#!/usr/bin/env python # coding: utf-8 # In[1]: # Using an edit-distance-like dynamic programming formulation, we can # look for approximate occurrences of p in t. import sys import numpy # Assume x is the string labeling rows of the matrix and y is the # string labeling the columns def trace(D, x, y): ''' Backtrace edit-distance matrix D for strings x and y ''' i, j = len(x), len(y) xscript = [] while i > 0: diag, vert, horz = sys.maxsize, sys.maxsize, sys.maxsize delt = None if i > 0 and j > 0: delt = 0 if x[i-1] == y[j-1] else 1 diag = D[i-1, j-1] + delt if i > 0: vert = D[i-1, j] + 1 if j > 0: horz = D[i, j-1] + 1 if diag <= vert and diag <= horz: # diagonal was best xscript.append('R' if delt == 1 else 'M') i -= 1; j -= 1 elif vert <= horz: # vertical was best; this is an insertion in x w/r/t y xscript.append('I') i -= 1 else: # horizontal was best xscript.append('D') j -= 1 # j = offset of the first (leftmost) character of t involved in the # alignment return j, (''.join(xscript))[::-1] # reverse and string-ize def kEditDp(p, t): ''' Find and return the alignment of p to a substring of t with the fewest edits. We return the edit distance, the offset of the substring aligned to, and the edit transcript. If multiple alignments tie for best, we report the leftmost. ''' D = numpy.zeros((len(p)+1, len(t)+1), dtype=int) # Note: First row gets zeros. First column initialized as usual. D[1:, 0] = range(1, len(p)+1) for i in range(1, len(p)+1): for j in range(1, len(t)+1): delt = 1 if p[i-1] != t[j-1] else 0 D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1) # Find minimum edit distance in last row mnJ, mn = None, len(p) + len(t) for j in range(len(t)+1): if D[len(p), j] < mn: mnJ, mn = j, D[len(p), j] # Backtrace; note: stops as soon as it gets to first row off, xcript = trace(D, p, t[:mnJ]) # Return edit distance, offset into T, edit transcript return mn, off, xcript, D # In[2]: p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA' mn, off, xscript, D = kEditDp(p, t) mn, off, xscript # In[3]: D # In[ ]: