In [1]:

# Using an edit-distance-like dynamic programming formulation, we can
# look for approximate occurrences of p in t.

import sys
import numpy

# Assume x is the string labeling rows of the matrix and y is the
# string labeling the columns

def trace(D, x, y):
    ''' Backtrace edit-distance matrix D for strings x and y '''
    i, j = len(x), len(y)
    xscript = []
    while i > 0:
        diag, vert, horz = sys.maxsize, sys.maxsize, sys.maxsize
        delt = None
        if i > 0 and j > 0:
            delt = 0 if x[i-1] == y[j-1] else 1
            diag = D[i-1, j-1] + delt
        if i > 0:
            vert = D[i-1, j] + 1
        if j > 0:
            horz = D[i, j-1] + 1
        if diag <= vert and diag <= horz:
            # diagonal was best
            xscript.append('R' if delt == 1 else 'M')
            i -= 1; j -= 1
        elif vert <= horz:
            # vertical was best; this is an insertion in x w/r/t y
            xscript.append('I')
            i -= 1
        else:
            # horizontal was best
            xscript.append('D')
            j -= 1
    # j = offset of the first (leftmost) character of t involved in the
    # alignment
    return j, (''.join(xscript))[::-1] # reverse and string-ize

def kEditDp(p, t):
    ''' Find and return the alignment of p to a substring of t with the
        fewest edits.  We return the edit distance, the offset of the
        substring aligned to, and the edit transcript.  If multiple
        alignments tie for best, we report the leftmost. '''
    D = numpy.zeros((len(p)+1, len(t)+1), dtype=int)
    # Note: First row gets zeros.  First column initialized as usual.
    D[1:, 0] = range(1, len(p)+1)
    for i in range(1, len(p)+1):
        for j in range(1, len(t)+1):
            delt = 1 if p[i-1] != t[j-1] else 0
            D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)
    # Find minimum edit distance in last row
    mnJ, mn = None, len(p) + len(t)
    for j in range(len(t)+1):
        if D[len(p), j] < mn:
            mnJ, mn = j, D[len(p), j]
    # Backtrace; note: stops as soon as it gets to first row
    off, xcript = trace(D, p, t[:mnJ])
    # Return edit distance, offset into T, edit transcript
    return mn, off, xcript, D

In [2]:

p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA'
mn, off, xscript, D = kEditDp(p, t)
mn, off, xscript

Out[2]:

(2, 5, 'MMRMMMMDMM')

In [3]:

Out[3]:

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1],
       [2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1],
       [3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2],
       [4, 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 3],
       [5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 1, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3],
       [6, 5, 5, 4, 3, 3, 4, 4, 3, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4],
       [7, 6, 5, 5, 4, 4, 4, 4, 4, 4, 3, 2, 1, 2, 3, 4, 4, 4, 4, 4, 5, 4],
       [8, 7, 6, 6, 5, 5, 5, 5, 5, 4, 4, 3, 2, 2, 2, 3, 4, 5, 5, 4, 4, 5],
       [9, 8, 7, 6, 6, 5, 6, 6, 6, 5, 5, 4, 3, 3, 3, 2, 3, 4, 5, 5, 5, 5]])

In [ ]: