### Local alignment¶

We define a simple cost function exampleCost and a function smithWaterman that, given strings x and y and a cost function, fills in and returns the corresponding local alignment matrix.

In [1]:
import numpy

def exampleCost(xc, yc):
''' Cost function: 2 to match, -6 to gap, -4 to mismatch '''
if xc == yc: return 2 # match
if xc == '-' or yc == '-': return -6 # gap
return -4

def smithWaterman(x, y, s):
''' Calculate local alignment values of sequences x and y using
dynamic programming.  Return maximal local alignment value. '''
V = numpy.zeros((len(x)+1, len(y)+1), dtype=int)
for i in range(1, len(x)+1):
for j in range(1, len(y)+1):
V[i, j] = max(V[i-1, j-1] + s(x[i-1], y[j-1]), # diagonal
V[i-1, j  ] + s(x[i-1], '-'),    # vertical
V[i  , j-1] + s('-',    y[j-1]), # horizontal
0)                               # empty
argmax = numpy.where(V == V.max())
return V, int(V[argmax])

In [2]:
x, y = 'GGTATGCTGGCGCTA', 'TATATGCGGCGTTT'
V, best = smithWaterman(x, y, exampleCost)
print(V)
print("Best score=%d, in cell %s" % (best, numpy.unravel_index(numpy.argmax(V), V.shape)))

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 0  0  0  0  0  0  2  0  2  2  0  2  0  0  0]
[ 0  0  0  0  0  0  2  0  2  4  0  2  0  0  0]
[ 0  2  0  2  0  2  0  0  0  0  0  0  4  2  2]
[ 0  0  4  0  4  0  0  0  0  0  0  0  0  0  0]
[ 0  2  0  6  0  6  0  0  0  0  0  0  2  2  2]
[ 0  0  0  0  2  0  8  2  2  2  0  2  0  0  0]
[ 0  0  0  0  0  0  2 10  4  0  4  0  0  0  0]
[ 0  2  0  2  0  2  0  4  6  0  0  0  2  2  2]
[ 0  0  0  0  0  0  4  0  6  8  2  2  0  0  0]
[ 0  0  0  0  0  0  2  0  2  8  4  4  0  0  0]
[ 0  0  0  0  0  0  0  4  0  2 10  4  0  0  0]
[ 0  0  0  0  0  0  2  0  6  2  4 12  6  0  0]
[ 0  0  0  0  0  0  0  4  0  2  4  6  8  2  0]
[ 0  2  0  2  0  2  0  0  0  0  0  0  8 10  4]
[ 0  0  4  0  4  0  0  0  0  0  0  0  2  4  6]]
Best score=12, in cell (12, 11)


Next we define a traceback function that, given a local alignment matrix, strings x and y, and a scoring function, returns the optimal edit transcript and alignment for the optimal substrings of x and y.

In [3]:
def traceback(V, x, y, s):
""" Trace back from given cell in local-alignment matrix V """
# get i, j for maximal cell
i, j = numpy.unravel_index(numpy.argmax(V), V.shape)
xscript, alx, aly, alm = [], [], [], []
while (i > 0 or j > 0) and V[i, j] != 0:
diag, vert, horz = 0, 0, 0
if i > 0 and j > 0:
diag = V[i-1, j-1] + s(x[i-1], y[j-1])
if i > 0:
vert = V[i-1, j] + s(x[i-1], '-')
if j > 0:
horz = V[i, j-1] + s('-', y[j-1])
if diag >= vert and diag >= horz:
match = x[i-1] == y[j-1]
xscript.append('M' if match else 'R')
alm.append('|' if match else ' ')
alx.append(x[i-1]); aly.append(y[j-1])
i -= 1; j -= 1
elif vert >= horz:
xscript.append('D')
alx.append(x[i-1]); aly.append('-'); alm.append(' ')
i -= 1
else:
xscript.append('I')
aly.append(y[j-1]); alx.append('-'); alm.append(' ')
j -= 1
xscript = (''.join(xscript))[::-1]
alignment = '\n'.join(map(lambda x: ''.join(x), [alx[::-1], alm[::-1], aly[::-1]]))
return xscript, alignment

In [4]:
print(traceback(V, x, y, exampleCost)[1])

TATGCTGGCG
||||| ||||
TATGC-GGCG

In [5]:
def editDistanceLikeCost(xc, yc):
return 1 if xc == yc else -1

x = 'he_will_after_his_sour_fashion_tell_you'
y = 'struts_and_frets_his_hour_upon_the_stage '
V, best = smithWaterman(x, y, editDistanceLikeCost)
print('Best score: %d' % best)
print(traceback(V, x, y, exampleCost)[1])

Best score: 8
_his_sour_
||||| ||||
_his_hour_