#!/usr/bin/env python # coding: utf-8 # ### Local alignment # We define a simple cost function `exampleCost` and a function `smithWaterman` that, given strings x and y and a cost function, fills in and returns the corresponding local alignment matrix. # In[1]: import numpy def exampleCost(xc, yc): ''' Cost function: 2 to match, -6 to gap, -4 to mismatch ''' if xc == yc: return 2 # match if xc == '-' or yc == '-': return -6 # gap return -4 def smithWaterman(x, y, s): ''' Calculate local alignment values of sequences x and y using dynamic programming. Return maximal local alignment value. ''' V = numpy.zeros((len(x)+1, len(y)+1), dtype=int) for i in range(1, len(x)+1): for j in range(1, len(y)+1): V[i, j] = max(V[i-1, j-1] + s(x[i-1], y[j-1]), # diagonal V[i-1, j ] + s(x[i-1], '-'), # vertical V[i , j-1] + s('-', y[j-1]), # horizontal 0) # empty argmax = numpy.where(V == V.max()) return V, int(V[argmax]) # In[2]: x, y = 'GGTATGCTGGCGCTA', 'TATATGCGGCGTTT' V, best = smithWaterman(x, y, exampleCost) print(V) print("Best score=%d, in cell %s" % (best, numpy.unravel_index(numpy.argmax(V), V.shape))) # Next we define a `traceback` function that, given a local alignment matrix, strings x and y, and a scoring function, returns the optimal edit transcript and alignment for the optimal substrings of x and y. # In[3]: def traceback(V, x, y, s): """ Trace back from given cell in local-alignment matrix V """ # get i, j for maximal cell i, j = numpy.unravel_index(numpy.argmax(V), V.shape) xscript, alx, aly, alm = [], [], [], [] while (i > 0 or j > 0) and V[i, j] != 0: diag, vert, horz = 0, 0, 0 if i > 0 and j > 0: diag = V[i-1, j-1] + s(x[i-1], y[j-1]) if i > 0: vert = V[i-1, j] + s(x[i-1], '-') if j > 0: horz = V[i, j-1] + s('-', y[j-1]) if diag >= vert and diag >= horz: match = x[i-1] == y[j-1] xscript.append('M' if match else 'R') alm.append('|' if match else ' ') alx.append(x[i-1]); aly.append(y[j-1]) i -= 1; j -= 1 elif vert >= horz: xscript.append('D') alx.append(x[i-1]); aly.append('-'); alm.append(' ') i -= 1 else: xscript.append('I') aly.append(y[j-1]); alx.append('-'); alm.append(' ') j -= 1 xscript = (''.join(xscript))[::-1] alignment = '\n'.join(map(lambda x: ''.join(x), [alx[::-1], alm[::-1], aly[::-1]])) return xscript, alignment # In[4]: print(traceback(V, x, y, exampleCost)[1]) # In[5]: def editDistanceLikeCost(xc, yc): return 1 if xc == yc else -1 x = 'he_will_after_his_sour_fashion_tell_you' y = 'struts_and_frets_his_hour_upon_the_stage ' V, best = smithWaterman(x, y, editDistanceLikeCost) print('Best score: %d' % best) print(traceback(V, x, y, exampleCost)[1])