#!/usr/bin/env python # coding: utf-8 # In[1]: def hammingDistance(x, y): ''' Return Hamming distance between x and y ''' assert len(x) == len(y) nmm = 0 for i in range(0, len(x)): if x[i] != y[i]: nmm += 1 return nmm # In[2]: hammingDistance('brown', 'blown') # In[3]: hammingDistance('cringe', 'orange') # In[4]: def boundEditDistance(x, y): ''' Return loose lower and upper bounds on the edit distance between x and y in O(|x| + |y|) time. ''' if x == y: return 0, 0 if len(x) == 0: return len(y), len(y) if len(y) == 0: return len(x), len(x) diff = abs(len(x) - len(y)) lower = diff if lower == 0 and x != y: lower = 1 minlen = min(len(x), len(y)) upper = hammingDistance(x[:minlen], y[:minlen]) + diff return lower, upper # In[5]: boundEditDistance('brown', 'blown') # In[6]: boundEditDistance('create', 'creation') # In[7]: boundEditDistance('aaa', 'bbb') # In[8]: # note: bounds can be pretty rough boundEditDistance('the longest', 'longest day') # In[9]: boundEditDistance('Shakespeare', 'shake spear') # In[10]: def edDistRecursive(x, y): if len(x) == 0: return len(y) if len(y) == 0: return len(x) delt = 1 if x[-1] != y[-1] else 0 diag = edDistRecursive(x[:-1], y[:-1]) + delt vert = edDistRecursive(x[:-1], y) + 1 horz = edDistRecursive(x, y[:-1]) + 1 return min(diag, vert, horz) # In[11]: edDistRecursive('Shakespeare', 'shake spear') # this takes a while! # In[12]: # let's see how long it takes from time import time st = time() edDistRecursive('Shakespeare', 'shake spear') print('Took %0.3f seconds' % (time() - st)) # In[13]: def edDistRecursiveMemo(x, y, memo=None): ''' A version of edDistRecursive with memoization. For each x, y we see, we record result from edDistRecursiveMemo(x, y). In the future, we retrieve recorded result rather than re-run the function. ''' if memo is None: memo = {} if len(x) == 0: return len(y) if len(y) == 0: return len(x) if (len(x), len(y)) in memo: return memo[(len(x), len(y))] delt = 1 if x[-1] != y[-1] else 0 diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt vert = edDistRecursiveMemo(x[:-1], y, memo) + 1 horz = edDistRecursiveMemo(x, y[:-1], memo) + 1 ans = min(diag, vert, horz) memo[(len(x), len(y))] = ans return ans # In[14]: edDistRecursiveMemo('Shakespeare', 'shake spear') # In[15]: # this time it's instantaneous from time import time st = time() edDistRecursiveMemo('Shakespeare', 'shake spear') print('Took %0.3f seconds' % (time() - st)) # In[16]: from numpy import zeros def edDistDp(x, y): """ Calculate edit distance between sequences x and y using matrix dynamic programming. Return distance. """ D = zeros((len(x)+1, len(y)+1), dtype=int) D[0, 1:] = range(1, len(y)+1) D[1:, 0] = range(1, len(x)+1) for i in range(1, len(x)+1): for j in range(1, len(y)+1): delt = 1 if x[i-1] != y[j-1] else 0 D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1) return D[len(x), len(y)] # In[17]: edDistDp('Shakespeare', 'shake spear') # In[18]: # again, instantaneous from time import time st = time() edDistDp('Shakespeare', 'shake spear') print('Took %0.3f seconds' % (time() - st)) # In[19]: edDistDp('GCGTATGCACGC', 'GCTATGCCACGC')