In [1]:
def hammingDistance(x, y):
''' Return Hamming distance between x and y '''
assert len(x) == len(y)
nmm = 0
for i in range(0, len(x)):
if x[i] != y[i]:
nmm += 1
return nmm

In [2]:
hammingDistance('brown', 'blown')

Out[2]:
1
In [3]:
hammingDistance('cringe', 'orange')

Out[3]:
2
In [4]:
def boundEditDistance(x, y):
''' Return loose lower and upper bounds on the edit distance
between x and y in O(|x| + |y|) time. '''
if x == y: return 0, 0
if len(x) == 0: return len(y), len(y)
if len(y) == 0: return len(x), len(x)
diff = abs(len(x) - len(y))
lower = diff
if lower == 0 and x != y:
lower = 1
minlen = min(len(x), len(y))
upper = hammingDistance(x[:minlen], y[:minlen]) + diff
return lower, upper

In [5]:
boundEditDistance('brown', 'blown')

Out[5]:
(1, 1)
In [6]:
boundEditDistance('create', 'creation')

Out[6]:
(2, 3)
In [7]:
boundEditDistance('aaa', 'bbb')

Out[7]:
(1, 3)
In [8]:
# note: bounds can be pretty rough
boundEditDistance('the longest', 'longest day')

Out[8]:
(1, 11)
In [9]:
boundEditDistance('Shakespeare', 'shake spear')

Out[9]:
(1, 7)
In [10]:
def edDistRecursive(x, y):
if len(x) == 0: return len(y)
if len(y) == 0: return len(x)
delt = 1 if x[-1] != y[-1] else 0
diag = edDistRecursive(x[:-1], y[:-1]) + delt
vert = edDistRecursive(x[:-1], y) + 1
horz = edDistRecursive(x, y[:-1]) + 1
return min(diag, vert, horz)

In [11]:
edDistRecursive('Shakespeare', 'shake spear') # this takes a while!

Out[11]:
3
In [12]:
# let's see how long it takes
from time import time
st = time()
edDistRecursive('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))

Took 25.144 seconds

In [13]:
def edDistRecursiveMemo(x, y, memo=None):
''' A version of edDistRecursive with memoization.  For each x, y we see, we
record result from edDistRecursiveMemo(x, y).  In the future, we retrieve
recorded result rather than re-run the function. '''
if memo is None: memo = {}
if len(x) == 0: return len(y)
if len(y) == 0: return len(x)
if (len(x), len(y)) in memo:
return memo[(len(x), len(y))]
delt = 1 if x[-1] != y[-1] else 0
diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt
vert = edDistRecursiveMemo(x[:-1], y, memo) + 1
horz = edDistRecursiveMemo(x, y[:-1], memo) + 1
ans = min(diag, vert, horz)
memo[(len(x), len(y))] = ans
return ans

In [14]:
edDistRecursiveMemo('Shakespeare', 'shake spear')

Out[14]:
3
In [15]:
# this time it's instantaneous
from time import time
st = time()
edDistRecursiveMemo('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))

Took 0.000 seconds

In [16]:
from numpy import zeros

def edDistDp(x, y):
""" Calculate edit distance between sequences x and y using
matrix dynamic programming.  Return distance. """
D = zeros((len(x)+1, len(y)+1), dtype=int)
D[0, 1:] = range(1, len(y)+1)
D[1:, 0] = range(1, len(x)+1)
for i in range(1, len(x)+1):
for j in range(1, len(y)+1):
delt = 1 if x[i-1] != y[j-1] else 0
D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)
return D[len(x), len(y)]

In [17]:
edDistDp('Shakespeare', 'shake spear')

Out[17]:
3
In [18]:
# again, instantaneous
from time import time
st = time()
edDistDp('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))

Took 0.000 seconds

In [19]:
edDistDp('GCGTATGCACGC', 'GCTATGCCACGC')

Out[19]:
2