In [1]:
def hammingDistance(x, y):
    ''' Return Hamming distance between x and y '''
    assert len(x) == len(y)
    nmm = 0
    for i in range(0, len(x)):
        if x[i] != y[i]:
            nmm += 1
    return nmm
In [2]:
hammingDistance('brown', 'blown')
Out[2]:
1
In [3]:
hammingDistance('cringe', 'orange')
Out[3]:
2
In [4]:
def boundEditDistance(x, y):
    ''' Return loose lower and upper bounds on the edit distance
        between x and y in O(|x| + |y|) time. '''
    if x == y: return 0, 0
    if len(x) == 0: return len(y), len(y)
    if len(y) == 0: return len(x), len(x)
    diff = abs(len(x) - len(y))
    lower = diff
    if lower == 0 and x != y:
        lower = 1
    minlen = min(len(x), len(y))
    upper = hammingDistance(x[:minlen], y[:minlen]) + diff
    return lower, upper
In [5]:
boundEditDistance('brown', 'blown')
Out[5]:
(1, 1)
In [6]:
boundEditDistance('create', 'creation')
Out[6]:
(2, 3)
In [7]:
boundEditDistance('aaa', 'bbb')
Out[7]:
(1, 3)
In [8]:
# note: bounds can be pretty rough
boundEditDistance('the longest', 'longest day')
Out[8]:
(1, 11)
In [9]:
boundEditDistance('Shakespeare', 'shake spear')
Out[9]:
(1, 7)
In [10]:
def edDistRecursive(x, y):
    if len(x) == 0: return len(y)
    if len(y) == 0: return len(x)
    delt = 1 if x[-1] != y[-1] else 0
    diag = edDistRecursive(x[:-1], y[:-1]) + delt 
    vert = edDistRecursive(x[:-1], y) + 1
    horz = edDistRecursive(x, y[:-1]) + 1
    return min(diag, vert, horz)
In [11]:
edDistRecursive('Shakespeare', 'shake spear') # this takes a while!
Out[11]:
3
In [12]:
# let's see how long it takes
from time import time
st = time()
edDistRecursive('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))
Took 25.144 seconds
In [13]:
def edDistRecursiveMemo(x, y, memo=None):
    ''' A version of edDistRecursive with memoization.  For each x, y we see, we
        record result from edDistRecursiveMemo(x, y).  In the future, we retrieve
        recorded result rather than re-run the function. '''
    if memo is None: memo = {}
    if len(x) == 0: return len(y)
    if len(y) == 0: return len(x)
    if (len(x), len(y)) in memo:
        return memo[(len(x), len(y))]
    delt = 1 if x[-1] != y[-1] else 0
    diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt
    vert = edDistRecursiveMemo(x[:-1], y, memo) + 1
    horz = edDistRecursiveMemo(x, y[:-1], memo) + 1
    ans = min(diag, vert, horz)
    memo[(len(x), len(y))] = ans
    return ans
In [14]:
edDistRecursiveMemo('Shakespeare', 'shake spear')
Out[14]:
3
In [15]:
# this time it's instantaneous
from time import time
st = time()
edDistRecursiveMemo('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))
Took 0.000 seconds
In [16]:
from numpy import zeros

def edDistDp(x, y):
    """ Calculate edit distance between sequences x and y using
        matrix dynamic programming.  Return distance. """
    D = zeros((len(x)+1, len(y)+1), dtype=int)
    D[0, 1:] = range(1, len(y)+1)
    D[1:, 0] = range(1, len(x)+1)
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            delt = 1 if x[i-1] != y[j-1] else 0
            D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)
    return D[len(x), len(y)]
In [17]:
edDistDp('Shakespeare', 'shake spear')
Out[17]:
3
In [18]:
# again, instantaneous
from time import time
st = time()
edDistDp('Shakespeare', 'shake spear')
print('Took %0.3f seconds' % (time() - st))
Took 0.000 seconds
In [19]:
edDistDp('GCGTATGCACGC', 'GCTATGCCACGC')
Out[19]:
2