def bmApproximate(p, t, k, alph="ACGT"):
""" Use the pigeonhole principle together with Boyer-Moore to find
approximate matches with up to a specified number of mismatches. """
assert len(p) >= k+1
ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings
off = 0 # offset into p of current partition
occurrences = set() # note we might see the same occurrence >1 time
for pi in ps: # for each partition
# NOTE: I haven't given the implementation for the BMPreprocessing object.
# It implements the Boyer-Moore skipping rules as we discussed in class.
bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition
for hit in bm_prep.match(t)[0]:
if hit - off < 0: continue # pattern falls off left end of T?
if hit + len(p) - off > len(t): continue # falls off right end?
# Count mismatches to left and right of the matching partition
nmm = 0
for i in range(0, off) + range(off+len(pi), len(p)):
if t[hit-off+i] != p[i]:
nmm += 1
if nmm > k: break # exceeded maximum # mismatches
if nmm <= k:
occurrences.add(hit-off) # approximate match
off += len(pi) # Update offset of current partition
return sorted(list(occurrences))