# First we make a function that splits a string p up into a set of # non-overlapping, non-empty substrings. def partition(p, pieces=2): assert len(p) >= pieces base, mod = len(p) / pieces, len(p) % pieces idx = 0 ps = [] modAdjust = 1 for i in xrange(0, pieces): if i >= mod: modAdjust = 0 newIdx = idx + base + modAdjust ps.append(p[idx:newIdx]) idx = newIdx return ps def bmApproximate(p, t, k, alph="ACGT"): """ Use the pigeonhole principle together with Boyer-Moore to find approximate matches with up to a specified number of mismatches. """ assert len(p) >= k+1 ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings off = 0 # offset into p of current partition occurrences = set() # note we might see the same occurrence >1 time for pi in ps: # for each partition # NOTE: I haven't given the implementation for the BMPreprocessing object. # It implements the Boyer-Moore skipping rules as we discussed in class. bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition for hit in bm_prep.match(t)[0]: if hit - off < 0: continue # pattern falls off left end of T? if hit + len(p) - off > len(t): continue # falls off right end? # Count mismatches to left and right of the matching partition nmm = 0 for i in range(0, off) + range(off+len(pi), len(p)): if t[hit-off+i] != p[i]: nmm += 1 if nmm > k: break # exceeded maximum # mismatches if nmm <= k: occurrences.add(hit-off) # approximate match off += len(pi) # Update offset of current partition return sorted(list(occurrences)) bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')