We discussed the "string matching" problem and the Karp-Rabin(KR) algorithm for solving it. Then we solved a question about KR using generators.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Given a string $text$ of length $n$, and a short(er) string $pattern$ of length $m$ ($m\leq n$), report all occurrances of $pattern$ in $text$.
Example:
$text = $"abracadabra", $pattern = $"abr"
The requested output should be $[0,7]$, since $pattern$ appears in $text$ in indices $0,7$.
def fingerprint(text, basis=2**16, r=2**32-3):
""" used to compute karp-rabin fingerprint of the pattern
employs Horner method (modulo r) """
partial_sum = 0
for ch in text:
partial_sum =(partial_sum*basis + ord(ch)) % r
return partial_sum
def text_fingerprint(text, m, basis=2**16, r=2**32-3):
""" computes karp-rabin fingerprint of the text """
f=[]
b_power = pow(basis, m-1, r)
list.append(f, fingerprint(text[0:m], basis, r))
# f[0] equals first text fingerprint
for s in range(1, len(text)-m+1):
new_fingerprint = ( (f[s-1] - ord(text[s-1])*b_power)*basis
+ord(text[s+m-1]) ) % r
# compute f[s], based on f[s-1]
list.append(f,new_fingerprint)# append f[s] to existing f
return f
def find_matches_KR(pattern, text, basis=2**16, r=2**32-3):
""" find all occurances of pattern in text
using coin flipping Karp-Rabin algorithm """
if len(pattern) > len(text):
return []
p = fingerprint(pattern, basis, r)
f = text_fingerprint(text, len(pattern), basis, r)
matches = [s for (s,f_s) in enumerate(f) if f_s == p]
# list comprehension
return matches
text = "abracadabra"
pattern = "abr"
fingerprint("abr")
6422933
base = 2**16
arit = ord("a")*(base**2) + ord("b")*(base**1) + ord("r")*(base**0)
arit
r = 2**32 - 3
fp = arit%r
fp
416618250354
6422933
text_fingerprint(text, 3)
[6422933, 7471495, 6357433, 6488452, 6357389, 6553988, 6357390, 6422933, 7471495]
find_matches_KR(pattern, text)
[0, 7]
Makes sure no false positives occur. In the worst case, when all $n-m$ possible substrings are indeed matches, behaves as the naive solution in terms of time complexity.
def find_matches_KR_safe(pattern, text, basis=2**16, r=2**32-3):
""" a safe version of KR
checks every suspect for a match """
if len(pattern) > len(text):
return []
p = fingerprint(pattern, basis, r)
f = text_fingerprint(text, len(pattern), basis, r)
matches = [s for (s,f_s) in enumerate(f) if f_s == p \
and text[s:s+len(pattern)]==pattern]
#note that python performs "cleaver evaluation" of the 'and' statement
return matches
This is the worst-case scenario for the safe version. Changing $m$ has a greater effect on the safe version than on the standard KR.
import time
text = "a"*100000
print("text = 'a'*",len(text))
for pattern in ["a"*100, "a"*1000, "a"*10000]:
#for pattern in ["a"*30000, "a"*40000, "a"*50000, "a"*60000, "a"*70000]: #max in m=n/2
print("pattern = 'a'*",len(pattern))
for f in [find_matches_KR_safe, find_matches_KR]:
t0=time.clock()
res=f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
print("") #newline
text = 'a'* 100000 pattern = 'a'* 100 find_matches_KR_safe 0.17724202722919818 find_matches_KR 0.1276535441773806 pattern = 'a'* 1000 find_matches_KR_safe 0.18844752517457397 find_matches_KR 0.12176520046895167 pattern = 'a'* 10000 find_matches_KR_safe 0.27398147088196856 find_matches_KR 0.13507782308521266
Note that the standard and safe versions of KR has similar running times. Moreover, as $m$ increases, the running time slightly decreases since there are less candidates to consider.
import random
def gen_str(size):
''' Generate a random lowercase English string of length size'''
s=""
for i in range(size):
s+=random.choice("abcdefghijklmnopqrstuvwxyz")
return s
n=100000
m=1000
text = gen_str(n)
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.clock()
f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
n=100000
m=10000
text = gen_str(n)
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.clock()
f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
random str of len n= 100000 , random pattern of length m= 1000
[]
find_matches_KR 0.17595477853319608
[]
find_matches_KR_safe 0.1846639377035899 random str of len n= 100000 , random pattern of length m= 10000
[]
find_matches_KR 0.1361657278466737
[]
find_matches_KR_safe 0.17290264515031595
By setting $r$ to be a power of the base, say $r=base$, we will obtain more false-positives. This may serve as an intuition for choosing a prime $r$.
find_matches_KR("da", "abracadabra", basis=2**16, r=2**16)
find_matches_KR_safe("da", "abracadabra", basis=2**16, r=2**16)
[2, 4, 6, 9]
[6]
fingerprint("da", 2**16, r=2**16)
ord("d")*(2**16)**1 + ord("a")
ord("a")
fingerprint("ca", 2**16, r=2**16)
ord("c")*(2**16)**1 + ord("a")
(ord("c")*(2**16)**1 + ord("a") )%2**16
97
6553697
97
97
6488161
97
base = 2**16
r = 2**16
fingerprint("bda", base, r)
ord("b")*(base**2) + ord("d")*(base**1) + ord("a")
(ord("b")*base + ord("d"))*base + ord("a")
((ord("b")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
fingerprint("cda", base, r)
(ord("c")*base + ord("d"))*base + ord("a")
((ord("c")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
97
420913348705
420913348705
True
97
425208316001
True
find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*5))
[0, 7]
fingerprint("Humpty", r=2**(16*5))
fingerprint("Dumpty", r=2**(16*5))
2158299737877522940025
2158299737877522940025
text_fingerprint("Humpty Dumpty", 6, r=2**(16*5))
find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*6))
[0]
def fingerprint_sum(string, r=2**32-3):
''' a different kind of fingerprint: sum of all ords '''
partial_sum=0
for x in string:
partial_sum = (partial_sum + ord(x)) % r
return partial_sum
def text_fingerprint_sum(string, length, r=2**32-3):
""" used to compute a simple sum fingerprint of the text """
f =[]
list.append(f, fingerprint_sum(string[0:length], r))
for s in range (1, len(string)-length+1):
new_fingerprint = (f[s-1] - ord(string[s-1]) + ord(string[s+length-1])) % r
# compute f[s], based on f[s -1]
list.append(f, new_fingerprint) # append f[s] to existing f
return f
text_fingerprint_sum("abracadabra", 3)
[309, 309, 310, 293, 296, 294, 295, 309, 309]
def fingerprint(string, basis=2**16, r=2**32-3):
""" used to computes karp-rabin fingerprint of the pattern
employs Horner method (modulo r) """
partial_sum=0
for x in string:
partial_sum=(partial_sum*basis+ord(x)) % r
return partial_sum
def slide(prev_fp,prev_char,next_char,b_power,basis=2**16,r=2**32-3):
new_fp=((prev_fp-ord(prev_char)*b_power)*basis+ord(next_char)) % r
return new_fp
def kr_gen(text,length,basis=2**16,r=2**32-3):
fp = fingerprint(text[:length])
yield fp
b_power = pow(basis,length-1,r)
for s in range(1,len(text)-length+1):
fp = slide(fp,text[s-1],text[s-1+length],b_power)
yield fp
gen = kr_gen("abracadabra", 3)
next(gen)
next(gen)
6422933
7471495
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
6357433
6488452
6357389
6553988
6357390
6422933
7471495
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-28-6fd1aa460113> in <module>() 6 next(gen) 7 next(gen) ----> 8 next(gen) 9 next(gen) 10 next(gen) StopIteration:
def generate_shared_substrings(text1,text2,length):
g1 = kr_gen(text1, length)
i1 = 0
for fp1 in g1:
g2 = kr_gen(text2, length)
i2 = 0
for fp2 in g2:
if fp1 == fp2:
yield(i1, i2)
i2 += 1
i1 += 1
g = generate_shared_substrings("abcdef","xcdefx",3)
next(g)
next(g)
(2, 1)
(3, 2)
next(g)
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-31-e734f8aca5ac> in <module>() ----> 1 next(g) StopIteration: