We discussed iterators, generators, and the Karp-Rabin algorithm for string matching
A generator function is a function that contains the yield command and returns a genertor object.
The Karp-Rabin algorithm is a probabilistic string matching algorithm (find all occurences of a string of length $m$ in a string of length $n$) running in linear time $O(m+n)$ (as opposed to the naive solution which has running time $O(nm)$).
Make sure you read our KR summary.
Make sure you understand the way the algorithm works, and in particular the "rolling hash mechanism", that is, how to compute the fingerprint of the next substring in $O(1)$ time, given the fingerprint of the current substring.
Make sure you understand the "arithmetization" step used by the algorithm.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
l = [1,2,3]
li = iter(l)
type(li)
li2 = iter(l)
list_iterator
next(li)
1
next(li)
2
z = next(li)
print("z is", z)
z is 3
next(li)
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-86-c9b8845252db> in <module> ----> 1 next(li) StopIteration:
next(li2)
1
for elem in li2:
print(elem)
2 3
def countdown_gen():
''' calling it creates an iterator for the values 5,4,3,2,1,'launch' '''
yield 5
yield 4
yield 3
yield 2
yield 1
yield 'launch'
cd = countdown_gen()
type(cd) #generator object (private case of iterator)
generator
next(cd)
5
next(cd)
4
next(cd)
3
for e in cd:
print(e)
2 1 launch
With generators we can have infinite iterations
def countup_infinite():
i = 0
while True:
yield i
#print(i)
i += 1
cd = countup_infinite()
lst = [next(cd) for i in range(10)]
lst
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
next(cd)
10
Section (b)
def SomePairs():
i=1
while True:
for j in range(i):
yield(i,j)
i=i+1
gen = SomePairs()
[next(gen) for x in range(10)]
[(1, 0), (2, 0), (2, 1), (3, 0), (3, 1), (3, 2), (4, 0), (4, 1), (4, 2), (4, 3)]
Section (c)
def RevGen(PairsGen):
pairs = PairsGen()
while True:
pair = next(pairs)
yield(pair[1],pair[0])
gen = RevGen(SomePairs)
[next(gen) for x in range(10)]
[(0, 1), (0, 2), (1, 2), (0, 3), (1, 3), (2, 3), (0, 4), (1, 4), (2, 4), (3, 4)]
Section (d1)
def UnionGenerators(gen1, gen2):
while True:
yield next(gen1)
yield next(gen2)
ug = UnionGenerators(SomePairs(), RevGen(SomePairs))
for x in range(10):
next(ug)
(1, 0)
(0, 1)
(2, 0)
(0, 2)
(2, 1)
(1, 2)
(3, 0)
(0, 3)
(3, 1)
(1, 3)
Section (d2)
def EqPairs():
i=0
while True:
yield (i,i)
i=i+1
def AllPairs():
return UnionGenerators(SomePairs(), \
UnionGenerators(EqPairs(),\
RevGen(SomePairs)))
gen = AllPairs()
[next(gen) for x in range(10)]
[(1, 0), (0, 0), (2, 0), (0, 1), (2, 1), (1, 1), (3, 0), (0, 2), (3, 1), (2, 2)]
Given a string $text$ of length $n$, and a short(er) string $pattern$ of length $m$ ($m\leq n$), report all occurrances of $pattern$ in $text$.
Example:
$text = $"abracadabra", $pattern = $"abr"
The requested output should be $[0,7]$, since $pattern$ appears in $text$ in indices $0,7$.
def is_prime(N, show_witness=False):
""" probabilistic test for N's compositeness """
for i in range(0,100):
a = random.randint(1,N-1) # a is a random integer in [1..N-1]
if pow(a,N-1,N) != 1:
if show_witness: # caller wishes to see a witness
print(N,"is composite","\n",a,"is a witness, i=",i+1)
return False
return True
def find_prime(n):
""" find random n-bit long prime """
while(True):
candidate = random.randrange(2**(n-1),2**n)
if is_prime(candidate):
return candidate
def fingerprint(string, basis, r):
""" used to computes karp-rabin fingerprint of the pattern
and of the first substring in the text
employs Horner method (modulo r) """
s = 0
for ch in string:
s = (s*basis + ord(ch)) % r # Horner
return s
def substring_fingerprints(string, m, basis, r):
""" return a list of all fingerprints of size m windows in string """
fp_list = []
b_power = pow(basis, m-1, r)
# compute first fingerprint
fp_list.append(fingerprint(string[0:m], basis, r))
# compute f_list[s], based on f_list[s-1]
for s in range(1,len(string)-m+1): # O(n-m-1), each iteration O(1)
next_fingerprint = \
((fp_list[s-1] - ord(string[s-1])*b_power) * basis \
+ ord(string[s+m-1])) % r
fp_list.append(next_fingerprint)
return fp_list
def find_matches_KR(text, pattern, r=None):
""" find all shifts of occurances of pattern in text """
if len(pattern) > len(text):
return [] #no matches
basis = 2**16 # assume 16 bits are enough for all chars
if r == None:
r = find_prime(32) # randomly pick a 32 bit long prime using
# good old find_prime from an earlier lecture
p = fingerprint(pattern, basis, r)
fp_list = substring_fingerprints(text, len(pattern), basis, r)
matches = [shift for shift in range(len(fp_list)) if fp_list[shift] == p]
return matches
text = "abracadabra"
pattern = "abr"
r = find_prime(32)
base = 2**16
r
2149219973
fingerprint("abr", base, r)
1818795565
base = 2**16
arit = ord("a")*(base**2) + ord("b")*(base**1) + ord("r")*(base**0)
arit
fp = arit%r
fp
416618250354
1818795565
substring_fingerprints(text, 3, base, r)
[1818795565, 1816371474, 1759694964, 1818861084, 1811784715, 1818926620, 1808312063, 1818795565, 1816371474]
find_matches_KR(text, pattern, r)
[0, 7]
Makes sure no false positives occur. In the worst case, when all $n-m$ possible substrings are indeed matches, behaves as the naive solution in terms of time complexity.
def find_matches_KR_safe(text, pattern, r=None):
""" a safe version of KR
checks every suspect for a match """
if len(pattern) > len(text):
return [] #no matches
basis = 2**16 # assume 16 bits are enough for all chars
if r == None:
r = find_prime(32) # randomly pick a 32 bit long prime using
# good old find_prime from an earlier lecture
p = fingerprint(pattern, basis, r)
fp_list = substring_fingerprints(text, len(pattern), basis, r)
matches = [shift for shift in range(len(fp_list)) \
if fp_list[shift] == p and \
text[shift:shift+len(pattern)]==pattern]
return matches
This is the worst-case scenario for the safe version. Changing $m$ has a greater effect on the safe version than on the standard KR.
import time
text = "a"*1000000
print("text = 'a'*",len(text))
print()
for pattern in ["a"*100, "a"*1000, "a"*10000, "a"*100000]:
print("pattern = 'a'*",len(pattern))
r = find_prime(32)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.perf_counter()
res=f(text, pattern, r)
t1=time.perf_counter()
print (f.__name__, t1-t0 )
print("") #newline
text = 'a'* 1000000 pattern = 'a'* 100 find_matches_KR 1.4179207529959967 find_matches_KR_safe 1.6589124150050338 pattern = 'a'* 1000 find_matches_KR 1.3420768009964377 find_matches_KR_safe 1.775463817990385 pattern = 'a'* 10000 find_matches_KR 1.2947049719950883 find_matches_KR_safe 2.638186254000175 pattern = 'a'* 100000 find_matches_KR 1.4077826730062952 find_matches_KR_safe 13.829727393997018
Note that the standard and safe versions of KR has similar running times. Moreover, as $m$ increases, the running time slightly decreases since there are less candidates to consider.
import random
def gen_str(size):
''' Generate a random lowercase English string of length size'''
s=""
for i in range(size):
s+=random.choice("abcdefghijklmnopqrstuvwxyz")
return s
n=1000000
m=1000
text = gen_str(n)
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
r = find_prime(32)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.perf_counter()
f(text, pattern, r)
t1=time.perf_counter()
print (f.__name__, t1-t0)
m=10000
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
r = find_prime(32)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.perf_counter()
f(text, pattern, r)
t1=time.perf_counter()
print (f.__name__, t1-t0)
m=100000
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
r = find_prime(32)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.perf_counter()
f(text, pattern, r)
t1=time.perf_counter()
print (f.__name__, t1-t0)
random str of len n= 1000000 , random pattern of length m= 1000
[]
find_matches_KR 1.4785060709982645
[]
find_matches_KR_safe 1.3623063810082385 random str of len n= 1000000 , random pattern of length m= 10000
[]
find_matches_KR 1.383506228987244
[]
find_matches_KR_safe 1.3038542309950572 random str of len n= 1000000 , random pattern of length m= 100000
[]
find_matches_KR 1.25419370700547
[]
find_matches_KR_safe 1.6594200499966973
By setting $r$ to be a power of the base we will obtain more false-positives. Specifically, for $r=base$, we will get many more false positives. Why is that?
Note that the computation of the fingerprint is of the form $fp = (x \cdot base + y) \mod r$ for some $x,y$ (where $y$ is the arithmetization of the last character in the current substring).
In the case where $base = r$ this is equivalent to $(x \cdot r + y) \mod r = y \mod r$, i.e., the fingerprint takes into account only the last character of the substring!
Clearly, choosing $r$ which is a multiple of $base$ is not a good idea. This may also serve as an intuition for why we choose $r$ to be prime - say for example we know that all the fingerprints we compute are even, then if our $r$ is a multiple of $2$, we will always have $fp \mod r$ an even number, and thus we will only utilize half of our range!
# Many false positives
find_matches_KR("abracadabra", "da", r=2**16)
# Same result here since the "x" will be cancelled out
find_matches_KR("abracadabra", "xa", r=2**16)
# The safe check still works, of course
find_matches_KR_safe("abracadabra", "da", r=2**16)
[2, 4, 6, 9]
[2, 4, 6, 9]
[6]
fingerprint("da", 2**16, r=2**16)
ord("d")*(2**16)**1 + ord("a")
ord("a")
fingerprint("ca", 2**16, r=2**16)
ord("c")*(2**16)**1 + ord("a")
(ord("c")*(2**16)**1 + ord("a") )%2**16
97
6553697
97
97
6488161
97
base = 2**16
r = 2**16
fingerprint("bda", base, r)
ord("b")*(base**2) + ord("d")*(base**1) + ord("a")
(ord("b")*base + ord("d"))*base + ord("a")
((ord("b")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
fingerprint("cda", base, r)
(ord("c")*base + ord("d"))*base + ord("a")
((ord("c")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
97
420913348705
420913348705
True
97
425208316001
True
When $r = base^k$ for some $k$ we have a similar behaviour, though this time it means we only take the last $k$ characters of the rolling hash into account.
find_matches_KR("Humpty Dumpty", "Humpty", r=2**(16*5))
[0, 7]
fingerprint("Humpty", 2**16, r=2**(16*5))
fingerprint("Dumpty", 2**16, r=2**(16*5))
fingerprint("Xumpty", 2**16, r=2**(16*5))
2158299737877522940025
2158299737877522940025
2158299737877522940025
substring_fingerprints("Humpty Dumpty", 6, 2**16, r=2**(16*5))
[2158299737877522940025, 2010726629729956855840, 2066067987872461357124, 2139856371159933386869, 2232065040410175930477, 590314951159640293488, 1254411530052683432052, 2158299737877522940025]
# Why don't we return the last index?
find_matches_KR("Humpty Dumpty", "Humpty", r=2**(16*6))
[0]
def fingerprint(string, basis=2**16, r=2**32-3):
""" used to computes karp-rabin fingerprint of the pattern
employs Horner method (modulo r) """
partial_sum = 0
for x in string:
partial_sum = (partial_sum*basis + ord(x)) % r
return partial_sum
def slide(prev_fp, prev_char, next_char, b_power, basis=2**16, r=2**32-3):
new_fp=((prev_fp - ord(prev_char)*b_power)*basis + ord(next_char)) % r
return new_fp
Build a generator which, given a string text and a length parameter generates all KR fingerprints of desired length in text. Instructions:
def kr_gen(text, length, basis=2**16, r=2**32-3):
fp = fingerprint(text[:length])
yield fp
b_power = pow(basis, length - 1, r)
for s in range(1, len(text) - length + 1):
fp = slide(fp, text[s - 1], text[s - 1 + length], b_power)
yield fp
gen = kr_gen("abracadabra", 3)
next(gen)
next(gen)
6422933
7471495
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
6357433
6488452
6357389
6553988
6357390
6422933
7471495
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-115-6fd1aa460113> in <module> 6 next(gen) 7 next(gen) ----> 8 next(gen) 9 next(gen) 10 next(gen) StopIteration:
Build a generator which, given two strings $text1, text2$ and a length parameter $\ell$ generates all pairs of indices $(i,j)$ such that $text1[i:i+\ell] == text2[j:j+\ell]$.
Instructions:
def generate_shared_substrings(text1, text2, length):
g1 = kr_gen(text1, length)
i1 = 0
for fp1 in g1:
g2 = kr_gen(text2, length)
i2 = 0
for fp2 in g2:
if fp1 == fp2:
yield(i1, i2)
i2 += 1
i1 += 1
g = generate_shared_substrings("abcdef","xcdefx",3)
next(g)
next(g)
(2, 1)
(3, 2)
next(g)
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-118-e734f8aca5ac> in <module> ----> 1 next(g) StopIteration: