#!/usr/bin/env python
# coding: utf-8
#
cs1001.py , Tel Aviv University, Fall 2017-2018
#
# ## Recitation 11
#
# We discussed the "string matching" problem and the Karp-Rabin(KR) algorithm for solving it. Then we solved a question about KR using generators.
# ###### Takeaways:
# - Make sure you read our summary.
# - A naive solution for the string-matching problem has $O(m(n-m))$ time complexity.
# - By allowing "false-positives" (with small probability), we can obtain a linear time solution for the string-matching problem.
# - Make sure you understand the way the algorithm works, and in particular the "rolling hash mechanism", that is, how to compute the fingerprint of the next substring in $O(1)$ time, given the fingerprint of the current substring.
# - Make sure you understand the "aritmetization" step used by the algorithm.
# - Make sure you understand the question we solved.
# #### Code for printing several outputs in one cell (not part of the recitation):
# In[3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# ## The string-matching problem
# Given a string $text$ of length $n$, and a short(er) string $pattern$ of length $m$ ($m\leq n$), report all occurrances of $pattern$ in $text$.
#
# Example:
#
# $text = $"abracadabra", $pattern = $"abr"
#
# The requested output should be $[0,7]$, since $pattern$ appears in $text$ in indices $0,7$.
#
#
#
# ## Karp-Rabin Algorithm
# In[4]:
def fingerprint(text, basis=2**16, r=2**32-3):
""" used to compute karp-rabin fingerprint of the pattern
employs Horner method (modulo r) """
partial_sum = 0
for ch in text:
partial_sum =(partial_sum*basis + ord(ch)) % r
return partial_sum
def text_fingerprint(text, m, basis=2**16, r=2**32-3):
""" computes karp-rabin fingerprint of the text """
f=[]
b_power = pow(basis, m-1, r)
list.append(f, fingerprint(text[0:m], basis, r))
# f[0] equals first text fingerprint
for s in range(1, len(text)-m+1):
new_fingerprint = ( (f[s-1] - ord(text[s-1])*b_power)*basis
+ord(text[s+m-1]) ) % r
# compute f[s], based on f[s-1]
list.append(f,new_fingerprint)# append f[s] to existing f
return f
def find_matches_KR(pattern, text, basis=2**16, r=2**32-3):
""" find all occurances of pattern in text
using coin flipping Karp-Rabin algorithm """
if len(pattern) > len(text):
return []
p = fingerprint(pattern, basis, r)
f = text_fingerprint(text, len(pattern), basis, r)
matches = [s for (s,f_s) in enumerate(f) if f_s == p]
# list comprehension
return matches
# In[5]:
text = "abracadabra"
pattern = "abr"
# In[6]:
fingerprint("abr")
# In[32]:
base = 2**16
arit = ord("a")*(base**2) + ord("b")*(base**1) + ord("r")*(base**0)
arit
r = 2**32 - 3
fp = arit%r
fp
# In[8]:
text_fingerprint(text, 3)
# In[9]:
find_matches_KR(pattern, text)
# ### Safe version
# Makes sure no false positives occur. In the worst case, when all $n-m$ possible substrings are indeed matches, behaves as the naive solution in terms of time complexity.
# In[10]:
def find_matches_KR_safe(pattern, text, basis=2**16, r=2**32-3):
""" a safe version of KR
checks every suspect for a match """
if len(pattern) > len(text):
return []
p = fingerprint(pattern, basis, r)
f = text_fingerprint(text, len(pattern), basis, r)
matches = [s for (s,f_s) in enumerate(f) if f_s == p \
and text[s:s+len(pattern)]==pattern]
#note that python performs "cleaver evaluation" of the 'and' statement
return matches
# #### Competition between versions on single char string.
# This is the worst-case scenario for the safe version.
# Changing $m$ has a greater effect on the safe version than on the standard KR.
# In[11]:
import time
text = "a"*100000
print("text = 'a'*",len(text))
for pattern in ["a"*100, "a"*1000, "a"*10000]:
#for pattern in ["a"*30000, "a"*40000, "a"*50000, "a"*60000, "a"*70000]: #max in m=n/2
print("pattern = 'a'*",len(pattern))
for f in [find_matches_KR_safe, find_matches_KR]:
t0=time.clock()
res=f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
print("") #newline
# #### Competition between versions on random strings.
#
# Note that the standard and safe versions of KR has similar running times. Moreover, as $m$ increases, the running time slightly decreases since there are less candidates to consider.
# In[33]:
import random
def gen_str(size):
''' Generate a random lowercase English string of length size'''
s=""
for i in range(size):
s+=random.choice("abcdefghijklmnopqrstuvwxyz")
return s
n=100000
m=1000
text = gen_str(n)
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.clock()
f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
n=100000
m=10000
text = gen_str(n)
pattern = gen_str(m)
print("random str of len n=", n, " , random pattern of length m=",m)
for f in [find_matches_KR, find_matches_KR_safe]:
t0=time.clock()
f(pattern, text)
t1=time.clock()
print (f.__name__, t1-t0)
# ### Choice of $r$
# By setting $r$ to be a power of the base, say $r=base$, we will obtain more false-positives. This may serve as an intuition for choosing a prime $r$.
# In[13]:
find_matches_KR("da", "abracadabra", basis=2**16, r=2**16)
find_matches_KR_safe("da", "abracadabra", basis=2**16, r=2**16)
# In[18]:
fingerprint("da", 2**16, r=2**16)
ord("d")*(2**16)**1 + ord("a")
ord("a")
fingerprint("ca", 2**16, r=2**16)
ord("c")*(2**16)**1 + ord("a")
(ord("c")*(2**16)**1 + ord("a") )%2**16
# In[34]:
base = 2**16
r = 2**16
fingerprint("bda", base, r)
ord("b")*(base**2) + ord("d")*(base**1) + ord("a")
(ord("b")*base + ord("d"))*base + ord("a")
((ord("b")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
fingerprint("cda", base, r)
(ord("c")*base + ord("d"))*base + ord("a")
((ord("c")*base + ord("d"))*base + ord("a"))%r == ord("a")%r
# In[19]:
find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*5))
# In[20]:
fingerprint("Humpty", r=2**(16*5))
fingerprint("Dumpty", r=2**(16*5))
# In[ ]:
text_fingerprint("Humpty Dumpty", 6, r=2**(16*5))
# In[21]:
find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*6))
# ### Alternative fingerprint
# In[22]:
def fingerprint_sum(string, r=2**32-3):
''' a different kind of fingerprint: sum of all ords '''
partial_sum=0
for x in string:
partial_sum = (partial_sum + ord(x)) % r
return partial_sum
def text_fingerprint_sum(string, length, r=2**32-3):
""" used to compute a simple sum fingerprint of the text """
f =[]
list.append(f, fingerprint_sum(string[0:length], r))
for s in range (1, len(string)-length+1):
new_fingerprint = (f[s-1] - ord(string[s-1]) + ord(string[s+length-1])) % r
# compute f[s], based on f[s -1]
list.append(f, new_fingerprint) # append f[s] to existing f
return f
# In[23]:
text_fingerprint_sum("abracadabra", 3)
# ### Solving question 2 in this exercise
# In[26]:
def fingerprint(string, basis=2**16, r=2**32-3):
""" used to computes karp-rabin fingerprint of the pattern
employs Horner method (modulo r) """
partial_sum=0
for x in string:
partial_sum=(partial_sum*basis+ord(x)) % r
return partial_sum
def slide(prev_fp,prev_char,next_char,b_power,basis=2**16,r=2**32-3):
new_fp=((prev_fp-ord(prev_char)*b_power)*basis+ord(next_char)) % r
return new_fp
# ### Section (a)
# In[35]:
def kr_gen(text,length,basis=2**16,r=2**32-3):
fp = fingerprint(text[:length])
yield fp
b_power = pow(basis,length-1,r)
for s in range(1,len(text)-length+1):
fp = slide(fp,text[s-1],text[s-1+length],b_power)
yield fp
# In[27]:
gen = kr_gen("abracadabra", 3)
next(gen)
next(gen)
# In[28]:
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
next(gen)
# ### Section (b)
# In[ ]:
def generate_shared_substrings(text1,text2,length):
g1 = kr_gen(text1, length)
i1 = 0
for fp1 in g1:
g2 = kr_gen(text2, length)
i2 = 0
for fp2 in g2:
if fp1 == fp2:
yield(i1, i2)
i2 += 1
i1 += 1
# In[30]:
g = generate_shared_substrings("abcdef","xcdefx",3)
next(g)
next(g)
# In[31]:
next(g)
# In[ ]: