#!/usr/bin/env python # coding: utf-8 #

cs1001.py , Tel Aviv University, Fall 2017-2018

# # ## Recitation 11 # # We discussed the "string matching" problem and the Karp-Rabin(KR) algorithm for solving it. Then we solved a question about KR using generators. # ###### Takeaways: # - Make sure you read our summary. # - A naive solution for the string-matching problem has $O(m(n-m))$ time complexity. # - By allowing "false-positives" (with small probability), we can obtain a linear time solution for the string-matching problem. # - Make sure you understand the way the algorithm works, and in particular the "rolling hash mechanism", that is, how to compute the fingerprint of the next substring in $O(1)$ time, given the fingerprint of the current substring. # - Make sure you understand the "aritmetization" step used by the algorithm. # - Make sure you understand the question we solved. # #### Code for printing several outputs in one cell (not part of the recitation): # In[3]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # ## The string-matching problem # Given a string $text$ of length $n$, and a short(er) string $pattern$ of length $m$ ($m\leq n$), report all occurrances of $pattern$ in $text$. # # Example: # # $text = $"abracadabra", $pattern = $"abr" # # The requested output should be $[0,7]$, since $pattern$ appears in $text$ in indices $0,7$. # # # # ## Karp-Rabin Algorithm # In[4]: def fingerprint(text, basis=2**16, r=2**32-3): """ used to compute karp-rabin fingerprint of the pattern employs Horner method (modulo r) """ partial_sum = 0 for ch in text: partial_sum =(partial_sum*basis + ord(ch)) % r return partial_sum def text_fingerprint(text, m, basis=2**16, r=2**32-3): """ computes karp-rabin fingerprint of the text """ f=[] b_power = pow(basis, m-1, r) list.append(f, fingerprint(text[0:m], basis, r)) # f[0] equals first text fingerprint for s in range(1, len(text)-m+1): new_fingerprint = ( (f[s-1] - ord(text[s-1])*b_power)*basis +ord(text[s+m-1]) ) % r # compute f[s], based on f[s-1] list.append(f,new_fingerprint)# append f[s] to existing f return f def find_matches_KR(pattern, text, basis=2**16, r=2**32-3): """ find all occurances of pattern in text using coin flipping Karp-Rabin algorithm """ if len(pattern) > len(text): return [] p = fingerprint(pattern, basis, r) f = text_fingerprint(text, len(pattern), basis, r) matches = [s for (s,f_s) in enumerate(f) if f_s == p] # list comprehension return matches # In[5]: text = "abracadabra" pattern = "abr" # In[6]: fingerprint("abr") # In[32]: base = 2**16 arit = ord("a")*(base**2) + ord("b")*(base**1) + ord("r")*(base**0) arit r = 2**32 - 3 fp = arit%r fp # In[8]: text_fingerprint(text, 3) # In[9]: find_matches_KR(pattern, text) # ### Safe version # Makes sure no false positives occur. In the worst case, when all $n-m$ possible substrings are indeed matches, behaves as the naive solution in terms of time complexity. # In[10]: def find_matches_KR_safe(pattern, text, basis=2**16, r=2**32-3): """ a safe version of KR checks every suspect for a match """ if len(pattern) > len(text): return [] p = fingerprint(pattern, basis, r) f = text_fingerprint(text, len(pattern), basis, r) matches = [s for (s,f_s) in enumerate(f) if f_s == p \ and text[s:s+len(pattern)]==pattern] #note that python performs "cleaver evaluation" of the 'and' statement return matches # #### Competition between versions on single char string. # This is the worst-case scenario for the safe version. # Changing $m$ has a greater effect on the safe version than on the standard KR. # In[11]: import time text = "a"*100000 print("text = 'a'*",len(text)) for pattern in ["a"*100, "a"*1000, "a"*10000]: #for pattern in ["a"*30000, "a"*40000, "a"*50000, "a"*60000, "a"*70000]: #max in m=n/2 print("pattern = 'a'*",len(pattern)) for f in [find_matches_KR_safe, find_matches_KR]: t0=time.clock() res=f(pattern, text) t1=time.clock() print (f.__name__, t1-t0) print("") #newline # #### Competition between versions on random strings. # # Note that the standard and safe versions of KR has similar running times. Moreover, as $m$ increases, the running time slightly decreases since there are less candidates to consider. # In[33]: import random def gen_str(size): ''' Generate a random lowercase English string of length size''' s="" for i in range(size): s+=random.choice("abcdefghijklmnopqrstuvwxyz") return s n=100000 m=1000 text = gen_str(n) pattern = gen_str(m) print("random str of len n=", n, " , random pattern of length m=",m) for f in [find_matches_KR, find_matches_KR_safe]: t0=time.clock() f(pattern, text) t1=time.clock() print (f.__name__, t1-t0) n=100000 m=10000 text = gen_str(n) pattern = gen_str(m) print("random str of len n=", n, " , random pattern of length m=",m) for f in [find_matches_KR, find_matches_KR_safe]: t0=time.clock() f(pattern, text) t1=time.clock() print (f.__name__, t1-t0) # ### Choice of $r$ # By setting $r$ to be a power of the base, say $r=base$, we will obtain more false-positives. This may serve as an intuition for choosing a prime $r$. # In[13]: find_matches_KR("da", "abracadabra", basis=2**16, r=2**16) find_matches_KR_safe("da", "abracadabra", basis=2**16, r=2**16) # In[18]: fingerprint("da", 2**16, r=2**16) ord("d")*(2**16)**1 + ord("a") ord("a") fingerprint("ca", 2**16, r=2**16) ord("c")*(2**16)**1 + ord("a") (ord("c")*(2**16)**1 + ord("a") )%2**16 # In[34]: base = 2**16 r = 2**16 fingerprint("bda", base, r) ord("b")*(base**2) + ord("d")*(base**1) + ord("a") (ord("b")*base + ord("d"))*base + ord("a") ((ord("b")*base + ord("d"))*base + ord("a"))%r == ord("a")%r fingerprint("cda", base, r) (ord("c")*base + ord("d"))*base + ord("a") ((ord("c")*base + ord("d"))*base + ord("a"))%r == ord("a")%r # In[19]: find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*5)) # In[20]: fingerprint("Humpty", r=2**(16*5)) fingerprint("Dumpty", r=2**(16*5)) # In[ ]: text_fingerprint("Humpty Dumpty", 6, r=2**(16*5)) # In[21]: find_matches_KR("Humpty", "Humpty Dumpty", r=2**(16*6)) # ### Alternative fingerprint # In[22]: def fingerprint_sum(string, r=2**32-3): ''' a different kind of fingerprint: sum of all ords ''' partial_sum=0 for x in string: partial_sum = (partial_sum + ord(x)) % r return partial_sum def text_fingerprint_sum(string, length, r=2**32-3): """ used to compute a simple sum fingerprint of the text """ f =[] list.append(f, fingerprint_sum(string[0:length], r)) for s in range (1, len(string)-length+1): new_fingerprint = (f[s-1] - ord(string[s-1]) + ord(string[s+length-1])) % r # compute f[s], based on f[s -1] list.append(f, new_fingerprint) # append f[s] to existing f return f # In[23]: text_fingerprint_sum("abracadabra", 3) # ### Solving question 2 in this exercise # In[26]: def fingerprint(string, basis=2**16, r=2**32-3): """ used to computes karp-rabin fingerprint of the pattern employs Horner method (modulo r) """ partial_sum=0 for x in string: partial_sum=(partial_sum*basis+ord(x)) % r return partial_sum def slide(prev_fp,prev_char,next_char,b_power,basis=2**16,r=2**32-3): new_fp=((prev_fp-ord(prev_char)*b_power)*basis+ord(next_char)) % r return new_fp # ### Section (a) # In[35]: def kr_gen(text,length,basis=2**16,r=2**32-3): fp = fingerprint(text[:length]) yield fp b_power = pow(basis,length-1,r) for s in range(1,len(text)-length+1): fp = slide(fp,text[s-1],text[s-1+length],b_power) yield fp # In[27]: gen = kr_gen("abracadabra", 3) next(gen) next(gen) # In[28]: next(gen) next(gen) next(gen) next(gen) next(gen) next(gen) next(gen) next(gen) next(gen) next(gen) # ### Section (b) # In[ ]: def generate_shared_substrings(text1,text2,length): g1 = kr_gen(text1, length) i1 = 0 for fp1 in g1: g2 = kr_gen(text2, length) i2 = 0 for fp2 in g2: if fp1 == fp2: yield(i1, i2) i2 += 1 i1 += 1 # In[30]: g = generate_shared_substrings("abcdef","xcdefx",3) next(g) next(g) # In[31]: next(g) # In[ ]: