#!/usr/bin/env python # coding: utf-8 #

cs1001.py , Tel Aviv University, Fall 2017-2018

# ## Recitation 10 # # We discussed hash tables and iterators + generators # ###### Takeaways: # - Hash tables can be useful for many algorithms, including memoization. # - Make sure you understand the complexity analysis for hash tables (see the links below). # - Generators function is a function that contains the yield command and returns a genertor object. # #### Code for printing several outputs in one cell (not part of the recitation): # In[14]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # ## Hash # We wish to have a data structure that implements the operations: insert, search and delete in **expected** $O(1)$ time. # # Summarizing the insert and search complexity of the data structures that we have seen already: #

# A detailed summary on the complexity of insert/search operations using hash tables can be found here . Make sure you read it. # ### Exercise: # given a string $st$ of length $n$ and a small integer $\ell$, write a function that checks whether there is a substring in $st$ of length $\ell$ that appears more than once. # # # Make sure you read the following summary that includes a detailed explanation on the experiments. # #### Naive solution # The complexity is $O(\ell(n-\ell)^2)$. # There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time. # In[15]: def repeat_naive(st, l): for i in range(len(st)-l+1): for j in range(i+1,len(st)-l+1): if st[i:i+l]==st[j:j+l]: return True return False repeat_naive("hello", 1) repeat_naive("hello"*10, 45) repeat_naive("hello"*10, 46) # A function that generates a random string of a given size # In[16]: import random def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"): ''' Generate a random string of length size over alphabet ''' s="" for i in range(size): s += random.choice(alphabet) return s rndstr = gen_str(1000) repeat_naive(rndstr, 3) repeat_naive(rndstr, 10) rndstr1 = gen_str(10000) repeat_naive(rndstr1, 10) # The class Hashtable from the lectures # In[ ]: class Hashtable: def __init__(self, m, hash_func=hash): """ initial hash table, m empty entries """ ##bogus initialization #1: #self.table = [[]*m] ##bogus initialization #2: #empty=[] #self.table = [empty for i in range(m)] self.table = [ [] for i in range(m)] self.hash_mod = lambda x: hash_func(x) % m # using python hash function def __repr__(self): L = [self.table[i] for i in range(len(self.table))] return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))]) def find(self, item): """ returns True if item in hashtable, False otherwise """ i = self.hash_mod(item) return item in self.table[i] #if item in self.table[i]: # return True #else: # return False def insert(self, item): """ insert an item into table """ i = self.hash_mod(item) if item not in self.table[i]: self.table[i].append(item) # #### Solution using the class Hashtable # The expected (average) complexity is: $O(\ell(n-\ell))$ # # Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time. # # # # The worst case complexity is: $O(\ell(n-\ell)^2)$ # # Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is # $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$ # # # # # In[ ]: def repeat_hash1(st, l): m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # Which of Python's naitive DS fits the solution? #

# #### Solution using Python's set implementation # In[ ]: def repeat_hash2(st, l): htable = set() #Python sets use hash functions for fast lookup for i in range(len(st)-l+1): if st[i:i+l] not in htable: htable.add(st[i:i+l]) else: return True return False # #### Competition between the 3 solutions # For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest. # # When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2. # In[17]: import time str_len=1000 st=gen_str(str_len) l=10 print("str_len=",str_len, "repeating substring len=",l) for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.clock() res=f(st, l) t1=time.clock() print(f.__name__, t1-t0, "found?",res) # In[24]: str_len=2000 st=gen_str(str_len) l=10 print("str_len=",str_len, "repeating substring len=",l) for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.clock() res=f(st, l) t1=time.clock() print(f.__name__, t1-t0, "found?",res) # When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991. # # In[27]: st="a"*1000 l=10 print("str_len=",str_len, "repeating substring len=",l) for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.clock() res=f(st, l) t1=time.clock() print(f.__name__, t1-t0, "found?",res) # ### The effect of table size # Our solution, with control over the table size # In[28]: def repeat_hash1_var_size(st, l, m=0): if m==0: #default hash table size is ~number of substrings to be inserted m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # Comparing different table sizes # In[30]: import time str_len=1000 st=gen_str(str_len) l=10 print("str_len=",str_len, "repeating substring len=",l) for m in [1, 10, 100, 1000, 1500, 10000, 100000]: t0=time.clock() res=repeat_hash1_var_size(st, l, m) t1=time.clock() print(t1-t0, "found?",res, "table size=",m) # ## Iterators and Generators # In[18]: l = [1,2,3] li = iter(l) type(li) li2 = iter(l) # In[19]: next(li) next(li) z = next(li) print("z is", z) next(li2) #next(li) print("before loop") for elem in li2: print(elem) # In[20]: next(li) # A function which produces a list of all positive even numbers up to $n$ # In[ ]: def Evens_list(n): ''' a list of evens up to n ''' return [num for num in range(n) if num%2==0] # A generator function (includes a "yield" statement) that returns a generator that generates all positive even numbers up to $n$ # In[ ]: def Evens_gen(n): ''' returns a generator of evens up to n ''' for i in range(n): print("current i:", i) if i%2 == 0: print("before yield") yield i print("after yield") # In[21]: g = Evens_gen(10) # In[22]: next(g) # In[23]: next(g) # A generator function which produces the **infinite** sequence of all positive even numbers # In[ ]: def All_evens_gen(): i=0 while True: yield i i+=2 # #### Solving this exam question about generators # Section (b) # In[ ]: def SomePairs(): i=0 while True: for j in range(i): yield(i,j) i=i+1 # Section (c) # In[ ]: def RevGen(PairsGen): pairs = PairsGen() while True: pair = next(pairs) yield(pair[1],pair[0]) # Section (d1) # In[ ]: def UnionGenerators(gen1, gen2): while True: yield next(gen1) yield next(gen2) # Section (d2) # In[ ]: def EqPairs(): i=0 while True: yield (i,i) i=i+1 # In[ ]: def AllPairs(): return UnionGenerators(SomePairs(), UnionGenerators(EqPairs(), RevGen(SomePairs)))