#!/usr/bin/env python
# coding: utf-8
#
cs1001.py , Tel Aviv University, Fall 2017-2018
#
# ## Recitation 10
#
# We discussed hash tables and iterators + generators
# ###### Takeaways:
# - Hash tables can be useful for many algorithms, including memoization.
# - Make sure you understand the complexity analysis for hash tables (see the links below).
# - Generators function is a function that contains the yield command and returns a genertor object.
# #### Code for printing several outputs in one cell (not part of the recitation):
# In[14]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# ## Hash
# We wish to have a data structure that implements the operations: insert, search and delete in **expected** $O(1)$ time.
#
# Summarizing the insert and search complexity of the data structures that we have seen already:
#
# A detailed summary on the complexity of insert/search operations using hash tables can be found here . Make sure you read it.
# ### Exercise:
# given a string $st$ of length $n$ and a small integer $\ell$, write a function that checks whether there is a substring in $st$ of length $\ell$ that appears more than once.
#
#
# Make sure you read the following summary that includes a detailed explanation on the experiments.
# #### Naive solution
# The complexity is $O(\ell(n-\ell)^2)$.
# There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time.
# In[15]:
def repeat_naive(st, l):
for i in range(len(st)-l+1):
for j in range(i+1,len(st)-l+1):
if st[i:i+l]==st[j:j+l]:
return True
return False
repeat_naive("hello", 1)
repeat_naive("hello"*10, 45)
repeat_naive("hello"*10, 46)
# A function that generates a random string of a given size
# In[16]:
import random
def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"):
''' Generate a random string of length size over alphabet '''
s=""
for i in range(size):
s += random.choice(alphabet)
return s
rndstr = gen_str(1000)
repeat_naive(rndstr, 3)
repeat_naive(rndstr, 10)
rndstr1 = gen_str(10000)
repeat_naive(rndstr1, 10)
# The class Hashtable from the lectures
# In[ ]:
class Hashtable:
def __init__(self, m, hash_func=hash):
""" initial hash table, m empty entries """
##bogus initialization #1:
#self.table = [[]*m]
##bogus initialization #2:
#empty=[]
#self.table = [empty for i in range(m)]
self.table = [ [] for i in range(m)]
self.hash_mod = lambda x: hash_func(x) % m # using python hash function
def __repr__(self):
L = [self.table[i] for i in range(len(self.table))]
return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))])
def find(self, item):
""" returns True if item in hashtable, False otherwise """
i = self.hash_mod(item)
return item in self.table[i]
#if item in self.table[i]:
# return True
#else:
# return False
def insert(self, item):
""" insert an item into table """
i = self.hash_mod(item)
if item not in self.table[i]:
self.table[i].append(item)
# #### Solution using the class Hashtable
# The expected (average) complexity is: $O(\ell(n-\ell))$
#
# Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time.
#
#
#
# The worst case complexity is: $O(\ell(n-\ell)^2)$
#
# Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is
# $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$
#
#
#
#
# In[ ]:
def repeat_hash1(st, l):
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
# Which of Python's naitive DS fits the solution?
#
# #### Solution using Python's set implementation
# In[ ]:
def repeat_hash2(st, l):
htable = set() #Python sets use hash functions for fast lookup
for i in range(len(st)-l+1):
if st[i:i+l] not in htable:
htable.add(st[i:i+l])
else: return True
return False
# #### Competition between the 3 solutions
# For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest.
#
# When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2.
# In[17]:
import time
str_len=1000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
# In[24]:
str_len=2000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
# When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991.
#
# In[27]:
st="a"*1000
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
# ### The effect of table size
# Our solution, with control over the table size
# In[28]:
def repeat_hash1_var_size(st, l, m=0):
if m==0: #default hash table size is ~number of substrings to be inserted
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
# Comparing different table sizes
# In[30]:
import time
str_len=1000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for m in [1, 10, 100, 1000, 1500, 10000, 100000]:
t0=time.clock()
res=repeat_hash1_var_size(st, l, m)
t1=time.clock()
print(t1-t0, "found?",res, "table size=",m)
# ## Iterators and Generators
# In[18]:
l = [1,2,3]
li = iter(l)
type(li)
li2 = iter(l)
# In[19]:
next(li)
next(li)
z = next(li)
print("z is", z)
next(li2)
#next(li)
print("before loop")
for elem in li2:
print(elem)
# In[20]:
next(li)
# A function which produces a list of all positive even numbers up to $n$
# In[ ]:
def Evens_list(n):
''' a list of evens up to n '''
return [num for num in range(n) if num%2==0]
# A generator function (includes a "yield" statement) that returns a generator that generates all positive even numbers up to $n$
# In[ ]:
def Evens_gen(n):
''' returns a generator of evens up to n '''
for i in range(n):
print("current i:", i)
if i%2 == 0:
print("before yield")
yield i
print("after yield")
# In[21]:
g = Evens_gen(10)
# In[22]:
next(g)
# In[23]:
next(g)
# A generator function which produces the **infinite** sequence of all positive even numbers
# In[ ]:
def All_evens_gen():
i=0
while True:
yield i
i+=2
# #### Solving this exam question about generators
# Section (b)
# In[ ]:
def SomePairs():
i=0
while True:
for j in range(i):
yield(i,j)
i=i+1
# Section (c)
# In[ ]:
def RevGen(PairsGen):
pairs = PairsGen()
while True:
pair = next(pairs)
yield(pair[1],pair[0])
# Section (d1)
# In[ ]:
def UnionGenerators(gen1, gen2):
while True:
yield next(gen1)
yield next(gen2)
# Section (d2)
# In[ ]:
def EqPairs():
i=0
while True:
yield (i,i)
i=i+1
# In[ ]:
def AllPairs():
return UnionGenerators(SomePairs(),
UnionGenerators(EqPairs(),
RevGen(SomePairs)))