We discussed hash tables and iterators + generators
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
We wish to have a data structure that implements the operations: insert, search and delete in expected $O(1)$ time.
Summarizing the insert and search complexity of the data structures that we have seen already:
A detailed summary on the complexity of insert/search operations using hash tables can be found here . Make sure you read it.
The complexity is $O(\ell(n-\ell)^2)$. There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time.
def repeat_naive(st, l):
for i in range(len(st)-l+1):
for j in range(i+1,len(st)-l+1):
if st[i:i+l]==st[j:j+l]:
return True
return False
repeat_naive("hello", 1)
repeat_naive("hello"*10, 45)
repeat_naive("hello"*10, 46)
True
True
False
A function that generates a random string of a given size
import random
def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"):
''' Generate a random string of length size over alphabet '''
s=""
for i in range(size):
s += random.choice(alphabet)
return s
rndstr = gen_str(1000)
repeat_naive(rndstr, 3)
repeat_naive(rndstr, 10)
rndstr1 = gen_str(10000)
repeat_naive(rndstr1, 10)
True
False
False
The class Hashtable from the lectures
class Hashtable:
def __init__(self, m, hash_func=hash):
""" initial hash table, m empty entries """
##bogus initialization #1:
#self.table = [[]*m]
##bogus initialization #2:
#empty=[]
#self.table = [empty for i in range(m)]
self.table = [ [] for i in range(m)]
self.hash_mod = lambda x: hash_func(x) % m # using python hash function
def __repr__(self):
L = [self.table[i] for i in range(len(self.table))]
return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))])
def find(self, item):
""" returns True if item in hashtable, False otherwise """
i = self.hash_mod(item)
return item in self.table[i]
#if item in self.table[i]:
# return True
#else:
# return False
def insert(self, item):
""" insert an item into table """
i = self.hash_mod(item)
if item not in self.table[i]:
self.table[i].append(item)
The expected (average) complexity is: $O(\ell(n-\ell))$
Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time.
The worst case complexity is: $O(\ell(n-\ell)^2)$
Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$
def repeat_hash1(st, l):
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
Which of Python's naitive DS fits the solution?
def repeat_hash2(st, l):
htable = set() #Python sets use hash functions for fast lookup
for i in range(len(st)-l+1):
if st[i:i+l] not in htable:
htable.add(st[i:i+l])
else: return True
return False
For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest.
When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2.
import time
str_len=1000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
str_len= 1000 repeating substring len= 10 repeat_naive 0.17578042042896413 found? False repeat_hash1 0.0020885685707980883 found? False repeat_hash2 0.0005123723312863149 found? False
str_len=2000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
str_len= 2000 repeating substring len= 10 repeat_naive 0.7582052599900635 found? False repeat_hash1 0.0047206939198076725 found? False repeat_hash2 0.0010855345899472013 found? False
When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991.
st="a"*1000
l=10
print("str_len=",str_len, "repeating substring len=",l)
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.clock()
res=f(st, l)
t1=time.clock()
print(f.__name__, t1-t0, "found?",res)
str_len= 2000 repeating substring len= 10 repeat_naive 1.1842188541777432e-05 found? True repeat_hash1 0.0003122392372461036 found? True repeat_hash2 8.684277418069541e-06 found? True
Our solution, with control over the table size
def repeat_hash1_var_size(st, l, m=0):
if m==0: #default hash table size is ~number of substrings to be inserted
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
Comparing different table sizes
import time
str_len=1000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for m in [1, 10, 100, 1000, 1500, 10000, 100000]:
t0=time.clock()
res=repeat_hash1_var_size(st, l, m)
t1=time.clock()
print(t1-t0, "found?",res, "table size=",m)
str_len= 1000 repeating substring len= 10 0.0392379310214892 found? False table size= 1 0.00573320165858604 found? False table size= 10 0.0035834484006045386 found? False table size= 100 0.0035980537650175393 found? False table size= 1000 0.003550684981746599 found? False table size= 1500 0.0036312119191279635 found? False table size= 10000 0.045343372359639034 found? False table size= 100000
l = [1,2,3]
li = iter(l)
type(li)
li2 = iter(l)
list_iterator
next(li)
next(li)
z = next(li)
print("z is", z)
next(li2)
#next(li)
print("before loop")
for elem in li2:
print(elem)
1
2
z is 3
1
before loop 2 3
next(li)
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-20-c9b8845252db> in <module>() ----> 1 next(li) StopIteration:
A function which produces a list of all positive even numbers up to $n$
def Evens_list(n):
''' a list of evens up to n '''
return [num for num in range(n) if num%2==0]
A generator function (includes a "yield" statement) that returns a generator that generates all positive even numbers up to $n$
def Evens_gen(n):
''' returns a generator of evens up to n '''
for i in range(n):
print("current i:", i)
if i%2 == 0:
print("before yield")
yield i
print("after yield")
g = Evens_gen(10)
next(g)
current i: 0 before yield
0
next(g)
after yield current i: 1 current i: 2 before yield
2
A generator function which produces the infinite sequence of all positive even numbers
def All_evens_gen():
i=0
while True:
yield i
i+=2
Section (b)
def SomePairs():
i=0
while True:
for j in range(i):
yield(i,j)
i=i+1
Section (c)
def RevGen(PairsGen):
pairs = PairsGen()
while True:
pair = next(pairs)
yield(pair[1],pair[0])
Section (d1)
def UnionGenerators(gen1, gen2):
while True:
yield next(gen1)
yield next(gen2)
Section (d2)
def EqPairs():
i=0
while True:
yield (i,i)
i=i+1
def AllPairs():
return UnionGenerators(SomePairs(),
UnionGenerators(EqPairs(),
RevGen(SomePairs)))