#!/usr/bin/env python
# coding: utf-8
#
cs1001.py , Tel Aviv University, Spring 2019
#
# # Recitation 10
# We discussed Hash Tables and Nim
#
# #### Takeaways:
# - Important properties of Hash Tables:
# - Hash tables can be useful for many algorithms, including memoization.
# - Insert and find operation run in $O(1)$ on average, but $O(n)$ in the worst case (where $n$ is the number of elements in the table)
# - Make sure you understand the complexity analysis for hash tables (see the links below).
# #### Code for printing several outputs in one cell (not part of the recitation):
# In[1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# ## Hash Tables
#
# We wish to have a data structure that implements the operations: insert, search and delete in **expected** $O(1)$ time.
#
# Summarizing the insert and search complexity of the data structures that we have seen already:
#
# | implementation | insert | search | delete |
# |-------------------------------|--------------------------|----------|---------------------|
# | Python list | O(1) always at the end | O(n) | O(n) |
# | Python ordered list | O(n) | O(log n) | O(n) |
# | Linked list | O(1) always at the start | O(n) | O(1) given the node before the one to delete |
# | Sorted linked list | O(n) | O(n) | O(1) given the node before the one to delete |
# | Unbalanced Binary Search Tree | O(n) | O(n) | O(n) |
# | Balanced Binary Search Tree | O(log n) | O(log n) | O(log n) |
# Please read the following summary on the various data structures mentioned in class.
#
# A detailed summary on the complexity of insert/search operations using hash tables can be found here. Make sure you read it.
# ### Exercise:
# Given a string $st$ of length $n$ and a small integer $\ell$, write a function that checks whether there is a substring in $st$ of length $\ell$ that appears more than once.
#
# #### Solution \#1: Naive
# The complexity is $O(\ell(n-\ell)^2)$.
# There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time.
# In[4]:
def repeat_naive(st, l):
for i in range(len(st)-l+1):
for j in range(i+1,len(st)-l+1):
if st[i:i+l]==st[j:j+l]:
return True
return False
repeat_naive("hello", 1)
repeat_naive("hello"*10, 45)
repeat_naive("hello"*10, 46)
# Let's test our algorithm with by generating a random string of a given size.
# In[7]:
import random
def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"):
''' Generate a random string of length size over alphabet '''
s=""
for i in range(size):
s += random.choice(alphabet)
return s
rndstr = gen_str(1000)
print(rndstr)
repeat_naive(rndstr, 3)
repeat_naive(rndstr, 10)
# For bigger $n$ and $\ell$, this could be quite slow:
# In[8]:
rndstr = gen_str(10000)
repeat_naive(rndstr, 3)
repeat_naive(rndstr, 10)
# ### The class Hashtable from the lectures
# In[9]:
class Hashtable:
def __init__(self, m, hash_func=hash):
""" initial hash table, m empty entries """
##bogus initialization #1:
#self.table = [[]*m]
##bogus initialization #2:
#empty=[]
#self.table = [empty for i in range(m)]
self.table = [ [] for i in range(m)]
self.hash_mod = lambda x: hash_func(x) % m # using python hash function
def __repr__(self):
L = [self.table[i] for i in range(len(self.table))]
return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))])
def find(self, item):
""" returns True if item in hashtable, False otherwise """
i = self.hash_mod(item)
return item in self.table[i]
#if item in self.table[i]:
# return True
#else:
# return False
def insert(self, item):
""" insert an item into table """
i = self.hash_mod(item)
if item not in self.table[i]:
self.table[i].append(item)
# #### Solution \#2: using the class Hashtable
# In[10]:
def repeat_hash1(st, l):
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
# The expected (average) complexity is: $O(\ell(n-\ell))$
#
# Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time.
#
#
#
# The worst case complexity is: $O(\ell(n-\ell)^2)$
#
# Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is
# $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$
#
# Which native Python data structure is most suitable for this problem?
#
# #### Solution #3: using Python's set implementation
#
# In[11]:
def repeat_hash2(st, l):
htable = set() #Python sets use hash functions for fast lookup
for i in range(len(st)-l+1):
if st[i:i+l] not in htable:
htable.add(st[i:i+l])
else: return True
return False
# #### Competition between the 3 solutions
# In[12]:
import time
str_len=1000
st=gen_str(str_len)
l=10
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.perf_counter()
res=f(st, l)
t1=time.perf_counter()
print(f.__name__, t1-t0, "found?",res)
# In[13]:
str_len=2000
st=gen_str(str_len)
l=10
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.perf_counter()
res=f(st, l)
t1=time.perf_counter()
print(f.__name__, t1-t0, "found?",res)
# For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest.
#
# When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2.
#
# #### Time spent on creating the table
# When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991.
# In[14]:
st="a"*1000
l=10
for f in [repeat_naive,repeat_hash1,repeat_hash2]:
t0=time.perf_counter()
res=f(st, l)
t1=time.perf_counter()
print(f.__name__, t1-t0, "found?",res)
# ### The effect of table size
#
# The second solution, with control over the table size
# In[15]:
def repeat_hash1_var_size(st, l, m=0):
if m==0: #default hash table size is ~number of substrings to be inserted
m=len(st)-l+1
htable = Hashtable(m)
for i in range(len(st)-l+1):
if htable.find(st[i:i+l])==False:
htable.insert(st[i:i+l])
else:
return True
return False
# #### Comparing tables sizes
# In[16]:
import time
str_len=1000
st=gen_str(str_len)
l=10
print("str_len=",str_len, "repeating substring len=",l)
for m in [1, 10, 100, 1000, 1500, 10000, 100000]:
t0=time.perf_counter()
res=repeat_hash1_var_size(st, l, m)
t1=time.perf_counter()
print(t1-t0, "found?",res, "table size=",m)
# ### Summary
# Make sure you read the following summary that includes a detailed explanation on the experiments.
# # Nim
#
# In HW4, we were asked to give a recursive solution to the game of Nim. As a reminder, in the game of Nim we are given $n$ stacks, each containing some non-zero amount of coins. There are two players in the game, at each turn the player whose turn it is picks a stack which has some $k > 0$ coins and removes $1 \leq i \leq k$ coins from the stack. The game ends when all stacks are empty and the winner is the last player to remove coins from the last stack.
#
# Our goal is to implement the function $can\_win\_nim$ which gets a list $stack$ of positive integers and returns $True$ iff the first player has a winning strategy in the game.
#
# Let's try and find a recursive solution for our problem:
# * As a base case, if all stacks are empty, then the player currently playing loses and therefore we simply return $False$
# * Otherwise, there are some "active" stacks. I.e., stacks with a positive amount of coins in them.
#
# We now recall the reasoning we used when solving Munch: in order to see if we can win the game, we can ask "is there a legal move we can make such that our opponent will lose?" If there exists such a move, we will play it. If no such move exists, this is the same as saying that for any legal move we can make, our opponent has a winning strategy and therefore we will lose.
#
# This gives rise to a simple recursive solution:
# * Go over all stacks
# * For each "active" stack with $i > 0$ coins:
# * For each $1 \leq j \leq i$, recursively check if the opponent loses if we remove $j$ coins from the stack.
# * If the opponent loses, this is a move that leads to a winning strategy and therefore we return "True"
# * If all possible moves in all non-zero stacks lead to the opponent winning, we lose and return "False"
# In[30]:
def can_win_nim(stacks):
#if len([s for s in stacks if s > 0]) == 0:
if not any(stacks):
return False
for i in range(len(stacks)):
for j in range(1, stacks[i]+1):
stacks[i] -= j
can_other_player_win = can_win_nim(stacks)
stacks[i] += j
if not can_other_player_win:
return True
return False
can_win_nim([1])
can_win_nim([2, 2])
can_win_nim([2, 3, 1, 4])
# # Memoization
#
# In order to memoize this code we need to decide on the key for memoization. A natural candidate is the stacks themselves. We can convert our list into a tuple (so it will be immutable) and use this tuple as a key.
#
# The implementation is straightforward:
# In[9]:
def can_win_nim_mem(stacks):
return can_win_nim_mem_inner(stacks, {})
def can_win_nim_mem_inner(stacks, mem):
mem_key = tuple(stacks)
if mem_key in mem:
return mem[mem_key]
if not any(stacks):
return False
for i in range(len(stacks)):
for j in range(1, stacks[i]+1):
stacks[i] -= j
can_other_player_win = can_win_nim_mem_inner(stacks, mem)
stacks[i] += j
if not can_other_player_win:
mem[mem_key] = True
return True
mem[mem_key] = False
return False
can_win_nim_mem([3 for i in range(10)])
# Let's mention a slight improvement for this: we first make an almost trivial observation, and that is the fact that given an input to the function $stacks$, the result of the function does not depend on the order of the entries in it.
#
# For example, if $stacks = [2, 3, 1, 4]$ and the function returns $True$ on $stacks$, we know that it will also return $True$ on $[1, 2, 3, 4], [4, 3, 2, 1]$ and so forth. Since our goal is to do as little work as possible, we would like to make sure that once we've computed the function on $stacks$, we will now compute it on any permutation of the list. How can we do this?
#
# One possible solution is that once we know the value of the function on $stacks$, we can enter any permutation of it into our dictionary. This will work but would be prohibitive since merely manufacturing all permutations of $stacks$ can take a long time (if $stacks$ has 10 unique entries there are over 3 million possible permutations for it!).
#
# Can you think of a better way of avoiding extra work? Say we have a solution to $stacks$, we can enter it into the dictionary using the sorted list as a key. Sorting the list will take $O(n \log n)$ which is not too bad. Are we done? There's one more important thing to remember, and that is that now when we want to check if a given list is already in our dictionary, we need to search for it after sorting it.
#
# The easiest way not to get confused about all of this is to simply compute the key at the beginning of the function execution and use it throughout the function.
#
# Here is the implementation:
# In[10]:
def can_win_nim_mem_sorted(stacks):
return can_win_nim_mem_inner_sorted(stacks, {})
def can_win_nim_mem_inner_sorted(stacks, mem):
# This is the only change!
# mem_key = tuple(stacks)
mem_key = tuple(sorted(stacks))
if mem_key in mem:
return mem[mem_key]
if not any(stacks):
return False
#if len([s for s in stacks if s > 0]) == 1:
# mem[mem_key] = True
# return True
for i in range(len(stacks)):
for j in range(1, stacks[i]+1):
stacks[i] -= j
can_other_player_win = can_win_nim_mem_inner_sorted(stacks, mem)
stacks[i] += j
if not can_other_player_win:
mem[mem_key] = True
return True
mem[mem_key] = False
return False
can_win_nim_mem_sorted([3 for i in range(30)])
# How much did we gain? Our naive memoization started coughing for a list of length 10, whereas our improved implementation solved the problem for a list of length 30 almost instantaneously!
# In[ ]: