#!/usr/bin/env python # coding: utf-8 #

cs1001.py , Tel Aviv University, Spring 2019

# # Recitation 10 # We discussed Hash Tables and Nim # # #### Takeaways: # - Important properties of Hash Tables: # - Hash tables can be useful for many algorithms, including memoization. # - Insert and find operation run in $O(1)$ on average, but $O(n)$ in the worst case (where $n$ is the number of elements in the table) # - Make sure you understand the complexity analysis for hash tables (see the links below). # #### Code for printing several outputs in one cell (not part of the recitation): # In[1]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # ## Hash Tables # # We wish to have a data structure that implements the operations: insert, search and delete in **expected** $O(1)$ time. # # Summarizing the insert and search complexity of the data structures that we have seen already: # # | implementation | insert | search | delete | # |-------------------------------|--------------------------|----------|---------------------| # | Python list | O(1) always at the end | O(n) | O(n) | # | Python ordered list | O(n) | O(log n) | O(n) | # | Linked list | O(1) always at the start | O(n) | O(1) given the node before the one to delete | # | Sorted linked list | O(n) | O(n) | O(1) given the node before the one to delete | # | Unbalanced Binary Search Tree | O(n) | O(n) | O(n) | # | Balanced Binary Search Tree | O(log n) | O(log n) | O(log n) | # Please read the following summary on the various data structures mentioned in class. # # A detailed summary on the complexity of insert/search operations using hash tables can be found here. Make sure you read it. # ### Exercise: # Given a string $st$ of length $n$ and a small integer $\ell$, write a function that checks whether there is a substring in $st$ of length $\ell$ that appears more than once. # # #### Solution \#1: Naive # The complexity is $O(\ell(n-\ell)^2)$. # There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time. # In[4]: def repeat_naive(st, l): for i in range(len(st)-l+1): for j in range(i+1,len(st)-l+1): if st[i:i+l]==st[j:j+l]: return True return False repeat_naive("hello", 1) repeat_naive("hello"*10, 45) repeat_naive("hello"*10, 46) # Let's test our algorithm with by generating a random string of a given size. # In[7]: import random def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"): ''' Generate a random string of length size over alphabet ''' s="" for i in range(size): s += random.choice(alphabet) return s rndstr = gen_str(1000) print(rndstr) repeat_naive(rndstr, 3) repeat_naive(rndstr, 10) # For bigger $n$ and $\ell$, this could be quite slow: # In[8]: rndstr = gen_str(10000) repeat_naive(rndstr, 3) repeat_naive(rndstr, 10) # ### The class Hashtable from the lectures # In[9]: class Hashtable: def __init__(self, m, hash_func=hash): """ initial hash table, m empty entries """ ##bogus initialization #1: #self.table = [[]*m] ##bogus initialization #2: #empty=[] #self.table = [empty for i in range(m)] self.table = [ [] for i in range(m)] self.hash_mod = lambda x: hash_func(x) % m # using python hash function def __repr__(self): L = [self.table[i] for i in range(len(self.table))] return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))]) def find(self, item): """ returns True if item in hashtable, False otherwise """ i = self.hash_mod(item) return item in self.table[i] #if item in self.table[i]: # return True #else: # return False def insert(self, item): """ insert an item into table """ i = self.hash_mod(item) if item not in self.table[i]: self.table[i].append(item) # #### Solution \#2: using the class Hashtable # In[10]: def repeat_hash1(st, l): m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # The expected (average) complexity is: $O(\ell(n-\ell))$ # # Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time. # # # # The worst case complexity is: $O(\ell(n-\ell)^2)$ # # Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is # $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$ # # Which native Python data structure is most suitable for this problem? # # #### Solution #3: using Python's set implementation # # In[11]: def repeat_hash2(st, l): htable = set() #Python sets use hash functions for fast lookup for i in range(len(st)-l+1): if st[i:i+l] not in htable: htable.add(st[i:i+l]) else: return True return False # #### Competition between the 3 solutions # In[12]: import time str_len=1000 st=gen_str(str_len) l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # In[13]: str_len=2000 st=gen_str(str_len) l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest. # # When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2. # # #### Time spent on creating the table # When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991. # In[14]: st="a"*1000 l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # ### The effect of table size # # The second solution, with control over the table size # In[15]: def repeat_hash1_var_size(st, l, m=0): if m==0: #default hash table size is ~number of substrings to be inserted m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # #### Comparing tables sizes # In[16]: import time str_len=1000 st=gen_str(str_len) l=10 print("str_len=",str_len, "repeating substring len=",l) for m in [1, 10, 100, 1000, 1500, 10000, 100000]: t0=time.perf_counter() res=repeat_hash1_var_size(st, l, m) t1=time.perf_counter() print(t1-t0, "found?",res, "table size=",m) # ### Summary # Make sure you read the following summary that includes a detailed explanation on the experiments. # # Nim # # In HW4, we were asked to give a recursive solution to the game of Nim. As a reminder, in the game of Nim we are given $n$ stacks, each containing some non-zero amount of coins. There are two players in the game, at each turn the player whose turn it is picks a stack which has some $k > 0$ coins and removes $1 \leq i \leq k$ coins from the stack. The game ends when all stacks are empty and the winner is the last player to remove coins from the last stack. # # Our goal is to implement the function $can\_win\_nim$ which gets a list $stack$ of positive integers and returns $True$ iff the first player has a winning strategy in the game. # # Let's try and find a recursive solution for our problem: # * As a base case, if all stacks are empty, then the player currently playing loses and therefore we simply return $False$ # * Otherwise, there are some "active" stacks. I.e., stacks with a positive amount of coins in them. # # We now recall the reasoning we used when solving Munch: in order to see if we can win the game, we can ask "is there a legal move we can make such that our opponent will lose?" If there exists such a move, we will play it. If no such move exists, this is the same as saying that for any legal move we can make, our opponent has a winning strategy and therefore we will lose. # # This gives rise to a simple recursive solution: # * Go over all stacks # * For each "active" stack with $i > 0$ coins: # * For each $1 \leq j \leq i$, recursively check if the opponent loses if we remove $j$ coins from the stack. # * If the opponent loses, this is a move that leads to a winning strategy and therefore we return "True" # * If all possible moves in all non-zero stacks lead to the opponent winning, we lose and return "False" # In[30]: def can_win_nim(stacks): #if len([s for s in stacks if s > 0]) == 0: if not any(stacks): return False for i in range(len(stacks)): for j in range(1, stacks[i]+1): stacks[i] -= j can_other_player_win = can_win_nim(stacks) stacks[i] += j if not can_other_player_win: return True return False can_win_nim([1]) can_win_nim([2, 2]) can_win_nim([2, 3, 1, 4]) # # Memoization # # In order to memoize this code we need to decide on the key for memoization. A natural candidate is the stacks themselves. We can convert our list into a tuple (so it will be immutable) and use this tuple as a key. # # The implementation is straightforward: # In[9]: def can_win_nim_mem(stacks): return can_win_nim_mem_inner(stacks, {}) def can_win_nim_mem_inner(stacks, mem): mem_key = tuple(stacks) if mem_key in mem: return mem[mem_key] if not any(stacks): return False for i in range(len(stacks)): for j in range(1, stacks[i]+1): stacks[i] -= j can_other_player_win = can_win_nim_mem_inner(stacks, mem) stacks[i] += j if not can_other_player_win: mem[mem_key] = True return True mem[mem_key] = False return False can_win_nim_mem([3 for i in range(10)]) # Let's mention a slight improvement for this: we first make an almost trivial observation, and that is the fact that given an input to the function $stacks$, the result of the function does not depend on the order of the entries in it. # # For example, if $stacks = [2, 3, 1, 4]$ and the function returns $True$ on $stacks$, we know that it will also return $True$ on $[1, 2, 3, 4], [4, 3, 2, 1]$ and so forth. Since our goal is to do as little work as possible, we would like to make sure that once we've computed the function on $stacks$, we will now compute it on any permutation of the list. How can we do this? # # One possible solution is that once we know the value of the function on $stacks$, we can enter any permutation of it into our dictionary. This will work but would be prohibitive since merely manufacturing all permutations of $stacks$ can take a long time (if $stacks$ has 10 unique entries there are over 3 million possible permutations for it!). # # Can you think of a better way of avoiding extra work? Say we have a solution to $stacks$, we can enter it into the dictionary using the sorted list as a key. Sorting the list will take $O(n \log n)$ which is not too bad. Are we done? There's one more important thing to remember, and that is that now when we want to check if a given list is already in our dictionary, we need to search for it after sorting it. # # The easiest way not to get confused about all of this is to simply compute the key at the beginning of the function execution and use it throughout the function. # # Here is the implementation: # In[10]: def can_win_nim_mem_sorted(stacks): return can_win_nim_mem_inner_sorted(stacks, {}) def can_win_nim_mem_inner_sorted(stacks, mem): # This is the only change! # mem_key = tuple(stacks) mem_key = tuple(sorted(stacks)) if mem_key in mem: return mem[mem_key] if not any(stacks): return False #if len([s for s in stacks if s > 0]) == 1: # mem[mem_key] = True # return True for i in range(len(stacks)): for j in range(1, stacks[i]+1): stacks[i] -= j can_other_player_win = can_win_nim_mem_inner_sorted(stacks, mem) stacks[i] += j if not can_other_player_win: mem[mem_key] = True return True mem[mem_key] = False return False can_win_nim_mem_sorted([3 for i in range(30)]) # How much did we gain? Our naive memoization started coughing for a list of length 10, whereas our improved implementation solved the problem for a list of length 30 almost instantaneously! # In[ ]: