#!/usr/bin/env python # coding: utf-8 #

cs1001.py , Tel Aviv University, Spring 2019

# # # Recitation 9 # We discussed two data structures - Binary Search Trees and Hash Tables. # # #### Takeaways: # - Important properties of Binary Search Trees: # - Insert and find take $O(h)$ time where $h$ is the height of the tree. # - When a tree containing $n$ nodes is balanced, $h = O(\log{n})$, in the worst case (totally unbalanced), $h=O(n)$ # - Many methods in this class are implemented using recursion. # - Important properties of Hash Tables: # - Hash tables can be useful for many algorithms, including memoization. # - Insert and find operation run in $O(1)$ on average, but $O(n)$ in the worst case (where $n$ is the number of elements in the table) # - Make sure you understand the complexity analysis for hash tables (see the links below). # #### Code for printing several outputs in one cell (not part of the recitation): # In[26]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # In[27]: ## This file contains functions for the representation of binary trees. ## used in class Binary_search_tree's __repr__ ## Written by a former student in the course - thanks to Amitai Cohen ## No need to fully understand this code def printree(t, bykey = True): """Print a textual representation of t bykey=True: show keys instead of values""" #for row in trepr(t, bykey): # print(row) return trepr(t, bykey) def trepr(t, bykey = False): """Return a list of textual representations of the levels in t bykey=True: show keys instead of values""" if t==None: return ["#"] thistr = str(t.key) if bykey else str(t.val) return conc(trepr(t.left,bykey), thistr, trepr(t.right,bykey)) def conc(left,root,right): """Return a concatenation of textual represantations of a root node, its left node, and its right node root is a string, and left and right are lists of strings""" lwid = len(left[-1]) rwid = len(right[-1]) rootwid = len(root) result = [(lwid+1)*" " + root + (rwid+1)*" "] ls = leftspace(left[0]) rs = rightspace(right[0]) result.append(ls*" " + (lwid-ls)*"_" + "/" + rootwid*" " + "|" + rs*"_" + (rwid-rs)*" ") for i in range(max(len(left),len(right))): row = "" if i node.key: if node.right == None: node.right = Tree_node(key, val) else: insert_rec(node.right, key, val) return if self.root == None: #empty tree self.root = Tree_node(key, val) else: insert_rec(self.root, key, val) def minimum(self): ''' return node with minimal key ''' if self.root == None: return None node = self.root left = node.left while left != None: node = left left = node.left return node def depth(self): ''' return depth of tree, uses recursion''' def depth_rec(node): if node == None: return -1 else: return 1 + max(depth_rec(node.left), depth_rec(node.right)) return depth_rec(self.root) def size(self): ''' return number of nodes in tree, uses recursion ''' def size_rec(node): if node == None: return 0 else: return 1 + size_rec(node.left) + size_rec(node.right) return size_rec(self.root) def inorder(self): '''prints the keys of the tree in a sorted order''' def inorder_rec(node): if node == None: return inorder_rec(node.left) print(node.key) inorder_rec(node.right) inorder_rec(self.root) t = Binary_search_tree() t.insert(4,'a') t.insert(2,'a') t.insert(6,'a') t.insert(1,'a') t.insert(3,'a') t.insert(5,'a') t.insert(7,'a') t t.inorder() # Claim: An inorder traversal of a binary search tree prints the keys in ascending order. # # Proof, by complete induction on the size of tree: # * Base - for $n = 1$ the claim is trivial # * Assume the claim holds for all $i \leq n$ # * For a tree of size $n+1$ with root $v$ consider the following: # * Both the right and the left subtree of $v$ have size at most $n$ # * By induction, all keys in $v.left$ are printed in ascending order (and they are all smaller than $v.key$) # * Next, $v.key$ is printed # * Finally, by induction, all keys in $v.right$ are printed in ascending order (and they are all larger than $v.key$) # * Thus, the keys of the tree (which has size $n+1$) are printed in ascending order # # # More information on complete induction: https://en.wikipedia.org/wiki/Mathematical_induction#Complete_(strong)_induction # Question: Assume $N = 2^n - 1$ for some $n$, find a method of inserting the numbers $[1,...,N]$ to a BST such that it is completely balanced (i.e. - it is a complete tree). # # Solution: As we usually do with trees, we give a recursive solution. Our input will be a node and a first and last index to enter into the tree rooted in the node. We start with the root, $first = 1, last = 2^n - 1$ # # - Base case: If $first = last$ then we simply need to create a root with no sons labeled $first$ # - Otherwise, we find the mid point $mid = (first + last - 1 ) // 2$. We recursively insert $first,...,mid$ into the left son of the node, $mid + 1$ into the current node and $mid + 2, ..., last$ into the right son of the node. # # In[30]: class Tree_node(): def __init__(self, key, val): self.key = key self.val = val self.left = None self.right = None def __repr__(self): return "(" + str(self.key) + ":" + str(self.val) + ")" class Binary_search_tree(): def __init__(self): self.root = None def __repr__(self): #no need to understand the implementation of this one out = "" for row in printree(self.root): #need printree.py file out = out + row + "\n" return out def lookup(self, key): ''' return node with key, uses recursion ''' def lookup_rec(node, key): if node == None: return None elif key == node.key: return node elif key < node.key: return lookup_rec(node.left, key) else: return lookup_rec(node.right, key) return lookup_rec(self.root, key) def insert(self, key, val): ''' insert node with key,val into tree, uses recursion ''' def insert_rec(node, key, val): if key == node.key: node.val = val # update the val for this key elif key < node.key: if node.left == None: node.left = Tree_node(key, val) else: insert_rec(node.left, key, val) else: #key > node.key: if node.right == None: node.right = Tree_node(key, val) else: insert_rec(node.right, key, val) return if self.root == None: #empty tree self.root = Tree_node(key, val) else: insert_rec(self.root, key, val) def minimum(self): ''' return node with minimal key ''' if self.root == None: return None node = self.root left = node.left while left != None: node = left left = node.left return node def depth(self): ''' return depth of tree, uses recursion''' def depth_rec(node): if node == None: return -1 else: return 1 + max(depth_rec(node.left), depth_rec(node.right)) return depth_rec(self.root) def size(self): ''' return number of nodes in tree, uses recursion ''' def size_rec(node): if node == None: return 0 else: return 1 + size_rec(node.left) + size_rec(node.right) return size_rec(self.root) def inorder(self): '''prints the keys of the tree in a sorted order''' def inorder_rec(node): if node == None: return inorder_rec(node.left) print(node.key) inorder_rec(node.right) inorder_rec(self.root) def insert_balanced(self, n): '''inserts the numbers 1, ... , 2**n - 1 into a full BST''' def insert_balanced_rec(node, first, last): if first == last: node.key = first node.val = first else: mid = (first + last - 1) // 2 node.key = mid + 1 node.val = mid + 1 node.left = Tree_node(0,0) insert_balanced_rec(node.left, first, mid) node.right = Tree_node(0,0) insert_balanced_rec(node.right, mid + 2, last) self.root = Tree_node(0,0) insert_balanced_rec(self.root, 1, 2**n - 1) t = Binary_search_tree() t.insert(4,'a') t.insert(2,'a') t.insert(6,'a') t.insert(1,'a') t.insert(3,'a') t.insert(5,'a') t.insert(7,'a') #t.inorder() t = Binary_search_tree() t.insert_balanced(4) printree(t.root) # ## Hash Tables # # We wish to have a data structure that implements the operations: insert, search and delete in **expected** $O(1)$ time. # # Summarizing the insert and search complexity of the data structures that we have seen already: # # | implementation | insert | search | delete | # |-------------------------------|--------------------------|----------|---------------------| # | Python list | O(1) always at the end | O(n) | O(n) | # | Python ordered list | O(n) | O(log n) | O(n) | # | Linked list | O(1) always at the start | O(n) | O(1) given the node before the one to delete | # | Sorted linked list | O(n) | O(n) | O(1) given the node before the one to delete | # | Unbalanced Binary Search Tree | O(n) | O(n) | O(n) | # | Balanced Binary Search Tree | O(log n) | O(log n) | O(log n) | # Please read the following summary on the various data structures mentioned in class. # # A detailed summary on the complexity of insert/search operations using hash tables can be found here. Make sure you read it. # ### Exercise: # Given a string $st$ of length $n$ and a small integer $\ell$, write a function that checks whether there is a substring in $st$ of length $\ell$ that appears more than once. # # #### Solution \#1: Naive # The complexity is $O(\ell(n-\ell)^2)$. # There $O((n-\ell)^2)$ iterations (make sure you undersand why) and in each iteration we perform operations in $O(\ell)$ time. # In[ ]: def repeat_naive(st, l): for i in range(len(st)-l+1): for j in range(i+1,len(st)-l+1): if st[i:i+l]==st[j:j+l]: return True return False repeat_naive("hello", 1) repeat_naive("hello"*10, 45) repeat_naive("hello"*10, 46) # Let's test our algorithm with by generating a random string of a given size. # In[ ]: import random def gen_str(size, alphabet = "abcdefghijklmnopqrstuvwxyz"): ''' Generate a random string of length size over alphabet ''' s="" for i in range(size): s += random.choice(alphabet) return s rndstr = gen_str(1000) print(rndstr) repeat_naive(rndstr, 3) repeat_naive(rndstr, 10) # For bigger $n$ and $\ell$, this could be quite slow: # In[ ]: rndstr = gen_str(10000) repeat_naive(rndstr, 3) repeat_naive(rndstr, 10) # ### The class Hashtable from the lectures # In[ ]: class Hashtable: def __init__(self, m, hash_func=hash): """ initial hash table, m empty entries """ ##bogus initialization #1: #self.table = [[]*m] ##bogus initialization #2: #empty=[] #self.table = [empty for i in range(m)] self.table = [ [] for i in range(m)] self.hash_mod = lambda x: hash_func(x) % m # using python hash function def __repr__(self): L = [self.table[i] for i in range(len(self.table))] return "".join([str(i) + " " + str(L[i]) + "\n" for i in range(len(self.table))]) def find(self, item): """ returns True if item in hashtable, False otherwise """ i = self.hash_mod(item) return item in self.table[i] #if item in self.table[i]: # return True #else: # return False def insert(self, item): """ insert an item into table """ i = self.hash_mod(item) if item not in self.table[i]: self.table[i].append(item) # #### Solution \#2: using the class Hashtable # In[ ]: def repeat_hash1(st, l): m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # The expected (average) complexity is: $O(\ell(n-\ell))$ # # Creating the table takes $O(n-\ell)$ time, and there are $O(n-\ell)$ iterations, each taking expected $O(\ell)$ time. # # # # The worst case complexity is: $O(\ell(n-\ell)^2)$ # # Creating the table takes $O(n-\ell)$ time, and the time for executing the loop is # $\ell\cdot\sum_{i=0}^{n-\ell}{i}= O(\ell(n-\ell)^2)$ # # Which native Python data structure is most suitable for this problem? # # #### Solution #3: using Python's set implementation # # In[ ]: def repeat_hash2(st, l): htable = set() #Python sets use hash functions for fast lookup for i in range(len(st)-l+1): if st[i:i+l] not in htable: htable.add(st[i:i+l]) else: return True return False # #### Competition between the 3 solutions # In[ ]: import time str_len=1000 st=gen_str(str_len) l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # In[ ]: str_len=2000 st=gen_str(str_len) l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # For a random string of size $n=1000$ and for $l=10$ the running time of repeat_hash2 is the smallest, while the one for repeat_naive is the largest. # # When increasing $n$ to 2000, the running time of repeat_naive increases by ~4, while the running time of repeat_hash1, repeat_hash2 increases by ~2. # # #### Time spent on creating the table # When $st$ is "a"$*1000$, repeat_hash1 is the slowest, since it spends time on creating an empty table of size 991. # In[ ]: st="a"*1000 l=10 for f in [repeat_naive,repeat_hash1,repeat_hash2]: t0=time.perf_counter() res=f(st, l) t1=time.perf_counter() print(f.__name__, t1-t0, "found?",res) # ### The effect of table size # # The second solution, with control over the table size # In[ ]: def repeat_hash1_var_size(st, l, m=0): if m==0: #default hash table size is ~number of substrings to be inserted m=len(st)-l+1 htable = Hashtable(m) for i in range(len(st)-l+1): if htable.find(st[i:i+l])==False: htable.insert(st[i:i+l]) else: return True return False # #### Comparing tables sizes # In[ ]: import time str_len=1000 st=gen_str(str_len) l=10 print("str_len=",str_len, "repeating substring len=",l) for m in [1, 10, 100, 1000, 1500, 10000, 100000]: t0=time.perf_counter() res=repeat_hash1_var_size(st, l, m) t1=time.perf_counter() print(t1-t0, "found?",res, "table size=",m) # ### Summary # Make sure you read the following summary that includes a detailed explanation on the experiments.