%load_ext watermark
%watermark -a 'Sebastian Raschka' -u -d -v
Sebastian Raschka last updated: 2017-08-14 CPython 3.6.1 IPython 6.1.0
This notebooks walks through a simple hash table implementation that behaves relatively similar to a Python dictionary but has a fixed size of elements that it can store for simplicity. The methods we are going to implement are listed below. Note that __setitem__
and __getitem__
are used to overload the []
index access for the insert
and get
methods, respectively.
class HashTable():
def __init__(self, size=17):
pass
def insert(self, key, value):
"""Insert a new key-value pair or overwrites an existing one."""
pass
def __setitem__(self, key, value):
return self.insert(key, value)
def get(self, key):
"""Retrieve the value corresponding to the key."""
pass
def __getitem__(self, key):
return self.get(key)
def keys(self):
"""Retrieves all keys from the hash table"""
pass
def _hash(self, key):
"""Computes the hash position"""
pass
def _rehash(self, previous_hash):
"""Find the next hash for linear probing"""
pass
For this simple example, we will be using a very naive hash function, the remainder method, which simply computes the remainder of the key when dividing it by the size of the hash table. For example:
hash_table_size = 17
key = 123467
position = key % hash_table_size
print('Position in the hash table:', position)
Position in the hash table: 13
Note that this function only works with integer keys. Thus, if the key is a string, we have to make a little modification:
string_key = 'abcdef'
key = sum(ord(c) for c in string_key)
position = key % hash_table_size
print('Position in the hash table:', position)
Position in the hash table: 2
For collision resolution we will use linear probing. That is, if a new key hashes to the same position as an existing one, we will increase the old hash value by a skip constant and check if the resulting slot is empty. If not, we will repeat this procedure until we find the next empty slot and insert the new key in this empty slot. We call this "*rehashing*."
For look-ups, we then have to use the following procedure:
For simplicity, we will use implement the reshashing function using a skip constant of 1. So let's now implement the __init__
, _hash
, _rehash
, keys
and insert
methods to get a simple hashtable set up in which we can already store some keys:
class HashTable():
def __init__(self, size=17):
self.size = size
self.stored_keys = [None] * self.size
self.stored_values = [None] * self.size
def insert(self, key, value):
"""Insert a new key-value pair or overwrites an existing one."""
hash_val = self._hash(key)
# insert new key if it doesn't exist yet
if self.stored_keys[hash_val] is None:
self.stored_keys[hash_val] = key
self.stored_values[hash_val] = value
# overwrite key if it already exists
else:
if self.stored_keys[hash_val] == key:
self.stored_values[hash_val] = value
# collision resolution
elif len(self.keys()) == self.size:
raise ValueError('Hash table is full')
else:
next_hash = self._rehash(hash_val)
while (self.stored_keys[next_hash] is not None
and self.stored_keys[next_hash] != key):
next_hash = self._rehash(next_hash)
# insert new key if it doesn't exist yet
if self.stored_keys[next_hash] is None:
self.stored_keys[next_hash] = key
self.stored_values[next_hash] = value
# overwrite key if it already exists
else:
self.stored_values[next_hash] = value
def __setitem__(self, key, value):
return self.insert(key, value)
def get(self, key):
"""Retrieve the value corresponding to the key."""
pass
def __getitem__(self, key):
return self.get(key)
def keys(self):
"""Retrieves all keys from the hash table"""
return [k for k in self.stored_keys if k is not None]
def _hash(self, key):
"""Computes the hash position."""
if isinstance(key, str):
key = sum(ord(c) for c in key)
position = key % hash_table_size
return position
def _rehash(self, previous_hash):
"""Find the next hash for linear probing"""
return (previous_hash + 1) % self.size
Let's start by inserting 2 different keys and check they have been stored by listing all keys:
hashtable = HashTable()
hashtable['abc'] = 1
hashtable['xyz'] = 2
hashtable.keys()
['abc', 'xyz']
Next, let's use a key that would result in a hash collision with one of the already stored keys:
print(hashtable._hash('abc'))
print(hashtable._hash('efg'))
print(hashtable._hash('abcdefgh'))
5 0 5
We can use this key, 'abcdefgh'
, now if hash collisions are resolved correctly:
hashtable['abcdefgh'] = 3
hashtable.keys()
['abc', 'xyz', 'abcdefgh']
Finally, let's add the get method to retrieve keys:
class HashTable():
def __init__(self, size=17):
self.size = size
self.stored_keys = [None] * self.size
self.stored_values = [None] * self.size
def insert(self, key, value):
"""Insert a new key-value pair or overwrites an existing one."""
hash_val = self._hash(key)
# insert new key if it doesn't exist yet
if self.stored_keys[hash_val] is None:
self.stored_keys[hash_val] = key
self.stored_values[hash_val] = value
# overwrite key if it already exists
else:
if self.stored_keys[hash_val] == key:
self.stored_values[hash_val] = value
# collision resolution
elif len(self.keys()) == self.size:
raise ValueError('Hash table is full')
else:
next_hash = self._rehash(hash_val)
while (self.stored_keys[next_hash] is not None
and self.stored_keys[next_hash] != key):
next_hash = self._rehash(next_hash)
# insert new key if it doesn't exist yet
if self.stored_keys[next_hash] is None:
self.stored_keys[next_hash] = key
self.stored_values[next_hash] = value
# overwrite key if it already exists
else:
self.stored_values[next_hash] = value
def __setitem__(self, key, value):
return self.insert(key, value)
def get(self, key):
"""Retrieve the value corresponding to the key."""
hash_val = self._hash(key)
if self.stored_keys[hash_val] == key:
return self.stored_values[hash_val]
elif self.stored_keys[hash_val] is None:
return KeyError(key)
else:
next_hash = self._rehash(hash_val)
while self.stored_keys[next_hash] != key:
next_hash = self._rehash(next_hash)
if next_hash == hash_val:
return KeyError(key)
elif self.stored_keys[next_hash] is None:
return KeyError(key)
return self.stored_values[next_hash]
def __getitem__(self, key):
return self.get(key)
def keys(self):
"""Retrieves all keys from the hash table"""
return [k for k in self.stored_keys if k is not None]
def _hash(self, key):
"""Computes the hash position."""
if isinstance(key, str):
key = sum(ord(c) for c in key)
position = key % hash_table_size
return position
def _rehash(self, previous_hash):
"""Find the next hash for linear probing"""
return (previous_hash + 1) % self.size
hashtable = HashTable()
hashtable['abc'] = 1
hashtable['xyz'] = 2
hashtable['abcdefgh'] = 3
hashtable['abc']
1
hashtable['xyz']
2
hashtable['abcdefgh']
3
As we can see, the hash collision resolution works correctly for both insert
and the get
.