known_urls = set()
known_urls.add('http://www.google.com')
known_urls.add('http://www.pydata.org')
known_urls2 = set()
known_urls2.add('http://www.python.org')
'http://www.pydata.org' in known_urls # Membership
True
print len(known_urls) # Cardinality
2
'http://www.pydata.org' in (known_urls or known_urls2) # Union
True
'http://www.pydata.org' in (known_urls and known_urls2) # Intersection
False
print known_urls # Retrieve the actual content
set(['http://www.google.com', 'http://www.pydata.org'])
known_urls.discard('http://www.google.com') # Delete an element in the set
print known_urls
set(['http://www.pydata.org'])
HashTable:
Available Query:
Good:
Limitation:
import numpy as np
import matplotlib.pyplot as plt
import sys,os
capacities = np.linspace(100,int(1E9))
typical_url = 'http://arstechnica.com/science/2013/10/new-type-of-quantum-excitation-behaves-like-a-solitary-particle/'
memory_footprint = len(typical_url) * 8
memory_scaling = [memory_footprint * c for c in capacities]
plt.plot(capacities, [((n / (8))) / (1024 * 1024 * 1024) for n in memory_scaling])
plt.grid()
plt.title('Memory consumption for a set')
plt.xlabel('Nbr. of items')
plt.ylabel('Memory consumption in GB')
<matplotlib.text.Text at 0x1087230d0>
from hashfunctions import generate_hashfunctions
class LightweightHashtable(object):
""" Almost-correct hashtable for membership query only """
def __init__(self, nbr_buckets=10):
self.bitarray = [0] * nbr_buckets
self.hashfunction = generate_hashfunctions(nbr_buckets,1)
def add(self, element):
self.bitarray[self.hashfunction(element)[0]] = 1
def __contains__(self, element):
""" Membership query """
return self.bitarray[self.hashfunction(element)[0]] == 1
s = LightweightHashtable(10)
for i in range(10):
s.add(str(i))
s.bitarray
[1, 1, 0, 1, 1, 0, 0, 1, 1, 1]
'1' in s # Membership query available
True
'105' in s
False
'22' in s # False positive
True
How many hash functions should I use ?
In this variant the optimal numder of hashes for a specific error rate P is $ k = log_2 \left( \frac{1}{P} \right) $
If we want to store $n_{max}$ with a an error rate P, the optimal number of bits per slice is: $ m = \frac{n_{max} \cdot |ln(P)|}{k \cdot (ln(2))^2} $
class BloomFilter(object):
""" Basic Bloom Filter """
def __init__(self, capacity, error_rate):
self.error_rate = error_rate
self.capacity = capacity
self.nbr_slices = int(np.ceil(np.log2(1.0 / error_rate)))
self.bits_per_slice = int(np.ceil((capacity * abs(np.log(error_rate))) / (self.nbr_slices * (np.log(2) ** 2))))
self.nbr_bits = self.nbr_slices * self.bits_per_slice
self.bitarray = np.zeros(self.nbr_bits, dtype=np.bool)
self.count = 0
self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)
self.hashed_values = []
def __contains__(self, key):
""" Membership query """
self.hashed_values = self.hashes(key)
offset = 0
for value in self.hashed_values:
if not self.bitarray[offset + value]:
return False
offset += self.bits_per_slice
return True
def add(self, key):
""" Add an item to the set """
if key in self:
return True
offset = 0
if not self.hashed_values:
self.hashed_values = self.hashes(key)
for value in self.hashed_values:
self.bitarray[offset + value] = True
offset += self.bits_per_slice
self.count += 1
return False
bf = BloomFilter(1000, 0.1)
random_items = [str(r) for r in np.random.randn(20000)]
for item in random_items[:1000]:
bf.add(item)
false_positive = 0
for item in random_items[1000:20000]:
if item in bf:
false_positive += 1
print "Error rate (false positive): %s" % str(float(false_positive) / 19000)
Error rate (false positive): 0.105315789474
Good:
Limitation:
Trigger an action when you see an item for the first in a given time period
Ex:
Two choices:
def mem_vs_errors_and_capacity(error=0.01, capacity=10000, bit_mul = 1):
nbr_slices = int(np.ceil(np.log2(1.0 / error)))
bits_per_slice = int(np.ceil((capacity * abs(np.log(error))) / (nbr_slices * (np.log(2) ** 2))))
mem = float(((nbr_slices*bits_per_slice) / (8))) / (1024 * 1024 * 1024) * bit_mul
return mem
errors = [0.01,0.02,0.03,0.05]
capacities = np.linspace(100,int(1E9))
mems = np.zeros((len(errors), len(capacities)))
for e,error in enumerate(errors):
for c,capacity in enumerate(capacities):
mems[e,c] = mem_vs_errors_and_capacity(error, capacity)
for e,error in enumerate(errors):
plt.plot(capacities,mems[e])
plt.grid(True)
plt.legend([str(item) for item in errors], loc=0)
plt.xlabel('Capacity')
plt.ylabel('Memory Consumtion [Gb]')
plt.title('Memory Consumption for a plain Bloom Filter')
<matplotlib.text.Text at 0x10893d850>
for e,error in enumerate(errors):
for c,capacity in enumerate(capacities):
mems[e,c] = mem_vs_errors_and_capacity(error, capacity, 64)
for e,error in enumerate(errors):
plt.plot(capacities,mems[e])
plt.grid(True)
plt.legend([str(item) for item in errors], loc=0)
plt.xlabel('Capacity')
plt.ylabel('Memory Consumtion [Gb]')
plt.title('Memory Consumption for a Bloom Filter (with timestamp)')
<matplotlib.text.Text at 0x1077d0350>
for e,error in enumerate(errors):
for c,capacity in enumerate(capacities):
mems[e,c] = mem_vs_errors_and_capacity(error, capacity, 6)
for e,error in enumerate(errors):
plt.plot(capacities,mems[e])
plt.grid(True)
plt.legend([str(item) for item in errors], loc=0)
plt.xlabel('Capacity')
plt.ylabel('Memory Consumtion [Gb]')
plt.title('Memory Consumption for a Bloom Filter (6 bits "timestamp")')
<matplotlib.text.Text at 0x10771d4d0>
- Refresh period *s*:
$$ s = \frac{t}{M \left( c - 1 + \frac{1}{z \cdot (k + 1)} \right)} $$from pds import CountdownBloomFilter
from collections import Counter # <== Probabilistic version of this
urls = Counter()
urls['foo'] += 2
urls['bar'] += 100
print urls
Counter({'bar': 100, 'foo': 2})
Count items in a Multiset
Precedure:
Almost identical to a Counting Bloom Filter
$ k = log \left( \frac{1}{\epsilon} \right) $
$ n = \frac{e^1}{\delta} $
$ Pr[f_i' <= f_i + \epsilon m] >= 1 -\delta $
Work well for skewed distribution
Point Query : Single frequency of a single value ($f_i$)
Range Query ($\sum\limits_{i=1}^r{f_i}$):
s = np.random.zipf(2.0, 10000)
skew_dist = ((s/float(max(s)))*1000).astype(np.int)
count, bins, ignored = plt.hist(skew_dist[skew_dist<50], 50)
plt.grid(True)
plt.show()
class CountMinSketch(object):
""" Basic Count-Min Sketch """
def __init__(self, delta, epsilon):
self.bits_per_slice = int(np.ceil(np.exp(1) / epsilon))
self.nbr_slices = int(np.ceil(np.log(1 / delta)))
self.bitarray = np.zeros((self.nbr_slices, self.bits_per_slice), dtype=np.int32)
self.make_hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)
def update(self, key, increment):
for row, column in enumerate(self.make_hashes(key)):
self.bitarray[int(row), int(column)] += increment
def get(self, key):
value = sys.maxint
for row, column in enumerate(self.make_hashes(key)):
value = min(self.bitarray[row, column], value)
return value
import random
cms = CountMinSketch(1E-1, 0.03)
# Create a fake randomized stream for n occurences of str(n)
stream = []
for i in range(100):
stream = stream + [str(i)] * i
random.shuffle(stream)
for s in stream:
p = cms.update(s,1)
cms.get('1')
1
cms.get('19')
19
Good:
Limitation:
Cool use case:
Ex:
Only cardinality, No membership, No content
=> Proabilistic counting (Hyperloglog)
observable in the bitpattern of the hash
import pandas as pd
ns = [50,200,1000,1E4,1E5,1E6]
def Rn(n,q):
return int(round(np.log2(n*q)))
We can have an approximation of the distribution by using an extreme value distribution:
def prob_longest(a, b, n, p):
r = Rn(n,1-p)
cdf = lambda x: np.exp(- p**x )
return cdf(b + 1 - r) - cdf(a - r)
For example, for 1000 coin flips (n=1000, p=0.5 for a fair coin), the probability of having a longest run of 10 is 27%:
prob_longest(10,11,1000,0.5)
0.27596624287196203
ns = range(1,30)
ps_50 = pd.Series([prob_longest(ns[i],ns[i+1], 50, 0.5) for i in ns[:-2]])
ps_50.plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x107680710>
ps_200 = pd.Series([prob_longest(ns[i],ns[i+1], 200, 0.5) for i in ns[:-2]])
ps_200.plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x1077a8610>
ps_100k = pd.Series([prob_longest(ns[i],ns[i+1], 1E6, 0.5) for i in ns[:-2]])
ps_100k.plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x107906210>
length of the longest run shift towards larger value at a rate $\propto$ log()
By looking at the longuest run we can now estimate n simply using $n\approx (1/q) \cdot {2^{R_n}}$.
The Hyperloglog : Estimate the cardinality by counting the longest sequence on zero in hashed input stream
** Longuest run **
** bit-pattern $ O^{\rho-1}1$ **
** bit-pattern $ O^{\rho-1}1$ with stochastics averaging with 32 registers**