known_urls = set() known_urls.add('http://www.google.com') known_urls.add('http://www.pydata.org') known_urls2 = set() known_urls2.add('http://www.python.org') 'http://www.pydata.org' in known_urls # Membership print len(known_urls) # Cardinality 'http://www.pydata.org' in (known_urls or known_urls2) # Union 'http://www.pydata.org' in (known_urls and known_urls2) # Intersection print known_urls # Retrieve the actual content known_urls.discard('http://www.google.com') # Delete an element in the set print known_urls import numpy as np import matplotlib.pyplot as plt import sys,os capacities = np.linspace(100,int(1E9)) typical_url = 'http://arstechnica.com/science/2013/10/new-type-of-quantum-excitation-behaves-like-a-solitary-particle/' memory_footprint = len(typical_url) * 8 memory_scaling = [memory_footprint * c for c in capacities] plt.plot(capacities, [((n / (8))) / (1024 * 1024 * 1024) for n in memory_scaling]) plt.grid() plt.title('Memory consumption for a set') plt.xlabel('Nbr. of items') plt.ylabel('Memory consumption in GB') from hashfunctions import generate_hashfunctions class LightweightHashtable(object): """ Almost-correct hashtable for membership query only """ def __init__(self, nbr_buckets=10): self.bitarray = [0] * nbr_buckets self.hashfunction = generate_hashfunctions(nbr_buckets,1) def add(self, element): self.bitarray[self.hashfunction(element)[0]] = 1 def __contains__(self, element): """ Membership query """ return self.bitarray[self.hashfunction(element)[0]] == 1 s = LightweightHashtable(10) for i in range(10): s.add(str(i)) s.bitarray '1' in s # Membership query available '105' in s '22' in s # False positive class BloomFilter(object): """ Basic Bloom Filter """ def __init__(self, capacity, error_rate): self.error_rate = error_rate self.capacity = capacity self.nbr_slices = int(np.ceil(np.log2(1.0 / error_rate))) self.bits_per_slice = int(np.ceil((capacity * abs(np.log(error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) self.nbr_bits = self.nbr_slices * self.bits_per_slice self.bitarray = np.zeros(self.nbr_bits, dtype=np.bool) self.count = 0 self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) self.hashed_values = [] def __contains__(self, key): """ Membership query """ self.hashed_values = self.hashes(key) offset = 0 for value in self.hashed_values: if not self.bitarray[offset + value]: return False offset += self.bits_per_slice return True def add(self, key): """ Add an item to the set """ if key in self: return True offset = 0 if not self.hashed_values: self.hashed_values = self.hashes(key) for value in self.hashed_values: self.bitarray[offset + value] = True offset += self.bits_per_slice self.count += 1 return False bf = BloomFilter(1000, 0.1) random_items = [str(r) for r in np.random.randn(20000)] for item in random_items[:1000]: bf.add(item) false_positive = 0 for item in random_items[1000:20000]: if item in bf: false_positive += 1 print "Error rate (false positive): %s" % str(float(false_positive) / 19000) def mem_vs_errors_and_capacity(error=0.01, capacity=10000, bit_mul = 1): nbr_slices = int(np.ceil(np.log2(1.0 / error))) bits_per_slice = int(np.ceil((capacity * abs(np.log(error))) / (nbr_slices * (np.log(2) ** 2)))) mem = float(((nbr_slices*bits_per_slice) / (8))) / (1024 * 1024 * 1024) * bit_mul return mem errors = [0.01,0.02,0.03,0.05] capacities = np.linspace(100,int(1E9)) mems = np.zeros((len(errors), len(capacities))) for e,error in enumerate(errors): for c,capacity in enumerate(capacities): mems[e,c] = mem_vs_errors_and_capacity(error, capacity) for e,error in enumerate(errors): plt.plot(capacities,mems[e]) plt.grid(True) plt.legend([str(item) for item in errors], loc=0) plt.xlabel('Capacity') plt.ylabel('Memory Consumtion [Gb]') plt.title('Memory Consumption for a plain Bloom Filter') for e,error in enumerate(errors): for c,capacity in enumerate(capacities): mems[e,c] = mem_vs_errors_and_capacity(error, capacity, 64) for e,error in enumerate(errors): plt.plot(capacities,mems[e]) plt.grid(True) plt.legend([str(item) for item in errors], loc=0) plt.xlabel('Capacity') plt.ylabel('Memory Consumtion [Gb]') plt.title('Memory Consumption for a Bloom Filter (with timestamp)') for e,error in enumerate(errors): for c,capacity in enumerate(capacities): mems[e,c] = mem_vs_errors_and_capacity(error, capacity, 6) for e,error in enumerate(errors): plt.plot(capacities,mems[e]) plt.grid(True) plt.legend([str(item) for item in errors], loc=0) plt.xlabel('Capacity') plt.ylabel('Memory Consumtion [Gb]') plt.title('Memory Consumption for a Bloom Filter (6 bits "timestamp")') from pds import CountdownBloomFilter from collections import Counter # <== Probabilistic version of this urls = Counter() urls['foo'] += 2 urls['bar'] += 100 print urls s = np.random.zipf(2.0, 10000) skew_dist = ((s/float(max(s)))*1000).astype(np.int) count, bins, ignored = plt.hist(skew_dist[skew_dist<50], 50) plt.grid(True) plt.show() class CountMinSketch(object): """ Basic Count-Min Sketch """ def __init__(self, delta, epsilon): self.bits_per_slice = int(np.ceil(np.exp(1) / epsilon)) self.nbr_slices = int(np.ceil(np.log(1 / delta))) self.bitarray = np.zeros((self.nbr_slices, self.bits_per_slice), dtype=np.int32) self.make_hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) def update(self, key, increment): for row, column in enumerate(self.make_hashes(key)): self.bitarray[int(row), int(column)] += increment def get(self, key): value = sys.maxint for row, column in enumerate(self.make_hashes(key)): value = min(self.bitarray[row, column], value) return value import random cms = CountMinSketch(1E-1, 0.03) # Create a fake randomized stream for n occurences of str(n) stream = [] for i in range(100): stream = stream + [str(i)] * i random.shuffle(stream) for s in stream: p = cms.update(s,1) cms.get('1') cms.get('19') import pandas as pd ns = [50,200,1000,1E4,1E5,1E6] def Rn(n,q): return int(round(np.log2(n*q))) def prob_longest(a, b, n, p): r = Rn(n,1-p) cdf = lambda x: np.exp(- p**x ) return cdf(b + 1 - r) - cdf(a - r) prob_longest(10,11,1000,0.5) ns = range(1,30) ps_50 = pd.Series([prob_longest(ns[i],ns[i+1], 50, 0.5) for i in ns[:-2]]) ps_50.plot(kind='bar') ps_200 = pd.Series([prob_longest(ns[i],ns[i+1], 200, 0.5) for i in ns[:-2]]) ps_200.plot(kind='bar') ps_100k = pd.Series([prob_longest(ns[i],ns[i+1], 1E6, 0.5) for i in ns[:-2]]) ps_100k.plot(kind='bar')