nbuckets = 256 # Number of buckets
counters = np.array([0] * nbuckets) # Counter per bucket
def hash_index(s, nbuckets=nbuckets):
'''Hash string to a integer in the range 0..nbuckets-1'''
return hash(s) % nbuckets
with open('users.txt') as fo: # Populate the counts
for user in (line.strip() for line in fo):
counters[hash_index(user)] += 1
# Plot the distribution over buckets
xs = np.arange(nbuckets)
plot(xs, counters, marker='o')
plot([0, size-1], [counters.mean(), counters.mean()], color='red', label='mean')
grid()
xlim(0, size)
legend(frameon=False)
<matplotlib.legend.Legend at 0x51746d0>
counters.std() # Standard diviation
51.444682958135587
# Plot density
from statsmodels.nonparametric import KDE
kde = KDE(counters.astype(np.double))
kde.fit()
fill(kde.support, kde.density)
[<matplotlib.patches.Polygon at 0x530af50>]