In [52]:

nbuckets = 256  # Number of buckets

In [53]:

counters = np.array([0] * nbuckets)  # Counter per bucket

In [54]:

def hash_index(s, nbuckets=nbuckets):
    '''Hash string to a integer in the range 0..nbuckets-1'''
    return hash(s) % nbuckets

In [55]:

with open('users.txt') as fo:  # Populate the counts
    for user in (line.strip() for line in fo):
        counters[hash_index(user)] += 1

In [56]:

# Plot the distribution over buckets
xs = np.arange(nbuckets)
plot(xs, counters, marker='o')
plot([0, size-1], [counters.mean(), counters.mean()], color='red', label='mean')
grid()
xlim(0, size)
legend(frameon=False)

Out[56]:

<matplotlib.legend.Legend at 0x51746d0>

In [57]:

counters.std()  # Standard diviation

Out[57]:

51.444682958135587

In [58]:

# Plot density
from statsmodels.nonparametric import KDE
kde = KDE(counters.astype(np.double))
kde.fit()
fill(kde.support, kde.density)

Out[58]:

[<matplotlib.patches.Polygon at 0x530af50>]

In [58]: