Both the outlink and inlink distributions have the vast majority of their mass near zero and decay very rapidly.

In [58]:
import statsmodels.api as sm
from scipy.io import loadmat
from pandas import Series
from numpy import array
In [2]:
A = loadmat('A.mat')['A'].astype('int32')
In [26]:
outlinks = Series(array(A.sum(1).flatten())[0])
inlinks = Series(array(A.sum(0).flatten())[0])
In [22]:
outlinks.describe()
Out[22]:
count    185314.000000
mean         13.126801
std          46.984216
min           0.000000
25%           1.000000
50%           1.000000
75%           3.000000
max        2799.000000
dtype: float64
In [57]:
outlinks.hist(bins=20);
In [42]:
log(outlinks+1).hist(bins=20, log=True)
In [54]:
outlinks.hist(normed=True, cumulative=True, bins=30, histtype='step')
ylim(.94,1)
Out[54]:
(0.94, 1)
In [62]:
ecdf = sm.distributions.ECDF(outlinks)

x = linspace(min(outlinks), max(outlinks))
y = ecdf(x)
step(log(x+1), y)
ylim(.9,1);

Same story, but even more extreme.

In [29]:
inlinks.describe()
Out[29]:
count    185314.000000
mean         13.126801
std         138.280052
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max       46769.000000
dtype: float64
In [59]:
inlinks.hist(bins=20)
Out[59]:
<matplotlib.axes.AxesSubplot at 0x1187fa890>
In [40]:
log(inlinks+1).hist(bins=20, log=True)
Out[40]:
<matplotlib.axes.AxesSubplot at 0x116f0f110>
In [65]:
ecdf = sm.distributions.ECDF(inlinks)
x = linspace(min(inlinks), max(inlinks))
y = ecdf(x)
step(log(x+1), y)
ylim(.95, 1);