from lxml import etree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Data from: https://archive.org/details/stackexchange
update_from_web = False
def get_rank(s):
# Read dataset
root = etree.parse('Users-%s.xml'%s)
Reputation = []
for x in root.getiterator(tag='row'):
Reputation.append(float(x.get('Reputation')))
# Frequency Analysis by Ranking of Data
# Sort in descending order
a = np.array(Reputation)
a.sort()
a = a[::-1]
# Rank
rank = np.arange(1,len(a)+1)
return rank, a
def plot(rank, rep, site):
fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('log rank')
ax.set_ylabel('log reputation')
ax.plot(rank, rep, '-.b')
ax.title.set_text('Reputation distribution on %s'%site)
plt.show()
sites = ['superuser.com','askubuntu.com','math.stackexchange.com', 'serverfault.com']
for site in sites:
rank, rep = get_rank(site)
plot(rank, rep, site)