#!/usr/bin/env python # coding: utf-8 # In[1]: from lxml import etree import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # Data from: # https://archive.org/details/stackexchange # In[2]: update_from_web = False # In[3]: def get_rank(s): # Read dataset root = etree.parse('Users-%s.xml'%s) Reputation = [] for x in root.getiterator(tag='row'): Reputation.append(float(x.get('Reputation'))) # Frequency Analysis by Ranking of Data # Sort in descending order a = np.array(Reputation) a.sort() a = a[::-1] # Rank rank = np.arange(1,len(a)+1) return rank, a def plot(rank, rep, site): fig, ax = plt.subplots() fig.set_size_inches(8, 8) ax.set_xscale('log') ax.set_yscale('log') ax.set_xlabel('log rank') ax.set_ylabel('log reputation') ax.plot(rank, rep, '-.b') ax.title.set_text('Reputation distribution on %s'%site) plt.show() # In[4]: sites = ['superuser.com','askubuntu.com','math.stackexchange.com', 'serverfault.com'] for site in sites: rank, rep = get_rank(site) plot(rank, rep, site) # In[ ]: