from lxml.html import document_fromstring
import requests
import pandas as pd
from dateutil.parser import parse as dateparse
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
def package_versions(package_name, name_pwd):
"""
Scrape the package's pkg_edit page for all uploaded versions
package_name: official package name
name_pwd: PyPI username, password tuple
Returns a list of versions for the package
"""
pl = {':action': 'pkg_edit', 'name': package_name}
r = requests.get('https://pypi.python.org/pypi', params=pl, auth=name_pwd)
html = document_fromstring(r.content)
rows = html.cssselect('table.list tr')[:-1]
header = rows[0]
data = rows[1:]
version_index, _ = filter(lambda x: x[1].text_content().lower() == 'version', enumerate(header))[0]
versions = map(lambda x: x[version_index].text_content(), data)
return versions
def package_data(package, version):
"""
Hit the official json feed for a package and version
Return the package info and url data
"""
url = 'http://pypi.python.org/pypi/%s/%s/json' % (package, version)
data = requests.get(url).json()
metadata = data['info']
# PyPI publishes the version information as a list, we're interested in first
url_info = data['urls'][0]
return metadata, url_info
def get_pypi_auth():
"""
Try to read ~/.pypirc and return the (username, password) tuple
"""
from ConfigParser import ConfigParser
import os
cp = ConfigParser()
cp.read(os.path.expanduser('~/.pypirc'))
name = cp.get('pypi', 'username')
passwd = cp.get('pypi', 'password')
return name, passwd
Putting it all together...
package = 'PyCap'
versions = package_versions(package, get_pypi_auth())
data = [package_data(package, v)[1] for v in versions]
df = pd.DataFrame(data, index=versions)
# convert upload_time
df['upload_time'] = df['upload_time'].map(dateparse)
df['elapsed'] = df['upload_time'] - df['upload_time'].shift(-1)
def to_days(x):
days = -1
try:
days = x.days
except:
pass
return days
df['elapsed'] = df['elapsed'].map(to_days)
# compute downloads / day
df['downloads / day'] = df['downloads'] / df['elapsed']
to_plot = df['downloads / day'][:-1]
mpl.rc('figure', figsize=(10, 8))
to_plot.plot(kind='bar')
plt.ylabel('Downloads Per Day')
plt.xlabel('Version')
plt.title('Downloads/day per version for %s' % package)
plt.show()