Using lxml and pandas to examine PyPI package stats

A post written for PyNash more clearly describes this notebook.

In [14]:
from lxml.html import document_fromstring
import requests
import pandas as pd
from dateutil.parser import parse as dateparse
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
In [15]:
def package_versions(package_name, name_pwd):
    Scrape the package's pkg_edit page for all uploaded versions

    package_name: official package name
    name_pwd: PyPI username, password tuple

    Returns a list of versions for the package
    pl = {':action': 'pkg_edit', 'name': package_name}
    r = requests.get('', params=pl, auth=name_pwd)
    html = document_fromstring(r.content)
    rows = html.cssselect('table.list tr')[:-1]
    header = rows[0]
    data = rows[1:]
    version_index, _ = filter(lambda x: x[1].text_content().lower() == 'version', enumerate(header))[0]
    versions = map(lambda x: x[version_index].text_content(), data)
    return versions

def package_data(package, version):
    Hit the official json feed for a package and version

    Return the package info and url data
    url = '' % (package, version)
    data = requests.get(url).json()
    metadata = data['info']
    # PyPI publishes the version information as a list, we're interested in first
    url_info = data['urls'][0]
    return metadata, url_info

def get_pypi_auth():
    Try to read ~/.pypirc and return the (username, password) tuple
    from ConfigParser import ConfigParser
    import os
    cp = ConfigParser()'~/.pypirc'))
    name = cp.get('pypi', 'username')
    passwd = cp.get('pypi', 'password')
    return name, passwd

Putting it all together...

In [16]:
package = 'PyCap'
versions = package_versions(package, get_pypi_auth())
data = [package_data(package, v)[1] for v in versions]

df = pd.DataFrame(data, index=versions)
# convert upload_time
df['upload_time'] = df['upload_time'].map(dateparse)
df['elapsed'] = df['upload_time'] - df['upload_time'].shift(-1)

def to_days(x):
    days = -1
        days = x.days
    return days

df['elapsed'] = df['elapsed'].map(to_days)
# compute downloads / day
df['downloads / day'] = df['downloads'] / df['elapsed']
In [17]:
to_plot = df['downloads / day'][:-1]
mpl.rc('figure', figsize=(10, 8))
plt.ylabel('Downloads Per Day')
plt.title('Downloads/day per version for %s' % package)