from lxml.html import document_fromstring import requests import pandas as pd from dateutil.parser import parse as dateparse %pylab inline def package_versions(package_name, name_pwd): """ Scrape the package's pkg_edit page for all uploaded versions package_name: official package name name_pwd: PyPI username, password tuple Returns a list of versions for the package """ pl = {':action': 'pkg_edit', 'name': package_name} r = requests.get('https://pypi.python.org/pypi', params=pl, auth=name_pwd) html = document_fromstring(r.content) rows = html.cssselect('table.list tr')[:-1] header = rows[0] data = rows[1:] version_index, _ = filter(lambda x: x[1].text_content().lower() == 'version', enumerate(header))[0] versions = map(lambda x: x[version_index].text_content(), data) return versions def package_data(package, version): """ Hit the official json feed for a package and version Return the package info and url data """ url = 'http://pypi.python.org/pypi/%s/%s/json' % (package, version) data = requests.get(url).json() metadata = data['info'] # PyPI publishes the version information as a list, we're interested in first url_info = data['urls'][0] return metadata, url_info def get_pypi_auth(): """ Try to read ~/.pypirc and return the (username, password) tuple """ from ConfigParser import ConfigParser import os cp = ConfigParser() cp.read(os.path.expanduser('~/.pypirc')) name = cp.get('pypi', 'username') passwd = cp.get('pypi', 'password') return name, passwd package = 'PyCap' versions = package_versions(package, get_pypi_auth()) data = [package_data(package, v)[1] for v in versions] df = pd.DataFrame(data, index=versions) # convert upload_time df['upload_time'] = df['upload_time'].map(dateparse) df['elapsed'] = df['upload_time'] - df['upload_time'].shift(-1) def to_days(x): days = -1 try: days = x.days except: pass return days df['elapsed'] = df['elapsed'].map(to_days) # compute downloads / day df['downloads / day'] = df['downloads'] / df['elapsed'] to_plot = df['downloads / day'][:-1] mpl.rc('figure', figsize=(10, 8)) to_plot.plot(kind='bar') plt.ylabel('Downloads Per Day') plt.xlabel('Version') plt.title('Downloads/day per version for %s' % package) plt.show()