#!/usr/bin/env python # coding: utf-8 # # Estimate of Jupyter Notebooks on GitHub # # This notebook shows the historical count and future estimate of the number of `*.ipynb` files on GitHub. The daily count comes from executing the query [extension:ipynb nbformat_minor](https://github.com/search?utf8=%E2%9C%93&q=extension%3Aipynb+nbformat_minor) once a day, on most days. We re-render the notebook and publish it daily after the update. # ## Assumptions # # 1. That the search query hits and notebooks on GitHub are in 1:1 correspondence. # 1. That GitHub is accurately reporting the total number of `*.ipynb` file hits. # 1. That the result is **not** inflated due to GitHub forks. # * Evidence: We do not see the tutorial notebooks from the ipython/ipython GitHub repository duplicated in the search results because of the 2,000+ forks of the ipython/ipython repo. # 1. That the result **is** inflated a tiny bit by manually created duplicates of notebooks. # * Evidence: Some people seem to download their favorite notebooks and then upload them into their own git repositories for safe keeping. # ## Prerequisites # # 1. `jupyter/datascience-notebook:9c0c4a1fc008` Docker image # 2. `beautifulsoup4==4.4.1` which is installed in the next cell # In[1]: get_ipython().system("pip install 'beautifulsoup4==4.4.*' > /dev/null") # In[2]: import warnings warnings.simplefilter('ignore') # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') # In[4]: import time import requests import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd import numpy as np import statsmodels.api as sm from __future__ import division from bs4 import BeautifulSoup # In[5]: mpl.style.use('ggplot') figsize = (14,7) # In[6]: today = time.strftime("%Y-%m-%d") print('This notebook was last rendered on {}.'.format(today)) # ## Raw Hits # # First, let's load the historical data into a DataFrame indexed by date. # In[7]: hits_df = pd.read_csv('ipynb_counts.csv', index_col=0, header=0, parse_dates=True) hits_df.reset_index(inplace=True) hits_df.drop_duplicates(cols='date', inplace=True) hits_df.set_index('date', inplace=True) # In[8]: hits_df.tail(3) # Now let's fetch (and save) today's count if we haven't already. # In[9]: if today not in hits_df.index: resp = requests.get('https://github.com/search?p=2&q=extension%3Aipynb+nbformat_minor&ref=searchresults&type=Code&utf8=%E2%9C%93') resp.raise_for_status() soup = BeautifulSoup(resp.content, 'html.parser') elem = soup.find('span', {'class': 'counter'}) count_now = int(elem.text.replace(',','')) # make sure the index remains a timestamp hits_df.loc[pd.Timestamp(today)] = count_now # save to the same format we read hits_df.to_csv('ipynb_counts.csv', date_format='%Y-%m-%d', index_label='date') # In[10]: hits_df.tail(3) # There might be missing counts for days that we failed to sample. We build up the expected date range and insert NaNs for dates we missed. # In[11]: til_today = pd.date_range(hits_df.index[0], hits_df.index[-1]) # In[12]: hits_df = hits_df.reindex(til_today) # Now we plot the known notebook counts for each day we've been tracking the query results. # In[13]: ax = hits_df.plot(title="GitHub search hits for {} days".format(len(hits_df)), figsize=figsize) ax.set_xlabel('Date') ax.set_ylabel('# of ipynb files') # ## Smoothed Hits # The outliers in the data are from GitHub reporting drastically different counts when we sample. We suspect this happens when they rebuild their search index. We'll filter them out now by removing any daily change greater than 2.5 standard deviations from the mean daily change. # In[14]: daily_deltas = (hits_df.hits - hits_df.hits.shift()).fillna(0) # In[15]: outliers = abs(daily_deltas - daily_deltas.mean()) > 2.5*daily_deltas.std() # In[16]: hits_df.ix[outliers] = np.NaN # Now we'll do simple linear interpolation for any missing values over days that we failed to sample and days that had outlier counts. # In[17]: hits_df = hits_df.interpolate(method='time') # In[18]: ax = hits_df.plot(title="GitHub search hits for {} days sans outliers".format(len(hits_df)), figsize=figsize) ax.set_xlabel('Date') _ = ax.set_ylabel('# of ipynb files') # The total change in the number of `*.ipynb` hits between the tracking start date and today is: # In[19]: total_delta_nbs = hits_df.iloc[-1] - hits_df.iloc[0] total_delta_nbs # ## Daily Change # The daily average change is: # In[20]: avg_delta_nbs = total_delta_nbs / len(hits_df) avg_delta_nbs # We can look at the daily change over the entire period. We can also plot the rolling 30-day mean of the daily deltas. # In[21]: daily_deltas = (hits_df.hits - hits_df.hits.shift()).fillna(0) # In[22]: fig, ax = plt.subplots(figsize=figsize) ax.plot(pd.rolling_mean(daily_deltas, window=30, min_periods=0), label='30-day rolling mean of daily-change') ax.plot(daily_deltas, label='24-hour change') ax.set_xlabel('Date') ax.set_ylabel('Delta notebook count') ax.set_title('Change in notebook count') _ = ax.legend(loc='upper left') # Let's look at the rolling mean in isolation. # In[23]: fig, ax = plt.subplots(figsize=figsize) ax.plot(pd.rolling_mean(daily_deltas, window=30, min_periods=0)) ax.set_xlabel('Date') ax.set_ylabel('Delta notebook count') _ = ax.set_title('30-day rolling mean of daily-change') # ## Count Prediction # We next train an [autoregressive model](http://en.wikipedia.org/wiki/Autoregressive_model) on the time series data. We then use the model to predict the number of notebooks on GitHub a few months out. # In[24]: def train(df): ar_model = sm.tsa.AR(df, freq='D') ar_model_res = ar_model.fit(ic='bic') return ar_model_res # We look at the model using all data up to and including today's count, plus two historical models. # In[25]: start_date='2014-10-20' end_date='2017-01-01' model_dates = [today, '2015-06-01', '2014-11-15'] # In[26]: models = [train(hits_df.loc[:date]) for date in model_dates] # We see that the most recently selected model has more parameters allowed according to the [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion) methodology. # In[27]: pd.DataFrame([m.params for m in models], index=model_dates).T # We predict everything from the start date to the end date, using the model values throughout the range of known truth. # In[28]: predictions = [model.predict(start=start_date, end=end_date, dynamic=True) for model in models] # We put all of the predictions in a DataFrame alongside the ground truth for plotting. # In[29]: eval_df = pd.DataFrame(predictions, index=model_dates).T # In[30]: eval_df['truth'] = hits_df.hits # In[31]: title = 'GitHub search hits predicted from {} until {}'.format(start_date, end_date) ax = eval_df.plot(title=title, figsize=figsize) _ = ax.set_ylabel('# of ipynb files') # We plot the residuals for each model to get a sense of how accurate it is as time marches on. # In[32]: residual_df = -eval_df.subtract(eval_df.truth, axis=0).dropna().drop('truth', axis=1) # In[33]: fig, ax = plt.subplots(figsize=figsize) for i, (name, column) in enumerate(residual_df.iteritems()): ax.scatter(residual_df.index, column, c=mpl.rcParams['axes.color_cycle'][i], label=name) ax.legend(loc='upper left') ax.set_ylabel('# of ipynb files') ax.set_title('Residuals between predicted and truth') fig.autofmt_xdate()