#!/usr/bin/env python # coding: utf-8 # In[1]: from IPython.display import Markdown Markdown(filename='README.md') # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: import datetime import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import pandas as pd import requests # In[4]: mpl.style.use('ggplot') figsize = (14,7) # In[5]: now = datetime.datetime.utcnow() print(f'This notebook was last rendered at {now} UTC') # First, let's load the historical data into a DataFrame indexed by date. # In[6]: hits_df = pd.read_csv('ipynb_counts.csv', index_col=0, header=0, parse_dates=True) hits_df.reset_index(inplace=True) hits_df.drop_duplicates(subset='date', inplace=True) hits_df.set_index('date', inplace=True) hits_df.sort_index(ascending=True, inplace=True) # In[7]: hits_df.tail(3) # There might be missing counts for days that we failed to sample. We build up the expected date range and insert NaNs for dates we missed. # In[8]: til_today = pd.date_range(hits_df.index[0], hits_df.index[-1]) # In[9]: hits_df = hits_df.reindex(til_today) # Now we plot the known notebook counts. # In[10]: fig, ax = plt.subplots(figsize=figsize) ax.set_title(f'GitHub search hits for {len(hits_df)} days') ax.plot(hits_df.hits, 'ko', markersize=1, label='hits') ax.legend(loc='upper left') ax.set_xlabel('Date') ax.set_ylabel('# of ipynb files'); # Growth appears exponential until December 2020, at which point the count dropped suddenly and resumed growth from a new origin. # The total change in the number of `*.ipynb` hits between the first day we have data and today is: # In[11]: total_delta_nbs = hits_df.iloc[-1] - hits_df.iloc[0] total_delta_nbs # The mean daily change for the entire duration is: # In[12]: avg_delta_nbs = total_delta_nbs / len(hits_df) avg_delta_nbs # The change in hit count between any two consecutive days for which we have data looks like the following: # In[13]: daily_deltas = (hits_df.hits - hits_df.hits.shift()) # In[14]: fig, ax = plt.subplots(figsize=figsize) ax.plot(daily_deltas, 'ko', markersize=2) ax.set_xlabel('Date') ax.set_ylabel('$\Delta$ # of ipynb files') ax.set_title('Day-to-Day Change'); # The large jumps in the data are from GitHub reporting drastically different counts from one day to the next. # # Let's drop outliers defined as values more than two standard deviations away from a centered 90 day rolling mean. # In[15]: daily_delta_rolling = daily_deltas.rolling(window=90, min_periods=0, center=True) outliers = abs(daily_deltas - daily_delta_rolling.mean()) > 2*daily_delta_rolling.std() outliers.value_counts() # In[16]: cleaned_hits_df = hits_df.copy() cleaned_hits_df[outliers] = np.NaN cleaned_daily_deltas = (cleaned_hits_df.hits - cleaned_hits_df.hits.shift()) # In[17]: fig, ax = plt.subplots(figsize=figsize) ax.plot(cleaned_daily_deltas, 'ko', markersize=2) ax.set_xlabel('Date') ax.set_ylabel('$\Delta$ # of ipynb files') ax.set_title('Day-to-Day Change Sans Outliers');