#!/usr/bin/env python
# coding: utf-8
# In[101]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
# We will use the `gbq.read_gbq` function to read BigQuery datasets into Pandas `DataFrame` objects.
# In[102]:
import pandas as pd
from pandas.io import gbq
# In[103]:
import numpy as np
# We will use `linregress` function for linear regression of scatter plots.
# In[104]:
from scipy.stats import linregress
# ### Data Collection
# Read the post [Using Google BigQuery with Plotly and Pandas](moderndata.plot.ly/using-google-bigquery-with-plotly-and-pandas/) to create a new project.
# In[105]:
project_id = 'sixth-edition-678'
# This query will collect the `timestamp`, `package name`, and `total download count` columns from the table (on a daily basis).
# In[106]:
daily_download_query = """
SELECT
DATE(timestamp) as day,
MONTH(timestamp) as month,
file.project,
COUNT(*) as total_downloads,
FROM
TABLE_DATE_RANGE(
[the-psf:pypi.downloads],
TIMESTAMP("20120701"),
CURRENT_TIMESTAMP()
)
WHERE
file.project = '{0}'
GROUP BY
day, file.project, month
ORDER BY
day asc
"""
# The following function run the query and returns a DataFrame object, if successful.
# In[107]:
def package_df(package):
""" Return the query result as a pandas.DataFrame object
param: package(str): Name of the package on PyPI
"""
try:
df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id)
return df
except:
raise IOError
# We will construct different DataFrames for each package.
# In[108]:
plotly_df = package_df('plotly')
# In[109]:
bokeh_df = package_df('bokeh')
# In[110]:
matplotlib_df = package_df('matplotlib')
# In[111]:
mpld3_df = package_df('mpld3')
# In[112]:
vincent_df = package_df('vincent')
# ### Inspection for Missing Data
# Using a simple `TimeDelta` calculation, we can find if some rows are missing from the DataFrame.
# In[113]:
from datetime import datetime, timedelta
# Number of rows in the DataFrame
actual_rows = len(plotly_df)
start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22
end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29
# Expected rows if there was no missing data (day)
expected_rows = (end_date - start_date).days + 1
if (actual_rows != expected_rows):
print "{0} rows are missing in the DataFrame.".format(expected_rows - actual_rows)
# We find that there are no rows from **2016-03-06** to **2016-05-21**.
# ### Data Transformation
#
# Here, we will concate the missing values in the DataFrames.
# In[118]:
missing_data_start_date = '2016-03-06'
missing_data_end_date = '2016-05-21'
# starting/ending date for missing data and time differene (1 day)
s = datetime.strptime(missing_data_start_date, '%Y-%m-%d')
e = datetime.strptime(missing_data_end_date, '%Y-%m-%d')
diff = timedelta(days=1)
# generate all the missing dates in the same format
missing_dates = []
missing_dates_month = []
while (s <= e):
missing_dates.append(s.strftime('%Y-%m-%d'))
missing_dates_month.append(int(s.strftime('%m')[1]))
s += diff
missing_row_count = len(missing_dates) # 77
# In[123]:
def append_missing_data(dataframe, package):
"""Append the missing dates DataFrame to a given DataFrame
param: dataframe(pandas.DataFrame): DataFrame to append
param: package(str): Name of package on PyPI
"""
missing_dates_df = pd.DataFrame({'day': missing_dates,
'month': missing_dates_month,
'file_project': [package for i in range(missing_row_count)],
'total_downloads': [0 for i in range(missing_row_count)]}
)
# place the appended columns at their right place by sorting
new_df = pd.concat([dataframe, missing_dates_df])
return new_df.sort_values('day')
# Updated DataFrames with the recovered missing data.
# In[127]:
bokeh_df = append_missing_data(bokeh_df, 'bokeh')
matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib')
mpld3_df = append_missing_data(mpld3_df, 'mpld3')
plotly_df = append_missing_data(plotly_df, 'plotly')
vincent_df = append_missing_data(vincent_df, 'vincent')
# ### Package Downloads Comparison (Daily)
# In[128]:
trace1 = go.Scatter(
x=plotly_df['day'],
y=plotly_df['total_downloads'],
name='Plotly',
mode='lines',
line=dict(width=0.5,
color='rgb(10. 240, 10)'),
fill='tonexty'
)
trace2 = go.Scatter(
x=bokeh_df['day'],
y=bokeh_df['total_downloads'],
name='Bokeh',
mode='lines',
line=dict(width=0.5,
color='rgb(42, 77, 20)'),
fill='tonexty'
)
trace3 = go.Scatter(
x=mpld3_df['day'],
y=mpld3_df['total_downloads'],
name='MPLD3',
mode='lines',
line=dict(width=0.5,
color='rgb(20, 33, 61)'),
fill='tonexty'
)
trace4 = go.Scatter(
x=vincent_df['day'],
y=vincent_df['total_downloads'],
name='Vincent',
mode='lines',
line=dict(width=0.5,
color='rgb(0, 0, 0)'),
fill='tonexty'
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
title='Package Downloads Comparison (Daily)',
showlegend=True,
xaxis=dict(
type='category',
showgrid=False
),
yaxis=dict(
title='No. of downloads (daily)',
type='linear',
range=[1, 10000]
),
plot_bgcolor='rgba(250, 250, 250, 1)',
shapes=[
dict(
type='line',
xref='x',
yref='y',
x0='45',
y0='2000',
x1='120',
y1='2000'
)
],
annotations=[
dict(
x=75,
y=2400,
xref='x',
yref='y',
text="PyPI's stats collection service was down from March 6 to May 21",
showarrow=False
),
dict(
x=115,
y=9600,
xref='x',
yref='y',
text='From Jan 22, 2016 To Aug 29, 2016',
showarrow=False
),
dict(
x=121,
y=2000,
xref='x',
yref='y',
text="",
showarrow=True,
ay=0,
ax=-5
),
dict(
x=45,
y=2000,
xref='x',
yref='y',
text="",
showarrow=True,
ay=0,
ax=5
)
]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# ### Package Downloads Comparison (Monthly)
# The dataset was created on Jan 22, 2016. We will use these months on the x-axis.
# In[129]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']
# We are using pandas' `groupby` method to gather all the row by their `month` value and then adding their count to find out 'total downloads' in the month.
# In[130]:
plotly_df.groupby('month').sum()
# In[131]:
trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly')
trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent')
trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh')
trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3')
# In[132]:
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
barmode='group',
title="Package Downloads Comparison (PyPI)",
yaxis=dict(
title='No. of downloads (monthly)'
),
xaxis=dict(
title='Month'
),
annotations=[
dict(
x=3,
y=0,
xref='x',
yref='y',
text="PyPI's stats collection service
was down from March 6 to May 21",
showarrow=True,
arrowhead=2,
ax=0,
ay=-150
),
dict(
x=3.7,
y=90000,
xref='x',
yref='y',
text='From Jan 22, 2016 To Aug 29, 2016',
showarrow=False
)
]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# ### Growth of Plotly package downloads
# Following the tutorial [Linear fit in Python](https://plot.ly/python/linear-fits/), we will try to find an
# approximate regression line for the scatter graph of Plotly package's downloads.
# In[155]:
xvals = np.arange(0, len(plotly_df))
# The following `traces` are for the package downloads scatter plot (for each package).
# In[207]:
trace1 = go.Scatter(
x=xvals[:44],
y=plotly_df['total_downloads'].iloc[:44],
mode='markers',
marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
name='Plotly Downloads'
)
trace2 = go.Scatter(
x=xvals[121:],
y=plotly_df['total_downloads'].iloc[121:],
mode='markers',
marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
name='Plotly Downloads',
showlegend=False
)
# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept
trace3 = go.Scatter(
x=xvals,
y=plotly_line,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression Line',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='longdashdot'
)
)
layout = go.Layout(
title='Linear Regression Line for Plotly\'s Package Downloads Growth',
yaxis = dict(
title='No. of downloads (daily)'
),
xaxis = dict(
title='# days'
),
annotations=[
dict(
x=85,
y=2000,
xref='x',
yref='y',
text="Y = 13.29X - 282.55",
showarrow=False
)
]
)
data = [trace1, trace2, trace3]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# Similary, we can find the approximate growth line for 'Matplotlib'.
# In[204]:
mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads'])
matplotlib_line = mslope*xvals + mintercept
# Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now.
# #### Let's find out how much time will it take for Plotly to reach that level.
# Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000.
# $Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016.
#
# #### That means it will take around 404 days for Plotly's download range to reach 8000.
# In[229]:
# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept
trace1 = go.Scatter(
x=xvals,
y=plotly_line,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression (Actual)',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='longdashdot'
)
)
future_xvals = np.arange(221, 221 + 404)
trace2 = go.Scatter(
x=future_xvals,
y=pslope*future_xvals+pintercept,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression (Prediction)',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='dot'
)
)
layout = go.Layout(
title='Prediction for Plotly\'s Package Downloads Growth',
yaxis = dict(
title='No. of downloads (daily)'
),
xaxis = dict(
title='# days'
),
annotations=[
dict(
x=85,
y=2000,
xref='x',
yref='y',
text="Y = 13.29X - 282.55",
showarrow=False
),
dict(
x=400,
y=7800,
xref='x',
yref='y',
text="Current download range for Matplotlib",
showarrow=False
)
],
shapes=[
dict(
type='line',
xref='x',
yref='y',
x0=0,
y0=8000,
x1=624,
y1=8000,
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='solid'
)
),
dict(
type='line',
xref='x',
yref='y',
x0=624,
y0=0,
x1=624,
y1=8000,
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='solid'
)
)
]
)
data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# In[ ]: