import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
We will use the gbq.read_gbq
function to read BigQuery datasets into Pandas DataFrame
objects.
import pandas as pd
from pandas.io import gbq
import numpy as np
We will use linregress
function for linear regression of scatter plots.
from scipy.stats import linregress
Read the post Using Google BigQuery with Plotly and Pandas to create a new project.
project_id = 'sixth-edition-678'
This query will collect the timestamp
, package name
, and total download count
columns from the table (on a daily basis).
daily_download_query = """
SELECT
DATE(timestamp) as day,
MONTH(timestamp) as month,
file.project,
COUNT(*) as total_downloads,
FROM
TABLE_DATE_RANGE(
[the-psf:pypi.downloads],
TIMESTAMP("20120701"),
CURRENT_TIMESTAMP()
)
WHERE
file.project = '{0}'
GROUP BY
day, file.project, month
ORDER BY
day asc
"""
The following function run the query and returns a DataFrame object, if successful.
def package_df(package):
""" Return the query result as a pandas.DataFrame object
param: package(str): Name of the package on PyPI
"""
try:
df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id)
return df
except:
raise IOError
We will construct different DataFrames for each package.
plotly_df = package_df('plotly')
Requesting query... ok. Query running... Query done. Processed: 35.8 Gb Retrieving results... Got page: 1; 100.0% done. Elapsed 13.68 s. Got 144 rows. Total time taken 14.04 s. Finished at 2016-08-29 13:04:38.
bokeh_df = package_df('bokeh')
Requesting query... ok. Query running... Query done. Processed: 35.8 Gb Retrieving results... Got page: 1; 100.0% done. Elapsed 15.13 s. Got 144 rows. Total time taken 15.15 s. Finished at 2016-08-29 13:04:53.
matplotlib_df = package_df('matplotlib')
Requesting query... ok. Query running... Elapsed 14.52 s. Waiting... Query done. Processed: 35.8 Gb Retrieving results... Got page: 1; 100.0% done. Elapsed 15.37 s. Got 144 rows. Total time taken 15.39 s. Finished at 2016-08-29 13:05:09.
mpld3_df = package_df('mpld3')
Requesting query... ok. Query running... Query done. Processed: 35.8 Gb Retrieving results... Got page: 1; 100.0% done. Elapsed 13.45 s. Got 144 rows. Total time taken 13.47 s. Finished at 2016-08-29 13:05:22.
vincent_df = package_df('vincent')
Requesting query... ok. Query running... Query done. Processed: 35.8 Gb Retrieving results... Got page: 1; 100.0% done. Elapsed 14.06 s. Got 144 rows. Total time taken 14.08 s. Finished at 2016-08-29 13:05:36.
Using a simple TimeDelta
calculation, we can find if some rows are missing from the DataFrame.
from datetime import datetime, timedelta
# Number of rows in the DataFrame
actual_rows = len(plotly_df)
start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22
end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29
# Expected rows if there was no missing data (day)
expected_rows = (end_date - start_date).days + 1
if (actual_rows != expected_rows):
print "{0} rows are missing in the DataFrame.".format(expected_rows - actual_rows)
77 rows are missing in the DataFrame.
We find that there are no rows from 2016-03-06 to 2016-05-21.
Here, we will concate the missing values in the DataFrames.
missing_data_start_date = '2016-03-06'
missing_data_end_date = '2016-05-21'
# starting/ending date for missing data and time differene (1 day)
s = datetime.strptime(missing_data_start_date, '%Y-%m-%d')
e = datetime.strptime(missing_data_end_date, '%Y-%m-%d')
diff = timedelta(days=1)
# generate all the missing dates in the same format
missing_dates = []
missing_dates_month = []
while (s <= e):
missing_dates.append(s.strftime('%Y-%m-%d'))
missing_dates_month.append(int(s.strftime('%m')[1]))
s += diff
missing_row_count = len(missing_dates) # 77
def append_missing_data(dataframe, package):
"""Append the missing dates DataFrame to a given DataFrame
param: dataframe(pandas.DataFrame): DataFrame to append
param: package(str): Name of package on PyPI
"""
missing_dates_df = pd.DataFrame({'day': missing_dates,
'month': missing_dates_month,
'file_project': [package for i in range(missing_row_count)],
'total_downloads': [0 for i in range(missing_row_count)]}
)
# place the appended columns at their right place by sorting
new_df = pd.concat([dataframe, missing_dates_df])
return new_df.sort_values('day')
Updated DataFrames with the recovered missing data.
bokeh_df = append_missing_data(bokeh_df, 'bokeh')
matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib')
mpld3_df = append_missing_data(mpld3_df, 'mpld3')
plotly_df = append_missing_data(plotly_df, 'plotly')
vincent_df = append_missing_data(vincent_df, 'vincent')
trace1 = go.Scatter(
x=plotly_df['day'],
y=plotly_df['total_downloads'],
name='Plotly',
mode='lines',
line=dict(width=0.5,
color='rgb(10. 240, 10)'),
fill='tonexty'
)
trace2 = go.Scatter(
x=bokeh_df['day'],
y=bokeh_df['total_downloads'],
name='Bokeh',
mode='lines',
line=dict(width=0.5,
color='rgb(42, 77, 20)'),
fill='tonexty'
)
trace3 = go.Scatter(
x=mpld3_df['day'],
y=mpld3_df['total_downloads'],
name='MPLD3',
mode='lines',
line=dict(width=0.5,
color='rgb(20, 33, 61)'),
fill='tonexty'
)
trace4 = go.Scatter(
x=vincent_df['day'],
y=vincent_df['total_downloads'],
name='Vincent',
mode='lines',
line=dict(width=0.5,
color='rgb(0, 0, 0)'),
fill='tonexty'
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
title='Package Downloads Comparison (Daily)',
showlegend=True,
xaxis=dict(
type='category',
showgrid=False
),
yaxis=dict(
title='No. of downloads (daily)',
type='linear',
range=[1, 10000]
),
plot_bgcolor='rgba(250, 250, 250, 1)',
shapes=[
dict(
type='line',
xref='x',
yref='y',
x0='45',
y0='2000',
x1='120',
y1='2000'
)
],
annotations=[
dict(
x=75,
y=2400,
xref='x',
yref='y',
text="PyPI's stats collection service was down from March 6 to May 21",
showarrow=False
),
dict(
x=115,
y=9600,
xref='x',
yref='y',
text='From Jan 22, 2016 To Aug 29, 2016',
showarrow=False
),
dict(
x=121,
y=2000,
xref='x',
yref='y',
text="",
showarrow=True,
ay=0,
ax=-5
),
dict(
x=45,
y=2000,
xref='x',
yref='y',
text="",
showarrow=True,
ay=0,
ax=5
)
]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
The dataset was created on Jan 22, 2016. We will use these months on the x-axis.
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']
We are using pandas' groupby
method to gather all the row by their month
value and then adding their count to find out 'total downloads' in the month.
plotly_df.groupby('month').sum()
total_downloads | |
---|---|
month | |
1 | 6791 |
2 | 25920 |
3 | 6008 |
4 | 0 |
5 | 15946 |
6 | 52043 |
7 | 61338 |
8 | 92813 |
trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly')
trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent')
trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh')
trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3')
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
barmode='group',
title="Package Downloads Comparison (PyPI)",
yaxis=dict(
title='No. of downloads (monthly)'
),
xaxis=dict(
title='Month'
),
annotations=[
dict(
x=3,
y=0,
xref='x',
yref='y',
text="PyPI's stats collection service<br>was down from March 6 to May 21",
showarrow=True,
arrowhead=2,
ax=0,
ay=-150
),
dict(
x=3.7,
y=90000,
xref='x',
yref='y',
text='From Jan 22, 2016 To Aug 29, 2016',
showarrow=False
)
]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Following the tutorial Linear fit in Python, we will try to find an approximate regression line for the scatter graph of Plotly package's downloads.
xvals = np.arange(0, len(plotly_df))
The following traces
are for the package downloads scatter plot (for each package).
trace1 = go.Scatter(
x=xvals[:44],
y=plotly_df['total_downloads'].iloc[:44],
mode='markers',
marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
name='Plotly Downloads'
)
trace2 = go.Scatter(
x=xvals[121:],
y=plotly_df['total_downloads'].iloc[121:],
mode='markers',
marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
name='Plotly Downloads',
showlegend=False
)
# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept
trace3 = go.Scatter(
x=xvals,
y=plotly_line,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression Line',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='longdashdot'
)
)
layout = go.Layout(
title='Linear Regression Line for Plotly\'s Package Downloads Growth',
yaxis = dict(
title='No. of downloads (daily)'
),
xaxis = dict(
title='# days'
),
annotations=[
dict(
x=85,
y=2000,
xref='x',
yref='y',
text="<b>Y = 13.29X - 282.55</b>",
showarrow=False
)
]
)
data = [trace1, trace2, trace3]
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Similary, we can find the approximate growth line for 'Matplotlib'.
mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads'])
matplotlib_line = mslope*xvals + mintercept
Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now.
Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000.
$Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016.
# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept
trace1 = go.Scatter(
x=xvals,
y=plotly_line,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression (Actual)',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='longdashdot'
)
)
future_xvals = np.arange(221, 221 + 404)
trace2 = go.Scatter(
x=future_xvals,
y=pslope*future_xvals+pintercept,
mode='lines',
marker=go.Marker(color='rgb(10, 20, 30)'),
name='Plotly Regression (Prediction)',
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='dot'
)
)
layout = go.Layout(
title='Prediction for Plotly\'s Package Downloads Growth',
yaxis = dict(
title='No. of downloads (daily)'
),
xaxis = dict(
title='# days'
),
annotations=[
dict(
x=85,
y=2000,
xref='x',
yref='y',
text="<b>Y = 13.29X - 282.55</b>",
showarrow=False
),
dict(
x=400,
y=7800,
xref='x',
yref='y',
text="Current download range for Matplotlib",
showarrow=False
)
],
shapes=[
dict(
type='line',
xref='x',
yref='y',
x0=0,
y0=8000,
x1=624,
y1=8000,
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='solid'
)
),
dict(
type='line',
xref='x',
yref='y',
x0=624,
y0=0,
x1=624,
y1=8000,
line=dict(
color='rgba(10, 10, 10, 1)',
width=1,
dash='solid'
)
)
]
)
data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)
iplot(fig)