#!/usr/bin/env python # coding: utf-8 # In[101]: import plotly.graph_objs as go from plotly.offline import download_plotlyjs, init_notebook_mode, iplot init_notebook_mode(connected=True) # We will use the `gbq.read_gbq` function to read BigQuery datasets into Pandas `DataFrame` objects. # In[102]: import pandas as pd from pandas.io import gbq # In[103]: import numpy as np # We will use `linregress` function for linear regression of scatter plots. # In[104]: from scipy.stats import linregress # ### Data Collection # Read the post [Using Google BigQuery with Plotly and Pandas](moderndata.plot.ly/using-google-bigquery-with-plotly-and-pandas/) to create a new project. # In[105]: project_id = 'sixth-edition-678' # This query will collect the `timestamp`, `package name`, and `total download count` columns from the table (on a daily basis). # In[106]: daily_download_query = """ SELECT DATE(timestamp) as day, MONTH(timestamp) as month, file.project, COUNT(*) as total_downloads, FROM TABLE_DATE_RANGE( [the-psf:pypi.downloads], TIMESTAMP("20120701"), CURRENT_TIMESTAMP() ) WHERE file.project = '{0}' GROUP BY day, file.project, month ORDER BY day asc """ # The following function run the query and returns a DataFrame object, if successful. # In[107]: def package_df(package): """ Return the query result as a pandas.DataFrame object param: package(str): Name of the package on PyPI """ try: df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id) return df except: raise IOError # We will construct different DataFrames for each package. # In[108]: plotly_df = package_df('plotly') # In[109]: bokeh_df = package_df('bokeh') # In[110]: matplotlib_df = package_df('matplotlib') # In[111]: mpld3_df = package_df('mpld3') # In[112]: vincent_df = package_df('vincent') # ### Inspection for Missing Data # Using a simple `TimeDelta` calculation, we can find if some rows are missing from the DataFrame. # In[113]: from datetime import datetime, timedelta # Number of rows in the DataFrame actual_rows = len(plotly_df) start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22 end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29 # Expected rows if there was no missing data (day) expected_rows = (end_date - start_date).days + 1 if (actual_rows != expected_rows): print "{0} rows are missing in the DataFrame.".format(expected_rows - actual_rows) # We find that there are no rows from **2016-03-06** to **2016-05-21**. # ### Data Transformation # # Here, we will concate the missing values in the DataFrames. # In[118]: missing_data_start_date = '2016-03-06' missing_data_end_date = '2016-05-21' # starting/ending date for missing data and time differene (1 day) s = datetime.strptime(missing_data_start_date, '%Y-%m-%d') e = datetime.strptime(missing_data_end_date, '%Y-%m-%d') diff = timedelta(days=1) # generate all the missing dates in the same format missing_dates = [] missing_dates_month = [] while (s <= e): missing_dates.append(s.strftime('%Y-%m-%d')) missing_dates_month.append(int(s.strftime('%m')[1])) s += diff missing_row_count = len(missing_dates) # 77 # In[123]: def append_missing_data(dataframe, package): """Append the missing dates DataFrame to a given DataFrame param: dataframe(pandas.DataFrame): DataFrame to append param: package(str): Name of package on PyPI """ missing_dates_df = pd.DataFrame({'day': missing_dates, 'month': missing_dates_month, 'file_project': [package for i in range(missing_row_count)], 'total_downloads': [0 for i in range(missing_row_count)]} ) # place the appended columns at their right place by sorting new_df = pd.concat([dataframe, missing_dates_df]) return new_df.sort_values('day') # Updated DataFrames with the recovered missing data. # In[127]: bokeh_df = append_missing_data(bokeh_df, 'bokeh') matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib') mpld3_df = append_missing_data(mpld3_df, 'mpld3') plotly_df = append_missing_data(plotly_df, 'plotly') vincent_df = append_missing_data(vincent_df, 'vincent') # ### Package Downloads Comparison (Daily) # In[128]: trace1 = go.Scatter( x=plotly_df['day'], y=plotly_df['total_downloads'], name='Plotly', mode='lines', line=dict(width=0.5, color='rgb(10. 240, 10)'), fill='tonexty' ) trace2 = go.Scatter( x=bokeh_df['day'], y=bokeh_df['total_downloads'], name='Bokeh', mode='lines', line=dict(width=0.5, color='rgb(42, 77, 20)'), fill='tonexty' ) trace3 = go.Scatter( x=mpld3_df['day'], y=mpld3_df['total_downloads'], name='MPLD3', mode='lines', line=dict(width=0.5, color='rgb(20, 33, 61)'), fill='tonexty' ) trace4 = go.Scatter( x=vincent_df['day'], y=vincent_df['total_downloads'], name='Vincent', mode='lines', line=dict(width=0.5, color='rgb(0, 0, 0)'), fill='tonexty' ) data = [trace1, trace2, trace3, trace4] layout = go.Layout( title='Package Downloads Comparison (Daily)', showlegend=True, xaxis=dict( type='category', showgrid=False ), yaxis=dict( title='No. of downloads (daily)', type='linear', range=[1, 10000] ), plot_bgcolor='rgba(250, 250, 250, 1)', shapes=[ dict( type='line', xref='x', yref='y', x0='45', y0='2000', x1='120', y1='2000' ) ], annotations=[ dict( x=75, y=2400, xref='x', yref='y', text="PyPI's stats collection service was down from March 6 to May 21", showarrow=False ), dict( x=115, y=9600, xref='x', yref='y', text='From Jan 22, 2016 To Aug 29, 2016', showarrow=False ), dict( x=121, y=2000, xref='x', yref='y', text="", showarrow=True, ay=0, ax=-5 ), dict( x=45, y=2000, xref='x', yref='y', text="", showarrow=True, ay=0, ax=5 ) ] ) fig = go.Figure(data=data, layout=layout) iplot(fig) # ### Package Downloads Comparison (Monthly) # The dataset was created on Jan 22, 2016. We will use these months on the x-axis. # In[129]: months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug'] # We are using pandas' `groupby` method to gather all the row by their `month` value and then adding their count to find out 'total downloads' in the month. # In[130]: plotly_df.groupby('month').sum() # In[131]: trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly') trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent') trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh') trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3') # In[132]: data = [trace1, trace2, trace3, trace4] layout = go.Layout( barmode='group', title="Package Downloads Comparison (PyPI)", yaxis=dict( title='No. of downloads (monthly)' ), xaxis=dict( title='Month' ), annotations=[ dict( x=3, y=0, xref='x', yref='y', text="PyPI's stats collection service
was down from March 6 to May 21", showarrow=True, arrowhead=2, ax=0, ay=-150 ), dict( x=3.7, y=90000, xref='x', yref='y', text='From Jan 22, 2016 To Aug 29, 2016', showarrow=False ) ] ) fig = go.Figure(data=data, layout=layout) iplot(fig) # ### Growth of Plotly package downloads # Following the tutorial [Linear fit in Python](https://plot.ly/python/linear-fits/), we will try to find an # approximate regression line for the scatter graph of Plotly package's downloads. # In[155]: xvals = np.arange(0, len(plotly_df)) # The following `traces` are for the package downloads scatter plot (for each package). # In[207]: trace1 = go.Scatter( x=xvals[:44], y=plotly_df['total_downloads'].iloc[:44], mode='markers', marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'), name='Plotly Downloads' ) trace2 = go.Scatter( x=xvals[121:], y=plotly_df['total_downloads'].iloc[121:], mode='markers', marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'), name='Plotly Downloads', showlegend=False ) # linear regression line for Plotly package downloads pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads']) plotly_line = pslope*xvals + pintercept trace3 = go.Scatter( x=xvals, y=plotly_line, mode='lines', marker=go.Marker(color='rgb(10, 20, 30)'), name='Plotly Regression Line', line=dict( color='rgba(10, 10, 10, 1)', width=1, dash='longdashdot' ) ) layout = go.Layout( title='Linear Regression Line for Plotly\'s Package Downloads Growth', yaxis = dict( title='No. of downloads (daily)' ), xaxis = dict( title='# days' ), annotations=[ dict( x=85, y=2000, xref='x', yref='y', text="Y = 13.29X - 282.55", showarrow=False ) ] ) data = [trace1, trace2, trace3] fig = go.Figure(data=data, layout=layout) iplot(fig) # Similary, we can find the approximate growth line for 'Matplotlib'. # In[204]: mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads']) matplotlib_line = mslope*xvals + mintercept # Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now. # #### Let's find out how much time will it take for Plotly to reach that level. # Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000. # $Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016. # # #### That means it will take around 404 days for Plotly's download range to reach 8000. # In[229]: # linear regression line for Plotly package downloads pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads']) plotly_line = pslope*xvals + pintercept trace1 = go.Scatter( x=xvals, y=plotly_line, mode='lines', marker=go.Marker(color='rgb(10, 20, 30)'), name='Plotly Regression (Actual)', line=dict( color='rgba(10, 10, 10, 1)', width=1, dash='longdashdot' ) ) future_xvals = np.arange(221, 221 + 404) trace2 = go.Scatter( x=future_xvals, y=pslope*future_xvals+pintercept, mode='lines', marker=go.Marker(color='rgb(10, 20, 30)'), name='Plotly Regression (Prediction)', line=dict( color='rgba(10, 10, 10, 1)', width=1, dash='dot' ) ) layout = go.Layout( title='Prediction for Plotly\'s Package Downloads Growth', yaxis = dict( title='No. of downloads (daily)' ), xaxis = dict( title='# days' ), annotations=[ dict( x=85, y=2000, xref='x', yref='y', text="Y = 13.29X - 282.55", showarrow=False ), dict( x=400, y=7800, xref='x', yref='y', text="Current download range for Matplotlib", showarrow=False ) ], shapes=[ dict( type='line', xref='x', yref='y', x0=0, y0=8000, x1=624, y1=8000, line=dict( color='rgba(10, 10, 10, 1)', width=1, dash='solid' ) ), dict( type='line', xref='x', yref='y', x0=624, y0=0, x1=624, y1=8000, line=dict( color='rgba(10, 10, 10, 1)', width=1, dash='solid' ) ) ] ) data = [trace1, trace2] fig = go.Figure(data=data, layout=layout) iplot(fig) # In[ ]: