In [101]:

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

init_notebook_mode(connected=True)

We will use the gbq.read_gbq function to read BigQuery datasets into Pandas DataFrame objects.

In [102]:

import pandas as pd
from pandas.io import gbq

In [103]:

import numpy as np

We will use linregress function for linear regression of scatter plots.

In [104]:

from scipy.stats import linregress

Data Collection¶

Read the post Using Google BigQuery with Plotly and Pandas to create a new project.

In [105]:

project_id = 'sixth-edition-678'

This query will collect the timestamp, package name, and total download count columns from the table (on a daily basis).

In [106]:

daily_download_query = """
SELECT
  DATE(timestamp) as day,
  MONTH(timestamp) as month,
  file.project,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20120701"),
    CURRENT_TIMESTAMP()
  )
WHERE
  file.project = '{0}'
GROUP BY
  day, file.project, month
ORDER BY
  day asc
"""

The following function run the query and returns a DataFrame object, if successful.

In [107]:

def package_df(package):
    """ Return the query result as a pandas.DataFrame object
    
    param: package(str): Name of the package on PyPI
    """
    
    try:
        df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id)
        return df
    except:
        raise IOError

We will construct different DataFrames for each package.

In [108]:

plotly_df = package_df('plotly')

Requesting query... ok.
Query running...
Query done.
Processed: 35.8 Gb

Retrieving results...
  Got page: 1; 100.0% done. Elapsed 13.68 s.
Got 144 rows.

Total time taken 14.04 s.
Finished at 2016-08-29 13:04:38.

In [109]:

bokeh_df = package_df('bokeh')

Requesting query... ok.
Query running...
Query done.
Processed: 35.8 Gb

Retrieving results...
  Got page: 1; 100.0% done. Elapsed 15.13 s.
Got 144 rows.

Total time taken 15.15 s.
Finished at 2016-08-29 13:04:53.

In [110]:

matplotlib_df = package_df('matplotlib')

Requesting query... ok.
Query running...
  Elapsed 14.52 s. Waiting...
Query done.
Processed: 35.8 Gb

Retrieving results...
  Got page: 1; 100.0% done. Elapsed 15.37 s.
Got 144 rows.

Total time taken 15.39 s.
Finished at 2016-08-29 13:05:09.

In [111]:

mpld3_df = package_df('mpld3')

Requesting query... ok.
Query running...
Query done.
Processed: 35.8 Gb

Retrieving results...
  Got page: 1; 100.0% done. Elapsed 13.45 s.
Got 144 rows.

Total time taken 13.47 s.
Finished at 2016-08-29 13:05:22.

In [112]:

vincent_df = package_df('vincent')

Requesting query... ok.
Query running...
Query done.
Processed: 35.8 Gb

Retrieving results...
  Got page: 1; 100.0% done. Elapsed 14.06 s.
Got 144 rows.

Total time taken 14.08 s.
Finished at 2016-08-29 13:05:36.

Inspection for Missing Data¶

Using a simple TimeDelta calculation, we can find if some rows are missing from the DataFrame.

In [113]:

from datetime import datetime, timedelta

# Number of rows in the DataFrame
actual_rows = len(plotly_df)

start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22
end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29

# Expected rows if there was no missing data (day)
expected_rows = (end_date - start_date).days + 1

if (actual_rows != expected_rows):
    print "{0} rows are missing in the DataFrame.".format(expected_rows - actual_rows)

77 rows are missing in the DataFrame.

We find that there are no rows from 2016-03-06 to 2016-05-21.

Data Transformation¶

Here, we will concate the missing values in the DataFrames.

In [118]:

missing_data_start_date = '2016-03-06'
missing_data_end_date = '2016-05-21'

# starting/ending date for missing data and time differene (1 day)
s = datetime.strptime(missing_data_start_date, '%Y-%m-%d')
e = datetime.strptime(missing_data_end_date, '%Y-%m-%d')
diff = timedelta(days=1)

# generate all the missing dates in the same format
missing_dates = []
missing_dates_month = []

while (s <= e):
    missing_dates.append(s.strftime('%Y-%m-%d'))
    missing_dates_month.append(int(s.strftime('%m')[1]))
    s += diff
    
missing_row_count = len(missing_dates) # 77

In [123]:

def append_missing_data(dataframe, package):
    """Append the missing dates DataFrame to a given DataFrame
    
    param: dataframe(pandas.DataFrame): DataFrame to append
    param: package(str): Name of package on PyPI
    """
    
    missing_dates_df = pd.DataFrame({'day': missing_dates,
                                    'month': missing_dates_month,
                                    'file_project': [package for i in range(missing_row_count)],
                                    'total_downloads': [0 for i in range(missing_row_count)]}
                                   )
    
    # place the appended columns at their right place by sorting
    new_df = pd.concat([dataframe, missing_dates_df])
    
    return new_df.sort_values('day')

Updated DataFrames with the recovered missing data.

In [127]:

bokeh_df = append_missing_data(bokeh_df, 'bokeh')
matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib')
mpld3_df = append_missing_data(mpld3_df, 'mpld3')
plotly_df = append_missing_data(plotly_df, 'plotly')
vincent_df = append_missing_data(vincent_df, 'vincent')

Package Downloads Comparison (Daily)¶

In [128]:

trace1 = go.Scatter(
    x=plotly_df['day'],
    y=plotly_df['total_downloads'],
    name='Plotly',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(10. 240, 10)'),
    fill='tonexty'
)

trace2 = go.Scatter(
    x=bokeh_df['day'],
    y=bokeh_df['total_downloads'],
    name='Bokeh',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(42, 77, 20)'),
    fill='tonexty'
)

trace3 = go.Scatter(
    x=mpld3_df['day'],
    y=mpld3_df['total_downloads'],
    name='MPLD3',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(20, 33, 61)'),
    fill='tonexty'
)

trace4 = go.Scatter(
    x=vincent_df['day'],
    y=vincent_df['total_downloads'],
    name='Vincent',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(0, 0, 0)'),
    fill='tonexty'
)

data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    title='Package Downloads Comparison (Daily)',
    showlegend=True,
    xaxis=dict(
        type='category',
        showgrid=False
    ),
    yaxis=dict(
        title='No. of downloads (daily)',
        type='linear',
        range=[1, 10000]
    ),
    plot_bgcolor='rgba(250, 250, 250, 1)',
    shapes=[
        dict(
            type='line',
            xref='x',
            yref='y',
            x0='45',
            y0='2000',
            x1='120',
            y1='2000'
        )
    ],
    annotations=[
        dict(
            x=75,
            y=2400,
            xref='x',
            yref='y',
            text="PyPI's stats collection service was down from March 6 to May 21",
            showarrow=False
        ),
        dict(
            x=115,
            y=9600,
            xref='x',
            yref='y',
            text='From Jan 22, 2016 To Aug 29, 2016',
            showarrow=False
        ),
        dict(
            x=121,
            y=2000,
            xref='x',
            yref='y',
            text="",
            showarrow=True,
            ay=0,
            ax=-5
        ),
        dict(
            x=45,
            y=2000,
            xref='x',
            yref='y',
            text="",
            showarrow=True,
            ay=0,
            ax=5
        )
    ]
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Package Downloads Comparison (Monthly)¶

The dataset was created on Jan 22, 2016. We will use these months on the x-axis.

In [129]:

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']

We are using pandas' groupby method to gather all the row by their month value and then adding their count to find out 'total downloads' in the month.

In [130]:

plotly_df.groupby('month').sum()

Out[130]:

	total_downloads
month
1	6791
2	25920
3	6008
4	0
5	15946
6	52043
7	61338
8	92813

In [131]:

trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly')
trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent')
trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh')
trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3')

In [132]:

data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    barmode='group',
    title="Package Downloads Comparison (PyPI)",
    yaxis=dict(
        title='No. of downloads (monthly)'
    ),
    xaxis=dict(
        title='Month'
    ),
    annotations=[
        dict(
            x=3,
            y=0,
            xref='x',
            yref='y',
            text="PyPI's stats collection service<br>was down from March 6 to May 21",
            showarrow=True,
            arrowhead=2,
            ax=0,
            ay=-150
        ),
        dict(
            x=3.7,
            y=90000,
            xref='x',
            yref='y',
            text='From Jan 22, 2016 To Aug 29, 2016',
            showarrow=False
        )
    ]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Growth of Plotly package downloads¶

Following the tutorial Linear fit in Python, we will try to find an approximate regression line for the scatter graph of Plotly package's downloads.

In [155]:

xvals = np.arange(0, len(plotly_df))

The following traces are for the package downloads scatter plot (for each package).

In [207]:

trace1 = go.Scatter(
    x=xvals[:44], 
    y=plotly_df['total_downloads'].iloc[:44], 
    mode='markers',
    marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
    name='Plotly Downloads'
)

trace2 = go.Scatter(
    x=xvals[121:], 
    y=plotly_df['total_downloads'].iloc[121:],
    mode='markers',
    marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
    name='Plotly Downloads',
    showlegend=False
)

# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept

trace3 = go.Scatter(
    x=xvals, 
    y=plotly_line, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression Line',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='longdashdot'
    )
)

layout = go.Layout(
    title='Linear Regression Line for Plotly\'s Package Downloads Growth',
    yaxis = dict(
        title='No. of downloads (daily)'
    ),
    xaxis = dict(
        title='# days'
    ),
    annotations=[
        dict(
            x=85,
            y=2000,
            xref='x',
            yref='y',
            text="<b>Y = 13.29X - 282.55</b>",
            showarrow=False
        )
    ]
)

data = [trace1, trace2, trace3]

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Similary, we can find the approximate growth line for 'Matplotlib'.

In [204]:

mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads'])
matplotlib_line = mslope*xvals + mintercept

Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now.

Let's find out how much time will it take for Plotly to reach that level.¶

Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000.

$Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016.

That means it will take around 404 days for Plotly's download range to reach 8000.¶

In [229]:

# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept

trace1 = go.Scatter(
    x=xvals, 
    y=plotly_line, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression (Actual)',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='longdashdot'
    )
)

future_xvals = np.arange(221, 221 + 404)

trace2 = go.Scatter(
    x=future_xvals, 
    y=pslope*future_xvals+pintercept, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression (Prediction)',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='dot'
    )
)

layout = go.Layout(
    title='Prediction for Plotly\'s Package Downloads Growth',
    yaxis = dict(
        title='No. of downloads (daily)'
    ),
    xaxis = dict(
        title='# days'
    ),
    annotations=[
        dict(
            x=85,
            y=2000,
            xref='x',
            yref='y',
            text="<b>Y = 13.29X - 282.55</b>",
            showarrow=False
        ),
        dict(
            x=400,
            y=7800,
            xref='x',
            yref='y',
            text="Current download range for Matplotlib",
            showarrow=False
        )
    ],
    shapes=[
        dict(
            type='line',
            xref='x',
            yref='y',
            x0=0,
            y0=8000,
            x1=624,
            y1=8000,
            line=dict(
                color='rgba(10, 10, 10, 1)',
                width=1,
                dash='solid'
            )
        ),
        dict(
            type='line',
            xref='x',
            yref='y',
            x0=624,
            y0=0,
            x1=624,
            y1=8000,
            line=dict(
                color='rgba(10, 10, 10, 1)',
                width=1,
                dash='solid'
            )
        )
    ]
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [ ]: