#!/usr/bin/env python
# coding: utf-8

# In[101]:


import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

init_notebook_mode(connected=True)


# We will use the `gbq.read_gbq` function to read BigQuery datasets into Pandas `DataFrame` objects.

# In[102]:


import pandas as pd
from pandas.io import gbq


# In[103]:


import numpy as np


# We will use `linregress` function for linear regression of scatter plots.

# In[104]:


from scipy.stats import linregress


# ### Data Collection

# Read the post [Using Google BigQuery with Plotly and Pandas](moderndata.plot.ly/using-google-bigquery-with-plotly-and-pandas/) to create a new project.

# In[105]:


project_id = 'sixth-edition-678'


# This query will collect the `timestamp`, `package name`, and `total download count` columns from the table (on a daily basis).

# In[106]:


daily_download_query = """
SELECT
  DATE(timestamp) as day,
  MONTH(timestamp) as month,
  file.project,
  COUNT(*) as total_downloads,
FROM
  TABLE_DATE_RANGE(
    [the-psf:pypi.downloads],
    TIMESTAMP("20120701"),
    CURRENT_TIMESTAMP()
  )
WHERE
  file.project = '{0}'
GROUP BY
  day, file.project, month
ORDER BY
  day asc
"""


# The following function run the query and returns a DataFrame object, if successful.

# In[107]:


def package_df(package):
    """ Return the query result as a pandas.DataFrame object
    
    param: package(str): Name of the package on PyPI
    """
    
    try:
        df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id)
        return df
    except:
        raise IOError


# We will construct different DataFrames for each package.

# In[108]:


plotly_df = package_df('plotly')


# In[109]:


bokeh_df = package_df('bokeh')


# In[110]:


matplotlib_df = package_df('matplotlib')


# In[111]:


mpld3_df = package_df('mpld3')


# In[112]:


vincent_df = package_df('vincent')


# ### Inspection for Missing Data

# Using a simple `TimeDelta` calculation, we can find if some rows are missing from the DataFrame.

# In[113]:


from datetime import datetime, timedelta

# Number of rows in the DataFrame
actual_rows = len(plotly_df)

start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22
end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29

# Expected rows if there was no missing data (day)
expected_rows = (end_date - start_date).days + 1

if (actual_rows != expected_rows):
    print "{0} rows are missing in the DataFrame.".format(expected_rows - actual_rows)


# We find that there are no rows from **2016-03-06** to **2016-05-21**.

# ### Data Transformation
# 
# Here, we will concate the missing values in the DataFrames.

# In[118]:


missing_data_start_date = '2016-03-06'
missing_data_end_date = '2016-05-21'

# starting/ending date for missing data and time differene (1 day)
s = datetime.strptime(missing_data_start_date, '%Y-%m-%d')
e = datetime.strptime(missing_data_end_date, '%Y-%m-%d')
diff = timedelta(days=1)

# generate all the missing dates in the same format
missing_dates = []
missing_dates_month = []

while (s <= e):
    missing_dates.append(s.strftime('%Y-%m-%d'))
    missing_dates_month.append(int(s.strftime('%m')[1]))
    s += diff
    
missing_row_count = len(missing_dates) # 77


# In[123]:


def append_missing_data(dataframe, package):
    """Append the missing dates DataFrame to a given DataFrame
    
    param: dataframe(pandas.DataFrame): DataFrame to append
    param: package(str): Name of package on PyPI
    """
    
    missing_dates_df = pd.DataFrame({'day': missing_dates,
                                    'month': missing_dates_month,
                                    'file_project': [package for i in range(missing_row_count)],
                                    'total_downloads': [0 for i in range(missing_row_count)]}
                                   )
    
    # place the appended columns at their right place by sorting
    new_df = pd.concat([dataframe, missing_dates_df])
    
    return new_df.sort_values('day')


# Updated DataFrames with the recovered missing data.

# In[127]:


bokeh_df = append_missing_data(bokeh_df, 'bokeh')
matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib')
mpld3_df = append_missing_data(mpld3_df, 'mpld3')
plotly_df = append_missing_data(plotly_df, 'plotly')
vincent_df = append_missing_data(vincent_df, 'vincent')


# ### Package Downloads Comparison (Daily)

# In[128]:


trace1 = go.Scatter(
    x=plotly_df['day'],
    y=plotly_df['total_downloads'],
    name='Plotly',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(10. 240, 10)'),
    fill='tonexty'
)

trace2 = go.Scatter(
    x=bokeh_df['day'],
    y=bokeh_df['total_downloads'],
    name='Bokeh',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(42, 77, 20)'),
    fill='tonexty'
)

trace3 = go.Scatter(
    x=mpld3_df['day'],
    y=mpld3_df['total_downloads'],
    name='MPLD3',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(20, 33, 61)'),
    fill='tonexty'
)

trace4 = go.Scatter(
    x=vincent_df['day'],
    y=vincent_df['total_downloads'],
    name='Vincent',
    mode='lines',
    line=dict(width=0.5,
              color='rgb(0, 0, 0)'),
    fill='tonexty'
)

data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    title='Package Downloads Comparison (Daily)',
    showlegend=True,
    xaxis=dict(
        type='category',
        showgrid=False
    ),
    yaxis=dict(
        title='No. of downloads (daily)',
        type='linear',
        range=[1, 10000]
    ),
    plot_bgcolor='rgba(250, 250, 250, 1)',
    shapes=[
        dict(
            type='line',
            xref='x',
            yref='y',
            x0='45',
            y0='2000',
            x1='120',
            y1='2000'
        )
    ],
    annotations=[
        dict(
            x=75,
            y=2400,
            xref='x',
            yref='y',
            text="PyPI's stats collection service was down from March 6 to May 21",
            showarrow=False
        ),
        dict(
            x=115,
            y=9600,
            xref='x',
            yref='y',
            text='From Jan 22, 2016 To Aug 29, 2016',
            showarrow=False
        ),
        dict(
            x=121,
            y=2000,
            xref='x',
            yref='y',
            text="",
            showarrow=True,
            ay=0,
            ax=-5
        ),
        dict(
            x=45,
            y=2000,
            xref='x',
            yref='y',
            text="",
            showarrow=True,
            ay=0,
            ax=5
        )
    ]
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)


# ### Package Downloads Comparison (Monthly)

# The dataset was created on Jan 22, 2016. We will use these months on the x-axis.

# In[129]:


months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']


# We are using pandas' `groupby` method to gather all the row by their `month` value and then adding their count to find out 'total downloads' in the month.

# In[130]:


plotly_df.groupby('month').sum()


# In[131]:


trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly')
trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent')
trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh')
trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3')


# In[132]:


data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    barmode='group',
    title="Package Downloads Comparison (PyPI)",
    yaxis=dict(
        title='No. of downloads (monthly)'
    ),
    xaxis=dict(
        title='Month'
    ),
    annotations=[
        dict(
            x=3,
            y=0,
            xref='x',
            yref='y',
            text="PyPI's stats collection service<br>was down from March 6 to May 21",
            showarrow=True,
            arrowhead=2,
            ax=0,
            ay=-150
        ),
        dict(
            x=3.7,
            y=90000,
            xref='x',
            yref='y',
            text='From Jan 22, 2016 To Aug 29, 2016',
            showarrow=False
        )
    ]
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)


# ### Growth of Plotly package downloads

# Following the tutorial [Linear fit in Python](https://plot.ly/python/linear-fits/), we will try to find an
# approximate regression line for the scatter graph of Plotly package's downloads.

# In[155]:


xvals = np.arange(0, len(plotly_df))


# The following `traces` are for the package downloads scatter plot (for each package).

# In[207]:


trace1 = go.Scatter(
    x=xvals[:44], 
    y=plotly_df['total_downloads'].iloc[:44], 
    mode='markers',
    marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
    name='Plotly Downloads'
)

trace2 = go.Scatter(
    x=xvals[121:], 
    y=plotly_df['total_downloads'].iloc[121:],
    mode='markers',
    marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),
    name='Plotly Downloads',
    showlegend=False
)

# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept

trace3 = go.Scatter(
    x=xvals, 
    y=plotly_line, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression Line',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='longdashdot'
    )
)

layout = go.Layout(
    title='Linear Regression Line for Plotly\'s Package Downloads Growth',
    yaxis = dict(
        title='No. of downloads (daily)'
    ),
    xaxis = dict(
        title='# days'
    ),
    annotations=[
        dict(
            x=85,
            y=2000,
            xref='x',
            yref='y',
            text="<b>Y = 13.29X - 282.55</b>",
            showarrow=False
        )
    ]
)

data = [trace1, trace2, trace3]

fig = go.Figure(data=data, layout=layout)
iplot(fig)


# Similary, we can find the approximate growth line for 'Matplotlib'.

# In[204]:


mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads'])
matplotlib_line = mslope*xvals + mintercept


# Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now.

# #### Let's find out how much time will it take for Plotly to reach that level.

# Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000.

# $Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016.
# 
# #### That means it will take around 404 days for Plotly's download range to reach 8000.

# In[229]:


# linear regression line for Plotly package downloads
pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])
plotly_line = pslope*xvals + pintercept

trace1 = go.Scatter(
    x=xvals, 
    y=plotly_line, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression (Actual)',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='longdashdot'
    )
)

future_xvals = np.arange(221, 221 + 404)

trace2 = go.Scatter(
    x=future_xvals, 
    y=pslope*future_xvals+pintercept, 
    mode='lines',
    marker=go.Marker(color='rgb(10, 20, 30)'),
    name='Plotly Regression (Prediction)',
    line=dict(
        color='rgba(10, 10, 10, 1)',
        width=1,
        dash='dot'
    )
)

layout = go.Layout(
    title='Prediction for Plotly\'s Package Downloads Growth',
    yaxis = dict(
        title='No. of downloads (daily)'
    ),
    xaxis = dict(
        title='# days'
    ),
    annotations=[
        dict(
            x=85,
            y=2000,
            xref='x',
            yref='y',
            text="<b>Y = 13.29X - 282.55</b>",
            showarrow=False
        ),
        dict(
            x=400,
            y=7800,
            xref='x',
            yref='y',
            text="Current download range for Matplotlib",
            showarrow=False
        )
    ],
    shapes=[
        dict(
            type='line',
            xref='x',
            yref='y',
            x0=0,
            y0=8000,
            x1=624,
            y1=8000,
            line=dict(
                color='rgba(10, 10, 10, 1)',
                width=1,
                dash='solid'
            )
        ),
        dict(
            type='line',
            xref='x',
            yref='y',
            x0=624,
            y0=0,
            x1=624,
            y1=8000,
            line=dict(
                color='rgba(10, 10, 10, 1)',
                width=1,
                dash='solid'
            )
        )
    ]
)

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
iplot(fig)


# In[ ]: