import pandas as pd
from glob import glob
import numpy as np
import plotly.express as pe
import plotly.io as pio
from IPython.display import Markdown
width = 1000
pio.renderers.default = "plotly_mimetype+notebook_connected"
url_data = "https://github.com/jupyterhub/binder-data/raw/master/billing/data/proc/data.json"
data = pd.read_json(url_data, orient='split')
data['date'] = pd.to_datetime(data['date'])
# Add information about number of launches on each day
launches = pd.read_html("https://archive.analytics.mybinder.org/")[0]
launches['Date'] = pd.to_datetime(launches['Date'])
template = 'plotly_white'
start_date = data['date'].min()
end_date = data['date'].max()
Markdown(f"**Showing data from {start_date:%Y-%m-%d} to {end_date:%Y-%m-%d}**")
Showing data from 2017-12-13 to 2019-10-15
costs = data.groupby(['project_id', 'date']).sum()['cost']
costs = costs.reset_index()
weekly_costs = costs.groupby("project_id").resample("w", on="date").sum().reset_index()
pe.line(costs.reset_index(), x='date', y='cost', color='project_id',
width=width, template=template, title="Daily cost")
pe.line(weekly_costs, x='date', y='cost', color='project_id',
width=width, template=template, title="Weekly cost")
categories = data.groupby(['date', 'category', 'line_item']).sum()['cost'].reset_index()
categories = categories.groupby(['date', 'category']).sum().reset_index()
pe.line(categories, x='date', y='cost', color='category',
width=width, template=template, title="Category daily cost")
pe.line(categories, x='date', y='cost', color='category',
width=width, template=template, title="Category log daily cost", log_y=True)
daily = categories.groupby('date').agg({'cost': 'sum'})
# Merge them together
daily = pd.merge(daily, launches, left_on='date', right_on='Date', how='outer').dropna(subset=['Date'])
daily['cost_per_session'] = daily['cost'] / daily['Number of Events']
pe.line(daily, x='Date', y='Number of Events', template=template, title="Number of sessions")
weekly = daily.rolling(7, on='Date').mean()
pe.line(weekly, x='Date', y='Number of Events', template=template, title="Number of sessions")
pe.line(weekly, x='Date', y='cost_per_session', template=template, title="Cost per session")
monthly = categories.resample('M', on='date').agg({'cost': 'sum'}).reset_index()
pe.line(monthly, x='date', y='cost', template=template, title="Monthly cost")