#!/usr/bin/env python
# coding: utf-8

# In[1]:


import gcsfs
import pandas as pd
gcs = gcsfs.GCSFileSystem()


# In[2]:


with gcs.open('dask-data/scaling-data.csv') as f:
    df = pd.read_csv(f)
    
df


# In[3]:


from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import Row, Column, gridplot
output_notebook()


# In[4]:


def scaling_plot(part, axis_type='linear'):
    row = part.iloc[0]
    title = (row['collection'] + ': ' + row['name']).title().replace('0M', '0m').replace('1S', '1s')

    fig = figure(title=title, sizing_mode='scale_width', x_axis_type=axis_type, y_axis_type=axis_type)
    fig.line(x=part.n, y=part.rate)
    fig.circle(x=part.n, y=part.rate)
    fig.xaxis.axis_label = 'cores'
    fig.yaxis.axis_label = row['unit']
    fig.x_range.start = 0
    fig.y_range.start = 0

    # Add in perfect scaling line
    y_end = fig.y_range.end
    mn = part.n.min()
    mx = part.n.max()
    slope = part[part.n == mn].iloc[0]['rate'] / mn
    fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')
    fig.y_range.end = part.rate.max()
    
    fig.xaxis.ticker = part.n

    return fig


# In[5]:


df2 = df.groupby(['collection', 'name'])['collection', 'name', 'n', 'rate', 'unit'].apply(scaling_plot)
df2


# In[6]:


names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
         'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
         'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 
         'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
from toolz import partition_all
L = df2.loc['tasks'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))


# In[7]:


names = ['create random', 'blockwise 100ms tasks', 'elementwise computation', 'reduction', 
         'reduction along axis', 'random access', 'transpose addition', 'rechunk large', 
         'nearest neighbor fast tasks', 'nearest neighbor 100ms tasks']
from toolz import partition_all
L = df2.loc['arrays'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))


# In[8]:


names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
         'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
         'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 
         'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
L = df2.loc['dataframes'].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))


# In[9]:


from bokeh.palettes import viridis, Category10
from bokeh.models.widgets import Panel, Tabs

df3 = df.set_index(['collection', 'name'])[['n', 'rate', 'unit']]
n = df3.loc['tasks', 'task map 1s tasks'].n.values

colors = Category10[3]


def log_linear_plot(collection, names, title, legends=('100ms', '1us')):
    panels = []
    unit = df3.loc[collection, names[0]].iloc[0]['unit']
    
    for axis_type in ['log', 'linear']:
        fig = figure(title=title, # sizing_mode='scale_width', 
                     x_axis_type=axis_type, y_axis_type=axis_type, height=400, width=400)
        for i, name in enumerate(names):
            x = df3.loc[collection, name].n.values
            y = df3.loc[collection, name].rate.values
            fig.line(x=x, y=y, color=colors[i], legend=legends[i])
            fig.circle(x=x, y=y)

        for name in names:
            # Add in perfect scaling line
            x = df3.loc[collection, name].n.values
            y = df3.loc[collection, name].rate.values
            mn = x.min()
            mx = x.max()
            slope = y[0] / mn
            fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')

        fig.y_range.end = max([df3.loc[collection, name].rate.max() for name in names])

        fig.xaxis.axis_label = 'cores'
        fig.yaxis.axis_label = unit
        fig.x_range.start = 0
        fig.y_range.start = 0

        fig.xaxis.ticker = x
        fig.legend.location = 'bottom_right'

        panel = Panel(child=fig, title=axis_type)
        panels.append(panel)

    tabs = Tabs(tabs=panels)
    return tabs


# In[10]:


figures = {}

names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',]
legends = ['1s', '100ms', '1us']
fig = log_linear_plot('tasks', names, 'Tasks: Embarrassingly Parallel', legends)
figures['tasks-embarrasssing'] = fig

fig = log_linear_plot('tasks', ['tree reduction 100ms tasks', 'tree reduction fast tasks'], 'Tasks: Tree Reduction')
figures['tasks-reduction'] = fig

fig = log_linear_plot('tasks', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Tasks: Nearest Neighbor')
figures['tasks-nearest-neighbor'] = fig

fig = log_linear_plot('tasks', ['sequential'], 'Tasks: Sequential', legends=['fast'])
figures['tasks-sequential'] = fig

fig = log_linear_plot('tasks', ['dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'], 'Tasks: Dynamic Reduction')
figures['tasks-dynamic-reduction'] = fig

L = [['tasks-embarrasssing', 'tasks-sequential'], ['tasks-nearest-neighbor', 'tasks-dynamic-reduction']]
figs = [[figures[k] for k in kk] for kk in L]
grid = gridplot(figs, sizing_mode='scale_width')
figures['tasks-grid'] = grid


# In[11]:


show(grid)


# ### Arrays

# In[12]:


df[df.collection == 'arrays'].name.unique()


# In[13]:


fig = log_linear_plot('arrays', ['create random'], 'Arrays: Create', legends=['random'])
figures['array-create'] = fig

fig = log_linear_plot('arrays', ['elementwise computation'], 'Arrays: Elementwise', legends=['sin(x)**2 + cos(x)**2'])
figures['array-elementwise'] = fig

fig = log_linear_plot('arrays', ['reduction', 'reduction along axis'], 'Arrays: Nearest Neighbor', legends=['x.std()', 'x.std(axis=0)'])
figures['array-reductions'] = fig

fig = log_linear_plot('arrays', ['random access'], 'Arrays: Random Access', legends=['x[12345, 23456]'])
figures['array-random-access'] = fig

fig = log_linear_plot('arrays', ['transpose addition'], 'Arrays: Bulk Communication', legends=['x + x.T'])
figures['array-transpose'] = fig

fig = log_linear_plot('arrays', ['rechunk large'], 'Arrays: Rechunking', legends=['x.rechunk(...)'])
figures['array-rechunk'] = fig

fig = log_linear_plot('arrays', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Arrays: Map Overlap')
figures['array-overlap'] = fig

grid = [['array-create', 'array-elementwise'],
        ['array-reductions', 'array-random-access'],
        ['array-transpose', 'array-rechunk']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['array-grid'] = grid


# In[14]:


show(figures['array-grid'])


# ### Dataframes

# In[15]:


df[df.collection == 'dataframes'].name.unique()


# In[16]:


fig = log_linear_plot('dataframes', ['create random'], 'DataFrames: Create', legends=['random'])
figures['dataframe-create'] = fig

fig = log_linear_plot('dataframes', ['blockwise 100ms tasks', 'arithmetic'], 'DataFrames: Elementwise', legends=['100ms', 'arithmetic'])
figures['dataframe-elementwise'] = fig

fig = log_linear_plot('dataframes', ['random access'], 'DataFrames: Random Access', legends=['df.loc[123456]'])
figures['dataframe-random-access'] = fig


fig = log_linear_plot('dataframes', ['dataframe reduction', 'series reduction', 'groupby reduction'], 
                      'DataFrames: Reductions', legends=['df.std()', 'df[0].std()', 'df.groupby(0)[1].mean()'])
figures['dataframe-reductions'] = fig

fig = log_linear_plot('dataframes', ['groupby apply (full shuffle)', 'set index (full shuffle)'], 
                      'DataFrames: Full Shuffle', legends=['df.groupby(...).apply(...)', 'df.set_index(...)'])
figures['dataframe-shuffle'] = fig

fig = log_linear_plot('dataframes', ['rolling aggregations'], 'DataFrames: Time Series', legends=['df.rolling(...).mean()'])
figures['dataframe-time-series'] = fig

grid = [['dataframe-create', 'dataframe-elementwise'],
        ['dataframe-random-access', 'dataframe-reductions'],
        ['dataframe-shuffle', 'dataframe-time-series']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['dataframe-grid'] = grid


# In[17]:


show(grid)


# In[18]:


from bokeh.embed import components

script, divs = components(list(figures.values()))


# ```python
# with open('/home/mrocklin/workspace/blog/_posts/work/2017-06-27-scaling.md', 'at') as f:
#     
#     for div in divs:
#         f.write(div)
#         f.write('\n\n')
#     f.write(script)
# ```