#!/usr/bin/env python # coding: utf-8 # In[1]: import gcsfs import pandas as pd gcs = gcsfs.GCSFileSystem() # In[2]: with gcs.open('dask-data/scaling-data.csv') as f: df = pd.read_csv(f) df # In[3]: from bokeh.plotting import figure, show, output_notebook from bokeh.layouts import Row, Column, gridplot output_notebook() # In[4]: def scaling_plot(part, axis_type='linear'): row = part.iloc[0] title = (row['collection'] + ': ' + row['name']).title().replace('0M', '0m').replace('1S', '1s') fig = figure(title=title, sizing_mode='scale_width', x_axis_type=axis_type, y_axis_type=axis_type) fig.line(x=part.n, y=part.rate) fig.circle(x=part.n, y=part.rate) fig.xaxis.axis_label = 'cores' fig.yaxis.axis_label = row['unit'] fig.x_range.start = 0 fig.y_range.start = 0 # Add in perfect scaling line y_end = fig.y_range.end mn = part.n.min() mx = part.n.max() slope = part[part.n == mn].iloc[0]['rate'] / mn fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed') fig.y_range.end = part.rate.max() fig.xaxis.ticker = part.n return fig # In[5]: df2 = df.groupby(['collection', 'name'])['collection', 'name', 'n', 'rate', 'unit'].apply(scaling_plot) df2 # In[6]: names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks', 'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential', 'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'] from toolz import partition_all L = df2.loc['tasks'].loc[names].values.tolist() grid = list(partition_all(3, L)) # show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width')) show(gridplot(grid, sizing_mode='scale_width')) # In[7]: names = ['create random', 'blockwise 100ms tasks', 'elementwise computation', 'reduction', 'reduction along axis', 'random access', 'transpose addition', 'rechunk large', 'nearest neighbor fast tasks', 'nearest neighbor 100ms tasks'] from toolz import partition_all L = df2.loc['arrays'].loc[names].values.tolist() grid = list(partition_all(3, L)) # show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width')) show(gridplot(grid, sizing_mode='scale_width')) # In[8]: names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks', 'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential', 'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'] L = df2.loc['dataframes'].values.tolist() grid = list(partition_all(3, L)) # show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width')) show(gridplot(grid, sizing_mode='scale_width')) # In[9]: from bokeh.palettes import viridis, Category10 from bokeh.models.widgets import Panel, Tabs df3 = df.set_index(['collection', 'name'])[['n', 'rate', 'unit']] n = df3.loc['tasks', 'task map 1s tasks'].n.values colors = Category10[3] def log_linear_plot(collection, names, title, legends=('100ms', '1us')): panels = [] unit = df3.loc[collection, names[0]].iloc[0]['unit'] for axis_type in ['log', 'linear']: fig = figure(title=title, # sizing_mode='scale_width', x_axis_type=axis_type, y_axis_type=axis_type, height=400, width=400) for i, name in enumerate(names): x = df3.loc[collection, name].n.values y = df3.loc[collection, name].rate.values fig.line(x=x, y=y, color=colors[i], legend=legends[i]) fig.circle(x=x, y=y) for name in names: # Add in perfect scaling line x = df3.loc[collection, name].n.values y = df3.loc[collection, name].rate.values mn = x.min() mx = x.max() slope = y[0] / mn fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed') fig.y_range.end = max([df3.loc[collection, name].rate.max() for name in names]) fig.xaxis.axis_label = 'cores' fig.yaxis.axis_label = unit fig.x_range.start = 0 fig.y_range.start = 0 fig.xaxis.ticker = x fig.legend.location = 'bottom_right' panel = Panel(child=fig, title=axis_type) panels.append(panel) tabs = Tabs(tabs=panels) return tabs # In[10]: figures = {} names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',] legends = ['1s', '100ms', '1us'] fig = log_linear_plot('tasks', names, 'Tasks: Embarrassingly Parallel', legends) figures['tasks-embarrasssing'] = fig fig = log_linear_plot('tasks', ['tree reduction 100ms tasks', 'tree reduction fast tasks'], 'Tasks: Tree Reduction') figures['tasks-reduction'] = fig fig = log_linear_plot('tasks', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Tasks: Nearest Neighbor') figures['tasks-nearest-neighbor'] = fig fig = log_linear_plot('tasks', ['sequential'], 'Tasks: Sequential', legends=['fast']) figures['tasks-sequential'] = fig fig = log_linear_plot('tasks', ['dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'], 'Tasks: Dynamic Reduction') figures['tasks-dynamic-reduction'] = fig L = [['tasks-embarrasssing', 'tasks-sequential'], ['tasks-nearest-neighbor', 'tasks-dynamic-reduction']] figs = [[figures[k] for k in kk] for kk in L] grid = gridplot(figs, sizing_mode='scale_width') figures['tasks-grid'] = grid # In[11]: show(grid) # ### Arrays # In[12]: df[df.collection == 'arrays'].name.unique() # In[13]: fig = log_linear_plot('arrays', ['create random'], 'Arrays: Create', legends=['random']) figures['array-create'] = fig fig = log_linear_plot('arrays', ['elementwise computation'], 'Arrays: Elementwise', legends=['sin(x)**2 + cos(x)**2']) figures['array-elementwise'] = fig fig = log_linear_plot('arrays', ['reduction', 'reduction along axis'], 'Arrays: Nearest Neighbor', legends=['x.std()', 'x.std(axis=0)']) figures['array-reductions'] = fig fig = log_linear_plot('arrays', ['random access'], 'Arrays: Random Access', legends=['x[12345, 23456]']) figures['array-random-access'] = fig fig = log_linear_plot('arrays', ['transpose addition'], 'Arrays: Bulk Communication', legends=['x + x.T']) figures['array-transpose'] = fig fig = log_linear_plot('arrays', ['rechunk large'], 'Arrays: Rechunking', legends=['x.rechunk(...)']) figures['array-rechunk'] = fig fig = log_linear_plot('arrays', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Arrays: Map Overlap') figures['array-overlap'] = fig grid = [['array-create', 'array-elementwise'], ['array-reductions', 'array-random-access'], ['array-transpose', 'array-rechunk']] grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width') figures['array-grid'] = grid # In[14]: show(figures['array-grid']) # ### Dataframes # In[15]: df[df.collection == 'dataframes'].name.unique() # In[16]: fig = log_linear_plot('dataframes', ['create random'], 'DataFrames: Create', legends=['random']) figures['dataframe-create'] = fig fig = log_linear_plot('dataframes', ['blockwise 100ms tasks', 'arithmetic'], 'DataFrames: Elementwise', legends=['100ms', 'arithmetic']) figures['dataframe-elementwise'] = fig fig = log_linear_plot('dataframes', ['random access'], 'DataFrames: Random Access', legends=['df.loc[123456]']) figures['dataframe-random-access'] = fig fig = log_linear_plot('dataframes', ['dataframe reduction', 'series reduction', 'groupby reduction'], 'DataFrames: Reductions', legends=['df.std()', 'df[0].std()', 'df.groupby(0)[1].mean()']) figures['dataframe-reductions'] = fig fig = log_linear_plot('dataframes', ['groupby apply (full shuffle)', 'set index (full shuffle)'], 'DataFrames: Full Shuffle', legends=['df.groupby(...).apply(...)', 'df.set_index(...)']) figures['dataframe-shuffle'] = fig fig = log_linear_plot('dataframes', ['rolling aggregations'], 'DataFrames: Time Series', legends=['df.rolling(...).mean()']) figures['dataframe-time-series'] = fig grid = [['dataframe-create', 'dataframe-elementwise'], ['dataframe-random-access', 'dataframe-reductions'], ['dataframe-shuffle', 'dataframe-time-series']] grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width') figures['dataframe-grid'] = grid # In[17]: show(grid) # In[18]: from bokeh.embed import components script, divs = components(list(figures.values())) # ```python # with open('/home/mrocklin/workspace/blog/_posts/work/2017-06-27-scaling.md', 'at') as f: # # for div in divs: # f.write(div) # f.write('\n\n') # f.write(script) # ```