import gcsfs
import pandas as pd
gcs = gcsfs.GCSFileSystem()
with gcs.open('dask-data/scaling-data.csv') as f:
df = pd.read_csv(f)
df
collection | name | n | unit | duration | rate | |
---|---|---|---|---|---|---|
0 | arrays | blockwise 100ms tasks | 2 | MB/s | 0.930781 | 429.738441 |
1 | arrays | blockwise 100ms tasks | 4 | MB/s | 0.847067 | 944.435197 |
2 | arrays | blockwise 100ms tasks | 8 | MB/s | 1.053046 | 1519.373375 |
3 | arrays | blockwise 100ms tasks | 16 | MB/s | 0.967746 | 3306.651083 |
4 | arrays | blockwise 100ms tasks | 32 | MB/s | 1.113454 | 5747.768011 |
5 | arrays | blockwise 100ms tasks | 64 | MB/s | 0.967302 | 13232.684357 |
6 | arrays | blockwise 100ms tasks | 128 | MB/s | 1.289514 | 19852.058765 |
7 | arrays | blockwise 100ms tasks | 256 | MB/s | 1.730550 | 29585.964835 |
8 | arrays | blockwise 100ms tasks | 512 | MB/s | 6.030538 | 16980.218490 |
9 | arrays | create random | 2 | MB/s | 0.584174 | 684.713900 |
10 | arrays | create random | 4 | MB/s | 0.655029 | 1221.319419 |
11 | arrays | create random | 8 | MB/s | 0.721304 | 2218.161714 |
12 | arrays | create random | 16 | MB/s | 0.664351 | 4816.727556 |
13 | arrays | create random | 32 | MB/s | 0.718007 | 8913.387113 |
14 | arrays | create random | 64 | MB/s | 0.738002 | 17344.136192 |
15 | arrays | create random | 128 | MB/s | 1.200436 | 21325.182436 |
16 | arrays | create random | 256 | MB/s | 2.564872 | 19962.007739 |
17 | arrays | create random | 512 | MB/s | 1.880598 | 54450.695152 |
18 | arrays | elementwise computation | 2 | MB/s | 2.708794 | 147.664339 |
19 | arrays | elementwise computation | 4 | MB/s | 2.908163 | 275.087715 |
20 | arrays | elementwise computation | 8 | MB/s | 3.580060 | 446.911371 |
21 | arrays | elementwise computation | 16 | MB/s | 3.964876 | 807.087044 |
22 | arrays | elementwise computation | 32 | MB/s | 5.441955 | 1176.025319 |
23 | arrays | elementwise computation | 64 | MB/s | 7.383476 | 1733.600921 |
24 | arrays | elementwise computation | 128 | MB/s | 13.116995 | 1951.629145 |
25 | arrays | elementwise computation | 256 | MB/s | 21.965428 | 2330.935700 |
26 | arrays | elementwise computation | 512 | MB/s | 42.592467 | 2404.177413 |
27 | arrays | nearest neighbor 100ms tasks | 2 | MB/s | 1.236899 | 323.383141 |
28 | arrays | nearest neighbor 100ms tasks | 4 | MB/s | 1.160823 | 689.166299 |
29 | arrays | nearest neighbor 100ms tasks | 8 | MB/s | 1.509198 | 1060.145264 |
... | ... | ... | ... | ... | ... | ... |
249 | tasks | task map 1s tasks | 128 | tasks/s | 5.590187 | 91.589068 |
250 | tasks | task map 1s tasks | 256 | tasks/s | 5.644582 | 181.412908 |
251 | tasks | task map 1s tasks | 512 | tasks/s | 7.549867 | 271.263068 |
252 | tasks | task map fast tasks | 2 | tasks/s | 0.221133 | 1808.870110 |
253 | tasks | task map fast tasks | 4 | tasks/s | 0.241904 | 3307.103271 |
254 | tasks | task map fast tasks | 8 | tasks/s | 0.486054 | 3291.817309 |
255 | tasks | task map fast tasks | 16 | tasks/s | 0.800579 | 3997.107929 |
256 | tasks | task map fast tasks | 32 | tasks/s | 1.796089 | 3563.297941 |
257 | tasks | task map fast tasks | 64 | tasks/s | 3.346050 | 3825.405776 |
258 | tasks | task map fast tasks | 128 | tasks/s | 6.896783 | 3711.875589 |
259 | tasks | task map fast tasks | 256 | tasks/s | 14.315529 | 3576.535636 |
260 | tasks | task map fast tasks | 512 | tasks/s | 29.077214 | 3521.657944 |
261 | tasks | tree reduction 100ms tasks | 2 | tasks/s | 6.856049 | 37.339291 |
262 | tasks | tree reduction 100ms tasks | 4 | tasks/s | 7.081078 | 72.305377 |
263 | tasks | tree reduction 100ms tasks | 8 | tasks/s | 7.219373 | 141.840563 |
264 | tasks | tree reduction 100ms tasks | 16 | tasks/s | 7.429270 | 275.666383 |
265 | tasks | tree reduction 100ms tasks | 32 | tasks/s | 7.793831 | 525.543849 |
266 | tasks | tree reduction 100ms tasks | 64 | tasks/s | 8.285013 | 988.773376 |
267 | tasks | tree reduction 100ms tasks | 128 | tasks/s | 8.970450 | 1826.441178 |
268 | tasks | tree reduction 100ms tasks | 256 | tasks/s | 12.109690 | 2705.932256 |
269 | tasks | tree reduction 100ms tasks | 512 | tasks/s | 18.223943 | 3596.148117 |
270 | tasks | tree reduction fast tasks | 2 | tasks/s | 0.202223 | 1265.927313 |
271 | tasks | tree reduction fast tasks | 4 | tasks/s | 0.294252 | 1740.005646 |
272 | tasks | tree reduction fast tasks | 8 | tasks/s | 0.505821 | 2024.432563 |
273 | tasks | tree reduction fast tasks | 16 | tasks/s | 0.864661 | 2368.557067 |
274 | tasks | tree reduction fast tasks | 32 | tasks/s | 1.395554 | 2935.035983 |
275 | tasks | tree reduction fast tasks | 64 | tasks/s | 2.916794 | 2808.563041 |
276 | tasks | tree reduction fast tasks | 128 | tasks/s | 5.950359 | 2753.447264 |
277 | tasks | tree reduction fast tasks | 256 | tasks/s | 11.853272 | 2764.468701 |
278 | tasks | tree reduction fast tasks | 512 | tasks/s | 25.379409 | 2582.250800 |
279 rows × 6 columns
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import Row, Column, gridplot
output_notebook()
def scaling_plot(part, axis_type='linear'):
row = part.iloc[0]
title = (row['collection'] + ': ' + row['name']).title().replace('0M', '0m').replace('1S', '1s')
fig = figure(title=title, sizing_mode='scale_width', x_axis_type=axis_type, y_axis_type=axis_type)
fig.line(x=part.n, y=part.rate)
fig.circle(x=part.n, y=part.rate)
fig.xaxis.axis_label = 'cores'
fig.yaxis.axis_label = row['unit']
fig.x_range.start = 0
fig.y_range.start = 0
# Add in perfect scaling line
y_end = fig.y_range.end
mn = part.n.min()
mx = part.n.max()
slope = part[part.n == mn].iloc[0]['rate'] / mn
fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')
fig.y_range.end = part.rate.max()
fig.xaxis.ticker = part.n
return fig
df2 = df.groupby(['collection', 'name'])['collection', 'name', 'n', 'rate', 'unit'].apply(scaling_plot)
df2
collection name arrays blockwise 100ms tasks Figure(id='bcca8fc0-7fe6-458b-9c19-6cb29d44131... create random Figure(id='d8bcdc1e-f3d9-42c6-94ce-75464f676e8... elementwise computation Figure(id='617352a4-ce31-45bd-ae48-9f273d04978... nearest neighbor 100ms tasks Figure(id='c533f293-6da4-4fcd-9981-cd4a228fcec... nearest neighbor fast tasks Figure(id='181550ff-8183-4494-899d-9e7b16db240... random access Figure(id='e68756de-ce1c-440f-b554-3910b12598f... rechunk large Figure(id='98b62a74-238a-4dbb-a00f-aac75621172... rechunk small Figure(id='2a98d534-b0e7-4c5a-bd28-32d5aae870f... reduction Figure(id='0834997a-d482-4e1c-a0cf-8c980bd5da8... reduction along axis Figure(id='60a7509c-35e7-46cc-b547-2072701d7fc... transpose addition Figure(id='f4429e42-0003-4c4a-b53f-d32be1fe2a7... dataframes arithmetic Figure(id='bb3cdc16-306a-4eca-8bea-d78af7d011d... blockwise 100ms tasks Figure(id='6c694c69-c303-49b3-8b45-b3aaa65486e... create random Figure(id='7fa63a9f-6a95-4d73-a8dc-a138c0e305d... dataframe reduction Figure(id='3b2726ce-eefe-42f8-8771-fd816e2065c... groupby apply (full shuffle) Figure(id='fe8d51a6-bedf-421b-abd7-3c1c36d5eeb... groupby reduction Figure(id='345dd894-9c0e-4029-a5f3-93a7bcf4646... random access Figure(id='52e54ceb-d341-4a3f-a7ec-927da4854b5... rolling aggregations Figure(id='d9b90423-449b-4214-8a57-5f4c2834716... series reduction Figure(id='6a1b3578-4c22-4fcf-99c9-aaff125fc6e... set index (full shuffle) Figure(id='57ed361b-be6c-488d-938b-2c723fb1750... tasks dynamic tree reduction 100ms tasks Figure(id='e4df49b6-7857-4bfb-b716-fb3ddaed3f9... dynamic tree reduction fast tasks Figure(id='efb02e2b-e8e1-41e7-9146-b183ed9475f... nearest neighbor 100ms tasks Figure(id='3dfadbf7-7a01-41e6-a4e3-907cf0a6189... nearest neighbor fast tasks Figure(id='8e774f01-9d70-4656-a01f-71f1186dbda... sequential Figure(id='c9d51a70-70c8-4b7b-b976-260fd0756c9... task map 100ms tasks Figure(id='682118d9-44c2-4cd3-bd82-9716c23ec89... task map 1s tasks Figure(id='504aa97b-a65b-470f-bf26-42d88766680... task map fast tasks Figure(id='b2a2e084-afb3-4e2d-b29f-9802b4ddd34... tree reduction 100ms tasks Figure(id='ee7ad037-8c0a-4b84-baa0-77e64dff90b... tree reduction fast tasks Figure(id='5599840f-08e1-436d-afd9-0049ff7905f... dtype: object
names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks',
'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
from toolz import partition_all
L = df2.loc['tasks'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))
names = ['create random', 'blockwise 100ms tasks', 'elementwise computation', 'reduction',
'reduction along axis', 'random access', 'transpose addition', 'rechunk large',
'nearest neighbor fast tasks', 'nearest neighbor 100ms tasks']
from toolz import partition_all
L = df2.loc['arrays'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))
names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks',
'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
L = df2.loc['dataframes'].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))
from bokeh.palettes import viridis, Category10
from bokeh.models.widgets import Panel, Tabs
df3 = df.set_index(['collection', 'name'])[['n', 'rate', 'unit']]
n = df3.loc['tasks', 'task map 1s tasks'].n.values
colors = Category10[3]
def log_linear_plot(collection, names, title, legends=('100ms', '1us')):
panels = []
unit = df3.loc[collection, names[0]].iloc[0]['unit']
for axis_type in ['log', 'linear']:
fig = figure(title=title, # sizing_mode='scale_width',
x_axis_type=axis_type, y_axis_type=axis_type, height=400, width=400)
for i, name in enumerate(names):
x = df3.loc[collection, name].n.values
y = df3.loc[collection, name].rate.values
fig.line(x=x, y=y, color=colors[i], legend=legends[i])
fig.circle(x=x, y=y)
for name in names:
# Add in perfect scaling line
x = df3.loc[collection, name].n.values
y = df3.loc[collection, name].rate.values
mn = x.min()
mx = x.max()
slope = y[0] / mn
fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')
fig.y_range.end = max([df3.loc[collection, name].rate.max() for name in names])
fig.xaxis.axis_label = 'cores'
fig.yaxis.axis_label = unit
fig.x_range.start = 0
fig.y_range.start = 0
fig.xaxis.ticker = x
fig.legend.location = 'bottom_right'
panel = Panel(child=fig, title=axis_type)
panels.append(panel)
tabs = Tabs(tabs=panels)
return tabs
figures = {}
names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',]
legends = ['1s', '100ms', '1us']
fig = log_linear_plot('tasks', names, 'Tasks: Embarrassingly Parallel', legends)
figures['tasks-embarrasssing'] = fig
fig = log_linear_plot('tasks', ['tree reduction 100ms tasks', 'tree reduction fast tasks'], 'Tasks: Tree Reduction')
figures['tasks-reduction'] = fig
fig = log_linear_plot('tasks', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Tasks: Nearest Neighbor')
figures['tasks-nearest-neighbor'] = fig
fig = log_linear_plot('tasks', ['sequential'], 'Tasks: Sequential', legends=['fast'])
figures['tasks-sequential'] = fig
fig = log_linear_plot('tasks', ['dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'], 'Tasks: Dynamic Reduction')
figures['tasks-dynamic-reduction'] = fig
L = [['tasks-embarrasssing', 'tasks-sequential'], ['tasks-nearest-neighbor', 'tasks-dynamic-reduction']]
figs = [[figures[k] for k in kk] for kk in L]
grid = gridplot(figs, sizing_mode='scale_width')
figures['tasks-grid'] = grid
show(grid)
df[df.collection == 'arrays'].name.unique()
array(['blockwise 100ms tasks', 'create random', 'elementwise computation', 'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 'random access', 'rechunk large', 'rechunk small', 'reduction', 'reduction along axis', 'transpose addition'], dtype=object)
fig = log_linear_plot('arrays', ['create random'], 'Arrays: Create', legends=['random'])
figures['array-create'] = fig
fig = log_linear_plot('arrays', ['elementwise computation'], 'Arrays: Elementwise', legends=['sin(x)**2 + cos(x)**2'])
figures['array-elementwise'] = fig
fig = log_linear_plot('arrays', ['reduction', 'reduction along axis'], 'Arrays: Nearest Neighbor', legends=['x.std()', 'x.std(axis=0)'])
figures['array-reductions'] = fig
fig = log_linear_plot('arrays', ['random access'], 'Arrays: Random Access', legends=['x[12345, 23456]'])
figures['array-random-access'] = fig
fig = log_linear_plot('arrays', ['transpose addition'], 'Arrays: Bulk Communication', legends=['x + x.T'])
figures['array-transpose'] = fig
fig = log_linear_plot('arrays', ['rechunk large'], 'Arrays: Rechunking', legends=['x.rechunk(...)'])
figures['array-rechunk'] = fig
fig = log_linear_plot('arrays', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Arrays: Map Overlap')
figures['array-overlap'] = fig
grid = [['array-create', 'array-elementwise'],
['array-reductions', 'array-random-access'],
['array-transpose', 'array-rechunk']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['array-grid'] = grid
show(figures['array-grid'])
df[df.collection == 'dataframes'].name.unique()
array(['arithmetic', 'blockwise 100ms tasks', 'create random', 'dataframe reduction', 'groupby apply (full shuffle)', 'groupby reduction', 'random access', 'rolling aggregations', 'series reduction', 'set index (full shuffle)'], dtype=object)
fig = log_linear_plot('dataframes', ['create random'], 'DataFrames: Create', legends=['random'])
figures['dataframe-create'] = fig
fig = log_linear_plot('dataframes', ['blockwise 100ms tasks', 'arithmetic'], 'DataFrames: Elementwise', legends=['100ms', 'arithmetic'])
figures['dataframe-elementwise'] = fig
fig = log_linear_plot('dataframes', ['random access'], 'DataFrames: Random Access', legends=['df.loc[123456]'])
figures['dataframe-random-access'] = fig
fig = log_linear_plot('dataframes', ['dataframe reduction', 'series reduction', 'groupby reduction'],
'DataFrames: Reductions', legends=['df.std()', 'df[0].std()', 'df.groupby(0)[1].mean()'])
figures['dataframe-reductions'] = fig
fig = log_linear_plot('dataframes', ['groupby apply (full shuffle)', 'set index (full shuffle)'],
'DataFrames: Full Shuffle', legends=['df.groupby(...).apply(...)', 'df.set_index(...)'])
figures['dataframe-shuffle'] = fig
fig = log_linear_plot('dataframes', ['rolling aggregations'], 'DataFrames: Time Series', legends=['df.rolling(...).mean()'])
figures['dataframe-time-series'] = fig
grid = [['dataframe-create', 'dataframe-elementwise'],
['dataframe-random-access', 'dataframe-reductions'],
['dataframe-shuffle', 'dataframe-time-series']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['dataframe-grid'] = grid
show(grid)
from bokeh.embed import components
script, divs = components(list(figures.values()))
with open('/home/mrocklin/workspace/blog/_posts/work/2017-06-27-scaling.md', 'at') as f:
for div in divs:
f.write(div)
f.write('\n\n')
f.write(script)