In [1]:

import gcsfs
import pandas as pd
gcs = gcsfs.GCSFileSystem()

In [2]:

with gcs.open('dask-data/scaling-data.csv') as f:
    df = pd.read_csv(f)
    
df

Out[2]:

	collection	name	n	unit	duration	rate
0	arrays	blockwise 100ms tasks	2	MB/s	0.930781	429.738441
1	arrays	blockwise 100ms tasks	4	MB/s	0.847067	944.435197
2	arrays	blockwise 100ms tasks	8	MB/s	1.053046	1519.373375
3	arrays	blockwise 100ms tasks	16	MB/s	0.967746	3306.651083
4	arrays	blockwise 100ms tasks	32	MB/s	1.113454	5747.768011
5	arrays	blockwise 100ms tasks	64	MB/s	0.967302	13232.684357
6	arrays	blockwise 100ms tasks	128	MB/s	1.289514	19852.058765
7	arrays	blockwise 100ms tasks	256	MB/s	1.730550	29585.964835
8	arrays	blockwise 100ms tasks	512	MB/s	6.030538	16980.218490
9	arrays	create random	2	MB/s	0.584174	684.713900
10	arrays	create random	4	MB/s	0.655029	1221.319419
11	arrays	create random	8	MB/s	0.721304	2218.161714
12	arrays	create random	16	MB/s	0.664351	4816.727556
13	arrays	create random	32	MB/s	0.718007	8913.387113
14	arrays	create random	64	MB/s	0.738002	17344.136192
15	arrays	create random	128	MB/s	1.200436	21325.182436
16	arrays	create random	256	MB/s	2.564872	19962.007739
17	arrays	create random	512	MB/s	1.880598	54450.695152
18	arrays	elementwise computation	2	MB/s	2.708794	147.664339
19	arrays	elementwise computation	4	MB/s	2.908163	275.087715
20	arrays	elementwise computation	8	MB/s	3.580060	446.911371
21	arrays	elementwise computation	16	MB/s	3.964876	807.087044
22	arrays	elementwise computation	32	MB/s	5.441955	1176.025319
23	arrays	elementwise computation	64	MB/s	7.383476	1733.600921
24	arrays	elementwise computation	128	MB/s	13.116995	1951.629145
25	arrays	elementwise computation	256	MB/s	21.965428	2330.935700
26	arrays	elementwise computation	512	MB/s	42.592467	2404.177413
27	arrays	nearest neighbor 100ms tasks	2	MB/s	1.236899	323.383141
28	arrays	nearest neighbor 100ms tasks	4	MB/s	1.160823	689.166299
29	arrays	nearest neighbor 100ms tasks	8	MB/s	1.509198	1060.145264
...	...	...	...	...	...	...
249	tasks	task map 1s tasks	128	tasks/s	5.590187	91.589068
250	tasks	task map 1s tasks	256	tasks/s	5.644582	181.412908
251	tasks	task map 1s tasks	512	tasks/s	7.549867	271.263068
252	tasks	task map fast tasks	2	tasks/s	0.221133	1808.870110
253	tasks	task map fast tasks	4	tasks/s	0.241904	3307.103271
254	tasks	task map fast tasks	8	tasks/s	0.486054	3291.817309
255	tasks	task map fast tasks	16	tasks/s	0.800579	3997.107929
256	tasks	task map fast tasks	32	tasks/s	1.796089	3563.297941
257	tasks	task map fast tasks	64	tasks/s	3.346050	3825.405776
258	tasks	task map fast tasks	128	tasks/s	6.896783	3711.875589
259	tasks	task map fast tasks	256	tasks/s	14.315529	3576.535636
260	tasks	task map fast tasks	512	tasks/s	29.077214	3521.657944
261	tasks	tree reduction 100ms tasks	2	tasks/s	6.856049	37.339291
262	tasks	tree reduction 100ms tasks	4	tasks/s	7.081078	72.305377
263	tasks	tree reduction 100ms tasks	8	tasks/s	7.219373	141.840563
264	tasks	tree reduction 100ms tasks	16	tasks/s	7.429270	275.666383
265	tasks	tree reduction 100ms tasks	32	tasks/s	7.793831	525.543849
266	tasks	tree reduction 100ms tasks	64	tasks/s	8.285013	988.773376
267	tasks	tree reduction 100ms tasks	128	tasks/s	8.970450	1826.441178
268	tasks	tree reduction 100ms tasks	256	tasks/s	12.109690	2705.932256
269	tasks	tree reduction 100ms tasks	512	tasks/s	18.223943	3596.148117
270	tasks	tree reduction fast tasks	2	tasks/s	0.202223	1265.927313
271	tasks	tree reduction fast tasks	4	tasks/s	0.294252	1740.005646
272	tasks	tree reduction fast tasks	8	tasks/s	0.505821	2024.432563
273	tasks	tree reduction fast tasks	16	tasks/s	0.864661	2368.557067
274	tasks	tree reduction fast tasks	32	tasks/s	1.395554	2935.035983
275	tasks	tree reduction fast tasks	64	tasks/s	2.916794	2808.563041
276	tasks	tree reduction fast tasks	128	tasks/s	5.950359	2753.447264
277	tasks	tree reduction fast tasks	256	tasks/s	11.853272	2764.468701
278	tasks	tree reduction fast tasks	512	tasks/s	25.379409	2582.250800

279 rows × 6 columns

In [3]:

from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import Row, Column, gridplot
output_notebook()

Loading BokehJS ...

In [4]:

def scaling_plot(part, axis_type='linear'):
    row = part.iloc[0]
    title = (row['collection'] + ': ' + row['name']).title().replace('0M', '0m').replace('1S', '1s')

    fig = figure(title=title, sizing_mode='scale_width', x_axis_type=axis_type, y_axis_type=axis_type)
    fig.line(x=part.n, y=part.rate)
    fig.circle(x=part.n, y=part.rate)
    fig.xaxis.axis_label = 'cores'
    fig.yaxis.axis_label = row['unit']
    fig.x_range.start = 0
    fig.y_range.start = 0

    # Add in perfect scaling line
    y_end = fig.y_range.end
    mn = part.n.min()
    mx = part.n.max()
    slope = part[part.n == mn].iloc[0]['rate'] / mn
    fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')
    fig.y_range.end = part.rate.max()
    
    fig.xaxis.ticker = part.n

    return fig

In [5]:

df2 = df.groupby(['collection', 'name'])['collection', 'name', 'n', 'rate', 'unit'].apply(scaling_plot)
df2

Out[5]:

collection  name                              
arrays      blockwise 100ms tasks                 Figure(id='bcca8fc0-7fe6-458b-9c19-6cb29d44131...
            create random                         Figure(id='d8bcdc1e-f3d9-42c6-94ce-75464f676e8...
            elementwise computation               Figure(id='617352a4-ce31-45bd-ae48-9f273d04978...
            nearest neighbor 100ms tasks          Figure(id='c533f293-6da4-4fcd-9981-cd4a228fcec...
            nearest neighbor fast tasks           Figure(id='181550ff-8183-4494-899d-9e7b16db240...
            random access                         Figure(id='e68756de-ce1c-440f-b554-3910b12598f...
            rechunk large                         Figure(id='98b62a74-238a-4dbb-a00f-aac75621172...
            rechunk small                         Figure(id='2a98d534-b0e7-4c5a-bd28-32d5aae870f...
            reduction                             Figure(id='0834997a-d482-4e1c-a0cf-8c980bd5da8...
            reduction along axis                  Figure(id='60a7509c-35e7-46cc-b547-2072701d7fc...
            transpose addition                    Figure(id='f4429e42-0003-4c4a-b53f-d32be1fe2a7...
dataframes  arithmetic                            Figure(id='bb3cdc16-306a-4eca-8bea-d78af7d011d...
            blockwise 100ms tasks                 Figure(id='6c694c69-c303-49b3-8b45-b3aaa65486e...
            create random                         Figure(id='7fa63a9f-6a95-4d73-a8dc-a138c0e305d...
            dataframe reduction                   Figure(id='3b2726ce-eefe-42f8-8771-fd816e2065c...
            groupby apply (full shuffle)          Figure(id='fe8d51a6-bedf-421b-abd7-3c1c36d5eeb...
            groupby reduction                     Figure(id='345dd894-9c0e-4029-a5f3-93a7bcf4646...
            random access                         Figure(id='52e54ceb-d341-4a3f-a7ec-927da4854b5...
            rolling aggregations                  Figure(id='d9b90423-449b-4214-8a57-5f4c2834716...
            series reduction                      Figure(id='6a1b3578-4c22-4fcf-99c9-aaff125fc6e...
            set index (full shuffle)              Figure(id='57ed361b-be6c-488d-938b-2c723fb1750...
tasks       dynamic tree reduction 100ms tasks    Figure(id='e4df49b6-7857-4bfb-b716-fb3ddaed3f9...
            dynamic tree reduction fast tasks     Figure(id='efb02e2b-e8e1-41e7-9146-b183ed9475f...
            nearest neighbor 100ms tasks          Figure(id='3dfadbf7-7a01-41e6-a4e3-907cf0a6189...
            nearest neighbor fast tasks           Figure(id='8e774f01-9d70-4656-a01f-71f1186dbda...
            sequential                            Figure(id='c9d51a70-70c8-4b7b-b976-260fd0756c9...
            task map 100ms tasks                  Figure(id='682118d9-44c2-4cd3-bd82-9716c23ec89...
            task map 1s tasks                     Figure(id='504aa97b-a65b-470f-bf26-42d88766680...
            task map fast tasks                   Figure(id='b2a2e084-afb3-4e2d-b29f-9802b4ddd34...
            tree reduction 100ms tasks            Figure(id='ee7ad037-8c0a-4b84-baa0-77e64dff90b...
            tree reduction fast tasks             Figure(id='5599840f-08e1-436d-afd9-0049ff7905f...
dtype: object

In [6]:

names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
         'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
         'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 
         'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
from toolz import partition_all
L = df2.loc['tasks'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))

In [7]:

names = ['create random', 'blockwise 100ms tasks', 'elementwise computation', 'reduction', 
         'reduction along axis', 'random access', 'transpose addition', 'rechunk large', 
         'nearest neighbor fast tasks', 'nearest neighbor 100ms tasks']
from toolz import partition_all
L = df2.loc['arrays'].loc[names].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))

In [8]:

names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',
         'tree reduction 100ms tasks', 'tree reduction fast tasks', 'sequential',
         'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks', 
         'dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks']
L = df2.loc['dataframes'].values.tolist()
grid = list(partition_all(3, L))
# show(Column(*[Row(*g, sizing_mode='scale_width') for g in grid], sizing_mode='scale_width'))
show(gridplot(grid, sizing_mode='scale_width'))

In [9]:

from bokeh.palettes import viridis, Category10
from bokeh.models.widgets import Panel, Tabs

df3 = df.set_index(['collection', 'name'])[['n', 'rate', 'unit']]
n = df3.loc['tasks', 'task map 1s tasks'].n.values

colors = Category10[3]


def log_linear_plot(collection, names, title, legends=('100ms', '1us')):
    panels = []
    unit = df3.loc[collection, names[0]].iloc[0]['unit']
    
    for axis_type in ['log', 'linear']:
        fig = figure(title=title, # sizing_mode='scale_width', 
                     x_axis_type=axis_type, y_axis_type=axis_type, height=400, width=400)
        for i, name in enumerate(names):
            x = df3.loc[collection, name].n.values
            y = df3.loc[collection, name].rate.values
            fig.line(x=x, y=y, color=colors[i], legend=legends[i])
            fig.circle(x=x, y=y)

        for name in names:
            # Add in perfect scaling line
            x = df3.loc[collection, name].n.values
            y = df3.loc[collection, name].rate.values
            mn = x.min()
            mx = x.max()
            slope = y[0] / mn
            fig.line(x=[0, mx], y=[0, slope * mx], color='gray', line_dash='dashed')

        fig.y_range.end = max([df3.loc[collection, name].rate.max() for name in names])

        fig.xaxis.axis_label = 'cores'
        fig.yaxis.axis_label = unit
        fig.x_range.start = 0
        fig.y_range.start = 0

        fig.xaxis.ticker = x
        fig.legend.location = 'bottom_right'

        panel = Panel(child=fig, title=axis_type)
        panels.append(panel)

    tabs = Tabs(tabs=panels)
    return tabs

In [10]:

figures = {}

names = ['task map 1s tasks', 'task map 100ms tasks', 'task map fast tasks',]
legends = ['1s', '100ms', '1us']
fig = log_linear_plot('tasks', names, 'Tasks: Embarrassingly Parallel', legends)
figures['tasks-embarrasssing'] = fig

fig = log_linear_plot('tasks', ['tree reduction 100ms tasks', 'tree reduction fast tasks'], 'Tasks: Tree Reduction')
figures['tasks-reduction'] = fig

fig = log_linear_plot('tasks', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Tasks: Nearest Neighbor')
figures['tasks-nearest-neighbor'] = fig

fig = log_linear_plot('tasks', ['sequential'], 'Tasks: Sequential', legends=['fast'])
figures['tasks-sequential'] = fig

fig = log_linear_plot('tasks', ['dynamic tree reduction 100ms tasks', 'dynamic tree reduction fast tasks'], 'Tasks: Dynamic Reduction')
figures['tasks-dynamic-reduction'] = fig

L = [['tasks-embarrasssing', 'tasks-sequential'], ['tasks-nearest-neighbor', 'tasks-dynamic-reduction']]
figs = [[figures[k] for k in kk] for kk in L]
grid = gridplot(figs, sizing_mode='scale_width')
figures['tasks-grid'] = grid

In [11]:

show(grid)

Arrays¶

In [12]:

df[df.collection == 'arrays'].name.unique()

Out[12]:

array(['blockwise 100ms tasks', 'create random', 'elementwise computation',
       'nearest neighbor 100ms tasks', 'nearest neighbor fast tasks',
       'random access', 'rechunk large', 'rechunk small', 'reduction',
       'reduction along axis', 'transpose addition'], dtype=object)

In [13]:

fig = log_linear_plot('arrays', ['create random'], 'Arrays: Create', legends=['random'])
figures['array-create'] = fig

fig = log_linear_plot('arrays', ['elementwise computation'], 'Arrays: Elementwise', legends=['sin(x)**2 + cos(x)**2'])
figures['array-elementwise'] = fig

fig = log_linear_plot('arrays', ['reduction', 'reduction along axis'], 'Arrays: Nearest Neighbor', legends=['x.std()', 'x.std(axis=0)'])
figures['array-reductions'] = fig

fig = log_linear_plot('arrays', ['random access'], 'Arrays: Random Access', legends=['x[12345, 23456]'])
figures['array-random-access'] = fig

fig = log_linear_plot('arrays', ['transpose addition'], 'Arrays: Bulk Communication', legends=['x + x.T'])
figures['array-transpose'] = fig

fig = log_linear_plot('arrays', ['rechunk large'], 'Arrays: Rechunking', legends=['x.rechunk(...)'])
figures['array-rechunk'] = fig

fig = log_linear_plot('arrays', ['nearest neighbor 100ms tasks', 'nearest neighbor fast tasks'], 'Arrays: Map Overlap')
figures['array-overlap'] = fig

grid = [['array-create', 'array-elementwise'],
        ['array-reductions', 'array-random-access'],
        ['array-transpose', 'array-rechunk']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['array-grid'] = grid

In [14]:

show(figures['array-grid'])

Dataframes¶

In [15]:

df[df.collection == 'dataframes'].name.unique()

Out[15]:

array(['arithmetic', 'blockwise 100ms tasks', 'create random',
       'dataframe reduction', 'groupby apply (full shuffle)',
       'groupby reduction', 'random access', 'rolling aggregations',
       'series reduction', 'set index (full shuffle)'], dtype=object)

In [16]:

fig = log_linear_plot('dataframes', ['create random'], 'DataFrames: Create', legends=['random'])
figures['dataframe-create'] = fig

fig = log_linear_plot('dataframes', ['blockwise 100ms tasks', 'arithmetic'], 'DataFrames: Elementwise', legends=['100ms', 'arithmetic'])
figures['dataframe-elementwise'] = fig

fig = log_linear_plot('dataframes', ['random access'], 'DataFrames: Random Access', legends=['df.loc[123456]'])
figures['dataframe-random-access'] = fig


fig = log_linear_plot('dataframes', ['dataframe reduction', 'series reduction', 'groupby reduction'], 
                      'DataFrames: Reductions', legends=['df.std()', 'df[0].std()', 'df.groupby(0)[1].mean()'])
figures['dataframe-reductions'] = fig

fig = log_linear_plot('dataframes', ['groupby apply (full shuffle)', 'set index (full shuffle)'], 
                      'DataFrames: Full Shuffle', legends=['df.groupby(...).apply(...)', 'df.set_index(...)'])
figures['dataframe-shuffle'] = fig

fig = log_linear_plot('dataframes', ['rolling aggregations'], 'DataFrames: Time Series', legends=['df.rolling(...).mean()'])
figures['dataframe-time-series'] = fig

grid = [['dataframe-create', 'dataframe-elementwise'],
        ['dataframe-random-access', 'dataframe-reductions'],
        ['dataframe-shuffle', 'dataframe-time-series']]
grid = gridplot([[figures[name] for name in L] for L in grid], sizing_mode='scale_width')
figures['dataframe-grid'] = grid

In [17]:

show(grid)

In [18]:

from bokeh.embed import components

script, divs = components(list(figures.values()))

with open('/home/mrocklin/workspace/blog/_posts/work/2017-06-27-scaling.md', 'at') as f:

    for div in divs:
        f.write(div)
        f.write('\n\n')
    f.write(script)