import sys
sys.version
'3.6.7 (default, Oct 21 2018, 04:56:05) \n[GCC 5.4.0 20160609]'
import pandas as pd
import numpy as np
import swifter
pd.__version__, np.__version__,swifter.__version__
('0.25.1', '1.16.3', '0.292')
np.random.seed(42)
df1 = pd.DataFrame({
'x': np.random.random(size=30000000)
})
apply
df1['x'].mean()
0.5000156711783587
vectorizable functions: winner is swifter series.apply, by a small margin
def apply_to_array(arr):
return np.add(np.multiply(arr,2),3)
def apply_to_element(elem):
return (elem*2)+3
%%time
#dataframe.apply
df1[['x']].apply(apply_to_array)
True
CPU times: user 172 ms, sys: 376 ms, total: 548 ms Wall time: 548 ms
%%time
# series.apply
df1['x'].apply(apply_to_element)
True
CPU times: user 5.8 s, sys: 576 ms, total: 6.38 s Wall time: 6.37 s
%%time
# swifter dataframe.apply
df1[['x']].swifter.apply(apply_to_array)
True
CPU times: user 140 ms, sys: 148 ms, total: 288 ms Wall time: 284 ms
%%time
# swifter series.apply
df1['x'].swifter.apply(apply_to_element)
True
CPU times: user 72 ms, sys: 120 ms, total: 192 ms Wall time: 190 ms
string functions: winner is regular series.apply; swifter.apply fails miserably
def num_to_str(num):
return str(num)
%%time
# series.apply
df1['x'].apply(num_to_str)
True
CPU times: user 22.8 s, sys: 952 ms, total: 23.8 s Wall time: 23.8 s
%%time
# swifter series.apply
df1['x'].swifter.apply(num_to_str)
True
HBox(children=(IntProgress(value=0, description='Dask Apply', max=32, style=ProgressStyle(description_width='i…
CPU times: user 1min 40s, sys: 5.66 s, total: 1min 45s Wall time: 2min 17s
if-then-else: swifter wins by a small margin
def if_then_else(x):
if x >= 0.5:
return True
else:
return False
%%time
# series.apply
df1['x'].apply(if_then_else)
True
CPU times: user 4.15 s, sys: 428 ms, total: 4.58 s Wall time: 4.57 s
%%time
# swifter series.apply
df1['x'].swifter.apply(if_then_else)
True
HBox(children=(IntProgress(value=0, description='Dask Apply', max=32, style=ProgressStyle(description_width='i…
CPU times: user 1 s, sys: 412 ms, total: 1.41 s Wall time: 3.83 s