# group creator
def create_frame(n, n_groups):
stamps = pd.date_range('20010101',periods=n,freq='ms')
random.shuffle(stamps.values)
return DataFrame({'name' : np.random.randint(0,n_groups,size=n),
'stamp' : stamps,
'value' : np.random.randint(0,n,size=n),
'value2' : np.random.randn(n)})
df = create_frame(1000000,10000)
def f_apply(df):
return df.groupby('name').value2.apply(lambda x: (x-x.mean())/x.std())
def f_unwrap(df):
g = df.groupby('name').value2
v = df.value2
return (v-g.transform('mean'))/g.transform('std')
np.allclose(f_apply(df),f_unwrap(df))
True
%timeit f_apply(df)
1 loops, best of 3: 2.68 s per loop
%timeit f_unwrap(df)
1 loops, best of 3: 635 ms per loop
df = create_frame(1000000,100)
%timeit f_apply(df)
1 loops, best of 3: 310 ms per loop
%timeit f_unwrap(df)
1 loops, best of 3: 267 ms per loop
df = create_frame(100000,1000)
def result_apply(df):
return df.value.apply(lambda x: np.nan if x < 30000 else x)
def result_where(df):
return df.value.where(df.value>=30000)
result_apply(df).equals(result_where(df))
True
%timeit result_apply(df)
10 loops, best of 3: 43.7 ms per loop
%timeit result_where(df)
100 loops, best of 3: 2.1 ms per loop