idioms

In [2]:
# group creator
def create_frame(n, n_groups):
    stamps = pd.date_range('20010101',periods=n,freq='ms')
    random.shuffle(stamps.values)    
    return DataFrame({'name'  : np.random.randint(0,n_groups,size=n),
                      'stamp' : stamps,
                      'value' : np.random.randint(0,n,size=n),
                      'value2' : np.random.randn(n)})

unwrapping groupby

In [23]:
df = create_frame(1000000,10000)

def f_apply(df):
    return df.groupby('name').value2.apply(lambda x: (x-x.mean())/x.std())
def f_unwrap(df):
    g = df.groupby('name').value2
    v = df.value2
    return (v-g.transform('mean'))/g.transform('std')
In [24]:
np.allclose(f_apply(df),f_unwrap(df))
Out[24]:
True
In [25]:
%timeit f_apply(df)
1 loops, best of 3: 2.68 s per loop
In [26]:
%timeit f_unwrap(df)
1 loops, best of 3: 635 ms per loop
In [27]:
df = create_frame(1000000,100)
In [28]:
%timeit f_apply(df)
1 loops, best of 3: 310 ms per loop
In [29]:
%timeit f_unwrap(df)
1 loops, best of 3: 267 ms per loop

if then

In [30]:
df = create_frame(100000,1000)
In [44]:
def result_apply(df):
    return df.value.apply(lambda x: np.nan if x < 30000 else x)
def result_where(df):
    return df.value.where(df.value>=30000)
In [45]:
result_apply(df).equals(result_where(df))
Out[45]:
True
In [46]:
%timeit result_apply(df)
10 loops, best of 3: 43.7 ms per loop
In [47]:
%timeit result_where(df)
100 loops, best of 3: 2.1 ms per loop
In [ ]: