#!/usr/bin/env python # coding: utf-8 # # idioms # In[2]: # group creator def create_frame(n, n_groups): stamps = pd.date_range('20010101',periods=n,freq='ms') random.shuffle(stamps.values) return DataFrame({'name' : np.random.randint(0,n_groups,size=n), 'stamp' : stamps, 'value' : np.random.randint(0,n,size=n), 'value2' : np.random.randn(n)}) # # unwrapping groupby # In[23]: df = create_frame(1000000,10000) def f_apply(df): return df.groupby('name').value2.apply(lambda x: (x-x.mean())/x.std()) def f_unwrap(df): g = df.groupby('name').value2 v = df.value2 return (v-g.transform('mean'))/g.transform('std') # In[24]: np.allclose(f_apply(df),f_unwrap(df)) # In[25]: get_ipython().run_line_magic('timeit', 'f_apply(df)') # In[26]: get_ipython().run_line_magic('timeit', 'f_unwrap(df)') # In[27]: df = create_frame(1000000,100) # In[28]: get_ipython().run_line_magic('timeit', 'f_apply(df)') # In[29]: get_ipython().run_line_magic('timeit', 'f_unwrap(df)') # # if then # In[30]: df = create_frame(100000,1000) # In[44]: def result_apply(df): return df.value.apply(lambda x: np.nan if x < 30000 else x) def result_where(df): return df.value.where(df.value>=30000) # In[45]: result_apply(df).equals(result_where(df)) # In[46]: get_ipython().run_line_magic('timeit', 'result_apply(df)') # In[47]: get_ipython().run_line_magic('timeit', 'result_where(df)') # In[ ]: