import pandas as pd
import numpy as np
from IPython.display import display
df = pd.DataFrame({'a':[1,2,3]})
df.values
array([[1], [2], [3]])
df.to_numpy()
array([[1], [2], [3]])
broadcasting
df = pd.DataFrame(np.random.rand(3,3))
df
0 | 1 | 2 | |
---|---|---|---|
0 | 0.631536 | 0.719294 | 0.356047 |
1 | 0.338154 | 0.220811 | 0.408885 |
2 | 0.824067 | 0.678876 | 0.352343 |
row = df.iloc[1]
col = df[1]
df.sub(row)
0 | 1 | 2 | |
---|---|---|---|
0 | 0.293382 | 0.498484 | -0.052838 |
1 | 0.000000 | 0.000000 | 0.000000 |
2 | 0.485913 | 0.458065 | -0.056542 |
row
0 0.338154 1 0.220811 2 0.408885 Name: 1, dtype: float64
df.sub(row, axis=0)
0 | 1 | 2 | |
---|---|---|---|
0 | 0.293382 | 0.381140 | 0.017893 |
1 | 0.117343 | 0.000000 | 0.188074 |
2 | 0.415183 | 0.269991 | -0.056542 |
df.sub(col,axis=0)
0 | 1 | 2 | |
---|---|---|---|
0 | -0.087759 | 0.0 | -0.363248 |
1 | 0.117343 | 0.0 | 0.188074 |
2 | 0.145191 | 0.0 | -0.326533 |
df.div(row)
0 | 1 | 2 | |
---|---|---|---|
0 | 1.867599 | 3.257516 | 0.870775 |
1 | 1.000000 | 1.000000 | 1.000000 |
2 | 2.436960 | 3.074471 | 0.861718 |
df.gt(.5).any()
0 True 1 True 2 False dtype: bool
df[df.gt(.5)] = np.nan
df
0 | 1 | 2 | |
---|---|---|---|
0 | NaN | NaN | 0.356047 |
1 | 0.338154 | 0.220811 | 0.408885 |
2 | NaN | NaN | 0.352343 |
df.combine_first(pd.DataFrame(np.random.rand(5,3), index=[0,1,2,4,5]).gt(.2))
0 | 1 | 2 | |
---|---|---|---|
0 | True | True | 0.356047 |
1 | 0.338154 | 0.220811 | 0.408885 |
2 | True | True | 0.352343 |
4 | True | False | False |
5 | False | True | True |
df.cumsum(axis=1)
0 | 1 | 2 | |
---|---|---|---|
0 | NaN | NaN | 0.356047 |
1 | 0.338154 | 0.558965 | 0.967849 |
2 | NaN | NaN | 0.352343 |
df.cummin()
0 | 1 | 2 | |
---|---|---|---|
0 | NaN | NaN | 0.356047 |
1 | 0.338154 | 0.220811 | 0.356047 |
2 | NaN | NaN | 0.352343 |
df = pd.DataFrame({
'a':np.linspace(1,10,4),
'b': ['Yes','No',np.nan, np.nan],
'c': np.linspace(11,20,4)
})
df
a | b | c | |
---|---|---|---|
0 | 1.0 | Yes | 11.0 |
1 | 4.0 | No | 14.0 |
2 | 7.0 | NaN | 17.0 |
3 | 10.0 | NaN | 20.0 |
df.describe(include='all')
a | b | c | |
---|---|---|---|
count | 4.000000 | 2 | 4.000000 |
unique | NaN | 2 | NaN |
top | NaN | No | NaN |
freq | NaN | 1 | NaN |
mean | 5.500000 | NaN | 15.500000 |
std | 3.872983 | NaN | 3.872983 |
min | 1.000000 | NaN | 11.000000 |
25% | 3.250000 | NaN | 13.250000 |
50% | 5.500000 | NaN | 15.500000 |
75% | 7.750000 | NaN | 17.750000 |
max | 10.000000 | NaN | 20.000000 |
df.drop('b', axis=1).idxmax()
a 3 c 3 dtype: int64
for each row get column with highest value:
df.drop('b', axis=1).idxmax(axis=1)
0 c 1 c 2 c 3 c dtype: object
for each column get index for highest value:
df.drop('b', axis=1).idxmax(axis='index')
a 3 c 3 dtype: int64
s = pd.Series(map(lambda _: np.random.randint(10),range(15)))
s.value_counts().sort_index()
0 1 1 2 2 1 3 2 4 1 5 1 6 2 7 1 8 2 9 2 dtype: int64
most frequest value:
s.mode()
0 1 1 3 2 6 3 8 4 9 dtype: int64
s_dicretized = pd.cut(s, 3)
s_dicretized
0 (5.333, 8.0] 1 (2.667, 5.333] 2 (5.333, 8.0] 3 (5.333, 8.0] 4 (-0.008, 2.667] 5 (-0.008, 2.667] 6 (2.667, 5.333] 7 (5.333, 8.0] 8 (5.333, 8.0] 9 (-0.008, 2.667] 10 (2.667, 5.333] 11 (-0.008, 2.667] 12 (5.333, 8.0] 13 (-0.008, 2.667] 14 (2.667, 5.333] dtype: category Categories (3, interval[float64]): [(-0.008, 2.667] < (2.667, 5.333] < (5.333, 8.0]]
s_dicretized.value_counts()
(5.333, 8.0] 6 (-0.008, 2.667] 5 (2.667, 5.333] 4 dtype: int64
s_dicretized.unique()
[(5.333, 8.0], (2.667, 5.333], (-0.008, 2.667]] Categories (3, interval[float64]): [(-0.008, 2.667] < (2.667, 5.333] < (5.333, 8.0]]
s_quantilized = pd.qcut(s, q=3)
display(s_quantilized)
0 (4.0, 6.333] 1 (4.0, 6.333] 2 (6.333, 8.0] 3 (6.333, 8.0] 4 (-0.001, 4.0] 5 (-0.001, 4.0] 6 (4.0, 6.333] 7 (6.333, 8.0] 8 (6.333, 8.0] 9 (-0.001, 4.0] 10 (4.0, 6.333] 11 (-0.001, 4.0] 12 (6.333, 8.0] 13 (-0.001, 4.0] 14 (4.0, 6.333] dtype: category Categories (3, interval[float64]): [(-0.001, 4.0] < (4.0, 6.333] < (6.333, 8.0]]
2 quantilized = median
pd.qcut(pd.Series(np.linspace(1,100,100)), q=2).unique()
[(0.999, 50.5], (50.5, 100.0]] Categories (2, interval[float64]): [(0.999, 50.5] < (50.5, 100.0]]
pd.cut(pd.Series(np.linspace(1,100,100)), 2).unique()
[(0.901, 50.5], (50.5, 100.0]] Categories (2, interval[float64]): [(0.901, 50.5] < (50.5, 100.0]]
s = pd.Series([0,0,1,1,2,3,4,5,6,7])
pd.qcut(s, q=4)
0 (-0.001, 1.0] 1 (-0.001, 1.0] 2 (-0.001, 1.0] 3 (-0.001, 1.0] 4 (1.0, 2.5] 5 (2.5, 4.75] 6 (2.5, 4.75] 7 (4.75, 7.0] 8 (4.75, 7.0] 9 (4.75, 7.0] dtype: category Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.5] < (2.5, 4.75] < (4.75, 7.0]]
pd.cut(s,4)
0 (-0.007, 1.75] 1 (-0.007, 1.75] 2 (-0.007, 1.75] 3 (-0.007, 1.75] 4 (1.75, 3.5] 5 (1.75, 3.5] 6 (3.5, 5.25] 7 (3.5, 5.25] 8 (5.25, 7.0] 9 (5.25, 7.0] dtype: category Categories (4, interval[float64]): [(-0.007, 1.75] < (1.75, 3.5] < (3.5, 5.25] < (5.25, 7.0]]
def is_greater(greater, frame):
return frame > greater
df[['a', 'c']].pipe((is_greater, 'frame'), 4)
a | c | |
---|---|---|
0 | False | True |
1 | False | True |
2 | True | True |
3 | True | True |
df.apply('mean', axis='index'), df.apply('mean', axis=1)
(a 5.5 c 15.5 dtype: float64, 0 6.0 1 9.0 2 12.0 3 15.0 dtype: float64)
The transform() method returns an object that is indexed the same (same size) as the original
def do(a):
print(a)
return np.linspace(-10,10, len(a))
df[['a','c']].transform(do, axis=0)
0 1.0 1 4.0 2 7.0 3 10.0 Name: a, dtype: float64 0 1.0 1 4.0 2 7.0 3 10.0 Name: a, dtype: float64 0 11.0 1 14.0 2 17.0 3 20.0 Name: c, dtype: float64
a | c | |
---|---|---|
0 | -10.000000 | -10.000000 |
1 | -3.333333 | -3.333333 |
2 | 3.333333 | 3.333333 |
3 | 10.000000 | 10.000000 |
df[['a','c']].transform([np.abs, np.log], axis='index')
a | c | |||
---|---|---|---|---|
absolute | log | absolute | log | |
0 | 1.0 | 0.000000 | 11.0 | 2.397895 |
1 | 4.0 | 1.386294 | 14.0 | 2.639057 |
2 | 7.0 | 1.945910 | 17.0 | 2.833213 |
3 | 10.0 | 2.302585 | 20.0 | 2.995732 |
df.applymap(lambda x: len(str(x)))
a | b | c | |
---|---|---|---|
0 | 3 | 3 | 4 |
1 | 3 | 2 | 4 |
2 | 3 | 3 | 4 |
3 | 4 | 3 | 4 |
df['b'].map({'Yes': 'No', 'No': 'No'})
0 No 1 No 2 NaN 3 NaN Name: b, dtype: object
note: need to provide 1:1 mapping such as 'No': 'No'
s = pd.Series([1,2,3,4,5], index=list('abcde'))
s
a 1 b 2 c 3 d 4 e 5 dtype: int64
s.reindex(list('eedcbaf'))
e 5.0 e 5.0 d 4.0 c 3.0 b 2.0 a 1.0 f NaN dtype: float64
note: duplicated index shows value twcice. Non esisting index value becomes NaN
s = pd.Series([1,2,2,3,4,5], index=list('bcdeaa'))
s
b 1 c 2 d 2 e 3 a 4 a 5 dtype: int64
s.reindex()
b 1 c 2 d 2 e 3 a 4 a 5 dtype: int64
df = pd.DataFrame(np.linspace(1,9,9).reshape(3,3), columns=list('cba'))
df
c | b | a | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 |
2 | 7.0 | 8.0 | 9.0 |
df.reindex(sorted(df.columns.values), axis=1)
a | b | c | |
---|---|---|---|
0 | 3.0 | 2.0 | 1.0 |
1 | 6.0 | 5.0 | 4.0 |
2 | 9.0 | 8.0 | 7.0 |
Become reindex ninja for big if heavy tasks on the way: many operations are faster on pre-aligned data
df_2 = pd.DataFrame(np.linspace(101,109,9).reshape(3,3), columns=['a', 'c', 'h'], index=[3,2,1])
df_2
a | c | h | |
---|---|---|---|
3 | 101.0 | 102.0 | 103.0 |
2 | 104.0 | 105.0 | 106.0 |
1 | 107.0 | 108.0 | 109.0 |
df_2.reindex_like(df)
c | b | a | |
---|---|---|---|
0 | NaN | NaN | NaN |
1 | 108.0 | NaN | 107.0 |
2 | 105.0 | NaN | 104.0 |
df_aligned, df_2_aligned = df.align(df_2, join='outer') # outer is default
display(df_aligned)
display(df_2_aligned)
a | b | c | h | |
---|---|---|---|---|
0 | 3.0 | 2.0 | 1.0 | NaN |
1 | 6.0 | 5.0 | 4.0 | NaN |
2 | 9.0 | 8.0 | 7.0 | NaN |
3 | NaN | NaN | NaN | NaN |
a | b | c | h | |
---|---|---|---|---|
0 | NaN | NaN | NaN | NaN |
1 | 107.0 | NaN | 108.0 | 109.0 |
2 | 104.0 | NaN | 105.0 | 106.0 |
3 | 101.0 | NaN | 102.0 | 103.0 |
df_aligned, df_2_aligned = df.align(df_2, join='left')
display(df_aligned)
display(df_2_aligned)
c | b | a | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 |
2 | 7.0 | 8.0 | 9.0 |
c | b | a | |
---|---|---|---|
0 | NaN | NaN | NaN |
1 | 108.0 | NaN | 107.0 |
2 | 105.0 | NaN | 104.0 |
df_aligned, df_2_aligned = df.align(df_2, join='right')
display(df_aligned)
display(df_2_aligned)
a | c | h | |
---|---|---|---|
3 | NaN | NaN | NaN |
2 | 9.0 | 7.0 | NaN |
1 | 6.0 | 4.0 | NaN |
a | c | h | |
---|---|---|---|
3 | 101.0 | 102.0 | 103.0 |
2 | 104.0 | 105.0 | 106.0 |
1 | 107.0 | 108.0 | 109.0 |
df_aligned, df_2_aligned = df.align(df_2, join='inner')
display(df_aligned)
display(df_2_aligned)
c | a | |
---|---|---|
1 | 4.0 | 6.0 |
2 | 7.0 | 9.0 |
c | a | |
---|---|---|
1 | 108.0 | 107.0 |
2 | 105.0 | 104.0 |
df_aligned, df_2_aligned = df.align(df_2, join='outer', axis=0)
display(df_aligned)
display(df_2_aligned)
c | b | a | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 |
2 | 7.0 | 8.0 | 9.0 |
3 | NaN | NaN | NaN |
a | c | h | |
---|---|---|---|
0 | NaN | NaN | NaN |
1 | 107.0 | 108.0 | 109.0 |
2 | 104.0 | 105.0 | 106.0 |
3 | 101.0 | 102.0 | 103.0 |
Filling whiel indexing
df_2.reindex([0, 1,1.5, 2, 2.5,3, 4], method='ffill')
a | c | h | |
---|---|---|---|
0.0 | 107.0 | 108.0 | 109.0 |
1.0 | 107.0 | 108.0 | 109.0 |
1.5 | 104.0 | 105.0 | 106.0 |
2.0 | 104.0 | 105.0 | 106.0 |
2.5 | 101.0 | 102.0 | 103.0 |
3.0 | 101.0 | 102.0 | 103.0 |
4.0 | NaN | NaN | NaN |
df_2
a | c | h | |
---|---|---|---|
3 | 101.0 | 102.0 | 103.0 |
2 | 104.0 | 105.0 | 106.0 |
1 | 107.0 | 108.0 | 109.0 |
df = pd.DataFrame(np.linspace(1,9,9).reshape(3,3),
columns=list('abc'),
index=[3,2,1])
df
a | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
1 | 7.0 | 8.0 | 9.0 |
df.reindex([0,1,2,3,4], method='ffill')
a | b | c | |
---|---|---|---|
0 | 7.0 | 8.0 | 9.0 |
1 | 7.0 | 8.0 | 9.0 |
2 | 4.0 | 5.0 | 6.0 |
3 | 1.0 | 2.0 | 3.0 |
4 | NaN | NaN | NaN |
df.sort_index().reindex([0,1,2,3,4], method='ffill')
a | b | c | |
---|---|---|---|
0 | NaN | NaN | NaN |
1 | 7.0 | 8.0 | 9.0 |
2 | 4.0 | 5.0 | 6.0 |
3 | 1.0 | 2.0 | 3.0 |
4 | 1.0 | 2.0 | 3.0 |
closely related to reindex is drop
df.reindex(df.index.difference([-1,0,1]))
a | b | c | |
---|---|---|---|
2 | 4.0 | 5.0 | 6.0 |
3 | 1.0 | 2.0 | 3.0 |
df.drop([1], axis=0)
a | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
renaming
df.rename(str.upper, axis=1)
A | B | C | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
1 | 7.0 | 8.0 | 9.0 |
df.rename({'a':'A',}, axis=1)
A | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
1 | 7.0 | 8.0 | 9.0 |
for col_name, series_values in df.items(): # dict like
print(f'col name: {col_name}, values')
display(series_values)
col name: a, values
3 1.0 2 4.0 1 7.0 Name: a, dtype: float64
col name: b, values
3 2.0 2 5.0 1 8.0 Name: b, dtype: float64
col name: c, values
3 3.0 2 6.0 1 9.0 Name: c, dtype: float64
for i,v in df.iteritems():
print(i,v)
a 3 1.0 2 4.0 1 7.0 Name: a, dtype: float64 b 3 2.0 2 5.0 1 8.0 Name: b, dtype: float64 c 3 3.0 2 6.0 1 9.0 Name: c, dtype: float64
for index, row in df.iterrows():
print(f'index: {index}, row:')
display(row)
index: 3, row:
a 1.0 b 2.0 c 3.0 Name: 3, dtype: float64
index: 2, row:
a 4.0 b 5.0 c 6.0 Name: 2, dtype: float64
index: 1, row:
a 7.0 b 8.0 c 9.0 Name: 1, dtype: float64
for row in df.itertuples(): # faster than iterrows
print(row)
Pandas(Index=3, a=1.0, b=2.0, c=3.0) Pandas(Index=2, a=4.0, b=5.0, c=6.0) Pandas(Index=1, a=7.0, b=8.0, c=9.0)
s = pd.Series(pd.date_range('2020/01/01', periods=5, freq='0.5D'))
s
0 2020-01-01 00:00:00 1 2020-01-01 12:00:00 2 2020-01-02 00:00:00 3 2020-01-02 12:00:00 4 2020-01-03 00:00:00 dtype: datetime64[ns]
s.dt.day_name()
0 Wednesday 1 Wednesday 2 Thursday 3 Thursday 4 Friday dtype: object
s.dt.day==2
0 False 1 False 2 True 3 True 4 False dtype: bool
# set tz to US, than say what time was it in UTC
display(s.dt.tz_localize('US/Eastern'))
display(s.dt.tz_localize('US/Eastern').dt.tz_convert('UTC'))
0 2020-01-01 00:00:00-05:00 1 2020-01-01 12:00:00-05:00 2 2020-01-02 00:00:00-05:00 3 2020-01-02 12:00:00-05:00 4 2020-01-03 00:00:00-05:00 dtype: datetime64[ns, US/Eastern]
0 2020-01-01 05:00:00+00:00 1 2020-01-01 17:00:00+00:00 2 2020-01-02 05:00:00+00:00 3 2020-01-02 17:00:00+00:00 4 2020-01-03 05:00:00+00:00 dtype: datetime64[ns, UTC]
# set tz to UTC than say what time it is in US
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
0 2019-12-31 19:00:00-05:00 1 2020-01-01 07:00:00-05:00 2 2020-01-01 19:00:00-05:00 3 2020-01-02 07:00:00-05:00 4 2020-01-02 19:00:00-05:00 dtype: datetime64[ns, US/Eastern]
df
a | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
1 | 7.0 | 8.0 | 9.0 |
df.sort_index(axis=1, ascending=False).sort_index()
c | b | a | |
---|---|---|---|
1 | 9.0 | 8.0 | 7.0 |
2 | 6.0 | 5.0 | 4.0 |
3 | 3.0 | 2.0 | 1.0 |
df[df.mod(2).eq(0)].sort_values('b', na_position='first')
c | b | a | |
---|---|---|---|
1 | 4.0 | NaN | 6.0 |
0 | NaN | 2.0 | NaN |
2 | NaN | 8.0 | NaN |
df.sort_values(by=['a', 'b'], ascending=[True,False])
a | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
1 | 7.0 | 8.0 | 9.0 |
df.nsmallest(2, columns=['a','b'])
a | b | c | |
---|---|---|---|
3 | 1.0 | 2.0 | 3.0 |
2 | 4.0 | 5.0 | 6.0 |
search sorted:
display(df.a.sort_values())
#find indecies to place values to maintain order
values = [0,4]
df.a.searchsorted(values)
3 1.0 2 4.0 1 7.0 Name: a, dtype: float64
array([0, 1])
df = pd.DataFrame({
'a': [0,1,2,3],
'b': [1,2,3,4]
})
df.loc[lambda df: df.a.mod(2).eq(0), ['b']] = 10
df
a | b | |
---|---|---|
0 | 0 | 10 |
1 | 1 | 2 |
2 | 2 | 10 |
3 | 3 | 4 |