import pandas as pd
import numpy as np
None==None
True
np.nan == np.nan
False
s = pd.Series([np.nan,1,np.nan])
s == np.nan, s.isna()
(0 False 1 False 2 False dtype: bool, 0 True 1 False 2 True dtype: bool)
np.nan is a float and forces int array ot be float type
pd.Series([1,2, np.nan]).dtype, pd.Series([1,2]).dtype
(dtype('float64'), dtype('int64'))
use Int dtype
pd.Series([1,2,np.nan], dtype=pd.Int32Dtype()).dtype
Int32Dtype()
s = pd.Series([np.nan, 1,2, pd.NaT])
s
0 NaN 1 1 2 2 3 NaT dtype: object
s.fillna('missing')
0 missing 1 1 2 2 3 missing dtype: object
s.fillna(method='pad'), s.fillna(method='ffill')
(0 NaN 1 1.0 2 2.0 3 2.0 dtype: float64, 0 NaN 1 1.0 2 2.0 3 2.0 dtype: float64)
s.fillna(method='bfill'), s.fillna(method='backfill'), s.bfill()
(0 1 1 1 2 2 3 NaT dtype: object, 0 1 1 1 2 2 3 NaT dtype: object, 0 1 1 1 2 2 3 NaT dtype: object)
Limit number of filled value
s.fillna('missing', limit=1)
0 missing 1 1 2 2 3 NaT dtype: object
df = pd.DataFrame({'a': [1 ,2, 3], 'b': [np.nan, 2,3]})
df
a | b | |
---|---|---|
0 | 1 | NaN |
1 | 2 | 2.0 |
2 | 3 | 3.0 |
df.dropna(axis=0)
a | b | |
---|---|---|
1 | 2 | 2.0 |
2 | 3 | 3.0 |
df.dropna(axis=1)
a | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
df.b.dropna()
1 2.0 2 3.0 Name: b, dtype: float64
ts = pd.Series([1,2,3,1,2,2, np.nan, np.nan, 4,3,2], index=pd.date_range('2020/1/1', '2020/1/11'))
ts.count()
9
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f97c1c0da60>
ts.interpolate().count()
11
ts.interpolate().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f97c1f09ca0>
ts.interpolate(method='pad').plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f97c1f05100>
ts.interpolate(method='polynomial', order=2).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f97c26aaa30>
s.replace(np.nan, 'missing')
0 missing 1 1 2 2 3 missing dtype: object
s.replace(1,'invalid')
0 NaN 1 invalid 2 2 3 NaT dtype: object
s.replace({1:-1, 2: 'over bountries'})
0 NaN 1 -1 2 over bountries 3 NaT dtype: object
was introduced to consistently indicate missing data across all types
pd.NA is pd.NA, pd.NA == pd.NA, np.nan == np.nan
(True, <NA>, False)
pd.NA * 1, pd.isna(pd.NA), pd.isnull(pd.NA)
(<NA>, True, True)