import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
'age': [42, np.nan, 36, 24, 73],
'sex': ['m', np.nan, 'f', 'm', 'f'],
'preTestScore': [4, np.nan, np.nan, 2, 3],
'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df
first_name | last_name | age | sex | preTestScore | postTestScore | |
---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 |
1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Tina | Ali | 36 | f | NaN | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 |
4 | Amy | Cooze | 73 | f | 3 | 70 |
df_no_missing = df.dropna()
df_no_missing
first_name | last_name | age | sex | preTestScore | postTestScore | |
---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 |
3 | Jake | Milner | 24 | m | 2 | 62 |
4 | Amy | Cooze | 73 | f | 3 | 70 |
df_cleaned = df.dropna(how='all')
df_cleaned
first_name | last_name | age | sex | preTestScore | postTestScore | |
---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 |
2 | Tina | Ali | 36 | f | NaN | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 |
4 | Amy | Cooze | 73 | f | 3 | 70 |
df['location'] = [np.nan, np.nan, np.nan, np.nan, np.nan]
df
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Tina | Ali | 36 | f | NaN | NaN | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 | NaN |
4 | Amy | Cooze | 73 | f | 3 | 70 | NaN |
df.dropna(axis=1, how='all')
first_name | last_name | age | sex | preTestScore | postTestScore | |
---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 |
1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Tina | Ali | 36 | f | NaN | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 |
4 | Amy | Cooze | 73 | f | 3 | 70 |
This is really mostly useful for time series
df.dropna(thresh=5)
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 | NaN |
4 | Amy | Cooze | 73 | f | 3 | 70 | NaN |
df.fillna(0)
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | Tina | Ali | 36 | f | 0 | 0 | 0 |
3 | Jake | Milner | 24 | m | 2 | 62 | 0 |
4 | Amy | Cooze | 73 | f | 3 | 70 | 0 |
inplace=True means that the changes are saved to the df right away
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | NaN |
1 | NaN | NaN | NaN | NaN | 3 | NaN | NaN |
2 | Tina | Ali | 36 | f | 3 | NaN | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 | NaN |
4 | Amy | Cooze | 73 | f | 3 | 70 | NaN |
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | NaN |
1 | NaN | NaN | NaN | NaN | 3 | NaN | NaN |
2 | Tina | Ali | 36 | f | 3 | 70 | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 | NaN |
4 | Amy | Cooze | 73 | f | 3 | 70 | NaN |
# Select the rows of df where age is not NaN and sex is not NaN
df[df['age'].notnull() & df['sex'].notnull()]
first_name | last_name | age | sex | preTestScore | postTestScore | location | |
---|---|---|---|---|---|---|---|
0 | Jason | Miller | 42 | m | 4 | 25 | NaN |
2 | Tina | Ali | 36 | f | 3 | 70 | NaN |
3 | Jake | Milner | 24 | m | 2 | 62 | NaN |
4 | Amy | Cooze | 73 | f | 3 | 70 | NaN |