import pandas as pd
import os
import numpy as np
print np.__version__
print pd.__version__
1.14.2 0.20.3
# reading data into pandas dataframe
# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand
# to treat '?' as NaN values
DATA_DIR = '../data'
df = pd.read_table(
os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),
sep = ',',
header=None,
na_values = '?'
)
df.head(5)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | b | 30.83 | 0.000 | u | g | w | v | 1.25 | t | t | 1 | f | g | 202.0 | 0 | + |
1 | a | 58.67 | 4.460 | u | g | q | h | 3.04 | t | t | 6 | f | g | 43.0 | 560 | + |
2 | a | 24.50 | 0.500 | u | g | q | h | 1.50 | t | f | 0 | f | g | 280.0 | 824 | + |
3 | b | 27.83 | 1.540 | u | g | w | v | 3.75 | t | t | 5 | t | g | 100.0 | 3 | + |
4 | b | 20.17 | 5.625 | u | g | w | v | 1.71 | t | f | 0 | f | s | 120.0 | 0 | + |
# list out number of missing value in each of the series of the df
df.isnull().sum()
0 12 1 12 2 0 3 6 4 6 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 13 14 0 15 0 dtype: int64
# just have look at the data types to get an idea of what imputation to make for any particular series
df.dtypes
0 object 1 float64 2 float64 3 object 4 object 5 object 6 object 7 float64 8 object 9 object 10 int64 11 object 12 object 13 float64 14 int64 15 object dtype: object
df_ = df.dropna(inplace=False)
df_.isnull().sum()
0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 dtype: int64
# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns
df[1].fillna(df[1].mean(), inplace=True)
df.isnull().sum()
0 12 1 0 2 0 3 6 4 6 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 13 14 0 15 0 dtype: int64
df[0].fillna(df[0].mode()[0], inplace=True)
df.isnull().sum()
0 0 1 0 2 0 3 6 4 6 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 13 14 0 15 0 dtype: int64
# 3rd column can have these number of possibilities
df[3].unique()
array(['u', 'y', nan, 'l'], dtype=object)
# treating it as new category all together tells model to learn it's dependency with other features for
# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding
df[3].fillna('UNK', inplace=True)
df.isnull().sum()
0 0 1 0 2 0 3 0 4 6 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 13 14 0 15 0 dtype: int64
df[df[3] == 'UNK']
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
206 | a | 71.58 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | + |
270 | b | 37.58 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | + |
330 | b | 20.42 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | - |
456 | b | 34.58 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | - |
592 | b | 23.17 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | + |
622 | a | 25.58 | 0.0 | UNK | NaN | NaN | NaN | 0.0 | f | f | 0 | f | p | NaN | 0 | + |
# we can specify a back-fill to propagate the next values backward
df[13].fillna(method='bfill', inplace=True)
df.isnull().sum()
0 0 1 0 2 0 3 0 4 6 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 dtype: int64
# We can specify a forward-fill to propagate the previous value forward
df[4].fillna(method='ffill', inplace=True)
df.isnull().sum()
0 0 1 0 2 0 3 0 4 0 5 9 6 9 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 dtype: int64