Imports¶

In [1]:

import pandas as pd
import os
import numpy as np

print np.__version__
print pd.__version__

1.14.2
0.20.3

In [2]:

# reading data into pandas dataframe
# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand 
# to treat '?' as NaN values
DATA_DIR = '../data'

df = pd.read_table(
                    os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),
                    sep = ',',
                    header=None,
                    na_values = '?'
                   )
df.head(5)

Out[2]:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	202.0	0	+
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	43.0	560	+
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	280.0	824	+
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	100.0	3	+
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	120.0	0	+

In [3]:

# list out number of missing value in each of the series of the df
df.isnull().sum()

Out[3]:

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [4]:

# just have look at the data types to get an idea of what imputation to make for any particular series
df.dtypes

Out[4]:

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15     object
dtype: object

Deleting Rows that have missing values¶

In [5]:

df_ = df.dropna(inplace=False)
df_.isnull().sum()

Out[5]:

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

Imputing with Mean value - (For Continuous Data)¶

In [6]:

# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns
df[1].fillna(df[1].mean(), inplace=True)
df.isnull().sum()

Out[6]:

0     12
1      0
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

Imputing with Mode value - (For Categorical Data)¶

In [7]:

df[0].fillna(df[0].mode()[0], inplace=True)
df.isnull().sum()

Out[7]:

0      0
1      0
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

Imputing by adding one more category - (For Categorical Data)¶

In [8]:

# 3rd column can have these number of possibilities
df[3].unique()

Out[8]:

array(['u', 'y', nan, 'l'], dtype=object)

In [9]:

# treating it as new category all together tells model to learn it's dependency with other features for 
# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding
df[3].fillna('UNK', inplace=True)
df.isnull().sum()

Out[9]:

0      0
1      0
2      0
3      0
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [10]:

df[df[3] == 'UNK']

Out[10]:

	0	1	3	4	5	6	8	9	11	12	13	15
206	a	71.58	UNK	NaN	NaN	NaN	f	f	f	p	NaN	+
270	b	37.58	UNK	NaN	NaN	NaN	f	f	f	p	NaN	+
330	b	20.42	UNK	NaN	NaN	NaN	f	f	f	p	NaN	-
456	b	34.58	UNK	NaN	NaN	NaN	f	f	f	p	NaN	-
592	b	23.17	UNK	NaN	NaN	NaN	f	f	f	p	NaN	+
622	a	25.58	UNK	NaN	NaN	NaN	f	f	f	p	NaN	+

Imputing by back filling¶

In [11]:

# we can specify a back-fill to propagate the next values backward
df[13].fillna(method='bfill', inplace=True)
df.isnull().sum()

Out[11]:

0     0
1     0
2     0
3     0
4     6
5     9
6     9
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

Imputing by forward filling¶

In [12]:

# We can specify a forward-fill to propagate the previous value forward
df[4].fillna(method='ffill', inplace=True)
df.isnull().sum()

Out[12]:

0     0
1     0
2     0
3     0
4     0
5     9
6     9
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64