In [10]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import scipy.stats

datapath='Data/'

In [11]:

# Import data
train_df=pd.read_csv(datapath+'train_u6lujuX_CVtuZ9i.csv')
test_df=pd.read_csv(datapath+'test_Y3wMUE5_7gLdaTN.csv')

In [12]:

train_df.head()

Out[12]:

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y

In [13]:

# % of target composition
train_df.Loan_Status.value_counts()/len(train_df)

Out[13]:

Y    0.687296
N    0.312704
Name: Loan_Status, dtype: float64

In [14]:

# Null values in columns
train_df.isnull().sum() * 100/len(train_df)

Out[14]:

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [15]:

train_df['Credit_History'].nunique()

Out[15]:

In [16]:

# Add a feature EMI. Formula here https://javatutoring.com/wp-content/uploads/2016/12/emi-calculation-formula.jpg
# On account of time paucity, rate of interest was avaergaed for gender. Definitely a better way worth investigating later.

dataset = [train_df,test_df]
for i in dataset:
    l = []
    for j in i.Gender.index:
        if i.Gender[j] == "Male":
            r = 8.65/(12*100)
        else:
            r = 8.6/(12*100)
        P = i.LoanAmount[j]*1000
        n = i.Loan_Amount_Term[j]
        E = P*r*(1 + r)**n/((1 + r)**n - 1)
        l.append(E)
    i["EMI"] = l

In [17]:

# Add a ratio feature EMI. This we be capacitating loan amount with compined income per record

for i in dataset:
    i["income"] = i["ApplicantIncome"] + i["CoapplicantIncome"]
#     i["app_income/loan"] = [x[0]/x[1] for x in zip(i["ApplicantIncome"],i["LoanAmount"])]
    i["income_loan_ratio"] = [x[0]/x[1] for x in zip(i["income"],i["LoanAmount"])]
    i.drop(["CoapplicantIncome"],axis = 1,inplace = True)

In [18]:

train_df.shape,test_df.shape

Out[18]:

((614, 15), (367, 14))

In [19]:

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 15 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
EMI                  578 non-null float64
income               614 non-null float64
income_loan_ratio    592 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 72.0+ KB

In [20]:

# Nothing fancy. Simply treat missing values based on data type

for i in train_df.columns[train_df.dtypes == "object"]:
    train_df[i].fillna(train_df[i].mode()[0],inplace = True)
for i in train_df.columns[train_df.dtypes != "object"]:
    train_df[i].fillna(train_df[i].mean(),inplace = True)
for i in test_df.columns[test_df.dtypes == "object"]:
    test_df[i].fillna(train_df[i].mode()[0],inplace = True)
for i in train_df.columns[train_df.dtypes != "object"]:
    test_df[i].fillna(test_df[i].mean(),inplace = True)

In [21]:

# Label encode categoricals

for i in list(set(train_df.columns[train_df.dtypes == "object"]) -set(["Loan_ID",'Loan_Status'])):
    le = LabelEncoder()
    train_df[i] = le.fit_transform(train_df[i].astype("str"))
    test_df[i] = le.transform(test_df[i].astype("str"))

In [22]:

# Create and apply model

tra_col = sorted(list(set (train_df.columns)- set(["Loan_ID","Loan_Status"])))
test_df_col = sorted(list(set (test_df.columns)- set(["Loan_ID"])))
print(tra_col,test_df_col)


import xgboost as xgb
from xgboost import XGBClassifier
model = XGBClassifier()

model.fit(train_df[tra_col],train_df["Loan_Status"])
# from scipy import sparse
#model.predict(test_df[test_df_col])

['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio'] ['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio']

Out[22]:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:

#Predict and create submission

result = pd.DataFrame({"Loan_ID":(test_df.Loan_ID),"Loan_Status":model.predict(test_df[test_df_col])}).reset_index(drop = True)

result[["Loan_ID","Loan_Status"]].to_csv("loan_prediction_analyticsvidhya.csv",index = False)

In [ ]: