#!/usr/bin/env python # coding: utf-8 # In[10]: # Import libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier import lightgbm as lgb import scipy.stats datapath='Data/' # In[11]: # Import data train_df=pd.read_csv(datapath+'train_u6lujuX_CVtuZ9i.csv') test_df=pd.read_csv(datapath+'test_Y3wMUE5_7gLdaTN.csv') # In[12]: train_df.head() # In[13]: # % of target composition train_df.Loan_Status.value_counts()/len(train_df) # In[14]: # Null values in columns train_df.isnull().sum() * 100/len(train_df) # In[15]: train_df['Credit_History'].nunique() # In[16]: # Add a feature EMI. Formula here https://javatutoring.com/wp-content/uploads/2016/12/emi-calculation-formula.jpg # On account of time paucity, rate of interest was avaergaed for gender. Definitely a better way worth investigating later. dataset = [train_df,test_df] for i in dataset: l = [] for j in i.Gender.index: if i.Gender[j] == "Male": r = 8.65/(12*100) else: r = 8.6/(12*100) P = i.LoanAmount[j]*1000 n = i.Loan_Amount_Term[j] E = P*r*(1 + r)**n/((1 + r)**n - 1) l.append(E) i["EMI"] = l # In[17]: # Add a ratio feature EMI. This we be capacitating loan amount with compined income per record for i in dataset: i["income"] = i["ApplicantIncome"] + i["CoapplicantIncome"] # i["app_income/loan"] = [x[0]/x[1] for x in zip(i["ApplicantIncome"],i["LoanAmount"])] i["income_loan_ratio"] = [x[0]/x[1] for x in zip(i["income"],i["LoanAmount"])] i.drop(["CoapplicantIncome"],axis = 1,inplace = True) # In[18]: train_df.shape,test_df.shape # In[19]: train_df.info() # In[20]: # Nothing fancy. Simply treat missing values based on data type for i in train_df.columns[train_df.dtypes == "object"]: train_df[i].fillna(train_df[i].mode()[0],inplace = True) for i in train_df.columns[train_df.dtypes != "object"]: train_df[i].fillna(train_df[i].mean(),inplace = True) for i in test_df.columns[test_df.dtypes == "object"]: test_df[i].fillna(train_df[i].mode()[0],inplace = True) for i in train_df.columns[train_df.dtypes != "object"]: test_df[i].fillna(test_df[i].mean(),inplace = True) # In[21]: # Label encode categoricals for i in list(set(train_df.columns[train_df.dtypes == "object"]) -set(["Loan_ID",'Loan_Status'])): le = LabelEncoder() train_df[i] = le.fit_transform(train_df[i].astype("str")) test_df[i] = le.transform(test_df[i].astype("str")) # In[22]: # Create and apply model tra_col = sorted(list(set (train_df.columns)- set(["Loan_ID","Loan_Status"]))) test_df_col = sorted(list(set (test_df.columns)- set(["Loan_ID"]))) print(tra_col,test_df_col) import xgboost as xgb from xgboost import XGBClassifier model = XGBClassifier() model.fit(train_df[tra_col],train_df["Loan_Status"]) # from scipy import sparse #model.predict(test_df[test_df_col]) # In[23]: #Predict and create submission result = pd.DataFrame({"Loan_ID":(test_df.Loan_ID),"Loan_Status":model.predict(test_df[test_df_col])}).reset_index(drop = True) result[["Loan_ID","Loan_Status"]].to_csv("loan_prediction_analyticsvidhya.csv",index = False) # In[ ]: