In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

loading the dataset

In [3]:
# Control delimiters, rows, column names with read_csv (see later) 
df = pd.read_csv("loans.csv", engine='python')

How does it look like ?

In [4]:
# show all columns
pd.set_option('display.max_columns', None)
In [5]:
df.head()
Out[5]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog hardship_flag hardship_type hardship_reason hardship_status deferral_term hardship_amount hardship_start_date hardship_end_date payment_plan_start_date hardship_length hardship_dpd hardship_loan_status orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
0 1040017 NaN NaN 14000 14000 14000.0 36 months 12.69 469.63 C C2 Receiving Dock Worker 9 years MORTGAGE 40000.0 Not Verified 2015-10-01 Charged Off n NaN NaN debt_consolidation Debt consolidation 166xx PA 17.07 0.0 Jun-2001 1.0 NaN NaN 5.0 0.0 5848 90.0 15.0 f 0.0 0.0 6057.790000 6057.79 4091.51 1556.41 0.0 409.87 73.7766 Oct-2016 469.63 NaN Jul-2018 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 119776.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6500.0 NaN NaN NaN 4.0 23955.0 2167.0 90.0 0.0 0.0 141.0 172.0 3.0 3.0 1.0 3.0 NaN 3.0 NaN 0.0 3.0 3.0 8.0 8.0 6.0 3.0 8.0 3.0 5.0 NaN 0.0 0.0 2.0 100.0 100.0 0.0 0.0 123292.0 29809.0 6500.0 25992.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
1 1050463 NaN NaN 1000 1000 1000.0 36 months 9.17 31.88 B B2 Portfolio Manager 1 year MORTGAGE 80000.0 Verified 2015-10-01 Fully Paid n NaN NaN credit_card Credit card refinancing 949xx CA 12.51 0.0 Oct-1967 3.0 NaN 22.0 9.0 1.0 7634 37.2 32.0 w 0.0 0.0 1021.730000 1021.73 999.99 21.74 0.0 0.00 0.0000 Feb-2016 27.85 NaN Feb-2017 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 53994.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20500.0 NaN NaN NaN 4.0 5999.0 12866.0 37.2 0.0 0.0 188.0 575.0 4.0 4.0 3.0 4.0 NaN 1.0 NaN 0.0 3.0 3.0 6.0 16.0 9.0 6.0 20.0 3.0 9.0 0.0 0.0 0.0 3.0 100.0 0.0 1.0 0.0 80788.0 53994.0 20500.0 60288.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
2 1056254 NaN NaN 13000 13000 13000.0 36 months 6.89 400.75 A A3 mailhandler 10+ years RENT 70000.0 Not Verified 2015-10-01 Fully Paid n NaN NaN debt_consolidation Debt consolidation 751xx TX 21.72 0.0 Oct-1996 0.0 NaN NaN 16.0 0.0 11113 24.4 23.0 w 0.0 0.0 14425.915413 14425.92 13000.00 1425.92 0.0 0.00 0.0000 Oct-2018 401.38 NaN Jan-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 316.0 93918.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 45600.0 NaN NaN NaN 10.0 5870.0 28646.0 26.9 0.0 0.0 227.0 60.0 7.0 5.0 3.0 7.0 NaN 7.0 NaN 0.0 7.0 8.0 8.0 8.0 8.0 10.0 10.0 8.0 16.0 0.0 0.0 0.0 4.0 100.0 0.0 0.0 0.0 141771.0 43988.0 39200.0 42191.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
3 1013860 NaN NaN 5600 5600 5600.0 36 months 6.24 170.98 A A2 Managing Director 4 years RENT 49000.0 Source Verified 2015-10-01 Fully Paid n NaN NaN debt_consolidation Debt consolidation 968xx HI 2.45 0.0 Dec-1988 0.0 NaN NaN 4.0 0.0 3402 41.5 8.0 w 0.0 0.0 6051.344476 6051.34 5600.00 451.34 0.0 0.00 0.0000 Aug-2017 2635.62 NaN Aug-2017 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 3402.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8200.0 NaN NaN NaN 0.0 851.0 3898.0 46.6 0.0 0.0 NaN 322.0 26.0 26.0 0.0 26.0 NaN NaN NaN 0.0 3.0 3.0 3.0 6.0 0.0 4.0 8.0 3.0 4.0 0.0 0.0 0.0 0.0 100.0 33.3 0.0 0.0 8200.0 3402.0 7300.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
4 1018431 NaN NaN 11000 11000 11000.0 60 months 12.29 246.31 C C1 RTI Administrator 4 years MORTGAGE 86000.0 Not Verified 2015-10-01 Fully Paid n NaN NaN debt_consolidation Debt consolidation 388xx MS 24.35 2.0 Nov-2001 1.0 2.0 NaN 12.0 0.0 6062 68.9 24.0 w 0.0 0.0 14008.069373 14008.07 11000.00 3008.07 0.0 0.00 0.0000 Aug-2018 5894.86 NaN Feb-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 331363.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8800.0 NaN NaN NaN 3.0 27614.0 1083.0 81.9 0.0 0.0 167.0 160.0 10.0 10.0 2.0 23.0 2.0 6.0 2.0 0.0 2.0 3.0 2.0 5.0 13.0 4.0 9.0 3.0 12.0 0.0 0.0 0.0 1.0 87.5 50.0 0.0 0.0 393082.0 81179.0 6000.0 118795.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
In [17]:
df.tail()
Out[17]:
Unnamed: 0 id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade emp_title emp_length home_ownership annual_inc verification_status issue_d loan_status pymnt_plan url desc purpose title zip_code addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal open_acc_6m open_act_il open_il_12m open_il_24m mths_since_rcnt_il total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit revol_bal_joint sec_app_earliest_cr_line sec_app_inq_last_6mths sec_app_mort_acc sec_app_open_acc sec_app_revol_util sec_app_open_act_il sec_app_num_rev_accts sec_app_chargeoff_within_12_mths sec_app_collections_12_mths_ex_med sec_app_mths_since_last_major_derog hardship_flag hardship_type hardship_reason hardship_status deferral_term hardship_amount hardship_start_date hardship_end_date payment_plan_start_date hardship_length hardship_dpd hardship_loan_status orig_projected_additional_accrued_interest hardship_payoff_balance_amount hardship_last_payment_amount disbursement_method debt_settlement_flag debt_settlement_flag_date settlement_status settlement_date settlement_amount settlement_percentage settlement_term
749995 7605 NaN NaN 15000 15000 15000.0 36 months 13.56 509.47 C C1 Driver 4 years RENT 55000.0 Not Verified 2018-12-01 Current n NaN NaN debt_consolidation Debt consolidation 171xx PA 17.15 0.0 Oct-2012 0.0 NaN NaN 9.0 0.0 7277 40.0 14.0 w 14316.22 14316.22 1001.99 1001.99 683.78 318.21 0.0 0.0 0.0 Feb-2019 509.47 Mar-2019 Feb-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 21229.0 0.0 2.0 1.0 3.0 8.0 13952.0 85.0 1.0 1.0 4778.0 61.0 18200.0 1.0 0.0 2.0 4.0 2359.0 2457.0 69.7 0.0 0.0 74.0 54.0 12.0 8.0 0.0 35.0 NaN 8.0 NaN 0.0 2.0 5.0 2.0 2.0 4.0 7.0 10.0 5.0 9.0 0.0 0.0 0.0 2.0 100.0 50.0 0.0 0.0 34632.0 21229.0 8100.0 16432.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
749996 35399 NaN NaN 8000 8000 8000.0 36 months 6.46 245.05 A A1 Realtor 2 years MORTGAGE 100000.0 Not Verified 2018-12-01 Current n NaN NaN debt_consolidation Debt consolidation 231xx VA 9.90 1.0 Sep-2001 0.0 21.0 NaN 13.0 0.0 17646 51.7 29.0 w 7389.00 7389.00 692.18 692.18 611.00 81.18 0.0 0.0 0.0 Feb-2019 245.05 Mar-2019 Feb-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 3323.0 326631.0 1.0 2.0 0.0 0.0 31.0 13286.0 57.0 2.0 3.0 2173.0 54.0 34100.0 0.0 3.0 0.0 3.0 27219.0 12434.0 26.0 0.0 0.0 82.0 206.0 2.0 2.0 2.0 2.0 63.0 17.0 21.0 0.0 4.0 5.0 5.0 11.0 3.0 10.0 24.0 5.0 13.0 0.0 0.0 0.0 2.0 82.8 40.0 0.0 0.0 381832.0 30932.0 16800.0 23370.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN DirectPay N NaN NaN NaN NaN NaN NaN
749997 8452 NaN NaN 40000 40000 40000.0 60 months 16.14 975.71 C C4 Director of Operations 3 years MORTGAGE 157630.0 Verified 2018-12-01 Current n NaN NaN credit_card Credit card refinancing 972xx OR 16.61 1.0 May-2004 0.0 18.0 NaN 11.0 0.0 15761 56.7 21.0 w 39562.28 39562.28 1155.05 1155.05 437.72 717.33 0.0 0.0 0.0 Feb-2019 1208.85 Mar-2019 Feb-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 474781.0 1.0 4.0 1.0 1.0 5.0 59310.0 38.0 0.0 0.0 9029.0 43.0 27800.0 0.0 5.0 0.0 1.0 43162.0 9453.0 59.8 0.0 0.0 71.0 175.0 27.0 5.0 2.0 27.0 NaN NaN NaN 0.0 3.0 4.0 3.0 4.0 6.0 6.0 13.0 4.0 11.0 0.0 0.0 0.0 1.0 95.2 66.7 0.0 0.0 542479.0 75071.0 23500.0 100389.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
749998 21664 NaN NaN 7000 7000 7000.0 36 months 8.19 219.97 A A4 Bill Poster 10+ years MORTGAGE 42000.0 Verified 2018-12-01 Current n NaN NaN other Other 707xx LA 6.49 0.0 Oct-2003 0.0 NaN NaN 7.0 0.0 6898 53.1 22.0 w 6654.43 6654.43 456.87 456.87 345.57 111.30 0.0 0.0 0.0 Feb-2019 243.27 Mar-2019 Feb-2019 0.0 NaN 1 Joint App 70000.0 4.35 Verified 0.0 440.0 114983.0 0.0 0.0 0.0 0.0 49.0 0.0 NaN 1.0 1.0 1048.0 53.0 13000.0 0.0 0.0 0.0 1.0 16426.0 1620.0 64.0 0.0 0.0 127.0 160.0 8.0 8.0 4.0 8.0 NaN NaN NaN 0.0 3.0 5.0 3.0 3.0 13.0 5.0 5.0 5.0 7.0 0.0 0.0 0.0 1.0 100.0 33.3 0.0 0.0 232200.0 6898.0 4500.0 0.0 7445.0 Jul-2003 0.0 4.0 6.0 59.8 0.0 6.0 0.0 0.0 58.0 N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN
749999 24316 NaN NaN 15000 15000 15000.0 60 months 17.97 380.66 D D1 Document 4 years RENT 110000.0 Not Verified 2018-12-01 Current n NaN NaN debt_consolidation Debt consolidation 950xx CA 13.28 0.0 Mar-2011 0.0 NaN NaN 10.0 0.0 13439 72.3 13.0 w 14685.59 14685.59 746.34 746.34 314.41 431.93 0.0 0.0 0.0 Feb-2019 380.66 Mar-2019 Feb-2019 0.0 NaN 1 Individual NaN NaN NaN 0.0 0.0 24340.0 0.0 2.0 0.0 2.0 16.0 10901.0 54.0 1.0 1.0 3313.0 63.0 18600.0 1.0 1.0 0.0 3.0 2704.0 3131.0 73.7 0.0 0.0 68.0 92.0 7.0 7.0 0.0 7.0 NaN 16.0 NaN 0.0 4.0 7.0 4.0 4.0 3.0 8.0 10.0 7.0 10.0 0.0 0.0 0.0 1.0 100.0 75.0 0.0 0.0 38908.0 24340.0 11900.0 20308.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Cash N NaN NaN NaN NaN NaN NaN

What are the total number of loans?

In [18]:
len(df)
Out[18]:
750000

How many columns does the dataset have?

In [6]:
len(df.columns)
Out[6]:
146

How may columns contain missing data?

In [6]:
miss_values_count = df.isnull().sum(min_count=1)
miss_values_count = miss_values_count[miss_values_count != 0]

What is the count of missing data for each of this column?

In [7]:
print(f"Number of columns with missing values: {miss_values_count.shape[0]}")
if miss_values_count.shape[0]:
    print("Null value count per column: ")
    for name, miss_vals in miss_values_count.items():
        p = miss_vals > 1
        print(f"  - For column name '{name}' there are{'n' if p else ''} "
              f"{miss_vals} missing values{'s' if p else ''}.")
Number of columns with missing values: 74
Null value count per column: 
  - For column name 'id' there aren 750000 missing valuess.
  - For column name 'member_id' there aren 750000 missing valuess.
  - For column name 'emp_title' there aren 61669 missing valuess.
  - For column name 'emp_length' there aren 54976 missing valuess.
  - For column name 'url' there aren 750000 missing valuess.
  - For column name 'desc' there aren 749986 missing valuess.
  - For column name 'title' there aren 11781 missing valuess.
  - For column name 'dti' there aren 835 missing valuess.
  - For column name 'inq_last_6mths' there are 1 missing values.
  - For column name 'mths_since_last_delinq' there aren 382775 missing valuess.
  - For column name 'mths_since_last_record' there aren 627757 missing valuess.
  - For column name 'revol_util' there aren 659 missing valuess.
  - For column name 'last_pymnt_d' there aren 909 missing valuess.
  - For column name 'next_pymnt_d' there aren 295358 missing valuess.
  - For column name 'last_credit_pull_d' there aren 18 missing valuess.
  - For column name 'mths_since_last_major_derog' there aren 552182 missing valuess.
  - For column name 'annual_inc_joint' there aren 689705 missing valuess.
  - For column name 'dti_joint' there aren 689705 missing valuess.
  - For column name 'verification_status_joint' there aren 692145 missing valuess.
  - For column name 'open_acc_6m' there aren 52540 missing valuess.
  - For column name 'open_act_il' there aren 52539 missing valuess.
  - For column name 'open_il_12m' there aren 52539 missing valuess.
  - For column name 'open_il_24m' there aren 52539 missing valuess.
  - For column name 'mths_since_rcnt_il' there aren 74494 missing valuess.
  - For column name 'total_bal_il' there aren 52539 missing valuess.
  - For column name 'il_util' there aren 153788 missing valuess.
  - For column name 'open_rv_12m' there aren 52539 missing valuess.
  - For column name 'open_rv_24m' there aren 52539 missing valuess.
  - For column name 'max_bal_bc' there aren 52539 missing valuess.
  - For column name 'all_util' there aren 52662 missing valuess.
  - For column name 'inq_fi' there aren 52539 missing valuess.
  - For column name 'total_cu_tl' there aren 52540 missing valuess.
  - For column name 'inq_last_12m' there aren 52540 missing valuess.
  - For column name 'avg_cur_bal' there aren 32 missing valuess.
  - For column name 'bc_open_to_buy' there aren 9089 missing valuess.
  - For column name 'bc_util' there aren 9415 missing valuess.
  - For column name 'mo_sin_old_il_acct' there aren 23463 missing valuess.
  - For column name 'mths_since_recent_bc' there aren 8576 missing valuess.
  - For column name 'mths_since_recent_bc_dlq' there aren 577966 missing valuess.
  - For column name 'mths_since_recent_inq' there aren 86405 missing valuess.
  - For column name 'mths_since_recent_revol_delinq' there aren 502543 missing valuess.
  - For column name 'num_tl_120dpd_2m' there aren 31530 missing valuess.
  - For column name 'percent_bc_gt_75' there aren 9163 missing valuess.
  - For column name 'revol_bal_joint' there aren 696039 missing valuess.
  - For column name 'sec_app_earliest_cr_line' there aren 696039 missing valuess.
  - For column name 'sec_app_inq_last_6mths' there aren 696039 missing valuess.
  - For column name 'sec_app_mort_acc' there aren 696039 missing valuess.
  - For column name 'sec_app_open_acc' there aren 696039 missing valuess.
  - For column name 'sec_app_revol_util' there aren 696953 missing valuess.
  - For column name 'sec_app_open_act_il' there aren 696039 missing valuess.
  - For column name 'sec_app_num_rev_accts' there aren 696039 missing valuess.
  - For column name 'sec_app_chargeoff_within_12_mths' there aren 696039 missing valuess.
  - For column name 'sec_app_collections_12_mths_ex_med' there aren 696039 missing valuess.
  - For column name 'sec_app_mths_since_last_major_derog' there aren 732082 missing valuess.
  - For column name 'hardship_type' there aren 745844 missing valuess.
  - For column name 'hardship_reason' there aren 745844 missing valuess.
  - For column name 'hardship_status' there aren 745844 missing valuess.
  - For column name 'deferral_term' there aren 745844 missing valuess.
  - For column name 'hardship_amount' there aren 745844 missing valuess.
  - For column name 'hardship_start_date' there aren 745844 missing valuess.
  - For column name 'hardship_end_date' there aren 745844 missing valuess.
  - For column name 'payment_plan_start_date' there aren 745844 missing valuess.
  - For column name 'hardship_length' there aren 745844 missing valuess.
  - For column name 'hardship_dpd' there aren 745844 missing valuess.
  - For column name 'hardship_loan_status' there aren 745844 missing valuess.
  - For column name 'orig_projected_additional_accrued_interest' there aren 746723 missing valuess.
  - For column name 'hardship_payoff_balance_amount' there aren 745844 missing valuess.
  - For column name 'hardship_last_payment_amount' there aren 745844 missing valuess.
  - For column name 'debt_settlement_flag_date' there aren 740517 missing valuess.
  - For column name 'settlement_status' there aren 740517 missing valuess.
  - For column name 'settlement_date' there aren 740517 missing valuess.
  - For column name 'settlement_amount' there aren 740517 missing valuess.
  - For column name 'settlement_percentage' there aren 740517 missing valuess.
  - For column name 'settlement_term' there aren 740517 missing valuess.

How has the average interest rate of a loan varied over time?

If it is a variable rate, the variation of the rate would be determined with the table of payments and not in the table of loans, which is our sample; since the 90's it is rare to use a variable rate. If it is a fixed rate, there is no variation in the rate and it would be int_rate. As for variation of any value it would be necessary to define if it is the programmed one or the real one; the first one is with financial formulas and there are many, the second one is from the data, normally from the payments.

What is the distribution of sub-grades, and can we easily segment borrowers based on prime and sub prime loans

In [91]:
plt.figure(figsize=(16, 6))
sns.countplot(x="sub_grade", data=df, order = df['sub_grade'].value_counts().index)
Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0x16a730644e0>

But can we easily segment borrowers based on prime and sub prime loans ? Maybe we need less/more categories ? I would segment them depending on if they pay their loan back

In [35]:
df[['sub_grade','loan_amnt','int_rate']].groupby('sub_grade').describe().reset_index()
Out[35]:
sub_grade int_rate loan_amnt
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
0 A1 34251.0 5.544917 0.389030 5.31 5.32 5.32 6.11 6.46 34251.0 15778.950980 9849.357821 1000.0 8400.0 13000.0 20000.0 40000.0
1 A2 25190.0 6.587943 0.381601 6.00 6.19 6.67 6.99 7.07 25190.0 14552.648869 9554.564306 1000.0 7500.0 12000.0 20000.0 40000.0
2 A3 26431.0 7.062373 0.266037 6.00 6.83 7.21 7.21 7.59 26431.0 14292.664863 9544.563417 1000.0 7000.0 12000.0 20000.0 40000.0
3 A4 33057.0 7.557079 0.283809 6.00 7.35 7.39 7.84 8.19 33057.0 15257.654203 9961.010378 1000.0 8000.0 12000.0 20000.0 40000.0
4 A5 34741.0 8.118880 0.285978 6.00 7.96 7.97 8.46 8.81 34741.0 14407.389684 9448.639545 1000.0 7200.0 12000.0 20000.0 40000.0
5 B1 43870.0 9.075405 0.682017 6.00 8.39 9.43 9.44 10.33 43870.0 14550.149305 9491.496416 1000.0 7500.0 12000.0 20000.0 40000.0
6 B2 42643.0 9.892638 0.510048 6.00 9.49 9.93 10.47 10.72 42643.0 14847.658467 9750.694550 1000.0 7200.0 12000.0 20000.0 40000.0
7 B3 41821.0 10.494923 0.545622 6.00 9.99 10.42 11.06 11.39 41821.0 14366.098372 9434.344664 1000.0 7000.0 12000.0 20000.0 40000.0
8 B4 46010.0 11.108060 0.335650 6.00 10.90 10.99 11.44 11.80 46010.0 14328.037927 9387.670226 1000.0 7000.0 12000.0 20000.0 40000.0
9 B5 49638.0 11.844107 0.462537 6.00 11.49 11.53 11.99 12.98 49638.0 14274.199706 9438.912459 1000.0 7000.0 12000.0 20000.0 40000.0
10 C1 50421.0 12.674281 0.472692 6.00 12.61 12.62 12.79 13.56 50421.0 14868.141746 9538.444558 1000.0 7500.0 12000.0 20000.0 40000.0
11 C2 43310.0 13.525962 0.525845 6.00 13.49 13.49 13.59 14.47 43310.0 14893.247518 9528.672902 1000.0 7450.0 12000.0 20000.0 40000.0
12 C3 42908.0 14.082712 0.512439 6.00 13.99 14.07 14.08 15.02 42908.0 15515.432437 9502.942268 1000.0 8000.0 14000.0 21000.0 40000.0
13 C4 43154.0 14.942308 0.636235 6.00 14.49 14.99 15.05 16.14 43154.0 15772.832182 9479.332442 1000.0 8500.0 14000.0 21000.0 40000.0
14 C5 40569.0 15.911166 0.628875 6.00 15.59 16.01 16.02 16.91 40569.0 15884.065419 9564.053499 1000.0 8450.0 14400.0 21525.0 40000.0
15 D1 25053.0 16.962046 0.725618 6.00 16.29 16.99 17.47 17.97 25053.0 15663.602163 9352.801072 1000.0 8325.0 14400.0 21000.0 40000.0
16 D2 23256.0 18.004723 0.691684 6.00 17.99 18.06 18.45 18.94 23256.0 15631.319874 9398.099400 1000.0 8000.0 14400.0 21600.0 40000.0
17 D3 20643.0 18.955224 0.809744 6.00 18.99 19.03 19.42 19.92 20643.0 16024.160732 9436.456261 1000.0 9000.0 15000.0 22250.0 40000.0
18 D4 17112.0 19.819662 0.970901 6.00 19.99 20.00 20.39 20.89 17112.0 16005.144051 9351.079353 1000.0 9175.0 15000.0 22000.0 40000.0
19 D5 14683.0 21.075908 1.350807 6.00 21.45 21.45 21.85 22.35 14683.0 16716.408432 9438.190236 1000.0 10000.0 15000.0 24000.0 40000.0
20 E1 8793.0 21.606883 1.829379 6.00 19.99 22.74 22.91 23.40 8793.0 16125.039804 9578.769471 1000.0 9000.0 15000.0 23000.0 40000.0
21 E2 7513.0 22.540733 2.144985 6.00 20.75 23.88 23.99 24.37 7513.0 16362.631439 9718.351712 1000.0 9000.0 15000.0 24000.0 40000.0
22 E3 7218.0 23.377350 2.274183 6.00 21.18 24.74 24.85 25.34 7218.0 17393.821003 9418.336317 1000.0 10000.0 16000.0 24000.0 40000.0
23 E4 6285.0 24.343181 2.246686 6.00 21.97 25.49 25.82 26.31 6285.0 17965.588703 9295.016271 1000.0 10975.0 16600.0 25000.0 40000.0
24 E5 7036.0 25.514105 1.981731 6.00 25.29 26.30 26.77 27.27 7036.0 18276.190307 8944.332753 1000.0 12000.0 17350.0 25000.0 40000.0
25 F1 3563.0 26.225897 2.774035 6.00 23.13 28.69 28.72 28.72 3563.0 19065.401347 8867.203485 1000.0 12000.0 18000.0 25000.0 40000.0
26 F2 2333.0 26.784303 2.794799 6.00 24.11 26.49 29.69 29.69 2333.0 18874.882126 9168.830490 1000.0 12000.0 18000.0 25000.0 40000.0
27 F3 1877.0 27.699020 2.507817 23.99 24.99 26.99 30.17 30.17 1877.0 19339.637720 9038.327304 1000.0 12000.0 18125.0 26000.0 40000.0
28 F4 1525.0 28.276000 2.494425 6.00 25.88 27.49 30.65 30.65 1525.0 18940.065574 8977.055891 1000.0 12000.0 18000.0 25600.0 40000.0
29 F5 1442.0 28.909563 2.035321 25.78 26.57 30.74 30.75 30.75 1442.0 20080.582524 9452.258874 1000.0 12500.0 19000.0 28000.0 40000.0
30 G1 1264.0 29.582540 1.746886 6.00 28.18 30.79 30.79 30.79 1264.0 20690.901899 9086.732784 1000.0 14000.0 20000.0 28000.0 40000.0
31 G2 715.0 29.560979 1.640559 6.00 28.14 30.84 30.84 30.84 715.0 18973.846154 9553.073015 1000.0 12000.0 17675.0 26600.0 40000.0
32 G3 613.0 29.837504 1.555991 6.00 28.34 30.89 30.89 30.89 613.0 20049.755302 9058.333894 1800.0 12900.0 18425.0 27300.0 40000.0
33 G4 550.0 30.309164 1.388560 6.00 29.96 30.94 30.94 30.94 550.0 20732.318182 8969.498118 1000.0 13000.0 19787.5 28287.5 40000.0
34 G5 512.0 30.622813 0.775062 28.99 30.99 30.99 30.99 30.99 512.0 20290.039062 9246.073949 2000.0 12212.5 19750.0 28000.0 40000.0

Visualise a low-dimensional split in the dataset when the target variable is the grade?

In [155]:
collist = df.columns.tolist()
# you can now select from this list any arbritrary range
df1 = df[collist[0:1]]
# or remove a column
collist.remove('sub_grade')
# now select
df1 = df[collist]

As far as we are considering the grade and not the interest rate we are going to drop the latest

In [156]:
df1 = df1.drop(columns = ['int_rate'])

We transform the grades to numeric values

In [157]:
df1['ascii'] = [ord(x) for x in df1['grade']]
In [158]:
df1 = df1.drop(columns=['grade'])

Here we will first plot the Pearson correlation heatmap and see the correlation of independent variables with the output variable grade. We will only select features which has correlation of above 0.5 (taking absolute value) with the output variable.

In [182]:
#Using Pearson Correlation
cor = df1.corr()
In [180]:
mask = np.zeros_like(cor, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(12,10))
sns.heatmap(cor,
            vmin=-1,
            cmap='coolwarm',
            annot=False,
            mask = mask);

As we can see, no features are highly correlated with the output variable grade, named ascii here to be able to respresent it. Hence we can't have insights from Pearson correlation. However this is not the end of the process. We can still handpick the explenatory features.

In [166]:
count = 0
for y in subset.columns:
    if subset[y].dtype == np.object:
        count+=1
print("number of categorical variables : " + str(count))

print(f"{subset.dtypes}\n")
print(f"Sum of null values in each feature:\n{35 * '-'}")
print(f"{subset.isnull().sum()}")
number of categorical variables : 36
Unnamed: 0                                      int64
id                                            float64
member_id                                     float64
loan_amnt                                       int64
funded_amnt                                     int64
funded_amnt_inv                               float64
term                                           object
int_rate                                      float64
installment                                   float64
grade                                          object
sub_grade                                      object
emp_title                                      object
emp_length                                     object
home_ownership                                 object
annual_inc                                    float64
verification_status                            object
issue_d                                        object
loan_status                                    object
pymnt_plan                                     object
url                                           float64
desc                                           object
purpose                                        object
title                                          object
zip_code                                       object
addr_state                                     object
dti                                           float64
delinq_2yrs                                   float64
earliest_cr_line                               object
inq_last_6mths                                float64
mths_since_last_delinq                        float64
                                               ...   
sec_app_open_acc                              float64
sec_app_revol_util                            float64
sec_app_open_act_il                           float64
sec_app_num_rev_accts                         float64
sec_app_chargeoff_within_12_mths              float64
sec_app_collections_12_mths_ex_med            float64
sec_app_mths_since_last_major_derog           float64
hardship_flag                                  object
hardship_type                                  object
hardship_reason                                object
hardship_status                                object
deferral_term                                 float64
hardship_amount                               float64
hardship_start_date                            object
hardship_end_date                              object
payment_plan_start_date                        object
hardship_length                               float64
hardship_dpd                                  float64
hardship_loan_status                           object
orig_projected_additional_accrued_interest    float64
hardship_payoff_balance_amount                float64
hardship_last_payment_amount                  float64
disbursement_method                            object
debt_settlement_flag                           object
debt_settlement_flag_date                      object
settlement_status                              object
settlement_date                                object
settlement_amount                             float64
settlement_percentage                         float64
settlement_term                               float64
Length: 146, dtype: object

Sum of null values in each feature:
-----------------------------------
Unnamed: 0                                         0
id                                            295358
member_id                                     295358
loan_amnt                                          0
funded_amnt                                        0
funded_amnt_inv                                    0
term                                               0
int_rate                                           0
installment                                        0
grade                                              0
sub_grade                                          0
emp_title                                      20400
emp_length                                     20036
home_ownership                                     0
annual_inc                                         0
verification_status                                0
issue_d                                            0
loan_status                                        0
pymnt_plan                                         0
url                                           295358
desc                                          295348
purpose                                            0
title                                           7834
zip_code                                           0
addr_state                                         0
dti                                              168
delinq_2yrs                                        0
earliest_cr_line                                   0
inq_last_6mths                                     1
mths_since_last_delinq                        144176
                                               ...  
sec_app_open_acc                              287070
sec_app_revol_util                            287222
sec_app_open_act_il                           287070
sec_app_num_rev_accts                         287070
sec_app_chargeoff_within_12_mths              287070
sec_app_collections_12_mths_ex_med            287070
sec_app_mths_since_last_major_derog           292362
hardship_flag                                      0
hardship_type                                 293455
hardship_reason                               293455
hardship_status                               293455
deferral_term                                 293455
hardship_amount                               293455
hardship_start_date                           293455
hardship_end_date                             293455
payment_plan_start_date                       293455
hardship_length                               293455
hardship_dpd                                  293455
hardship_loan_status                          293455
orig_projected_additional_accrued_interest    294215
hardship_payoff_balance_amount                293455
hardship_last_payment_amount                  293455
disbursement_method                                0
debt_settlement_flag                               0
debt_settlement_flag_date                     286344
settlement_status                             286344
settlement_date                               286344
settlement_amount                             286344
settlement_percentage                         286344
settlement_term                               286344
Length: 146, dtype: int64

Things worth knowing for predictive model

  • the data covers 110909 loans funded by the platform in 2018.
  • Number of columns with missing values: 74
  • 36 columns have categorical data
  • the data set is pretty imbalanced : the proportion of G loans is not even 10000 whereas the C1 loans are at least 50000

Cleaning the data

As we had observe, some columns like annual_inc, int_rate, etc. may be much useful for building our model but on the other hand, some columns like id, member_id, etc. will not be helping. So we hand pick some columns.

In [116]:
APPLICANT_NUMERIC = ['annual_inc', 'dti', 'loan_amnt', 'installment']
APPLICANT_CATEGORICAL = ['application_type', 'emp_length', 'home_ownership', 'addr_state', 'term']
CREDIT_NUMERIC = ['acc_now_delinq', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy',
                  'bc_util', 'delinq_2yrs', 'delinq_amnt',
                  'open_acc', 'pub_rec', 'revol_util',
                  'revol_bal', 'tot_coll_amt', 'tot_cur_bal', 'total_acc', 'total_rev_hi_lim',
                  'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats',
                  'num_bc_tl', 'num_il_tl', 'num_rev_tl_bal_gt_0', 'pct_tl_nvr_dlq',
                  'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                  'total_il_high_credit_limit', 'all_util',
                  'il_util', 'total_bal_il', 'total_cu_tl']
TARGET = ['grade']

df = pd.read_csv("loans.csv", usecols = APPLICANT_NUMERIC +APPLICANT_CATEGORICAL + CREDIT_NUMERIC + TARGET)

We didn't selected columns like 'title' and 'emp_title' are text which cannot be one-hot encoded / label encoded as they have arbitrary categorical text and very less unique data for each of their categories.

In [125]:
# We order our grade category so order of grades doesn't appear random in graphs
grade_categories = [g for g in "ABCDEFG"]
df["grade"] = df["grade"].astype("category", categories=grade_categories, ordered=True)

# Sanity check that we're working with cleaned data
bad_rows = df.isnull().T.any().T.sum()
if bad_rows > 0:
    print("Rows with null/NaN values: {}".format(bad_rows))
    print("Columns with null/NaN values:")
    print(pd.isnull(df).sum() > 0)
    print("Dropping bad rows...")
    df.dropna(axis=0, how='any', inplace=True)
    print("Rows with null/NaN values: {}".format(df.isnull().T.any().T.sum()))
    
print(df.info(null_counts = True, memory_usage = "deep", verbose = True))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  This is separate from the ipykernel package so we can avoid doing imports until
Rows with null/NaN values: 197840
Columns with null/NaN values:
loan_amnt                     False
term                          False
installment                   False
grade                         False
emp_length                     True
home_ownership                False
annual_inc                    False
addr_state                    False
dti                            True
delinq_2yrs                   False
open_acc                      False
pub_rec                       False
revol_bal                     False
revol_util                     True
total_acc                     False
application_type              False
acc_now_delinq                False
tot_coll_amt                  False
tot_cur_bal                   False
total_bal_il                   True
il_util                        True
all_util                       True
total_rev_hi_lim              False
total_cu_tl                    True
acc_open_past_24mths          False
avg_cur_bal                    True
bc_open_to_buy                 True
bc_util                        True
delinq_amnt                   False
num_accts_ever_120_pd         False
num_actv_bc_tl                False
num_actv_rev_tl               False
num_bc_sats                   False
num_bc_tl                     False
num_il_tl                     False
num_rev_tl_bal_gt_0           False
pct_tl_nvr_dlq                False
percent_bc_gt_75               True
tot_hi_cred_lim               False
total_bal_ex_mort             False
total_bc_limit                False
total_il_high_credit_limit    False
dtype: bool
Dropping bad rows...
Rows with null/NaN values: 0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 552160 entries, 41131 to 749999
Data columns (total 42 columns):
loan_amnt                     552160 non-null int64
term                          552160 non-null object
installment                   552160 non-null float64
grade                         552160 non-null category
emp_length                    552160 non-null object
home_ownership                552160 non-null object
annual_inc                    552160 non-null float64
addr_state                    552160 non-null object
dti                           552160 non-null float64
delinq_2yrs                   552160 non-null float64
open_acc                      552160 non-null float64
pub_rec                       552160 non-null float64
revol_bal                     552160 non-null int64
revol_util                    552160 non-null float64
total_acc                     552160 non-null float64
application_type              552160 non-null object
acc_now_delinq                552160 non-null float64
tot_coll_amt                  552160 non-null float64
tot_cur_bal                   552160 non-null float64
total_bal_il                  552160 non-null float64
il_util                       552160 non-null float64
all_util                      552160 non-null float64
total_rev_hi_lim              552160 non-null float64
total_cu_tl                   552160 non-null float64
acc_open_past_24mths          552160 non-null float64
avg_cur_bal                   552160 non-null float64
bc_open_to_buy                552160 non-null float64
bc_util                       552160 non-null float64
delinq_amnt                   552160 non-null float64
num_accts_ever_120_pd         552160 non-null float64
num_actv_bc_tl                552160 non-null float64
num_actv_rev_tl               552160 non-null float64
num_bc_sats                   552160 non-null float64
num_bc_tl                     552160 non-null float64
num_il_tl                     552160 non-null float64
num_rev_tl_bal_gt_0           552160 non-null float64
pct_tl_nvr_dlq                552160 non-null float64
percent_bc_gt_75              552160 non-null float64
tot_hi_cred_lim               552160 non-null float64
total_bal_ex_mort             552160 non-null float64
total_bc_limit                552160 non-null float64
total_il_high_credit_limit    552160 non-null float64
dtypes: category(1), float64(34), int64(2), object(5)
memory usage: 325.2 MB
None
In [126]:
df.head()
Out[126]:
loan_amnt term installment grade emp_length home_ownership annual_inc addr_state dti delinq_2yrs ... num_bc_sats num_bc_tl num_il_tl num_rev_tl_bal_gt_0 pct_tl_nvr_dlq percent_bc_gt_75 tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
41131 24000 36 months 757.51 B 1 year MORTGAGE 80000.0 WA 11.80 0.0 ... 11.0 11.0 29.0 7.0 100.0 9.1 501190.0 167271.0 137400.0 121695.0
41135 8000 36 months 257.39 B 6 years RENT 55000.0 CO 10.16 0.0 ... 4.0 13.0 6.0 4.0 95.7 25.0 27933.0 21907.0 7800.0 17349.0
41136 19975 60 months 453.27 C 1 year MORTGAGE 92000.0 GA 19.05 0.0 ... 4.0 9.0 36.0 6.0 83.0 50.0 406013.0 211797.0 12300.0 217689.0
41139 24000 60 months 603.85 D 3 years RENT 72500.0 CA 22.89 0.0 ... 5.0 7.0 9.0 7.0 100.0 80.0 73333.0 64376.0 17300.0 41233.0
41141 20000 60 months 461.96 C 8 years MORTGAGE 62000.0 NC 7.14 0.0 ... 6.0 10.0 5.0 6.0 90.9 16.7 188975.0 14423.0 18800.0 6914.0

5 rows × 42 columns

In [127]:
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker

sns.reset_orig()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), cmap=plt.cm.Blues)
fig.colorbar(cax)

ax.set_xticklabels(df, rotation=90)
ax.set_yticklabels(df)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

Next we can take a look at the distribution of some values by loan grade for some features of interest using "violin" charts.

In [128]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
plt.rcParams["figure.figsize"] = (20,8)
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12

# Create violinplot
plt.subplot(121)
v1 = sns.violinplot(x = "revol_util", y="grade", data=df)
v1.axes.set_title("Amount of Credit Line Used Relative to Total Credit Available by Loan Grade", fontsize=20)

plt.subplot(122)
v2 = sns.violinplot(x = "acc_open_past_24mths", y="grade", data=df)
v2.axes.set_title("Accts Opened in Past 24 Months by Loan Grade", fontsize=20)

# Show the plot
plt.show()

The distribution of the credit line used relative to total credit available by loan grade scores give A grade loans having higher values than the G grade loans. There is an interesting trend on the second graph with distribution of accounts opened in the past 24 months for higher grade loans relative to the lower grade loans. So even if the correlation is much lower, the inner distribution of these accounts oppenings give some major insights.

Predicting model

We started with a model with PCA and one hot encoding on every categorical variables. It created a dataframe of 214 features, 555M+ large. With the same model as below it first gave 30% of good classifcations, with KFold it dropped to a mean of 26.10% (with a standard deviation of 7.89%). So we decided to handpick the data and use the architecure the same way James Andersen did in his Medium article.

We prepare a machine learning model on the data. We first need to split the data into "X" for the data that will be fed into the model as input and "Y" the column we're trying to predict: loan grade.

Then we'll split the data by row; 80% will be used to train/validate the model and 20% will be used to test how well the model performs on unseen data.

Last the target, which is multi-class, has to been one-hot encoded.

In [129]:
def split_data(data, continuous_cols, categorical_cols, label_col, test_size=0.2, row_limit=None):
    """Divide the data in to X and y dataframes and train/test split"""

    # Subset to get feature data
    x_df = data.loc[:, continuous_cols + categorical_cols]

    # Update our X dataframe with categorical values replaced by one-hot encoded values
    x_df = encode_categorical(x_df, categorical_cols)

    # Ensure all numeric features are on the same scale
    for col in continuous_cols:
        x_df[col] = (x_df[col] - x_df[col].mean()) / x_df[col].std()

    # Specify the target labels and flatten the array
    y = pd.get_dummies(data[label_col])

    # When requested, limit the amount of data that will be used
    # Using entire data set can be painfully slow without a GPU!
    if row_limit != None:
        rows = np.random.binomial(1, 0.1, size=len(data)).astype('bool')
        x_df = x_df[rows]
        y = y[rows]
        print("Using only a sample of {} observations".format(x_df.shape[0]))
        #data = self.lcdata.sample(int(row_limit))
    else:
        print("Using the full set of {} observations".format(data.shape[0]))
        #data = self.lcdata

    # Create train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size=test_size, random_state=23)
    print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
    return x_train, x_test, y_train, y_test
In [130]:
def encode_categorical(frame, categorical_cols):
    """Replace categorical variables with one-hot encoding in-place"""
    for col in categorical_cols:
        # use get_dummies() to do one hot encoding of categorical column
        frame = frame.merge(pd.get_dummies(frame[col]), left_index=True, right_index=True)
        
        # drop the original categorical column
        frame.drop(col, axis=1, inplace=True)
  
    return frame
In [131]:
# Divide the data set into training and test sets
x_train, x_test, y_train, y_test = split_data(df, APPLICANT_NUMERIC + CREDIT_NUMERIC,
                  APPLICANT_CATEGORICAL,
                  TARGET,
                  test_size = 0.2,
                  row_limit = os.environ.get("sample"))

# Inspect our training data
print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
print("y_train contains {} rows and {} features".format(y_train.shape[0], y_train.shape[1]))

print("x_test contains {} rows and {} features".format(x_test.shape[0], x_test.shape[1]))
print("y_test contains {} rows and {} features".format(y_test.shape[0], y_test.shape[1]))

# Loan grade has been one-hot encoded
print("Sample one-hot encoded 'y' value: \n{}".format(y_train.sample()))
Using the full set of 552160 observations
x_train contains 441728 rows and 105 features
x_train contains 441728 rows and 105 features
y_train contains 441728 rows and 7 features
x_test contains 110432 rows and 105 features
y_test contains 110432 rows and 7 features
Sample one-hot encoded 'y' value: 
        grade_A  grade_B  grade_C  grade_D  grade_E  grade_F  grade_G
447861        1        0        0        0        0        0        0
In [132]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.constraints import maxnorm

def create_model(input_dim, output_dim):
    # create model
    model = Sequential()
    # input layer
    model.add(Dense(100, input_dim=input_dim, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    
    # hidden layer
    model.add(Dense(60, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    
    # output layer
    model.add(Dense(output_dim, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
In [134]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model



model = create_model(x_train.shape[1], y_train.shape[1])

epochs =  35
batch_sz = 64

print("Beginning model training with batch size {} and {} epochs".format(batch_sz, epochs))

checkpoint = ModelCheckpoint("lc_model.h5", monitor='val_acc', verbose=0, save_best_only=True, mode='auto', period=1)
# train the model
history = model.fit(x_train.as_matrix(),
                y_train.as_matrix(),
                validation_split=0.2,
                epochs=epochs,  
                batch_size=batch_sz, 
                verbose=2,
                callbacks=[checkpoint])

# revert to the best model encountered during training
model = load_model("lc_model.h5")
Beginning model training with batch size 64 and 35 epochs
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  from ipykernel import kernelapp as app
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  app.launch_new_instance()
Train on 353382 samples, validate on 88346 samples
Epoch 1/35
 - 31s - loss: 0.9299 - acc: 0.6013 - val_loss: 0.4764 - val_acc: 0.8208
Epoch 2/35
 - 25s - loss: 0.5274 - acc: 0.7795 - val_loss: 0.4230 - val_acc: 0.8219
Epoch 3/35
 - 24s - loss: 0.4681 - acc: 0.8038 - val_loss: 0.3759 - val_acc: 0.8448
Epoch 4/35
 - 27s - loss: 0.4437 - acc: 0.8144 - val_loss: 0.3513 - val_acc: 0.8593
Epoch 5/35
 - 26s - loss: 0.4300 - acc: 0.8205 - val_loss: 0.3510 - val_acc: 0.8581
Epoch 6/35
 - 29s - loss: 0.4197 - acc: 0.8249 - val_loss: 0.3651 - val_acc: 0.8478
Epoch 7/35
 - 31s - loss: 0.4119 - acc: 0.8279 - val_loss: 0.3275 - val_acc: 0.8648
Epoch 8/35
 - 29s - loss: 0.4055 - acc: 0.8312 - val_loss: 0.3168 - val_acc: 0.8702
Epoch 9/35
 - 27s - loss: 0.3998 - acc: 0.8335 - val_loss: 0.3298 - val_acc: 0.8652
Epoch 10/35
 - 27s - loss: 0.3958 - acc: 0.8362 - val_loss: 0.3154 - val_acc: 0.8697
Epoch 11/35
 - 26s - loss: 0.3931 - acc: 0.8365 - val_loss: 0.3155 - val_acc: 0.8736
Epoch 12/35
 - 26s - loss: 0.3868 - acc: 0.8391 - val_loss: 0.3006 - val_acc: 0.8793
Epoch 13/35
 - 25s - loss: 0.3839 - acc: 0.8406 - val_loss: 0.3218 - val_acc: 0.8647
Epoch 14/35
 - 27s - loss: 0.3805 - acc: 0.8424 - val_loss: 0.3137 - val_acc: 0.8698
Epoch 15/35
 - 26s - loss: 0.3773 - acc: 0.8428 - val_loss: 0.3047 - val_acc: 0.8742
Epoch 16/35
 - 26s - loss: 0.3746 - acc: 0.8453 - val_loss: 0.2908 - val_acc: 0.8826
Epoch 17/35
 - 26s - loss: 0.3735 - acc: 0.8454 - val_loss: 0.3047 - val_acc: 0.8745
Epoch 18/35
 - 25s - loss: 0.3701 - acc: 0.8465 - val_loss: 0.2946 - val_acc: 0.8788
Epoch 19/35
 - 29s - loss: 0.3694 - acc: 0.8462 - val_loss: 0.2958 - val_acc: 0.8772
Epoch 20/35
 - 26s - loss: 0.3673 - acc: 0.8483 - val_loss: 0.3035 - val_acc: 0.8773
Epoch 21/35
 - 25s - loss: 0.3650 - acc: 0.8483 - val_loss: 0.2870 - val_acc: 0.8847
Epoch 22/35
 - 26s - loss: 0.3648 - acc: 0.8491 - val_loss: 0.2881 - val_acc: 0.8834
Epoch 23/35
 - 28s - loss: 0.3640 - acc: 0.8489 - val_loss: 0.3021 - val_acc: 0.8751
Epoch 24/35
 - 30s - loss: 0.3612 - acc: 0.8502 - val_loss: 0.3163 - val_acc: 0.8665
Epoch 25/35
 - 26s - loss: 0.3624 - acc: 0.8499 - val_loss: 0.2869 - val_acc: 0.8822
Epoch 26/35
 - 25s - loss: 0.3596 - acc: 0.8512 - val_loss: 0.2905 - val_acc: 0.8832
Epoch 27/35
 - 26s - loss: 0.3563 - acc: 0.8528 - val_loss: 0.2839 - val_acc: 0.8822
Epoch 28/35
 - 27s - loss: 0.3570 - acc: 0.8524 - val_loss: 0.3027 - val_acc: 0.8745
Epoch 29/35
 - 25s - loss: 0.3568 - acc: 0.8529 - val_loss: 0.3116 - val_acc: 0.8683
Epoch 30/35
 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.2901 - val_acc: 0.8800
Epoch 31/35
 - 24s - loss: 0.3542 - acc: 0.8530 - val_loss: 0.2898 - val_acc: 0.8797
Epoch 32/35
 - 24s - loss: 0.3531 - acc: 0.8544 - val_loss: 0.2853 - val_acc: 0.8841
Epoch 33/35
 - 24s - loss: 0.3541 - acc: 0.8536 - val_loss: 0.2992 - val_acc: 0.8760
Epoch 34/35
 - 24s - loss: 0.3541 - acc: 0.8538 - val_loss: 0.2810 - val_acc: 0.8867
Epoch 35/35
 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.3013 - val_acc: 0.8760
In [135]:
plt.rcParams["figure.figsize"] = [16, 5]
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams['legend.loc'] = 'upper right'
plt.rcParams['legend.framealpha'] = 0.7
plt.rcParams["legend.fontsize"] = 14

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')

# adjust size
plt.tight_layout()
plt.show()

We can now see how our train model performs on the "test" data that we held out of the training data. We measure the performance using the F1 score.

In [137]:
import numpy as np
from sklearn.metrics import f1_score

y_pred = model.predict(x_test.as_matrix())

# Revert one-hot encoding to classes
y_pred_classes = pd.DataFrame((y_pred.argmax(1)[:,None] == np.arange(y_pred.shape[1])),
                              columns=y_test.columns,
                              index=y_test.index)

y_test_vals = y_test.idxmax(1)
y_pred_vals = y_pred_classes.idxmax(1)

# F1 score
# Use idxmax() to convert back from one-hot encoding
f1 = f1_score(y_test_vals, y_pred_vals, average='weighted')
print("Test Set Accuracy: {:.2%}   (But results would have been better if we trained on the FULL dataset)".format(f1))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  after removing the cwd from sys.path.
Test Set Accuracy: 88.54%   (But results would have been better if we trained on the FULL dataset)
In [141]:
from sklearn.metrics import confusion_matrix

from importlib import reload

# Confusion matrix
conf_matrix = confusion_matrix(y_test_vals, y_pred_vals).astype(float)
In [143]:
# drawing it 

def fmt_pct(x, pos):
    """Format as percentage"""
    return '{:.0%}'.format(x)

classes = [l for l in "ABCDEFG"]
title='Confusion matrix'

correct_mask = np.ones(conf_matrix.shape, dtype=bool)
wrong_mask = np.zeros(conf_matrix.shape, dtype=bool)
for i in range(conf_matrix.shape[0]):
    correct_mask[i,i] = False
    wrong_mask[i,i] = True
    row_sum = sum(conf_matrix[i])
    for j in range(conf_matrix.shape[1]):
        conf_matrix[i, j] = conf_matrix[i, j] / row_sum

correct_matrix = np.ma.masked_array(conf_matrix, mask=correct_mask)
wrong_matrix = np.ma.masked_array(conf_matrix, wrong_mask)

fig,ax = plt.subplots(figsize=(8, 8))
blue_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#f1eef6', '#025a90'], N=10)
blue_map.set_under(color='white')

red_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#feeddd', '#a83500'], N=10)
red_map.set_under(color='white')

plot_correct = ax.imshow(correct_matrix,interpolation='nearest',cmap=blue_map, vmin=0.000001, vmax=1)
plot_wrong = ax.imshow(wrong_matrix,interpolation='nearest',cmap=red_map, vmin=0.000001, vmax=1)

colorbar_wrong = plt.colorbar(plot_wrong, shrink=0.35, orientation='horizontal', pad=-0.11, format=ticker.FuncFormatter(fmt_pct))
colorbar_correct = plt.colorbar(plot_correct, shrink=0.35, orientation='horizontal', pad=0.03)
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
ax.set_xticklabels([''] + classes)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_yticklabels([''] + classes)
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
colorbar_correct.ax.text(-0.3,0.25,'CORRECT',rotation=0)
colorbar_wrong.ax.text(-0.35,0.25,'INCORRECT',rotation=0)
plt.setp(colorbar_correct.ax.get_xticklabels(), visible=False)

thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
    cell_label = "{:.1%}".format(conf_matrix[i, j])
    plt.text(j, i, cell_label,
             horizontalalignment="center",
             color="white" if conf_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.show()

Conclusion

From the very beginning James Anderson's model performed better than my model, however his neural network architecture was almost the same as ours. The first thing he did was to remove any rows with nas from the columns he selected. So he didn't selected the columns where there were too many missing values, unlike me. This created a lighter model.

Yet, if his results were already pretty good we improved them by removing a few features. However the lower the grade the lower were our classification accuracy. Reaching to 19% for G grade borrowers, which were mainly classified in F and some in E. So the next thing to work on would be to improve features selection.

In [ ]: