import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
loading the dataset
# Control delimiters, rows, column names with read_csv (see later)
df = pd.read_csv("loans.csv", engine='python')
How does it look like ?
# show all columns
pd.set_option('display.max_columns', None)
df.head()
Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | bc_open_to_buy | bc_util | chargeoff_within_12_mths | delinq_amnt | mo_sin_old_il_acct | mo_sin_old_rev_tl_op | mo_sin_rcnt_rev_tl_op | mo_sin_rcnt_tl | mort_acc | mths_since_recent_bc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_actv_bc_tl | num_actv_rev_tl | num_bc_sats | num_bc_tl | num_il_tl | num_op_rev_tl | num_rev_accts | num_rev_tl_bal_gt_0 | num_sats | num_tl_120dpd_2m | num_tl_30dpd | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | revol_bal_joint | sec_app_earliest_cr_line | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | sec_app_mths_since_last_major_derog | hardship_flag | hardship_type | hardship_reason | hardship_status | deferral_term | hardship_amount | hardship_start_date | hardship_end_date | payment_plan_start_date | hardship_length | hardship_dpd | hardship_loan_status | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | disbursement_method | debt_settlement_flag | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1040017 | NaN | NaN | 14000 | 14000 | 14000.0 | 36 months | 12.69 | 469.63 | C | C2 | Receiving Dock Worker | 9 years | MORTGAGE | 40000.0 | Not Verified | 2015-10-01 | Charged Off | n | NaN | NaN | debt_consolidation | Debt consolidation | 166xx | PA | 17.07 | 0.0 | Jun-2001 | 1.0 | NaN | NaN | 5.0 | 0.0 | 5848 | 90.0 | 15.0 | f | 0.0 | 0.0 | 6057.790000 | 6057.79 | 4091.51 | 1556.41 | 0.0 | 409.87 | 73.7766 | Oct-2016 | 469.63 | NaN | Jul-2018 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 119776.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6500.0 | NaN | NaN | NaN | 4.0 | 23955.0 | 2167.0 | 90.0 | 0.0 | 0.0 | 141.0 | 172.0 | 3.0 | 3.0 | 1.0 | 3.0 | NaN | 3.0 | NaN | 0.0 | 3.0 | 3.0 | 8.0 | 8.0 | 6.0 | 3.0 | 8.0 | 3.0 | 5.0 | NaN | 0.0 | 0.0 | 2.0 | 100.0 | 100.0 | 0.0 | 0.0 | 123292.0 | 29809.0 | 6500.0 | 25992.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 1050463 | NaN | NaN | 1000 | 1000 | 1000.0 | 36 months | 9.17 | 31.88 | B | B2 | Portfolio Manager | 1 year | MORTGAGE | 80000.0 | Verified | 2015-10-01 | Fully Paid | n | NaN | NaN | credit_card | Credit card refinancing | 949xx | CA | 12.51 | 0.0 | Oct-1967 | 3.0 | NaN | 22.0 | 9.0 | 1.0 | 7634 | 37.2 | 32.0 | w | 0.0 | 0.0 | 1021.730000 | 1021.73 | 999.99 | 21.74 | 0.0 | 0.00 | 0.0000 | Feb-2016 | 27.85 | NaN | Feb-2017 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 53994.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 20500.0 | NaN | NaN | NaN | 4.0 | 5999.0 | 12866.0 | 37.2 | 0.0 | 0.0 | 188.0 | 575.0 | 4.0 | 4.0 | 3.0 | 4.0 | NaN | 1.0 | NaN | 0.0 | 3.0 | 3.0 | 6.0 | 16.0 | 9.0 | 6.0 | 20.0 | 3.0 | 9.0 | 0.0 | 0.0 | 0.0 | 3.0 | 100.0 | 0.0 | 1.0 | 0.0 | 80788.0 | 53994.0 | 20500.0 | 60288.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 1056254 | NaN | NaN | 13000 | 13000 | 13000.0 | 36 months | 6.89 | 400.75 | A | A3 | mailhandler | 10+ years | RENT | 70000.0 | Not Verified | 2015-10-01 | Fully Paid | n | NaN | NaN | debt_consolidation | Debt consolidation | 751xx | TX | 21.72 | 0.0 | Oct-1996 | 0.0 | NaN | NaN | 16.0 | 0.0 | 11113 | 24.4 | 23.0 | w | 0.0 | 0.0 | 14425.915413 | 14425.92 | 13000.00 | 1425.92 | 0.0 | 0.00 | 0.0000 | Oct-2018 | 401.38 | NaN | Jan-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 316.0 | 93918.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 45600.0 | NaN | NaN | NaN | 10.0 | 5870.0 | 28646.0 | 26.9 | 0.0 | 0.0 | 227.0 | 60.0 | 7.0 | 5.0 | 3.0 | 7.0 | NaN | 7.0 | NaN | 0.0 | 7.0 | 8.0 | 8.0 | 8.0 | 8.0 | 10.0 | 10.0 | 8.0 | 16.0 | 0.0 | 0.0 | 0.0 | 4.0 | 100.0 | 0.0 | 0.0 | 0.0 | 141771.0 | 43988.0 | 39200.0 | 42191.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 1013860 | NaN | NaN | 5600 | 5600 | 5600.0 | 36 months | 6.24 | 170.98 | A | A2 | Managing Director | 4 years | RENT | 49000.0 | Source Verified | 2015-10-01 | Fully Paid | n | NaN | NaN | debt_consolidation | Debt consolidation | 968xx | HI | 2.45 | 0.0 | Dec-1988 | 0.0 | NaN | NaN | 4.0 | 0.0 | 3402 | 41.5 | 8.0 | w | 0.0 | 0.0 | 6051.344476 | 6051.34 | 5600.00 | 451.34 | 0.0 | 0.00 | 0.0000 | Aug-2017 | 2635.62 | NaN | Aug-2017 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 3402.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8200.0 | NaN | NaN | NaN | 0.0 | 851.0 | 3898.0 | 46.6 | 0.0 | 0.0 | NaN | 322.0 | 26.0 | 26.0 | 0.0 | 26.0 | NaN | NaN | NaN | 0.0 | 3.0 | 3.0 | 3.0 | 6.0 | 0.0 | 4.0 | 8.0 | 3.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 100.0 | 33.3 | 0.0 | 0.0 | 8200.0 | 3402.0 | 7300.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 1018431 | NaN | NaN | 11000 | 11000 | 11000.0 | 60 months | 12.29 | 246.31 | C | C1 | RTI Administrator | 4 years | MORTGAGE | 86000.0 | Not Verified | 2015-10-01 | Fully Paid | n | NaN | NaN | debt_consolidation | Debt consolidation | 388xx | MS | 24.35 | 2.0 | Nov-2001 | 1.0 | 2.0 | NaN | 12.0 | 0.0 | 6062 | 68.9 | 24.0 | w | 0.0 | 0.0 | 14008.069373 | 14008.07 | 11000.00 | 3008.07 | 0.0 | 0.00 | 0.0000 | Aug-2018 | 5894.86 | NaN | Feb-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 331363.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8800.0 | NaN | NaN | NaN | 3.0 | 27614.0 | 1083.0 | 81.9 | 0.0 | 0.0 | 167.0 | 160.0 | 10.0 | 10.0 | 2.0 | 23.0 | 2.0 | 6.0 | 2.0 | 0.0 | 2.0 | 3.0 | 2.0 | 5.0 | 13.0 | 4.0 | 9.0 | 3.0 | 12.0 | 0.0 | 0.0 | 0.0 | 1.0 | 87.5 | 50.0 | 0.0 | 0.0 | 393082.0 | 81179.0 | 6000.0 | 118795.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
df.tail()
Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | bc_open_to_buy | bc_util | chargeoff_within_12_mths | delinq_amnt | mo_sin_old_il_acct | mo_sin_old_rev_tl_op | mo_sin_rcnt_rev_tl_op | mo_sin_rcnt_tl | mort_acc | mths_since_recent_bc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_actv_bc_tl | num_actv_rev_tl | num_bc_sats | num_bc_tl | num_il_tl | num_op_rev_tl | num_rev_accts | num_rev_tl_bal_gt_0 | num_sats | num_tl_120dpd_2m | num_tl_30dpd | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | revol_bal_joint | sec_app_earliest_cr_line | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | sec_app_mths_since_last_major_derog | hardship_flag | hardship_type | hardship_reason | hardship_status | deferral_term | hardship_amount | hardship_start_date | hardship_end_date | payment_plan_start_date | hardship_length | hardship_dpd | hardship_loan_status | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | disbursement_method | debt_settlement_flag | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
749995 | 7605 | NaN | NaN | 15000 | 15000 | 15000.0 | 36 months | 13.56 | 509.47 | C | C1 | Driver | 4 years | RENT | 55000.0 | Not Verified | 2018-12-01 | Current | n | NaN | NaN | debt_consolidation | Debt consolidation | 171xx | PA | 17.15 | 0.0 | Oct-2012 | 0.0 | NaN | NaN | 9.0 | 0.0 | 7277 | 40.0 | 14.0 | w | 14316.22 | 14316.22 | 1001.99 | 1001.99 | 683.78 | 318.21 | 0.0 | 0.0 | 0.0 | Feb-2019 | 509.47 | Mar-2019 | Feb-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 21229.0 | 0.0 | 2.0 | 1.0 | 3.0 | 8.0 | 13952.0 | 85.0 | 1.0 | 1.0 | 4778.0 | 61.0 | 18200.0 | 1.0 | 0.0 | 2.0 | 4.0 | 2359.0 | 2457.0 | 69.7 | 0.0 | 0.0 | 74.0 | 54.0 | 12.0 | 8.0 | 0.0 | 35.0 | NaN | 8.0 | NaN | 0.0 | 2.0 | 5.0 | 2.0 | 2.0 | 4.0 | 7.0 | 10.0 | 5.0 | 9.0 | 0.0 | 0.0 | 0.0 | 2.0 | 100.0 | 50.0 | 0.0 | 0.0 | 34632.0 | 21229.0 | 8100.0 | 16432.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
749996 | 35399 | NaN | NaN | 8000 | 8000 | 8000.0 | 36 months | 6.46 | 245.05 | A | A1 | Realtor | 2 years | MORTGAGE | 100000.0 | Not Verified | 2018-12-01 | Current | n | NaN | NaN | debt_consolidation | Debt consolidation | 231xx | VA | 9.90 | 1.0 | Sep-2001 | 0.0 | 21.0 | NaN | 13.0 | 0.0 | 17646 | 51.7 | 29.0 | w | 7389.00 | 7389.00 | 692.18 | 692.18 | 611.00 | 81.18 | 0.0 | 0.0 | 0.0 | Feb-2019 | 245.05 | Mar-2019 | Feb-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 3323.0 | 326631.0 | 1.0 | 2.0 | 0.0 | 0.0 | 31.0 | 13286.0 | 57.0 | 2.0 | 3.0 | 2173.0 | 54.0 | 34100.0 | 0.0 | 3.0 | 0.0 | 3.0 | 27219.0 | 12434.0 | 26.0 | 0.0 | 0.0 | 82.0 | 206.0 | 2.0 | 2.0 | 2.0 | 2.0 | 63.0 | 17.0 | 21.0 | 0.0 | 4.0 | 5.0 | 5.0 | 11.0 | 3.0 | 10.0 | 24.0 | 5.0 | 13.0 | 0.0 | 0.0 | 0.0 | 2.0 | 82.8 | 40.0 | 0.0 | 0.0 | 381832.0 | 30932.0 | 16800.0 | 23370.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | DirectPay | N | NaN | NaN | NaN | NaN | NaN | NaN |
749997 | 8452 | NaN | NaN | 40000 | 40000 | 40000.0 | 60 months | 16.14 | 975.71 | C | C4 | Director of Operations | 3 years | MORTGAGE | 157630.0 | Verified | 2018-12-01 | Current | n | NaN | NaN | credit_card | Credit card refinancing | 972xx | OR | 16.61 | 1.0 | May-2004 | 0.0 | 18.0 | NaN | 11.0 | 0.0 | 15761 | 56.7 | 21.0 | w | 39562.28 | 39562.28 | 1155.05 | 1155.05 | 437.72 | 717.33 | 0.0 | 0.0 | 0.0 | Feb-2019 | 1208.85 | Mar-2019 | Feb-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 474781.0 | 1.0 | 4.0 | 1.0 | 1.0 | 5.0 | 59310.0 | 38.0 | 0.0 | 0.0 | 9029.0 | 43.0 | 27800.0 | 0.0 | 5.0 | 0.0 | 1.0 | 43162.0 | 9453.0 | 59.8 | 0.0 | 0.0 | 71.0 | 175.0 | 27.0 | 5.0 | 2.0 | 27.0 | NaN | NaN | NaN | 0.0 | 3.0 | 4.0 | 3.0 | 4.0 | 6.0 | 6.0 | 13.0 | 4.0 | 11.0 | 0.0 | 0.0 | 0.0 | 1.0 | 95.2 | 66.7 | 0.0 | 0.0 | 542479.0 | 75071.0 | 23500.0 | 100389.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
749998 | 21664 | NaN | NaN | 7000 | 7000 | 7000.0 | 36 months | 8.19 | 219.97 | A | A4 | Bill Poster | 10+ years | MORTGAGE | 42000.0 | Verified | 2018-12-01 | Current | n | NaN | NaN | other | Other | 707xx | LA | 6.49 | 0.0 | Oct-2003 | 0.0 | NaN | NaN | 7.0 | 0.0 | 6898 | 53.1 | 22.0 | w | 6654.43 | 6654.43 | 456.87 | 456.87 | 345.57 | 111.30 | 0.0 | 0.0 | 0.0 | Feb-2019 | 243.27 | Mar-2019 | Feb-2019 | 0.0 | NaN | 1 | Joint App | 70000.0 | 4.35 | Verified | 0.0 | 440.0 | 114983.0 | 0.0 | 0.0 | 0.0 | 0.0 | 49.0 | 0.0 | NaN | 1.0 | 1.0 | 1048.0 | 53.0 | 13000.0 | 0.0 | 0.0 | 0.0 | 1.0 | 16426.0 | 1620.0 | 64.0 | 0.0 | 0.0 | 127.0 | 160.0 | 8.0 | 8.0 | 4.0 | 8.0 | NaN | NaN | NaN | 0.0 | 3.0 | 5.0 | 3.0 | 3.0 | 13.0 | 5.0 | 5.0 | 5.0 | 7.0 | 0.0 | 0.0 | 0.0 | 1.0 | 100.0 | 33.3 | 0.0 | 0.0 | 232200.0 | 6898.0 | 4500.0 | 0.0 | 7445.0 | Jul-2003 | 0.0 | 4.0 | 6.0 | 59.8 | 0.0 | 6.0 | 0.0 | 0.0 | 58.0 | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
749999 | 24316 | NaN | NaN | 15000 | 15000 | 15000.0 | 60 months | 17.97 | 380.66 | D | D1 | Document | 4 years | RENT | 110000.0 | Not Verified | 2018-12-01 | Current | n | NaN | NaN | debt_consolidation | Debt consolidation | 950xx | CA | 13.28 | 0.0 | Mar-2011 | 0.0 | NaN | NaN | 10.0 | 0.0 | 13439 | 72.3 | 13.0 | w | 14685.59 | 14685.59 | 746.34 | 746.34 | 314.41 | 431.93 | 0.0 | 0.0 | 0.0 | Feb-2019 | 380.66 | Mar-2019 | Feb-2019 | 0.0 | NaN | 1 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 24340.0 | 0.0 | 2.0 | 0.0 | 2.0 | 16.0 | 10901.0 | 54.0 | 1.0 | 1.0 | 3313.0 | 63.0 | 18600.0 | 1.0 | 1.0 | 0.0 | 3.0 | 2704.0 | 3131.0 | 73.7 | 0.0 | 0.0 | 68.0 | 92.0 | 7.0 | 7.0 | 0.0 | 7.0 | NaN | 16.0 | NaN | 0.0 | 4.0 | 7.0 | 4.0 | 4.0 | 3.0 | 8.0 | 10.0 | 7.0 | 10.0 | 0.0 | 0.0 | 0.0 | 1.0 | 100.0 | 75.0 | 0.0 | 0.0 | 38908.0 | 24340.0 | 11900.0 | 20308.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
What are the total number of loans?
len(df)
750000
How many columns does the dataset have?
len(df.columns)
146
How may columns contain missing data?
miss_values_count = df.isnull().sum(min_count=1)
miss_values_count = miss_values_count[miss_values_count != 0]
What is the count of missing data for each of this column?
print(f"Number of columns with missing values: {miss_values_count.shape[0]}")
if miss_values_count.shape[0]:
print("Null value count per column: ")
for name, miss_vals in miss_values_count.items():
p = miss_vals > 1
print(f" - For column name '{name}' there are{'n' if p else ''} "
f"{miss_vals} missing values{'s' if p else ''}.")
Number of columns with missing values: 74 Null value count per column: - For column name 'id' there aren 750000 missing valuess. - For column name 'member_id' there aren 750000 missing valuess. - For column name 'emp_title' there aren 61669 missing valuess. - For column name 'emp_length' there aren 54976 missing valuess. - For column name 'url' there aren 750000 missing valuess. - For column name 'desc' there aren 749986 missing valuess. - For column name 'title' there aren 11781 missing valuess. - For column name 'dti' there aren 835 missing valuess. - For column name 'inq_last_6mths' there are 1 missing values. - For column name 'mths_since_last_delinq' there aren 382775 missing valuess. - For column name 'mths_since_last_record' there aren 627757 missing valuess. - For column name 'revol_util' there aren 659 missing valuess. - For column name 'last_pymnt_d' there aren 909 missing valuess. - For column name 'next_pymnt_d' there aren 295358 missing valuess. - For column name 'last_credit_pull_d' there aren 18 missing valuess. - For column name 'mths_since_last_major_derog' there aren 552182 missing valuess. - For column name 'annual_inc_joint' there aren 689705 missing valuess. - For column name 'dti_joint' there aren 689705 missing valuess. - For column name 'verification_status_joint' there aren 692145 missing valuess. - For column name 'open_acc_6m' there aren 52540 missing valuess. - For column name 'open_act_il' there aren 52539 missing valuess. - For column name 'open_il_12m' there aren 52539 missing valuess. - For column name 'open_il_24m' there aren 52539 missing valuess. - For column name 'mths_since_rcnt_il' there aren 74494 missing valuess. - For column name 'total_bal_il' there aren 52539 missing valuess. - For column name 'il_util' there aren 153788 missing valuess. - For column name 'open_rv_12m' there aren 52539 missing valuess. - For column name 'open_rv_24m' there aren 52539 missing valuess. - For column name 'max_bal_bc' there aren 52539 missing valuess. - For column name 'all_util' there aren 52662 missing valuess. - For column name 'inq_fi' there aren 52539 missing valuess. - For column name 'total_cu_tl' there aren 52540 missing valuess. - For column name 'inq_last_12m' there aren 52540 missing valuess. - For column name 'avg_cur_bal' there aren 32 missing valuess. - For column name 'bc_open_to_buy' there aren 9089 missing valuess. - For column name 'bc_util' there aren 9415 missing valuess. - For column name 'mo_sin_old_il_acct' there aren 23463 missing valuess. - For column name 'mths_since_recent_bc' there aren 8576 missing valuess. - For column name 'mths_since_recent_bc_dlq' there aren 577966 missing valuess. - For column name 'mths_since_recent_inq' there aren 86405 missing valuess. - For column name 'mths_since_recent_revol_delinq' there aren 502543 missing valuess. - For column name 'num_tl_120dpd_2m' there aren 31530 missing valuess. - For column name 'percent_bc_gt_75' there aren 9163 missing valuess. - For column name 'revol_bal_joint' there aren 696039 missing valuess. - For column name 'sec_app_earliest_cr_line' there aren 696039 missing valuess. - For column name 'sec_app_inq_last_6mths' there aren 696039 missing valuess. - For column name 'sec_app_mort_acc' there aren 696039 missing valuess. - For column name 'sec_app_open_acc' there aren 696039 missing valuess. - For column name 'sec_app_revol_util' there aren 696953 missing valuess. - For column name 'sec_app_open_act_il' there aren 696039 missing valuess. - For column name 'sec_app_num_rev_accts' there aren 696039 missing valuess. - For column name 'sec_app_chargeoff_within_12_mths' there aren 696039 missing valuess. - For column name 'sec_app_collections_12_mths_ex_med' there aren 696039 missing valuess. - For column name 'sec_app_mths_since_last_major_derog' there aren 732082 missing valuess. - For column name 'hardship_type' there aren 745844 missing valuess. - For column name 'hardship_reason' there aren 745844 missing valuess. - For column name 'hardship_status' there aren 745844 missing valuess. - For column name 'deferral_term' there aren 745844 missing valuess. - For column name 'hardship_amount' there aren 745844 missing valuess. - For column name 'hardship_start_date' there aren 745844 missing valuess. - For column name 'hardship_end_date' there aren 745844 missing valuess. - For column name 'payment_plan_start_date' there aren 745844 missing valuess. - For column name 'hardship_length' there aren 745844 missing valuess. - For column name 'hardship_dpd' there aren 745844 missing valuess. - For column name 'hardship_loan_status' there aren 745844 missing valuess. - For column name 'orig_projected_additional_accrued_interest' there aren 746723 missing valuess. - For column name 'hardship_payoff_balance_amount' there aren 745844 missing valuess. - For column name 'hardship_last_payment_amount' there aren 745844 missing valuess. - For column name 'debt_settlement_flag_date' there aren 740517 missing valuess. - For column name 'settlement_status' there aren 740517 missing valuess. - For column name 'settlement_date' there aren 740517 missing valuess. - For column name 'settlement_amount' there aren 740517 missing valuess. - For column name 'settlement_percentage' there aren 740517 missing valuess. - For column name 'settlement_term' there aren 740517 missing valuess.
How has the average interest rate of a loan varied over time?
If it is a variable rate, the variation of the rate would be determined with the table of payments and not in the table of loans, which is our sample; since the 90's it is rare to use a variable rate. If it is a fixed rate, there is no variation in the rate and it would be int_rate. As for variation of any value it would be necessary to define if it is the programmed one or the real one; the first one is with financial formulas and there are many, the second one is from the data, normally from the payments.
What is the distribution of sub-grades, and can we easily segment borrowers based on prime and sub prime loans
plt.figure(figsize=(16, 6))
sns.countplot(x="sub_grade", data=df, order = df['sub_grade'].value_counts().index)
<matplotlib.axes._subplots.AxesSubplot at 0x16a730644e0>
But can we easily segment borrowers based on prime and sub prime loans ? Maybe we need less/more categories ? I would segment them depending on if they pay their loan back
df[['sub_grade','loan_amnt','int_rate']].groupby('sub_grade').describe().reset_index()
sub_grade | int_rate | loan_amnt | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | ||
0 | A1 | 34251.0 | 5.544917 | 0.389030 | 5.31 | 5.32 | 5.32 | 6.11 | 6.46 | 34251.0 | 15778.950980 | 9849.357821 | 1000.0 | 8400.0 | 13000.0 | 20000.0 | 40000.0 |
1 | A2 | 25190.0 | 6.587943 | 0.381601 | 6.00 | 6.19 | 6.67 | 6.99 | 7.07 | 25190.0 | 14552.648869 | 9554.564306 | 1000.0 | 7500.0 | 12000.0 | 20000.0 | 40000.0 |
2 | A3 | 26431.0 | 7.062373 | 0.266037 | 6.00 | 6.83 | 7.21 | 7.21 | 7.59 | 26431.0 | 14292.664863 | 9544.563417 | 1000.0 | 7000.0 | 12000.0 | 20000.0 | 40000.0 |
3 | A4 | 33057.0 | 7.557079 | 0.283809 | 6.00 | 7.35 | 7.39 | 7.84 | 8.19 | 33057.0 | 15257.654203 | 9961.010378 | 1000.0 | 8000.0 | 12000.0 | 20000.0 | 40000.0 |
4 | A5 | 34741.0 | 8.118880 | 0.285978 | 6.00 | 7.96 | 7.97 | 8.46 | 8.81 | 34741.0 | 14407.389684 | 9448.639545 | 1000.0 | 7200.0 | 12000.0 | 20000.0 | 40000.0 |
5 | B1 | 43870.0 | 9.075405 | 0.682017 | 6.00 | 8.39 | 9.43 | 9.44 | 10.33 | 43870.0 | 14550.149305 | 9491.496416 | 1000.0 | 7500.0 | 12000.0 | 20000.0 | 40000.0 |
6 | B2 | 42643.0 | 9.892638 | 0.510048 | 6.00 | 9.49 | 9.93 | 10.47 | 10.72 | 42643.0 | 14847.658467 | 9750.694550 | 1000.0 | 7200.0 | 12000.0 | 20000.0 | 40000.0 |
7 | B3 | 41821.0 | 10.494923 | 0.545622 | 6.00 | 9.99 | 10.42 | 11.06 | 11.39 | 41821.0 | 14366.098372 | 9434.344664 | 1000.0 | 7000.0 | 12000.0 | 20000.0 | 40000.0 |
8 | B4 | 46010.0 | 11.108060 | 0.335650 | 6.00 | 10.90 | 10.99 | 11.44 | 11.80 | 46010.0 | 14328.037927 | 9387.670226 | 1000.0 | 7000.0 | 12000.0 | 20000.0 | 40000.0 |
9 | B5 | 49638.0 | 11.844107 | 0.462537 | 6.00 | 11.49 | 11.53 | 11.99 | 12.98 | 49638.0 | 14274.199706 | 9438.912459 | 1000.0 | 7000.0 | 12000.0 | 20000.0 | 40000.0 |
10 | C1 | 50421.0 | 12.674281 | 0.472692 | 6.00 | 12.61 | 12.62 | 12.79 | 13.56 | 50421.0 | 14868.141746 | 9538.444558 | 1000.0 | 7500.0 | 12000.0 | 20000.0 | 40000.0 |
11 | C2 | 43310.0 | 13.525962 | 0.525845 | 6.00 | 13.49 | 13.49 | 13.59 | 14.47 | 43310.0 | 14893.247518 | 9528.672902 | 1000.0 | 7450.0 | 12000.0 | 20000.0 | 40000.0 |
12 | C3 | 42908.0 | 14.082712 | 0.512439 | 6.00 | 13.99 | 14.07 | 14.08 | 15.02 | 42908.0 | 15515.432437 | 9502.942268 | 1000.0 | 8000.0 | 14000.0 | 21000.0 | 40000.0 |
13 | C4 | 43154.0 | 14.942308 | 0.636235 | 6.00 | 14.49 | 14.99 | 15.05 | 16.14 | 43154.0 | 15772.832182 | 9479.332442 | 1000.0 | 8500.0 | 14000.0 | 21000.0 | 40000.0 |
14 | C5 | 40569.0 | 15.911166 | 0.628875 | 6.00 | 15.59 | 16.01 | 16.02 | 16.91 | 40569.0 | 15884.065419 | 9564.053499 | 1000.0 | 8450.0 | 14400.0 | 21525.0 | 40000.0 |
15 | D1 | 25053.0 | 16.962046 | 0.725618 | 6.00 | 16.29 | 16.99 | 17.47 | 17.97 | 25053.0 | 15663.602163 | 9352.801072 | 1000.0 | 8325.0 | 14400.0 | 21000.0 | 40000.0 |
16 | D2 | 23256.0 | 18.004723 | 0.691684 | 6.00 | 17.99 | 18.06 | 18.45 | 18.94 | 23256.0 | 15631.319874 | 9398.099400 | 1000.0 | 8000.0 | 14400.0 | 21600.0 | 40000.0 |
17 | D3 | 20643.0 | 18.955224 | 0.809744 | 6.00 | 18.99 | 19.03 | 19.42 | 19.92 | 20643.0 | 16024.160732 | 9436.456261 | 1000.0 | 9000.0 | 15000.0 | 22250.0 | 40000.0 |
18 | D4 | 17112.0 | 19.819662 | 0.970901 | 6.00 | 19.99 | 20.00 | 20.39 | 20.89 | 17112.0 | 16005.144051 | 9351.079353 | 1000.0 | 9175.0 | 15000.0 | 22000.0 | 40000.0 |
19 | D5 | 14683.0 | 21.075908 | 1.350807 | 6.00 | 21.45 | 21.45 | 21.85 | 22.35 | 14683.0 | 16716.408432 | 9438.190236 | 1000.0 | 10000.0 | 15000.0 | 24000.0 | 40000.0 |
20 | E1 | 8793.0 | 21.606883 | 1.829379 | 6.00 | 19.99 | 22.74 | 22.91 | 23.40 | 8793.0 | 16125.039804 | 9578.769471 | 1000.0 | 9000.0 | 15000.0 | 23000.0 | 40000.0 |
21 | E2 | 7513.0 | 22.540733 | 2.144985 | 6.00 | 20.75 | 23.88 | 23.99 | 24.37 | 7513.0 | 16362.631439 | 9718.351712 | 1000.0 | 9000.0 | 15000.0 | 24000.0 | 40000.0 |
22 | E3 | 7218.0 | 23.377350 | 2.274183 | 6.00 | 21.18 | 24.74 | 24.85 | 25.34 | 7218.0 | 17393.821003 | 9418.336317 | 1000.0 | 10000.0 | 16000.0 | 24000.0 | 40000.0 |
23 | E4 | 6285.0 | 24.343181 | 2.246686 | 6.00 | 21.97 | 25.49 | 25.82 | 26.31 | 6285.0 | 17965.588703 | 9295.016271 | 1000.0 | 10975.0 | 16600.0 | 25000.0 | 40000.0 |
24 | E5 | 7036.0 | 25.514105 | 1.981731 | 6.00 | 25.29 | 26.30 | 26.77 | 27.27 | 7036.0 | 18276.190307 | 8944.332753 | 1000.0 | 12000.0 | 17350.0 | 25000.0 | 40000.0 |
25 | F1 | 3563.0 | 26.225897 | 2.774035 | 6.00 | 23.13 | 28.69 | 28.72 | 28.72 | 3563.0 | 19065.401347 | 8867.203485 | 1000.0 | 12000.0 | 18000.0 | 25000.0 | 40000.0 |
26 | F2 | 2333.0 | 26.784303 | 2.794799 | 6.00 | 24.11 | 26.49 | 29.69 | 29.69 | 2333.0 | 18874.882126 | 9168.830490 | 1000.0 | 12000.0 | 18000.0 | 25000.0 | 40000.0 |
27 | F3 | 1877.0 | 27.699020 | 2.507817 | 23.99 | 24.99 | 26.99 | 30.17 | 30.17 | 1877.0 | 19339.637720 | 9038.327304 | 1000.0 | 12000.0 | 18125.0 | 26000.0 | 40000.0 |
28 | F4 | 1525.0 | 28.276000 | 2.494425 | 6.00 | 25.88 | 27.49 | 30.65 | 30.65 | 1525.0 | 18940.065574 | 8977.055891 | 1000.0 | 12000.0 | 18000.0 | 25600.0 | 40000.0 |
29 | F5 | 1442.0 | 28.909563 | 2.035321 | 25.78 | 26.57 | 30.74 | 30.75 | 30.75 | 1442.0 | 20080.582524 | 9452.258874 | 1000.0 | 12500.0 | 19000.0 | 28000.0 | 40000.0 |
30 | G1 | 1264.0 | 29.582540 | 1.746886 | 6.00 | 28.18 | 30.79 | 30.79 | 30.79 | 1264.0 | 20690.901899 | 9086.732784 | 1000.0 | 14000.0 | 20000.0 | 28000.0 | 40000.0 |
31 | G2 | 715.0 | 29.560979 | 1.640559 | 6.00 | 28.14 | 30.84 | 30.84 | 30.84 | 715.0 | 18973.846154 | 9553.073015 | 1000.0 | 12000.0 | 17675.0 | 26600.0 | 40000.0 |
32 | G3 | 613.0 | 29.837504 | 1.555991 | 6.00 | 28.34 | 30.89 | 30.89 | 30.89 | 613.0 | 20049.755302 | 9058.333894 | 1800.0 | 12900.0 | 18425.0 | 27300.0 | 40000.0 |
33 | G4 | 550.0 | 30.309164 | 1.388560 | 6.00 | 29.96 | 30.94 | 30.94 | 30.94 | 550.0 | 20732.318182 | 8969.498118 | 1000.0 | 13000.0 | 19787.5 | 28287.5 | 40000.0 |
34 | G5 | 512.0 | 30.622813 | 0.775062 | 28.99 | 30.99 | 30.99 | 30.99 | 30.99 | 512.0 | 20290.039062 | 9246.073949 | 2000.0 | 12212.5 | 19750.0 | 28000.0 | 40000.0 |
Visualise a low-dimensional split in the dataset when the target variable is the grade?
collist = df.columns.tolist()
# you can now select from this list any arbritrary range
df1 = df[collist[0:1]]
# or remove a column
collist.remove('sub_grade')
# now select
df1 = df[collist]
As far as we are considering the grade and not the interest rate we are going to drop the latest
df1 = df1.drop(columns = ['int_rate'])
We transform the grades to numeric values
df1['ascii'] = [ord(x) for x in df1['grade']]
df1 = df1.drop(columns=['grade'])
Here we will first plot the Pearson correlation heatmap and see the correlation of independent variables with the output variable grade
. We will only select features which has correlation of above 0.5 (taking absolute value) with the output variable.
#Using Pearson Correlation
cor = df1.corr()
mask = np.zeros_like(cor, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(12,10))
sns.heatmap(cor,
vmin=-1,
cmap='coolwarm',
annot=False,
mask = mask);
As we can see, no features are highly correlated with the output variable grade
, named ascii
here to be able to respresent it. Hence we can't have insights from Pearson correlation. However this is not the end of the process. We can still handpick the explenatory features.
count = 0
for y in subset.columns:
if subset[y].dtype == np.object:
count+=1
print("number of categorical variables : " + str(count))
print(f"{subset.dtypes}\n")
print(f"Sum of null values in each feature:\n{35 * '-'}")
print(f"{subset.isnull().sum()}")
number of categorical variables : 36 Unnamed: 0 int64 id float64 member_id float64 loan_amnt int64 funded_amnt int64 funded_amnt_inv float64 term object int_rate float64 installment float64 grade object sub_grade object emp_title object emp_length object home_ownership object annual_inc float64 verification_status object issue_d object loan_status object pymnt_plan object url float64 desc object purpose object title object zip_code object addr_state object dti float64 delinq_2yrs float64 earliest_cr_line object inq_last_6mths float64 mths_since_last_delinq float64 ... sec_app_open_acc float64 sec_app_revol_util float64 sec_app_open_act_il float64 sec_app_num_rev_accts float64 sec_app_chargeoff_within_12_mths float64 sec_app_collections_12_mths_ex_med float64 sec_app_mths_since_last_major_derog float64 hardship_flag object hardship_type object hardship_reason object hardship_status object deferral_term float64 hardship_amount float64 hardship_start_date object hardship_end_date object payment_plan_start_date object hardship_length float64 hardship_dpd float64 hardship_loan_status object orig_projected_additional_accrued_interest float64 hardship_payoff_balance_amount float64 hardship_last_payment_amount float64 disbursement_method object debt_settlement_flag object debt_settlement_flag_date object settlement_status object settlement_date object settlement_amount float64 settlement_percentage float64 settlement_term float64 Length: 146, dtype: object Sum of null values in each feature: ----------------------------------- Unnamed: 0 0 id 295358 member_id 295358 loan_amnt 0 funded_amnt 0 funded_amnt_inv 0 term 0 int_rate 0 installment 0 grade 0 sub_grade 0 emp_title 20400 emp_length 20036 home_ownership 0 annual_inc 0 verification_status 0 issue_d 0 loan_status 0 pymnt_plan 0 url 295358 desc 295348 purpose 0 title 7834 zip_code 0 addr_state 0 dti 168 delinq_2yrs 0 earliest_cr_line 0 inq_last_6mths 1 mths_since_last_delinq 144176 ... sec_app_open_acc 287070 sec_app_revol_util 287222 sec_app_open_act_il 287070 sec_app_num_rev_accts 287070 sec_app_chargeoff_within_12_mths 287070 sec_app_collections_12_mths_ex_med 287070 sec_app_mths_since_last_major_derog 292362 hardship_flag 0 hardship_type 293455 hardship_reason 293455 hardship_status 293455 deferral_term 293455 hardship_amount 293455 hardship_start_date 293455 hardship_end_date 293455 payment_plan_start_date 293455 hardship_length 293455 hardship_dpd 293455 hardship_loan_status 293455 orig_projected_additional_accrued_interest 294215 hardship_payoff_balance_amount 293455 hardship_last_payment_amount 293455 disbursement_method 0 debt_settlement_flag 0 debt_settlement_flag_date 286344 settlement_status 286344 settlement_date 286344 settlement_amount 286344 settlement_percentage 286344 settlement_term 286344 Length: 146, dtype: int64
Things worth knowing for predictive model
As we had observe, some columns like annual_inc, int_rate, etc. may be much useful for building our model but on the other hand, some columns like id, member_id, etc. will not be helping. So we hand pick some columns.
APPLICANT_NUMERIC = ['annual_inc', 'dti', 'loan_amnt', 'installment']
APPLICANT_CATEGORICAL = ['application_type', 'emp_length', 'home_ownership', 'addr_state', 'term']
CREDIT_NUMERIC = ['acc_now_delinq', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy',
'bc_util', 'delinq_2yrs', 'delinq_amnt',
'open_acc', 'pub_rec', 'revol_util',
'revol_bal', 'tot_coll_amt', 'tot_cur_bal', 'total_acc', 'total_rev_hi_lim',
'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats',
'num_bc_tl', 'num_il_tl', 'num_rev_tl_bal_gt_0', 'pct_tl_nvr_dlq',
'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
'total_il_high_credit_limit', 'all_util',
'il_util', 'total_bal_il', 'total_cu_tl']
TARGET = ['grade']
df = pd.read_csv("loans.csv", usecols = APPLICANT_NUMERIC +APPLICANT_CATEGORICAL + CREDIT_NUMERIC + TARGET)
We didn't selected columns like 'title' and 'emp_title' are text which cannot be one-hot encoded / label encoded as they have arbitrary categorical text and very less unique data for each of their categories.
# We order our grade category so order of grades doesn't appear random in graphs
grade_categories = [g for g in "ABCDEFG"]
df["grade"] = df["grade"].astype("category", categories=grade_categories, ordered=True)
# Sanity check that we're working with cleaned data
bad_rows = df.isnull().T.any().T.sum()
if bad_rows > 0:
print("Rows with null/NaN values: {}".format(bad_rows))
print("Columns with null/NaN values:")
print(pd.isnull(df).sum() > 0)
print("Dropping bad rows...")
df.dropna(axis=0, how='any', inplace=True)
print("Rows with null/NaN values: {}".format(df.isnull().T.any().T.sum()))
print(df.info(null_counts = True, memory_usage = "deep", verbose = True))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead This is separate from the ipykernel package so we can avoid doing imports until
Rows with null/NaN values: 197840 Columns with null/NaN values: loan_amnt False term False installment False grade False emp_length True home_ownership False annual_inc False addr_state False dti True delinq_2yrs False open_acc False pub_rec False revol_bal False revol_util True total_acc False application_type False acc_now_delinq False tot_coll_amt False tot_cur_bal False total_bal_il True il_util True all_util True total_rev_hi_lim False total_cu_tl True acc_open_past_24mths False avg_cur_bal True bc_open_to_buy True bc_util True delinq_amnt False num_accts_ever_120_pd False num_actv_bc_tl False num_actv_rev_tl False num_bc_sats False num_bc_tl False num_il_tl False num_rev_tl_bal_gt_0 False pct_tl_nvr_dlq False percent_bc_gt_75 True tot_hi_cred_lim False total_bal_ex_mort False total_bc_limit False total_il_high_credit_limit False dtype: bool Dropping bad rows... Rows with null/NaN values: 0 <class 'pandas.core.frame.DataFrame'> Int64Index: 552160 entries, 41131 to 749999 Data columns (total 42 columns): loan_amnt 552160 non-null int64 term 552160 non-null object installment 552160 non-null float64 grade 552160 non-null category emp_length 552160 non-null object home_ownership 552160 non-null object annual_inc 552160 non-null float64 addr_state 552160 non-null object dti 552160 non-null float64 delinq_2yrs 552160 non-null float64 open_acc 552160 non-null float64 pub_rec 552160 non-null float64 revol_bal 552160 non-null int64 revol_util 552160 non-null float64 total_acc 552160 non-null float64 application_type 552160 non-null object acc_now_delinq 552160 non-null float64 tot_coll_amt 552160 non-null float64 tot_cur_bal 552160 non-null float64 total_bal_il 552160 non-null float64 il_util 552160 non-null float64 all_util 552160 non-null float64 total_rev_hi_lim 552160 non-null float64 total_cu_tl 552160 non-null float64 acc_open_past_24mths 552160 non-null float64 avg_cur_bal 552160 non-null float64 bc_open_to_buy 552160 non-null float64 bc_util 552160 non-null float64 delinq_amnt 552160 non-null float64 num_accts_ever_120_pd 552160 non-null float64 num_actv_bc_tl 552160 non-null float64 num_actv_rev_tl 552160 non-null float64 num_bc_sats 552160 non-null float64 num_bc_tl 552160 non-null float64 num_il_tl 552160 non-null float64 num_rev_tl_bal_gt_0 552160 non-null float64 pct_tl_nvr_dlq 552160 non-null float64 percent_bc_gt_75 552160 non-null float64 tot_hi_cred_lim 552160 non-null float64 total_bal_ex_mort 552160 non-null float64 total_bc_limit 552160 non-null float64 total_il_high_credit_limit 552160 non-null float64 dtypes: category(1), float64(34), int64(2), object(5) memory usage: 325.2 MB None
df.head()
loan_amnt | term | installment | grade | emp_length | home_ownership | annual_inc | addr_state | dti | delinq_2yrs | ... | num_bc_sats | num_bc_tl | num_il_tl | num_rev_tl_bal_gt_0 | pct_tl_nvr_dlq | percent_bc_gt_75 | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
41131 | 24000 | 36 months | 757.51 | B | 1 year | MORTGAGE | 80000.0 | WA | 11.80 | 0.0 | ... | 11.0 | 11.0 | 29.0 | 7.0 | 100.0 | 9.1 | 501190.0 | 167271.0 | 137400.0 | 121695.0 |
41135 | 8000 | 36 months | 257.39 | B | 6 years | RENT | 55000.0 | CO | 10.16 | 0.0 | ... | 4.0 | 13.0 | 6.0 | 4.0 | 95.7 | 25.0 | 27933.0 | 21907.0 | 7800.0 | 17349.0 |
41136 | 19975 | 60 months | 453.27 | C | 1 year | MORTGAGE | 92000.0 | GA | 19.05 | 0.0 | ... | 4.0 | 9.0 | 36.0 | 6.0 | 83.0 | 50.0 | 406013.0 | 211797.0 | 12300.0 | 217689.0 |
41139 | 24000 | 60 months | 603.85 | D | 3 years | RENT | 72500.0 | CA | 22.89 | 0.0 | ... | 5.0 | 7.0 | 9.0 | 7.0 | 100.0 | 80.0 | 73333.0 | 64376.0 | 17300.0 | 41233.0 |
41141 | 20000 | 60 months | 461.96 | C | 8 years | MORTGAGE | 62000.0 | NC | 7.14 | 0.0 | ... | 6.0 | 10.0 | 5.0 | 6.0 | 90.9 | 16.7 | 188975.0 | 14423.0 | 18800.0 | 6914.0 |
5 rows × 42 columns
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker
sns.reset_orig()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels(df, rotation=90)
ax.set_yticklabels(df)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
Next we can take a look at the distribution of some values by loan grade for some features of interest using "violin" charts.
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
plt.rcParams["figure.figsize"] = (20,8)
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
# Create violinplot
plt.subplot(121)
v1 = sns.violinplot(x = "revol_util", y="grade", data=df)
v1.axes.set_title("Amount of Credit Line Used Relative to Total Credit Available by Loan Grade", fontsize=20)
plt.subplot(122)
v2 = sns.violinplot(x = "acc_open_past_24mths", y="grade", data=df)
v2.axes.set_title("Accts Opened in Past 24 Months by Loan Grade", fontsize=20)
# Show the plot
plt.show()
The distribution of the credit line used relative to total credit available by loan grade scores give A grade loans having higher values than the G grade loans. There is an interesting trend on the second graph with distribution of accounts opened in the past 24 months for higher grade loans relative to the lower grade loans. So even if the correlation is much lower, the inner distribution of these accounts oppenings give some major insights.
We started with a model with PCA and one hot encoding on every categorical variables. It created a dataframe of 214 features, 555M+ large. With the same model as below it first gave 30% of good classifcations, with KFold it dropped to a mean of 26.10% (with a standard deviation of 7.89%). So we decided to handpick the data and use the architecure the same way James Andersen did in his Medium article.
We prepare a machine learning model on the data. We first need to split the data into "X" for the data that will be fed into the model as input and "Y" the column we're trying to predict: loan grade.
Then we'll split the data by row; 80% will be used to train/validate the model and 20% will be used to test how well the model performs on unseen data.
Last the target, which is multi-class, has to been one-hot encoded.
def split_data(data, continuous_cols, categorical_cols, label_col, test_size=0.2, row_limit=None):
"""Divide the data in to X and y dataframes and train/test split"""
# Subset to get feature data
x_df = data.loc[:, continuous_cols + categorical_cols]
# Update our X dataframe with categorical values replaced by one-hot encoded values
x_df = encode_categorical(x_df, categorical_cols)
# Ensure all numeric features are on the same scale
for col in continuous_cols:
x_df[col] = (x_df[col] - x_df[col].mean()) / x_df[col].std()
# Specify the target labels and flatten the array
y = pd.get_dummies(data[label_col])
# When requested, limit the amount of data that will be used
# Using entire data set can be painfully slow without a GPU!
if row_limit != None:
rows = np.random.binomial(1, 0.1, size=len(data)).astype('bool')
x_df = x_df[rows]
y = y[rows]
print("Using only a sample of {} observations".format(x_df.shape[0]))
#data = self.lcdata.sample(int(row_limit))
else:
print("Using the full set of {} observations".format(data.shape[0]))
#data = self.lcdata
# Create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size=test_size, random_state=23)
print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
return x_train, x_test, y_train, y_test
def encode_categorical(frame, categorical_cols):
"""Replace categorical variables with one-hot encoding in-place"""
for col in categorical_cols:
# use get_dummies() to do one hot encoding of categorical column
frame = frame.merge(pd.get_dummies(frame[col]), left_index=True, right_index=True)
# drop the original categorical column
frame.drop(col, axis=1, inplace=True)
return frame
# Divide the data set into training and test sets
x_train, x_test, y_train, y_test = split_data(df, APPLICANT_NUMERIC + CREDIT_NUMERIC,
APPLICANT_CATEGORICAL,
TARGET,
test_size = 0.2,
row_limit = os.environ.get("sample"))
# Inspect our training data
print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
print("y_train contains {} rows and {} features".format(y_train.shape[0], y_train.shape[1]))
print("x_test contains {} rows and {} features".format(x_test.shape[0], x_test.shape[1]))
print("y_test contains {} rows and {} features".format(y_test.shape[0], y_test.shape[1]))
# Loan grade has been one-hot encoded
print("Sample one-hot encoded 'y' value: \n{}".format(y_train.sample()))
Using the full set of 552160 observations x_train contains 441728 rows and 105 features x_train contains 441728 rows and 105 features y_train contains 441728 rows and 7 features x_test contains 110432 rows and 105 features y_test contains 110432 rows and 7 features Sample one-hot encoded 'y' value: grade_A grade_B grade_C grade_D grade_E grade_F grade_G 447861 1 0 0 0 0 0 0
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.constraints import maxnorm
def create_model(input_dim, output_dim):
# create model
model = Sequential()
# input layer
model.add(Dense(100, input_dim=input_dim, activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
# hidden layer
model.add(Dense(60, activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
# output layer
model.add(Dense(output_dim, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
model = create_model(x_train.shape[1], y_train.shape[1])
epochs = 35
batch_sz = 64
print("Beginning model training with batch size {} and {} epochs".format(batch_sz, epochs))
checkpoint = ModelCheckpoint("lc_model.h5", monitor='val_acc', verbose=0, save_best_only=True, mode='auto', period=1)
# train the model
history = model.fit(x_train.as_matrix(),
y_train.as_matrix(),
validation_split=0.2,
epochs=epochs,
batch_size=batch_sz,
verbose=2,
callbacks=[checkpoint])
# revert to the best model encountered during training
model = load_model("lc_model.h5")
Beginning model training with batch size 64 and 35 epochs
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. from ipykernel import kernelapp as app C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. app.launch_new_instance()
Train on 353382 samples, validate on 88346 samples Epoch 1/35 - 31s - loss: 0.9299 - acc: 0.6013 - val_loss: 0.4764 - val_acc: 0.8208 Epoch 2/35 - 25s - loss: 0.5274 - acc: 0.7795 - val_loss: 0.4230 - val_acc: 0.8219 Epoch 3/35 - 24s - loss: 0.4681 - acc: 0.8038 - val_loss: 0.3759 - val_acc: 0.8448 Epoch 4/35 - 27s - loss: 0.4437 - acc: 0.8144 - val_loss: 0.3513 - val_acc: 0.8593 Epoch 5/35 - 26s - loss: 0.4300 - acc: 0.8205 - val_loss: 0.3510 - val_acc: 0.8581 Epoch 6/35 - 29s - loss: 0.4197 - acc: 0.8249 - val_loss: 0.3651 - val_acc: 0.8478 Epoch 7/35 - 31s - loss: 0.4119 - acc: 0.8279 - val_loss: 0.3275 - val_acc: 0.8648 Epoch 8/35 - 29s - loss: 0.4055 - acc: 0.8312 - val_loss: 0.3168 - val_acc: 0.8702 Epoch 9/35 - 27s - loss: 0.3998 - acc: 0.8335 - val_loss: 0.3298 - val_acc: 0.8652 Epoch 10/35 - 27s - loss: 0.3958 - acc: 0.8362 - val_loss: 0.3154 - val_acc: 0.8697 Epoch 11/35 - 26s - loss: 0.3931 - acc: 0.8365 - val_loss: 0.3155 - val_acc: 0.8736 Epoch 12/35 - 26s - loss: 0.3868 - acc: 0.8391 - val_loss: 0.3006 - val_acc: 0.8793 Epoch 13/35 - 25s - loss: 0.3839 - acc: 0.8406 - val_loss: 0.3218 - val_acc: 0.8647 Epoch 14/35 - 27s - loss: 0.3805 - acc: 0.8424 - val_loss: 0.3137 - val_acc: 0.8698 Epoch 15/35 - 26s - loss: 0.3773 - acc: 0.8428 - val_loss: 0.3047 - val_acc: 0.8742 Epoch 16/35 - 26s - loss: 0.3746 - acc: 0.8453 - val_loss: 0.2908 - val_acc: 0.8826 Epoch 17/35 - 26s - loss: 0.3735 - acc: 0.8454 - val_loss: 0.3047 - val_acc: 0.8745 Epoch 18/35 - 25s - loss: 0.3701 - acc: 0.8465 - val_loss: 0.2946 - val_acc: 0.8788 Epoch 19/35 - 29s - loss: 0.3694 - acc: 0.8462 - val_loss: 0.2958 - val_acc: 0.8772 Epoch 20/35 - 26s - loss: 0.3673 - acc: 0.8483 - val_loss: 0.3035 - val_acc: 0.8773 Epoch 21/35 - 25s - loss: 0.3650 - acc: 0.8483 - val_loss: 0.2870 - val_acc: 0.8847 Epoch 22/35 - 26s - loss: 0.3648 - acc: 0.8491 - val_loss: 0.2881 - val_acc: 0.8834 Epoch 23/35 - 28s - loss: 0.3640 - acc: 0.8489 - val_loss: 0.3021 - val_acc: 0.8751 Epoch 24/35 - 30s - loss: 0.3612 - acc: 0.8502 - val_loss: 0.3163 - val_acc: 0.8665 Epoch 25/35 - 26s - loss: 0.3624 - acc: 0.8499 - val_loss: 0.2869 - val_acc: 0.8822 Epoch 26/35 - 25s - loss: 0.3596 - acc: 0.8512 - val_loss: 0.2905 - val_acc: 0.8832 Epoch 27/35 - 26s - loss: 0.3563 - acc: 0.8528 - val_loss: 0.2839 - val_acc: 0.8822 Epoch 28/35 - 27s - loss: 0.3570 - acc: 0.8524 - val_loss: 0.3027 - val_acc: 0.8745 Epoch 29/35 - 25s - loss: 0.3568 - acc: 0.8529 - val_loss: 0.3116 - val_acc: 0.8683 Epoch 30/35 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.2901 - val_acc: 0.8800 Epoch 31/35 - 24s - loss: 0.3542 - acc: 0.8530 - val_loss: 0.2898 - val_acc: 0.8797 Epoch 32/35 - 24s - loss: 0.3531 - acc: 0.8544 - val_loss: 0.2853 - val_acc: 0.8841 Epoch 33/35 - 24s - loss: 0.3541 - acc: 0.8536 - val_loss: 0.2992 - val_acc: 0.8760 Epoch 34/35 - 24s - loss: 0.3541 - acc: 0.8538 - val_loss: 0.2810 - val_acc: 0.8867 Epoch 35/35 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.3013 - val_acc: 0.8760
plt.rcParams["figure.figsize"] = [16, 5]
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams['legend.loc'] = 'upper right'
plt.rcParams['legend.framealpha'] = 0.7
plt.rcParams["legend.fontsize"] = 14
# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')
# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')
# adjust size
plt.tight_layout()
plt.show()
We can now see how our train model performs on the "test" data that we held out of the training data. We measure the performance using the F1 score.
import numpy as np
from sklearn.metrics import f1_score
y_pred = model.predict(x_test.as_matrix())
# Revert one-hot encoding to classes
y_pred_classes = pd.DataFrame((y_pred.argmax(1)[:,None] == np.arange(y_pred.shape[1])),
columns=y_test.columns,
index=y_test.index)
y_test_vals = y_test.idxmax(1)
y_pred_vals = y_pred_classes.idxmax(1)
# F1 score
# Use idxmax() to convert back from one-hot encoding
f1 = f1_score(y_test_vals, y_pred_vals, average='weighted')
print("Test Set Accuracy: {:.2%} (But results would have been better if we trained on the FULL dataset)".format(f1))
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. after removing the cwd from sys.path.
Test Set Accuracy: 88.54% (But results would have been better if we trained on the FULL dataset)
from sklearn.metrics import confusion_matrix
from importlib import reload
# Confusion matrix
conf_matrix = confusion_matrix(y_test_vals, y_pred_vals).astype(float)
# drawing it
def fmt_pct(x, pos):
"""Format as percentage"""
return '{:.0%}'.format(x)
classes = [l for l in "ABCDEFG"]
title='Confusion matrix'
correct_mask = np.ones(conf_matrix.shape, dtype=bool)
wrong_mask = np.zeros(conf_matrix.shape, dtype=bool)
for i in range(conf_matrix.shape[0]):
correct_mask[i,i] = False
wrong_mask[i,i] = True
row_sum = sum(conf_matrix[i])
for j in range(conf_matrix.shape[1]):
conf_matrix[i, j] = conf_matrix[i, j] / row_sum
correct_matrix = np.ma.masked_array(conf_matrix, mask=correct_mask)
wrong_matrix = np.ma.masked_array(conf_matrix, wrong_mask)
fig,ax = plt.subplots(figsize=(8, 8))
blue_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#f1eef6', '#025a90'], N=10)
blue_map.set_under(color='white')
red_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#feeddd', '#a83500'], N=10)
red_map.set_under(color='white')
plot_correct = ax.imshow(correct_matrix,interpolation='nearest',cmap=blue_map, vmin=0.000001, vmax=1)
plot_wrong = ax.imshow(wrong_matrix,interpolation='nearest',cmap=red_map, vmin=0.000001, vmax=1)
colorbar_wrong = plt.colorbar(plot_wrong, shrink=0.35, orientation='horizontal', pad=-0.11, format=ticker.FuncFormatter(fmt_pct))
colorbar_correct = plt.colorbar(plot_correct, shrink=0.35, orientation='horizontal', pad=0.03)
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
ax.set_xticklabels([''] + classes)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_yticklabels([''] + classes)
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
colorbar_correct.ax.text(-0.3,0.25,'CORRECT',rotation=0)
colorbar_wrong.ax.text(-0.35,0.25,'INCORRECT',rotation=0)
plt.setp(colorbar_correct.ax.get_xticklabels(), visible=False)
thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
cell_label = "{:.1%}".format(conf_matrix[i, j])
plt.text(j, i, cell_label,
horizontalalignment="center",
color="white" if conf_matrix[i, j] > thresh else "black")
plt.tight_layout()
plt.show()
From the very beginning James Anderson's model performed better than my model, however his neural network architecture was almost the same as ours. The first thing he did was to remove any rows with nas from the columns he selected. So he didn't selected the columns where there were too many missing values, unlike me. This created a lighter model.
Yet, if his results were already pretty good we improved them by removing a few features. However the lower the grade the lower were our classification accuracy. Reaching to 19% for G grade borrowers, which were mainly classified in F and some in E. So the next thing to work on would be to improve features selection.