In [2]:

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

loading the dataset

In [3]:

# Control delimiters, rows, column names with read_csv (see later) 
df = pd.read_csv("loans.csv", engine='python')

How does it look like ?

In [4]:

# show all columns
pd.set_option('display.max_columns', None)

In [5]:

df.head()

Out[5]:

	Unnamed: 0	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	emp_length	home_ownership	annual_inc	verification_status	issue_d	loan_status	pymnt_plan	url	desc	purpose	title	zip_code	addr_state	dti	delinq_2yrs	earliest_cr_line	inq_last_6mths	mths_since_last_delinq	mths_since_last_record	open_acc	pub_rec	revol_bal	revol_util	total_acc	initial_list_status	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	recoveries	collection_recovery_fee	last_pymnt_d	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	mths_since_last_major_derog	policy_code	application_type	annual_inc_joint	dti_joint	verification_status_joint	tot_coll_amt	tot_cur_bal	open_acc_6m	open_act_il	open_il_12m	open_il_24m	mths_since_rcnt_il	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m	acc_open_past_24mths	avg_cur_bal	bc_open_to_buy	bc_util	mo_sin_old_il_acct	mo_sin_old_rev_tl_op	mo_sin_rcnt_rev_tl_op	mo_sin_rcnt_tl	mort_acc	mths_since_recent_bc	mths_since_recent_bc_dlq	mths_since_recent_inq	mths_since_recent_revol_delinq	num_actv_bc_tl	num_actv_rev_tl	num_bc_sats	num_bc_tl	num_il_tl	num_op_rev_tl	num_rev_accts	num_rev_tl_bal_gt_0	num_sats	num_tl_120dpd_2m	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	pub_rec_bankruptcies	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit	revol_bal_joint	sec_app_earliest_cr_line	sec_app_inq_last_6mths	sec_app_mort_acc	sec_app_open_acc	sec_app_revol_util	sec_app_open_act_il	sec_app_num_rev_accts	sec_app_chargeoff_within_12_mths	sec_app_collections_12_mths_ex_med	sec_app_mths_since_last_major_derog	hardship_flag	hardship_type	hardship_reason	hardship_status	deferral_term	hardship_amount	hardship_start_date	hardship_end_date	payment_plan_start_date	hardship_length	hardship_dpd	hardship_loan_status	orig_projected_additional_accrued_interest	hardship_payoff_balance_amount	hardship_last_payment_amount	disbursement_method	debt_settlement_flag	debt_settlement_flag_date	settlement_status	settlement_date	settlement_amount	settlement_percentage	settlement_term
0	1040017	NaN	NaN	14000	14000	14000.0	36 months	12.69	469.63	C	C2	Receiving Dock Worker	9 years	MORTGAGE	40000.0	Not Verified	2015-10-01	Charged Off	n	NaN	NaN	debt_consolidation	Debt consolidation	166xx	PA	17.07	0.0	Jun-2001	1.0	NaN	NaN	5.0	0.0	5848	90.0	15.0	f	6057.790000	6057.79	4091.51	1556.41	409.87	73.7766	Oct-2016	469.63	NaN	Jul-2018	NaN	1	Individual	NaN	NaN	NaN	0.0	119776.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6500.0	NaN	NaN	NaN	4.0	23955.0	2167.0	90.0	141.0	172.0	3.0	3.0	1.0	3.0	NaN	3.0	NaN	3.0	3.0	8.0	8.0	6.0	3.0	8.0	3.0	5.0	NaN	2.0	100.0	100.0	0.0	123292.0	29809.0	6500.0	25992.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
1	1050463	NaN	NaN	1000	1000	1000.0	36 months	9.17	31.88	B	B2	Portfolio Manager	1 year	MORTGAGE	80000.0	Verified	2015-10-01	Fully Paid	n	NaN	NaN	credit_card	Credit card refinancing	949xx	CA	12.51	0.0	Oct-1967	3.0	NaN	22.0	9.0	1.0	7634	37.2	32.0	w	1021.730000	1021.73	999.99	21.74	0.00	0.0000	Feb-2016	27.85	NaN	Feb-2017	NaN	1	Individual	NaN	NaN	NaN	0.0	53994.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	20500.0	NaN	NaN	NaN	4.0	5999.0	12866.0	37.2	188.0	575.0	4.0	4.0	3.0	4.0	NaN	1.0	NaN	3.0	3.0	6.0	16.0	9.0	6.0	20.0	3.0	9.0	0.0	3.0	100.0	0.0	1.0	80788.0	53994.0	20500.0	60288.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
2	1056254	NaN	NaN	13000	13000	13000.0	36 months	6.89	400.75	A	A3	mailhandler	10+ years	RENT	70000.0	Not Verified	2015-10-01	Fully Paid	n	NaN	NaN	debt_consolidation	Debt consolidation	751xx	TX	21.72	0.0	Oct-1996	0.0	NaN	NaN	16.0	0.0	11113	24.4	23.0	w	14425.915413	14425.92	13000.00	1425.92	0.00	0.0000	Oct-2018	401.38	NaN	Jan-2019	NaN	1	Individual	NaN	NaN	NaN	316.0	93918.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	45600.0	NaN	NaN	NaN	10.0	5870.0	28646.0	26.9	227.0	60.0	7.0	5.0	3.0	7.0	NaN	7.0	NaN	7.0	8.0	8.0	8.0	8.0	10.0	10.0	8.0	16.0	0.0	4.0	100.0	0.0	0.0	141771.0	43988.0	39200.0	42191.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
3	1013860	NaN	NaN	5600	5600	5600.0	36 months	6.24	170.98	A	A2	Managing Director	4 years	RENT	49000.0	Source Verified	2015-10-01	Fully Paid	n	NaN	NaN	debt_consolidation	Debt consolidation	968xx	HI	2.45	0.0	Dec-1988	0.0	NaN	NaN	4.0	0.0	3402	41.5	8.0	w	6051.344476	6051.34	5600.00	451.34	0.00	0.0000	Aug-2017	2635.62	NaN	Aug-2017	NaN	1	Individual	NaN	NaN	NaN	0.0	3402.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	8200.0	NaN	NaN	NaN	0.0	851.0	3898.0	46.6	NaN	322.0	26.0	26.0	0.0	26.0	NaN	NaN	NaN	3.0	3.0	3.0	6.0	0.0	4.0	8.0	3.0	4.0	0.0	0.0	100.0	33.3	0.0	8200.0	3402.0	7300.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
4	1018431	NaN	NaN	11000	11000	11000.0	60 months	12.29	246.31	C	C1	RTI Administrator	4 years	MORTGAGE	86000.0	Not Verified	2015-10-01	Fully Paid	n	NaN	NaN	debt_consolidation	Debt consolidation	388xx	MS	24.35	2.0	Nov-2001	1.0	2.0	NaN	12.0	0.0	6062	68.9	24.0	w	14008.069373	14008.07	11000.00	3008.07	0.00	0.0000	Aug-2018	5894.86	NaN	Feb-2019	NaN	1	Individual	NaN	NaN	NaN	0.0	331363.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	8800.0	NaN	NaN	NaN	3.0	27614.0	1083.0	81.9	167.0	160.0	10.0	10.0	2.0	23.0	2.0	6.0	2.0	2.0	3.0	2.0	5.0	13.0	4.0	9.0	3.0	12.0	0.0	1.0	87.5	50.0	0.0	393082.0	81179.0	6000.0	118795.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN

In [17]:

df.tail()

Out[17]:

	Unnamed: 0	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	emp_length	home_ownership	annual_inc	verification_status	issue_d	loan_status	pymnt_plan	url	desc	purpose	title	zip_code	addr_state	dti	delinq_2yrs	earliest_cr_line	mths_since_last_delinq	mths_since_last_record	open_acc	revol_bal	revol_util	total_acc	initial_list_status	out_prncp	out_prncp_inv	total_pymnt	total_pymnt_inv	total_rec_prncp	total_rec_int	last_pymnt_d	last_pymnt_amnt	next_pymnt_d	last_credit_pull_d	mths_since_last_major_derog	policy_code	application_type	annual_inc_joint	dti_joint	verification_status_joint	tot_coll_amt	tot_cur_bal	open_acc_6m	open_act_il	open_il_12m	open_il_24m	mths_since_rcnt_il	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m	acc_open_past_24mths	avg_cur_bal	bc_open_to_buy	bc_util	mo_sin_old_il_acct	mo_sin_old_rev_tl_op	mo_sin_rcnt_rev_tl_op	mo_sin_rcnt_tl	mort_acc	mths_since_recent_bc	mths_since_recent_bc_dlq	mths_since_recent_inq	mths_since_recent_revol_delinq	num_actv_bc_tl	num_actv_rev_tl	num_bc_sats	num_bc_tl	num_il_tl	num_op_rev_tl	num_rev_accts	num_rev_tl_bal_gt_0	num_sats	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit	revol_bal_joint	sec_app_earliest_cr_line	sec_app_inq_last_6mths	sec_app_mort_acc	sec_app_open_acc	sec_app_revol_util	sec_app_open_act_il	sec_app_num_rev_accts	sec_app_chargeoff_within_12_mths	sec_app_collections_12_mths_ex_med	sec_app_mths_since_last_major_derog	hardship_flag	hardship_type	hardship_reason	hardship_status	deferral_term	hardship_amount	hardship_start_date	hardship_end_date	payment_plan_start_date	hardship_length	hardship_dpd	hardship_loan_status	orig_projected_additional_accrued_interest	hardship_payoff_balance_amount	hardship_last_payment_amount	disbursement_method	debt_settlement_flag	debt_settlement_flag_date	settlement_status	settlement_date	settlement_amount	settlement_percentage	settlement_term
749995	7605	NaN	NaN	15000	15000	15000.0	36 months	13.56	509.47	C	C1	Driver	4 years	RENT	55000.0	Not Verified	2018-12-01	Current	n	NaN	NaN	debt_consolidation	Debt consolidation	171xx	PA	17.15	0.0	Oct-2012	NaN	NaN	9.0	7277	40.0	14.0	w	14316.22	14316.22	1001.99	1001.99	683.78	318.21	Feb-2019	509.47	Mar-2019	Feb-2019	NaN	1	Individual	NaN	NaN	NaN	0.0	21229.0	0.0	2.0	1.0	3.0	8.0	13952.0	85.0	1.0	1.0	4778.0	61.0	18200.0	1.0	0.0	2.0	4.0	2359.0	2457.0	69.7	74.0	54.0	12.0	8.0	0.0	35.0	NaN	8.0	NaN	2.0	5.0	2.0	2.0	4.0	7.0	10.0	5.0	9.0	2.0	100.0	50.0	34632.0	21229.0	8100.0	16432.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
749996	35399	NaN	NaN	8000	8000	8000.0	36 months	6.46	245.05	A	A1	Realtor	2 years	MORTGAGE	100000.0	Not Verified	2018-12-01	Current	n	NaN	NaN	debt_consolidation	Debt consolidation	231xx	VA	9.90	1.0	Sep-2001	21.0	NaN	13.0	17646	51.7	29.0	w	7389.00	7389.00	692.18	692.18	611.00	81.18	Feb-2019	245.05	Mar-2019	Feb-2019	NaN	1	Individual	NaN	NaN	NaN	3323.0	326631.0	1.0	2.0	0.0	0.0	31.0	13286.0	57.0	2.0	3.0	2173.0	54.0	34100.0	0.0	3.0	0.0	3.0	27219.0	12434.0	26.0	82.0	206.0	2.0	2.0	2.0	2.0	63.0	17.0	21.0	4.0	5.0	5.0	11.0	3.0	10.0	24.0	5.0	13.0	2.0	82.8	40.0	381832.0	30932.0	16800.0	23370.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	DirectPay	N	NaN	NaN	NaN	NaN	NaN	NaN
749997	8452	NaN	NaN	40000	40000	40000.0	60 months	16.14	975.71	C	C4	Director of Operations	3 years	MORTGAGE	157630.0	Verified	2018-12-01	Current	n	NaN	NaN	credit_card	Credit card refinancing	972xx	OR	16.61	1.0	May-2004	18.0	NaN	11.0	15761	56.7	21.0	w	39562.28	39562.28	1155.05	1155.05	437.72	717.33	Feb-2019	1208.85	Mar-2019	Feb-2019	NaN	1	Individual	NaN	NaN	NaN	0.0	474781.0	1.0	4.0	1.0	1.0	5.0	59310.0	38.0	0.0	0.0	9029.0	43.0	27800.0	0.0	5.0	0.0	1.0	43162.0	9453.0	59.8	71.0	175.0	27.0	5.0	2.0	27.0	NaN	NaN	NaN	3.0	4.0	3.0	4.0	6.0	6.0	13.0	4.0	11.0	1.0	95.2	66.7	542479.0	75071.0	23500.0	100389.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
749998	21664	NaN	NaN	7000	7000	7000.0	36 months	8.19	219.97	A	A4	Bill Poster	10+ years	MORTGAGE	42000.0	Verified	2018-12-01	Current	n	NaN	NaN	other	Other	707xx	LA	6.49	0.0	Oct-2003	NaN	NaN	7.0	6898	53.1	22.0	w	6654.43	6654.43	456.87	456.87	345.57	111.30	Feb-2019	243.27	Mar-2019	Feb-2019	NaN	1	Joint App	70000.0	4.35	Verified	440.0	114983.0	0.0	0.0	0.0	0.0	49.0	0.0	NaN	1.0	1.0	1048.0	53.0	13000.0	0.0	0.0	0.0	1.0	16426.0	1620.0	64.0	127.0	160.0	8.0	8.0	4.0	8.0	NaN	NaN	NaN	3.0	5.0	3.0	3.0	13.0	5.0	5.0	5.0	7.0	1.0	100.0	33.3	232200.0	6898.0	4500.0	0.0	7445.0	Jul-2003	0.0	4.0	6.0	59.8	0.0	6.0	0.0	0.0	58.0	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN
749999	24316	NaN	NaN	15000	15000	15000.0	60 months	17.97	380.66	D	D1	Document	4 years	RENT	110000.0	Not Verified	2018-12-01	Current	n	NaN	NaN	debt_consolidation	Debt consolidation	950xx	CA	13.28	0.0	Mar-2011	NaN	NaN	10.0	13439	72.3	13.0	w	14685.59	14685.59	746.34	746.34	314.41	431.93	Feb-2019	380.66	Mar-2019	Feb-2019	NaN	1	Individual	NaN	NaN	NaN	0.0	24340.0	0.0	2.0	0.0	2.0	16.0	10901.0	54.0	1.0	1.0	3313.0	63.0	18600.0	1.0	1.0	0.0	3.0	2704.0	3131.0	73.7	68.0	92.0	7.0	7.0	0.0	7.0	NaN	16.0	NaN	4.0	7.0	4.0	4.0	3.0	8.0	10.0	7.0	10.0	1.0	100.0	75.0	38908.0	24340.0	11900.0	20308.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Cash	N	NaN	NaN	NaN	NaN	NaN	NaN

What are the total number of loans?

In [18]:

len(df)

Out[18]:

How many columns does the dataset have?

In [6]:

len(df.columns)

Out[6]:

How may columns contain missing data?

In [6]:

miss_values_count = df.isnull().sum(min_count=1)
miss_values_count = miss_values_count[miss_values_count != 0]

What is the count of missing data for each of this column?

In [7]:

print(f"Number of columns with missing values: {miss_values_count.shape[0]}")
if miss_values_count.shape[0]:
    print("Null value count per column: ")
    for name, miss_vals in miss_values_count.items():
        p = miss_vals > 1
        print(f"  - For column name '{name}' there are{'n' if p else ''} "
              f"{miss_vals} missing values{'s' if p else ''}.")

Number of columns with missing values: 74
Null value count per column:
- For column name 'id' there aren 750000 missing valuess.
- For column name 'member_id' there aren 750000 missing valuess.
- For column name 'emp_title' there aren 61669 missing valuess.
- For column name 'emp_length' there aren 54976 missing valuess.
- For column name 'url' there aren 750000 missing valuess.
- For column name 'desc' there aren 749986 missing valuess.
- For column name 'title' there aren 11781 missing valuess.
- For column name 'dti' there aren 835 missing valuess.
- For column name 'inq_last_6mths' there are 1 missing values.
- For column name 'mths_since_last_delinq' there aren 382775 missing valuess.
- For column name 'mths_since_last_record' there aren 627757 missing valuess.
- For column name 'revol_util' there aren 659 missing valuess.
- For column name 'last_pymnt_d' there aren 909 missing valuess.
- For column name 'next_pymnt_d' there aren 295358 missing valuess.
- For column name 'last_credit_pull_d' there aren 18 missing valuess.
- For column name 'mths_since_last_major_derog' there aren 552182 missing valuess.
- For column name 'annual_inc_joint' there aren 689705 missing valuess.
- For column name 'dti_joint' there aren 689705 missing valuess.
- For column name 'verification_status_joint' there aren 692145 missing valuess.
- For column name 'open_acc_6m' there aren 52540 missing valuess.
- For column name 'open_act_il' there aren 52539 missing valuess.
- For column name 'open_il_12m' there aren 52539 missing valuess.
- For column name 'open_il_24m' there aren 52539 missing valuess.
- For column name 'mths_since_rcnt_il' there aren 74494 missing valuess.
- For column name 'total_bal_il' there aren 52539 missing valuess.
- For column name 'il_util' there aren 153788 missing valuess.
- For column name 'open_rv_12m' there aren 52539 missing valuess.
- For column name 'open_rv_24m' there aren 52539 missing valuess.
- For column name 'max_bal_bc' there aren 52539 missing valuess.
- For column name 'all_util' there aren 52662 missing valuess.
- For column name 'inq_fi' there aren 52539 missing valuess.
- For column name 'total_cu_tl' there aren 52540 missing valuess.
- For column name 'inq_last_12m' there aren 52540 missing valuess.
- For column name 'avg_cur_bal' there aren 32 missing valuess.
- For column name 'bc_open_to_buy' there aren 9089 missing valuess.
- For column name 'bc_util' there aren 9415 missing valuess.
- For column name 'mo_sin_old_il_acct' there aren 23463 missing valuess.
- For column name 'mths_since_recent_bc' there aren 8576 missing valuess.
- For column name 'mths_since_recent_bc_dlq' there aren 577966 missing valuess.
- For column name 'mths_since_recent_inq' there aren 86405 missing valuess.
- For column name 'mths_since_recent_revol_delinq' there aren 502543 missing valuess.
- For column name 'num_tl_120dpd_2m' there aren 31530 missing valuess.
- For column name 'percent_bc_gt_75' there aren 9163 missing valuess.
- For column name 'revol_bal_joint' there aren 696039 missing valuess.
- For column name 'sec_app_earliest_cr_line' there aren 696039 missing valuess.
- For column name 'sec_app_inq_last_6mths' there aren 696039 missing valuess.
- For column name 'sec_app_mort_acc' there aren 696039 missing valuess.
- For column name 'sec_app_open_acc' there aren 696039 missing valuess.
- For column name 'sec_app_revol_util' there aren 696953 missing valuess.
- For column name 'sec_app_open_act_il' there aren 696039 missing valuess.
- For column name 'sec_app_num_rev_accts' there aren 696039 missing valuess.
- For column name 'sec_app_chargeoff_within_12_mths' there aren 696039 missing valuess.
- For column name 'sec_app_collections_12_mths_ex_med' there aren 696039 missing valuess.
- For column name 'sec_app_mths_since_last_major_derog' there aren 732082 missing valuess.
- For column name 'hardship_type' there aren 745844 missing valuess.
- For column name 'hardship_reason' there aren 745844 missing valuess.
- For column name 'hardship_status' there aren 745844 missing valuess.
- For column name 'deferral_term' there aren 745844 missing valuess.
- For column name 'hardship_amount' there aren 745844 missing valuess.
- For column name 'hardship_start_date' there aren 745844 missing valuess.
- For column name 'hardship_end_date' there aren 745844 missing valuess.
- For column name 'payment_plan_start_date' there aren 745844 missing valuess.
- For column name 'hardship_length' there aren 745844 missing valuess.
- For column name 'hardship_dpd' there aren 745844 missing valuess.
- For column name 'hardship_loan_status' there aren 745844 missing valuess.
- For column name 'orig_projected_additional_accrued_interest' there aren 746723 missing valuess.
- For column name 'hardship_payoff_balance_amount' there aren 745844 missing valuess.
- For column name 'hardship_last_payment_amount' there aren 745844 missing valuess.
- For column name 'debt_settlement_flag_date' there aren 740517 missing valuess.
- For column name 'settlement_status' there aren 740517 missing valuess.
- For column name 'settlement_date' there aren 740517 missing valuess.
- For column name 'settlement_amount' there aren 740517 missing valuess.
- For column name 'settlement_percentage' there aren 740517 missing valuess.
- For column name 'settlement_term' there aren 740517 missing valuess.

How has the average interest rate of a loan varied over time?

If it is a variable rate, the variation of the rate would be determined with the table of payments and not in the table of loans, which is our sample; since the 90's it is rare to use a variable rate. If it is a fixed rate, there is no variation in the rate and it would be int_rate. As for variation of any value it would be necessary to define if it is the programmed one or the real one; the first one is with financial formulas and there are many, the second one is from the data, normally from the payments.

What is the distribution of sub-grades, and can we easily segment borrowers based on prime and sub prime loans

In [91]:

plt.figure(figsize=(16, 6))
sns.countplot(x="sub_grade", data=df, order = df['sub_grade'].value_counts().index)

Out[91]:

<matplotlib.axes._subplots.AxesSubplot at 0x16a730644e0>

But can we easily segment borrowers based on prime and sub prime loans ? Maybe we need less/more categories ? I would segment them depending on if they pay their loan back

In [35]:

df[['sub_grade','loan_amnt','int_rate']].groupby('sub_grade').describe().reset_index()

Out[35]:

	sub_grade	int_rate								loan_amnt
		count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
0	A1	34251.0	5.544917	0.389030	5.31	5.32	5.32	6.11	6.46	34251.0	15778.950980	9849.357821	1000.0	8400.0	13000.0	20000.0	40000.0
1	A2	25190.0	6.587943	0.381601	6.00	6.19	6.67	6.99	7.07	25190.0	14552.648869	9554.564306	1000.0	7500.0	12000.0	20000.0	40000.0
2	A3	26431.0	7.062373	0.266037	6.00	6.83	7.21	7.21	7.59	26431.0	14292.664863	9544.563417	1000.0	7000.0	12000.0	20000.0	40000.0
3	A4	33057.0	7.557079	0.283809	6.00	7.35	7.39	7.84	8.19	33057.0	15257.654203	9961.010378	1000.0	8000.0	12000.0	20000.0	40000.0
4	A5	34741.0	8.118880	0.285978	6.00	7.96	7.97	8.46	8.81	34741.0	14407.389684	9448.639545	1000.0	7200.0	12000.0	20000.0	40000.0
5	B1	43870.0	9.075405	0.682017	6.00	8.39	9.43	9.44	10.33	43870.0	14550.149305	9491.496416	1000.0	7500.0	12000.0	20000.0	40000.0
6	B2	42643.0	9.892638	0.510048	6.00	9.49	9.93	10.47	10.72	42643.0	14847.658467	9750.694550	1000.0	7200.0	12000.0	20000.0	40000.0
7	B3	41821.0	10.494923	0.545622	6.00	9.99	10.42	11.06	11.39	41821.0	14366.098372	9434.344664	1000.0	7000.0	12000.0	20000.0	40000.0
8	B4	46010.0	11.108060	0.335650	6.00	10.90	10.99	11.44	11.80	46010.0	14328.037927	9387.670226	1000.0	7000.0	12000.0	20000.0	40000.0
9	B5	49638.0	11.844107	0.462537	6.00	11.49	11.53	11.99	12.98	49638.0	14274.199706	9438.912459	1000.0	7000.0	12000.0	20000.0	40000.0
10	C1	50421.0	12.674281	0.472692	6.00	12.61	12.62	12.79	13.56	50421.0	14868.141746	9538.444558	1000.0	7500.0	12000.0	20000.0	40000.0
11	C2	43310.0	13.525962	0.525845	6.00	13.49	13.49	13.59	14.47	43310.0	14893.247518	9528.672902	1000.0	7450.0	12000.0	20000.0	40000.0
12	C3	42908.0	14.082712	0.512439	6.00	13.99	14.07	14.08	15.02	42908.0	15515.432437	9502.942268	1000.0	8000.0	14000.0	21000.0	40000.0
13	C4	43154.0	14.942308	0.636235	6.00	14.49	14.99	15.05	16.14	43154.0	15772.832182	9479.332442	1000.0	8500.0	14000.0	21000.0	40000.0
14	C5	40569.0	15.911166	0.628875	6.00	15.59	16.01	16.02	16.91	40569.0	15884.065419	9564.053499	1000.0	8450.0	14400.0	21525.0	40000.0
15	D1	25053.0	16.962046	0.725618	6.00	16.29	16.99	17.47	17.97	25053.0	15663.602163	9352.801072	1000.0	8325.0	14400.0	21000.0	40000.0
16	D2	23256.0	18.004723	0.691684	6.00	17.99	18.06	18.45	18.94	23256.0	15631.319874	9398.099400	1000.0	8000.0	14400.0	21600.0	40000.0
17	D3	20643.0	18.955224	0.809744	6.00	18.99	19.03	19.42	19.92	20643.0	16024.160732	9436.456261	1000.0	9000.0	15000.0	22250.0	40000.0
18	D4	17112.0	19.819662	0.970901	6.00	19.99	20.00	20.39	20.89	17112.0	16005.144051	9351.079353	1000.0	9175.0	15000.0	22000.0	40000.0
19	D5	14683.0	21.075908	1.350807	6.00	21.45	21.45	21.85	22.35	14683.0	16716.408432	9438.190236	1000.0	10000.0	15000.0	24000.0	40000.0
20	E1	8793.0	21.606883	1.829379	6.00	19.99	22.74	22.91	23.40	8793.0	16125.039804	9578.769471	1000.0	9000.0	15000.0	23000.0	40000.0
21	E2	7513.0	22.540733	2.144985	6.00	20.75	23.88	23.99	24.37	7513.0	16362.631439	9718.351712	1000.0	9000.0	15000.0	24000.0	40000.0
22	E3	7218.0	23.377350	2.274183	6.00	21.18	24.74	24.85	25.34	7218.0	17393.821003	9418.336317	1000.0	10000.0	16000.0	24000.0	40000.0
23	E4	6285.0	24.343181	2.246686	6.00	21.97	25.49	25.82	26.31	6285.0	17965.588703	9295.016271	1000.0	10975.0	16600.0	25000.0	40000.0
24	E5	7036.0	25.514105	1.981731	6.00	25.29	26.30	26.77	27.27	7036.0	18276.190307	8944.332753	1000.0	12000.0	17350.0	25000.0	40000.0
25	F1	3563.0	26.225897	2.774035	6.00	23.13	28.69	28.72	28.72	3563.0	19065.401347	8867.203485	1000.0	12000.0	18000.0	25000.0	40000.0
26	F2	2333.0	26.784303	2.794799	6.00	24.11	26.49	29.69	29.69	2333.0	18874.882126	9168.830490	1000.0	12000.0	18000.0	25000.0	40000.0
27	F3	1877.0	27.699020	2.507817	23.99	24.99	26.99	30.17	30.17	1877.0	19339.637720	9038.327304	1000.0	12000.0	18125.0	26000.0	40000.0
28	F4	1525.0	28.276000	2.494425	6.00	25.88	27.49	30.65	30.65	1525.0	18940.065574	8977.055891	1000.0	12000.0	18000.0	25600.0	40000.0
29	F5	1442.0	28.909563	2.035321	25.78	26.57	30.74	30.75	30.75	1442.0	20080.582524	9452.258874	1000.0	12500.0	19000.0	28000.0	40000.0
30	G1	1264.0	29.582540	1.746886	6.00	28.18	30.79	30.79	30.79	1264.0	20690.901899	9086.732784	1000.0	14000.0	20000.0	28000.0	40000.0
31	G2	715.0	29.560979	1.640559	6.00	28.14	30.84	30.84	30.84	715.0	18973.846154	9553.073015	1000.0	12000.0	17675.0	26600.0	40000.0
32	G3	613.0	29.837504	1.555991	6.00	28.34	30.89	30.89	30.89	613.0	20049.755302	9058.333894	1800.0	12900.0	18425.0	27300.0	40000.0
33	G4	550.0	30.309164	1.388560	6.00	29.96	30.94	30.94	30.94	550.0	20732.318182	8969.498118	1000.0	13000.0	19787.5	28287.5	40000.0
34	G5	512.0	30.622813	0.775062	28.99	30.99	30.99	30.99	30.99	512.0	20290.039062	9246.073949	2000.0	12212.5	19750.0	28000.0	40000.0

Visualise a low-dimensional split in the dataset when the target variable is the grade?

In [155]:

collist = df.columns.tolist()
# you can now select from this list any arbritrary range
df1 = df[collist[0:1]]
# or remove a column
collist.remove('sub_grade')
# now select
df1 = df[collist]

As far as we are considering the grade and not the interest rate we are going to drop the latest

In [156]:

df1 = df1.drop(columns = ['int_rate'])

We transform the grades to numeric values

In [157]:

df1['ascii'] = [ord(x) for x in df1['grade']]

In [158]:

df1 = df1.drop(columns=['grade'])

Here we will first plot the Pearson correlation heatmap and see the correlation of independent variables with the output variable grade. We will only select features which has correlation of above 0.5 (taking absolute value) with the output variable.

In [182]:

#Using Pearson Correlation
cor = df1.corr()

In [180]:

mask = np.zeros_like(cor, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(12,10))
sns.heatmap(cor,
            vmin=-1,
            cmap='coolwarm',
            annot=False,
            mask = mask);

As we can see, no features are highly correlated with the output variable grade, named ascii here to be able to respresent it. Hence we can't have insights from Pearson correlation. However this is not the end of the process. We can still handpick the explenatory features.

In [166]:

count = 0
for y in subset.columns:
    if subset[y].dtype == np.object:
        count+=1
print("number of categorical variables : " + str(count))

print(f"{subset.dtypes}\n")
print(f"Sum of null values in each feature:\n{35 * '-'}")
print(f"{subset.isnull().sum()}")

number of categorical variables : 36
Unnamed: 0                                      int64
id                                            float64
member_id                                     float64
loan_amnt                                       int64
funded_amnt                                     int64
funded_amnt_inv                               float64
term                                           object
int_rate                                      float64
installment                                   float64
grade                                          object
sub_grade                                      object
emp_title                                      object
emp_length                                     object
home_ownership                                 object
annual_inc                                    float64
verification_status                            object
issue_d                                        object
loan_status                                    object
pymnt_plan                                     object
url                                           float64
desc                                           object
purpose                                        object
title                                          object
zip_code                                       object
addr_state                                     object
dti                                           float64
delinq_2yrs                                   float64
earliest_cr_line                               object
inq_last_6mths                                float64
mths_since_last_delinq                        float64
                                               ...   
sec_app_open_acc                              float64
sec_app_revol_util                            float64
sec_app_open_act_il                           float64
sec_app_num_rev_accts                         float64
sec_app_chargeoff_within_12_mths              float64
sec_app_collections_12_mths_ex_med            float64
sec_app_mths_since_last_major_derog           float64
hardship_flag                                  object
hardship_type                                  object
hardship_reason                                object
hardship_status                                object
deferral_term                                 float64
hardship_amount                               float64
hardship_start_date                            object
hardship_end_date                              object
payment_plan_start_date                        object
hardship_length                               float64
hardship_dpd                                  float64
hardship_loan_status                           object
orig_projected_additional_accrued_interest    float64
hardship_payoff_balance_amount                float64
hardship_last_payment_amount                  float64
disbursement_method                            object
debt_settlement_flag                           object
debt_settlement_flag_date                      object
settlement_status                              object
settlement_date                                object
settlement_amount                             float64
settlement_percentage                         float64
settlement_term                               float64
Length: 146, dtype: object

Sum of null values in each feature:
-----------------------------------
Unnamed: 0                                         0
id                                            295358
member_id                                     295358
loan_amnt                                          0
funded_amnt                                        0
funded_amnt_inv                                    0
term                                               0
int_rate                                           0
installment                                        0
grade                                              0
sub_grade                                          0
emp_title                                      20400
emp_length                                     20036
home_ownership                                     0
annual_inc                                         0
verification_status                                0
issue_d                                            0
loan_status                                        0
pymnt_plan                                         0
url                                           295358
desc                                          295348
purpose                                            0
title                                           7834
zip_code                                           0
addr_state                                         0
dti                                              168
delinq_2yrs                                        0
earliest_cr_line                                   0
inq_last_6mths                                     1
mths_since_last_delinq                        144176
                                               ...  
sec_app_open_acc                              287070
sec_app_revol_util                            287222
sec_app_open_act_il                           287070
sec_app_num_rev_accts                         287070
sec_app_chargeoff_within_12_mths              287070
sec_app_collections_12_mths_ex_med            287070
sec_app_mths_since_last_major_derog           292362
hardship_flag                                      0
hardship_type                                 293455
hardship_reason                               293455
hardship_status                               293455
deferral_term                                 293455
hardship_amount                               293455
hardship_start_date                           293455
hardship_end_date                             293455
payment_plan_start_date                       293455
hardship_length                               293455
hardship_dpd                                  293455
hardship_loan_status                          293455
orig_projected_additional_accrued_interest    294215
hardship_payoff_balance_amount                293455
hardship_last_payment_amount                  293455
disbursement_method                                0
debt_settlement_flag                               0
debt_settlement_flag_date                     286344
settlement_status                             286344
settlement_date                               286344
settlement_amount                             286344
settlement_percentage                         286344
settlement_term                               286344
Length: 146, dtype: int64

Things worth knowing for predictive model

the data covers 110909 loans funded by the platform in 2018.
Number of columns with missing values: 74
36 columns have categorical data
the data set is pretty imbalanced : the proportion of G loans is not even 10000 whereas the C1 loans are at least 50000

Cleaning the data¶

As we had observe, some columns like annual_inc, int_rate, etc. may be much useful for building our model but on the other hand, some columns like id, member_id, etc. will not be helping. So we hand pick some columns.

In [116]:

APPLICANT_NUMERIC = ['annual_inc', 'dti', 'loan_amnt', 'installment']
APPLICANT_CATEGORICAL = ['application_type', 'emp_length', 'home_ownership', 'addr_state', 'term']
CREDIT_NUMERIC = ['acc_now_delinq', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy',
                  'bc_util', 'delinq_2yrs', 'delinq_amnt',
                  'open_acc', 'pub_rec', 'revol_util',
                  'revol_bal', 'tot_coll_amt', 'tot_cur_bal', 'total_acc', 'total_rev_hi_lim',
                  'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats',
                  'num_bc_tl', 'num_il_tl', 'num_rev_tl_bal_gt_0', 'pct_tl_nvr_dlq',
                  'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                  'total_il_high_credit_limit', 'all_util',
                  'il_util', 'total_bal_il', 'total_cu_tl']
TARGET = ['grade']

df = pd.read_csv("loans.csv", usecols = APPLICANT_NUMERIC +APPLICANT_CATEGORICAL + CREDIT_NUMERIC + TARGET)

We didn't selected columns like 'title' and 'emp_title' are text which cannot be one-hot encoded / label encoded as they have arbitrary categorical text and very less unique data for each of their categories.

In [125]:

# We order our grade category so order of grades doesn't appear random in graphs
grade_categories = [g for g in "ABCDEFG"]
df["grade"] = df["grade"].astype("category", categories=grade_categories, ordered=True)

# Sanity check that we're working with cleaned data
bad_rows = df.isnull().T.any().T.sum()
if bad_rows > 0:
    print("Rows with null/NaN values: {}".format(bad_rows))
    print("Columns with null/NaN values:")
    print(pd.isnull(df).sum() > 0)
    print("Dropping bad rows...")
    df.dropna(axis=0, how='any', inplace=True)
    print("Rows with null/NaN values: {}".format(df.isnull().T.any().T.sum()))
    
print(df.info(null_counts = True, memory_usage = "deep", verbose = True))

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  This is separate from the ipykernel package so we can avoid doing imports until

Rows with null/NaN values: 197840
Columns with null/NaN values:
loan_amnt                     False
term                          False
installment                   False
grade                         False
emp_length                     True
home_ownership                False
annual_inc                    False
addr_state                    False
dti                            True
delinq_2yrs                   False
open_acc                      False
pub_rec                       False
revol_bal                     False
revol_util                     True
total_acc                     False
application_type              False
acc_now_delinq                False
tot_coll_amt                  False
tot_cur_bal                   False
total_bal_il                   True
il_util                        True
all_util                       True
total_rev_hi_lim              False
total_cu_tl                    True
acc_open_past_24mths          False
avg_cur_bal                    True
bc_open_to_buy                 True
bc_util                        True
delinq_amnt                   False
num_accts_ever_120_pd         False
num_actv_bc_tl                False
num_actv_rev_tl               False
num_bc_sats                   False
num_bc_tl                     False
num_il_tl                     False
num_rev_tl_bal_gt_0           False
pct_tl_nvr_dlq                False
percent_bc_gt_75               True
tot_hi_cred_lim               False
total_bal_ex_mort             False
total_bc_limit                False
total_il_high_credit_limit    False
dtype: bool
Dropping bad rows...
Rows with null/NaN values: 0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 552160 entries, 41131 to 749999
Data columns (total 42 columns):
loan_amnt                     552160 non-null int64
term                          552160 non-null object
installment                   552160 non-null float64
grade                         552160 non-null category
emp_length                    552160 non-null object
home_ownership                552160 non-null object
annual_inc                    552160 non-null float64
addr_state                    552160 non-null object
dti                           552160 non-null float64
delinq_2yrs                   552160 non-null float64
open_acc                      552160 non-null float64
pub_rec                       552160 non-null float64
revol_bal                     552160 non-null int64
revol_util                    552160 non-null float64
total_acc                     552160 non-null float64
application_type              552160 non-null object
acc_now_delinq                552160 non-null float64
tot_coll_amt                  552160 non-null float64
tot_cur_bal                   552160 non-null float64
total_bal_il                  552160 non-null float64
il_util                       552160 non-null float64
all_util                      552160 non-null float64
total_rev_hi_lim              552160 non-null float64
total_cu_tl                   552160 non-null float64
acc_open_past_24mths          552160 non-null float64
avg_cur_bal                   552160 non-null float64
bc_open_to_buy                552160 non-null float64
bc_util                       552160 non-null float64
delinq_amnt                   552160 non-null float64
num_accts_ever_120_pd         552160 non-null float64
num_actv_bc_tl                552160 non-null float64
num_actv_rev_tl               552160 non-null float64
num_bc_sats                   552160 non-null float64
num_bc_tl                     552160 non-null float64
num_il_tl                     552160 non-null float64
num_rev_tl_bal_gt_0           552160 non-null float64
pct_tl_nvr_dlq                552160 non-null float64
percent_bc_gt_75              552160 non-null float64
tot_hi_cred_lim               552160 non-null float64
total_bal_ex_mort             552160 non-null float64
total_bc_limit                552160 non-null float64
total_il_high_credit_limit    552160 non-null float64
dtypes: category(1), float64(34), int64(2), object(5)
memory usage: 325.2 MB
None

In [126]:

df.head()

Out[126]:

	loan_amnt	term	installment	grade	emp_length	home_ownership	annual_inc	addr_state	dti	...	num_bc_sats	num_bc_tl	num_il_tl	num_rev_tl_bal_gt_0	pct_tl_nvr_dlq	percent_bc_gt_75	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit
41131	24000	36 months	757.51	B	1 year	MORTGAGE	80000.0	WA	11.80	...	11.0	11.0	29.0	7.0	100.0	9.1	501190.0	167271.0	137400.0	121695.0
41135	8000	36 months	257.39	B	6 years	RENT	55000.0	CO	10.16	...	4.0	13.0	6.0	4.0	95.7	25.0	27933.0	21907.0	7800.0	17349.0
41136	19975	60 months	453.27	C	1 year	MORTGAGE	92000.0	GA	19.05	...	4.0	9.0	36.0	6.0	83.0	50.0	406013.0	211797.0	12300.0	217689.0
41139	24000	60 months	603.85	D	3 years	RENT	72500.0	CA	22.89	...	5.0	7.0	9.0	7.0	100.0	80.0	73333.0	64376.0	17300.0	41233.0
41141	20000	60 months	461.96	C	8 years	MORTGAGE	62000.0	NC	7.14	...	6.0	10.0	5.0	6.0	90.9	16.7	188975.0	14423.0	18800.0	6914.0

5 rows × 42 columns

In [127]:

import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker

sns.reset_orig()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cax = ax.matshow(df.corr(), cmap=plt.cm.Blues)
fig.colorbar(cax)

ax.set_xticklabels(df, rotation=90)
ax.set_yticklabels(df)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

Next we can take a look at the distribution of some values by loan grade for some features of interest using "violin" charts.

In [128]:

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
plt.rcParams["figure.figsize"] = (20,8)
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12

# Create violinplot
plt.subplot(121)
v1 = sns.violinplot(x = "revol_util", y="grade", data=df)
v1.axes.set_title("Amount of Credit Line Used Relative to Total Credit Available by Loan Grade", fontsize=20)

plt.subplot(122)
v2 = sns.violinplot(x = "acc_open_past_24mths", y="grade", data=df)
v2.axes.set_title("Accts Opened in Past 24 Months by Loan Grade", fontsize=20)

# Show the plot
plt.show()

The distribution of the credit line used relative to total credit available by loan grade scores give A grade loans having higher values than the G grade loans. There is an interesting trend on the second graph with distribution of accounts opened in the past 24 months for higher grade loans relative to the lower grade loans. So even if the correlation is much lower, the inner distribution of these accounts oppenings give some major insights.

Predicting model¶

We started with a model with PCA and one hot encoding on every categorical variables. It created a dataframe of 214 features, 555M+ large. With the same model as below it first gave 30% of good classifcations, with KFold it dropped to a mean of 26.10% (with a standard deviation of 7.89%). So we decided to handpick the data and use the architecure the same way James Andersen did in his Medium article.

We prepare a machine learning model on the data. We first need to split the data into "X" for the data that will be fed into the model as input and "Y" the column we're trying to predict: loan grade.

Then we'll split the data by row; 80% will be used to train/validate the model and 20% will be used to test how well the model performs on unseen data.

Last the target, which is multi-class, has to been one-hot encoded.

In [129]:

def split_data(data, continuous_cols, categorical_cols, label_col, test_size=0.2, row_limit=None):
    """Divide the data in to X and y dataframes and train/test split"""

    # Subset to get feature data
    x_df = data.loc[:, continuous_cols + categorical_cols]

    # Update our X dataframe with categorical values replaced by one-hot encoded values
    x_df = encode_categorical(x_df, categorical_cols)

    # Ensure all numeric features are on the same scale
    for col in continuous_cols:
        x_df[col] = (x_df[col] - x_df[col].mean()) / x_df[col].std()

    # Specify the target labels and flatten the array
    y = pd.get_dummies(data[label_col])

    # When requested, limit the amount of data that will be used
    # Using entire data set can be painfully slow without a GPU!
    if row_limit != None:
        rows = np.random.binomial(1, 0.1, size=len(data)).astype('bool')
        x_df = x_df[rows]
        y = y[rows]
        print("Using only a sample of {} observations".format(x_df.shape[0]))
        #data = self.lcdata.sample(int(row_limit))
    else:
        print("Using the full set of {} observations".format(data.shape[0]))
        #data = self.lcdata

    # Create train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size=test_size, random_state=23)
    print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
    return x_train, x_test, y_train, y_test

In [130]:

def encode_categorical(frame, categorical_cols):
    """Replace categorical variables with one-hot encoding in-place"""
    for col in categorical_cols:
        # use get_dummies() to do one hot encoding of categorical column
        frame = frame.merge(pd.get_dummies(frame[col]), left_index=True, right_index=True)
        
        # drop the original categorical column
        frame.drop(col, axis=1, inplace=True)
  
    return frame

In [131]:

# Divide the data set into training and test sets
x_train, x_test, y_train, y_test = split_data(df, APPLICANT_NUMERIC + CREDIT_NUMERIC,
                  APPLICANT_CATEGORICAL,
                  TARGET,
                  test_size = 0.2,
                  row_limit = os.environ.get("sample"))

# Inspect our training data
print("x_train contains {} rows and {} features".format(x_train.shape[0], x_train.shape[1]))
print("y_train contains {} rows and {} features".format(y_train.shape[0], y_train.shape[1]))

print("x_test contains {} rows and {} features".format(x_test.shape[0], x_test.shape[1]))
print("y_test contains {} rows and {} features".format(y_test.shape[0], y_test.shape[1]))

# Loan grade has been one-hot encoded
print("Sample one-hot encoded 'y' value: \n{}".format(y_train.sample()))

Using the full set of 552160 observations
x_train contains 441728 rows and 105 features
x_train contains 441728 rows and 105 features
y_train contains 441728 rows and 7 features
x_test contains 110432 rows and 105 features
y_test contains 110432 rows and 7 features
Sample one-hot encoded 'y' value: 
        grade_A  grade_B  grade_C  grade_D  grade_E  grade_F  grade_G
447861        1        0        0        0        0        0        0

In [132]:

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.constraints import maxnorm

def create_model(input_dim, output_dim):
    # create model
    model = Sequential()
    # input layer
    model.add(Dense(100, input_dim=input_dim, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    
    # hidden layer
    model.add(Dense(60, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    
    # output layer
    model.add(Dense(output_dim, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [134]:

from keras.callbacks import ModelCheckpoint
from keras.models import load_model



model = create_model(x_train.shape[1], y_train.shape[1])

epochs =  35
batch_sz = 64

print("Beginning model training with batch size {} and {} epochs".format(batch_sz, epochs))

checkpoint = ModelCheckpoint("lc_model.h5", monitor='val_acc', verbose=0, save_best_only=True, mode='auto', period=1)
# train the model
history = model.fit(x_train.as_matrix(),
                y_train.as_matrix(),
                validation_split=0.2,
                epochs=epochs,  
                batch_size=batch_sz, 
                verbose=2,
                callbacks=[checkpoint])

# revert to the best model encountered during training
model = load_model("lc_model.h5")

Beginning model training with batch size 64 and 35 epochs

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  from ipykernel import kernelapp as app
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  app.launch_new_instance()

Train on 353382 samples, validate on 88346 samples
Epoch 1/35
 - 31s - loss: 0.9299 - acc: 0.6013 - val_loss: 0.4764 - val_acc: 0.8208
Epoch 2/35
 - 25s - loss: 0.5274 - acc: 0.7795 - val_loss: 0.4230 - val_acc: 0.8219
Epoch 3/35
 - 24s - loss: 0.4681 - acc: 0.8038 - val_loss: 0.3759 - val_acc: 0.8448
Epoch 4/35
 - 27s - loss: 0.4437 - acc: 0.8144 - val_loss: 0.3513 - val_acc: 0.8593
Epoch 5/35
 - 26s - loss: 0.4300 - acc: 0.8205 - val_loss: 0.3510 - val_acc: 0.8581
Epoch 6/35
 - 29s - loss: 0.4197 - acc: 0.8249 - val_loss: 0.3651 - val_acc: 0.8478
Epoch 7/35
 - 31s - loss: 0.4119 - acc: 0.8279 - val_loss: 0.3275 - val_acc: 0.8648
Epoch 8/35
 - 29s - loss: 0.4055 - acc: 0.8312 - val_loss: 0.3168 - val_acc: 0.8702
Epoch 9/35
 - 27s - loss: 0.3998 - acc: 0.8335 - val_loss: 0.3298 - val_acc: 0.8652
Epoch 10/35
 - 27s - loss: 0.3958 - acc: 0.8362 - val_loss: 0.3154 - val_acc: 0.8697
Epoch 11/35
 - 26s - loss: 0.3931 - acc: 0.8365 - val_loss: 0.3155 - val_acc: 0.8736
Epoch 12/35
 - 26s - loss: 0.3868 - acc: 0.8391 - val_loss: 0.3006 - val_acc: 0.8793
Epoch 13/35
 - 25s - loss: 0.3839 - acc: 0.8406 - val_loss: 0.3218 - val_acc: 0.8647
Epoch 14/35
 - 27s - loss: 0.3805 - acc: 0.8424 - val_loss: 0.3137 - val_acc: 0.8698
Epoch 15/35
 - 26s - loss: 0.3773 - acc: 0.8428 - val_loss: 0.3047 - val_acc: 0.8742
Epoch 16/35
 - 26s - loss: 0.3746 - acc: 0.8453 - val_loss: 0.2908 - val_acc: 0.8826
Epoch 17/35
 - 26s - loss: 0.3735 - acc: 0.8454 - val_loss: 0.3047 - val_acc: 0.8745
Epoch 18/35
 - 25s - loss: 0.3701 - acc: 0.8465 - val_loss: 0.2946 - val_acc: 0.8788
Epoch 19/35
 - 29s - loss: 0.3694 - acc: 0.8462 - val_loss: 0.2958 - val_acc: 0.8772
Epoch 20/35
 - 26s - loss: 0.3673 - acc: 0.8483 - val_loss: 0.3035 - val_acc: 0.8773
Epoch 21/35
 - 25s - loss: 0.3650 - acc: 0.8483 - val_loss: 0.2870 - val_acc: 0.8847
Epoch 22/35
 - 26s - loss: 0.3648 - acc: 0.8491 - val_loss: 0.2881 - val_acc: 0.8834
Epoch 23/35
 - 28s - loss: 0.3640 - acc: 0.8489 - val_loss: 0.3021 - val_acc: 0.8751
Epoch 24/35
 - 30s - loss: 0.3612 - acc: 0.8502 - val_loss: 0.3163 - val_acc: 0.8665
Epoch 25/35
 - 26s - loss: 0.3624 - acc: 0.8499 - val_loss: 0.2869 - val_acc: 0.8822
Epoch 26/35
 - 25s - loss: 0.3596 - acc: 0.8512 - val_loss: 0.2905 - val_acc: 0.8832
Epoch 27/35
 - 26s - loss: 0.3563 - acc: 0.8528 - val_loss: 0.2839 - val_acc: 0.8822
Epoch 28/35
 - 27s - loss: 0.3570 - acc: 0.8524 - val_loss: 0.3027 - val_acc: 0.8745
Epoch 29/35
 - 25s - loss: 0.3568 - acc: 0.8529 - val_loss: 0.3116 - val_acc: 0.8683
Epoch 30/35
 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.2901 - val_acc: 0.8800
Epoch 31/35
 - 24s - loss: 0.3542 - acc: 0.8530 - val_loss: 0.2898 - val_acc: 0.8797
Epoch 32/35
 - 24s - loss: 0.3531 - acc: 0.8544 - val_loss: 0.2853 - val_acc: 0.8841
Epoch 33/35
 - 24s - loss: 0.3541 - acc: 0.8536 - val_loss: 0.2992 - val_acc: 0.8760
Epoch 34/35
 - 24s - loss: 0.3541 - acc: 0.8538 - val_loss: 0.2810 - val_acc: 0.8867
Epoch 35/35
 - 24s - loss: 0.3550 - acc: 0.8537 - val_loss: 0.3013 - val_acc: 0.8760

In [135]:

plt.rcParams["figure.figsize"] = [16, 5]
plt.rcParams["axes.titlesize"] = 18
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams['legend.loc'] = 'upper right'
plt.rcParams['legend.framealpha'] = 0.7
plt.rcParams["legend.fontsize"] = 14

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid.'], loc='upper left')

# adjust size
plt.tight_layout()
plt.show()

We can now see how our train model performs on the "test" data that we held out of the training data. We measure the performance using the F1 score.

In [137]:

import numpy as np
from sklearn.metrics import f1_score

y_pred = model.predict(x_test.as_matrix())

# Revert one-hot encoding to classes
y_pred_classes = pd.DataFrame((y_pred.argmax(1)[:,None] == np.arange(y_pred.shape[1])),
                              columns=y_test.columns,
                              index=y_test.index)

y_test_vals = y_test.idxmax(1)
y_pred_vals = y_pred_classes.idxmax(1)

# F1 score
# Use idxmax() to convert back from one-hot encoding
f1 = f1_score(y_test_vals, y_pred_vals, average='weighted')
print("Test Set Accuracy: {:.2%}   (But results would have been better if we trained on the FULL dataset)".format(f1))

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  after removing the cwd from sys.path.

Test Set Accuracy: 88.54%   (But results would have been better if we trained on the FULL dataset)

In [141]:

from sklearn.metrics import confusion_matrix

from importlib import reload

# Confusion matrix
conf_matrix = confusion_matrix(y_test_vals, y_pred_vals).astype(float)

In [143]:

# drawing it 

def fmt_pct(x, pos):
    """Format as percentage"""
    return '{:.0%}'.format(x)

classes = [l for l in "ABCDEFG"]
title='Confusion matrix'

correct_mask = np.ones(conf_matrix.shape, dtype=bool)
wrong_mask = np.zeros(conf_matrix.shape, dtype=bool)
for i in range(conf_matrix.shape[0]):
    correct_mask[i,i] = False
    wrong_mask[i,i] = True
    row_sum = sum(conf_matrix[i])
    for j in range(conf_matrix.shape[1]):
        conf_matrix[i, j] = conf_matrix[i, j] / row_sum

correct_matrix = np.ma.masked_array(conf_matrix, mask=correct_mask)
wrong_matrix = np.ma.masked_array(conf_matrix, wrong_mask)

fig,ax = plt.subplots(figsize=(8, 8))
blue_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#f1eef6', '#025a90'], N=10)
blue_map.set_under(color='white')

red_map = colors.LinearSegmentedColormap.from_list('custom blue', ['#feeddd', '#a83500'], N=10)
red_map.set_under(color='white')

plot_correct = ax.imshow(correct_matrix,interpolation='nearest',cmap=blue_map, vmin=0.000001, vmax=1)
plot_wrong = ax.imshow(wrong_matrix,interpolation='nearest',cmap=red_map, vmin=0.000001, vmax=1)

colorbar_wrong = plt.colorbar(plot_wrong, shrink=0.35, orientation='horizontal', pad=-0.11, format=ticker.FuncFormatter(fmt_pct))
colorbar_correct = plt.colorbar(plot_correct, shrink=0.35, orientation='horizontal', pad=0.03)
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
ax.set_xticklabels([''] + classes)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_yticklabels([''] + classes)
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
colorbar_correct.ax.text(-0.3,0.25,'CORRECT',rotation=0)
colorbar_wrong.ax.text(-0.35,0.25,'INCORRECT',rotation=0)
plt.setp(colorbar_correct.ax.get_xticklabels(), visible=False)

thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
    cell_label = "{:.1%}".format(conf_matrix[i, j])
    plt.text(j, i, cell_label,
             horizontalalignment="center",
             color="white" if conf_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.show()

Conclusion¶

From the very beginning James Anderson's model performed better than my model, however his neural network architecture was almost the same as ours. The first thing he did was to remove any rows with nas from the columns he selected. So he didn't selected the columns where there were too many missing values, unlike me. This created a lighter model.

Yet, if his results were already pretty good we improved them by removing a few features. However the lower the grade the lower were our classification accuracy. Reaching to 19% for G grade borrowers, which were mainly classified in F and some in E. So the next thing to work on would be to improve features selection.

In [ ]: