In [3]:

import graphlab 
graphlab.canvas.set_target('ipynb')

Load Loan Dataset¶

In [4]:

loans = graphlab.SFrame('lending-club-data.gl/')

2016-03-24 07:49:12,384 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: C:\Users\erigits\AppData\Local\Temp\graphlab_server_1458794950.log.0

This non-commercial license of GraphLab Create is assigned to ericgithua2011@gmail.com and will expire on November 01, 2016. For commercial licensing options, visit https://dato.com/buy/.

In [5]:

loans.head()

Out[5]:

id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade
1077501	1296599	5000	5000	4975	36 months	10.65	162.87	B	B2
1077430	1314167	2500	2500	2500	60 months	15.27	59.83	C	C4
1077175	1313524	2400	2400	2400	36 months	15.96	84.33	C	C5
1076863	1277178	10000	10000	10000	36 months	13.49	339.31	C	C1
1075269	1311441	5000	5000	5000	36 months	7.9	156.46	A	A4
1072053	1288686	3000	3000	3000	36 months	18.64	109.43	E	E1
1071795	1306957	5600	5600	5600	60 months	21.28	152.39	F	F2
1071570	1306721	5375	5375	5350	60 months	12.69	121.45	B	B5
1070078	1305201	6500	6500	6500	60 months	14.65	153.45	C	C3
1069908	1305008	12000	12000	12000	36 months	12.69	402.54	B	B5

emp_title	emp_length	home_ownership	annual_inc	is_inc_v	issue_d	loan_status	pymnt_plan
	10+ years	RENT	24000	Verified	20111201T000000	Fully Paid	n
Ryder	< 1 year	RENT	30000	Source Verified	20111201T000000	Charged Off	n
	10+ years	RENT	12252	Not Verified	20111201T000000	Fully Paid	n
AIR RESOURCES BOARD	10+ years	RENT	49200	Source Verified	20111201T000000	Fully Paid	n
Veolia Transportaton	3 years	RENT	36000	Source Verified	20111201T000000	Fully Paid	n
MKC Accounting	9 years	RENT	48000	Source Verified	20111201T000000	Fully Paid	n
	4 years	OWN	40000	Source Verified	20111201T000000	Charged Off	n
Starbucks	< 1 year	RENT	15000	Verified	20111201T000000	Charged Off	n
Southwest Rural metro	5 years	OWN	72000	Not Verified	20111201T000000	Fully Paid	n
UCLA	10+ years	OWN	75000	Source Verified	20111201T000000	Fully Paid	n

url	desc	purpose	title	zip_code
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/22/11 > I need to ...	credit_card	Computer	860xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/22/11 > I plan to use ...	car	bike	309xx
https://www.lendingclub.c om/browse/loanDetail. ...		small_business	real estate business	606xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/21/11 > to pay for ...	other	personel	917xx
https://www.lendingclub.c om/browse/loanDetail. ...		wedding	My wedding loan I promise to pay back ...	852xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/16/11 > Downpayment ...	car	Car Downpayment	900xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/21/11 > I own a small ...	small_business	Expand Business & Buy Debt Portfolio ...	958xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/16/11 > I'm trying to ...	other	Building my credit history. ...	774xx
https://www.lendingclub.c om/browse/loanDetail. ...	Borrower added on 12/15/11 > I had recived ...	debt_consolidation	High intrest Consolidation ...	853xx
https://www.lendingclub.c om/browse/loanDetail. ...		debt_consolidation	Consolidation	913xx

addr_state	dti	earliest_cr_line	inq_last_6mths	mths_since_last_delinq	mths_since_last_record
AZ	27.65	19850101T000000	1	None	None
GA	1.0	19990401T000000	5	None	None
IL	8.72	20011101T000000	2	None	None
CA	20.0	19960201T000000	1	35	None
AZ	11.2	20041101T000000	3	None	None
CA	5.35	20070101T000000	2	None	None
CA	5.55	20040401T000000	2	None	None
TX	18.08	20040901T000000	0	None	None
AZ	16.12	19980101T000000	2	None	None
CA	10.78	19891001T000000	0	None	None

open_acc	revol_bal	revol_util	total_acc	initial_list_status	total_pymnt
3	13648	83.7	9	f	5861.07
3	1687	9.4	4	f	1008.71
2	2956	98.5	10	f	3003.65
10	5598	21.0	37	f	12226.3
9	7963	28.3	12	f	5631.38
4	8221	87.5	4	f	3938.14
11	5210	32.6	13	f	646.02
2	9279	36.5	3	f	1476.19
14	4032	20.6	23	f	7677.52
12	23336	67.1	34	f	13943.1

total_pymnt_inv	...
5831.78	...
1008.71	...
3003.65	...
12226.3	...
5631.38	...
3938.14	...
646.02	...
1469.34	...
7677.52	...
13943.1	...

[10 rows x 68 columns]

In [7]:

# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [8]:

loans['safe_loans'].show(view = 'Categorical')

In [18]:

features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

In [10]:

safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150

In [11]:

print "Percentage of safe loans  :", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
print "Percentage of risky loans :", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))

Percentage of safe loans  : 0.811185331996
Percentage of risky loans : 0.188814668004

In [12]:

# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [13]:

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508

In [22]:

train_data, test_data = loans_data.random_split(.8, seed=1)
print len(train_data)
print len(test_data)

37224
9284

In [23]:

Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,
                                                                target = loans_data,
                                                                validation_set=None) 

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-23-122866ccca7b> in <module>()
      1 Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,
      2                                                                 target = loans_data,
----> 3                                                                 validation_set=None) 

C:\Anaconda2\lib\site-packages\graphlab\toolkits\classifier\logistic_classifier.pyc in create(dataset, target, features, l2_penalty, l1_penalty, solver, feature_rescaling, convergence_threshold, step_size, lbfgs_memory_level, max_iterations, class_weights, validation_set, verbose)
    306                         lbfgs_memory_level = lbfgs_memory_level,
    307                         max_iterations = max_iterations,
--> 308                         class_weights = class_weights)
    309 
    310     return LogisticClassifier(model.__proxy__)

C:\Anaconda2\lib\site-packages\graphlab\toolkits\_supervised_learning.pyc in create(dataset, target, model_name, features, validation_set, verbose, distributed, **kwargs)
    398 
    399     # Target
--> 400     target_sframe = _toolkits_select_columns(dataset, [target])
    401 
    402     # Features

C:\Anaconda2\lib\site-packages\graphlab\toolkits\_internal_utils.pyc in _toolkits_select_columns(dataset, columns)
    296     """
    297     try:
--> 298         return dataset.select_columns(columns)
    299 
    300     except RuntimeError:

C:\Anaconda2\lib\site-packages\graphlab\data_structures\sframe.pyc in select_columns(self, keylist)
   3630         if not (all([isinstance(x, str) or isinstance(x, type) or isinstance(x, bytes)
   3631                      for x in keylist])):
-> 3632             raise TypeError("Invalid key type: must be str, bytes or type")
   3633 
   3634         column_names_set = set(self.column_names())

TypeError: Invalid key type: must be str, bytes or type