In [3]:
import graphlab 
graphlab.canvas.set_target('ipynb')

Load Loan Dataset

In [4]:
loans = graphlab.SFrame('lending-club-data.gl/')
2016-03-24 07:49:12,384 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: C:\Users\erigits\AppData\Local\Temp\graphlab_server_1458794950.log.0
This non-commercial license of GraphLab Create is assigned to [email protected] and will expire on November 01, 2016. For commercial licensing options, visit https://dato.com/buy/.
In [5]:
loans.head()
Out[5]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade
1077501 1296599 5000 5000 4975 36 months 10.65 162.87 B B2
1077430 1314167 2500 2500 2500 60 months 15.27 59.83 C C4
1077175 1313524 2400 2400 2400 36 months 15.96 84.33 C C5
1076863 1277178 10000 10000 10000 36 months 13.49 339.31 C C1
1075269 1311441 5000 5000 5000 36 months 7.9 156.46 A A4
1072053 1288686 3000 3000 3000 36 months 18.64 109.43 E E1
1071795 1306957 5600 5600 5600 60 months 21.28 152.39 F F2
1071570 1306721 5375 5375 5350 60 months 12.69 121.45 B B5
1070078 1305201 6500 6500 6500 60 months 14.65 153.45 C C3
1069908 1305008 12000 12000 12000 36 months 12.69 402.54 B B5
emp_title emp_length home_ownership annual_inc is_inc_v issue_d loan_status pymnt_plan
10+ years RENT 24000 Verified 20111201T000000 Fully Paid n
Ryder < 1 year RENT 30000 Source Verified 20111201T000000 Charged Off n
10+ years RENT 12252 Not Verified 20111201T000000 Fully Paid n
AIR RESOURCES BOARD 10+ years RENT 49200 Source Verified 20111201T000000 Fully Paid n
Veolia Transportaton 3 years RENT 36000 Source Verified 20111201T000000 Fully Paid n
MKC Accounting 9 years RENT 48000 Source Verified 20111201T000000 Fully Paid n
4 years OWN 40000 Source Verified 20111201T000000 Charged Off n
Starbucks < 1 year RENT 15000 Verified 20111201T000000 Charged Off n
Southwest Rural metro 5 years OWN 72000 Not Verified 20111201T000000 Fully Paid n
UCLA 10+ years OWN 75000 Source Verified 20111201T000000 Fully Paid n
url desc purpose title zip_code
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/22/11 > I need to ...
credit_card Computer 860xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/22/11 > I plan to use ...
car bike 309xx
https://www.lendingclub.c
om/browse/loanDetail. ...
small_business real estate business 606xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/21/11 > to pay for ...
other personel 917xx
https://www.lendingclub.c
om/browse/loanDetail. ...
wedding My wedding loan I promise
to pay back ...
852xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/16/11 > Downpayment ...
car Car Downpayment 900xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/21/11 > I own a small ...
small_business Expand Business & Buy
Debt Portfolio ...
958xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/16/11 > I'm trying to ...
other Building my credit
history. ...
774xx
https://www.lendingclub.c
om/browse/loanDetail. ...
Borrower added on
12/15/11 > I had recived ...
debt_consolidation High intrest
Consolidation ...
853xx
https://www.lendingclub.c
om/browse/loanDetail. ...
debt_consolidation Consolidation 913xx
addr_state dti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record
AZ 27.65 0 19850101T000000 1 None None
GA 1.0 0 19990401T000000 5 None None
IL 8.72 0 20011101T000000 2 None None
CA 20.0 0 19960201T000000 1 35 None
AZ 11.2 0 20041101T000000 3 None None
CA 5.35 0 20070101T000000 2 None None
CA 5.55 0 20040401T000000 2 None None
TX 18.08 0 20040901T000000 0 None None
AZ 16.12 0 19980101T000000 2 None None
CA 10.78 0 19891001T000000 0 None None
open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt
3 0 13648 83.7 9 f 0.0 0.0 5861.07
3 0 1687 9.4 4 f 0.0 0.0 1008.71
2 0 2956 98.5 10 f 0.0 0.0 3003.65
10 0 5598 21.0 37 f 0.0 0.0 12226.3
9 0 7963 28.3 12 f 0.0 0.0 5631.38
4 0 8221 87.5 4 f 0.0 0.0 3938.14
11 0 5210 32.6 13 f 0.0 0.0 646.02
2 0 9279 36.5 3 f 0.0 0.0 1476.19
14 0 4032 20.6 23 f 0.0 0.0 7677.52
12 0 23336 67.1 34 f 0.0 0.0 13943.1
total_pymnt_inv ...
5831.78 ...
1008.71 ...
3003.65 ...
12226.3 ...
5631.38 ...
3938.14 ...
646.02 ...
1469.34 ...
7677.52 ...
13943.1 ...
[10 rows x 68 columns]
In [7]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
In [8]:
loans['safe_loans'].show(view = 'Categorical')
In [18]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]
In [10]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)
Number of safe loans  : 99457
Number of risky loans : 23150
In [11]:
print "Percentage of safe loans  :", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
print "Percentage of risky loans :", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
Percentage of safe loans  : 0.811185331996
Percentage of risky loans : 0.188814668004
In [12]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)
In [13]:
print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)
Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508
In [22]:
train_data, test_data = loans_data.random_split(.8, seed=1)
print len(train_data)
print len(test_data)
37224
9284
In [23]:
Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,
                                                                target = loans_data,
                                                                validation_set=None) 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-23-122866ccca7b> in <module>()
      1 Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,
      2                                                                 target = loans_data,
----> 3                                                                 validation_set=None) 

C:\Anaconda2\lib\site-packages\graphlab\toolkits\classifier\logistic_classifier.pyc in create(dataset, target, features, l2_penalty, l1_penalty, solver, feature_rescaling, convergence_threshold, step_size, lbfgs_memory_level, max_iterations, class_weights, validation_set, verbose)
    306                         lbfgs_memory_level = lbfgs_memory_level,
    307                         max_iterations = max_iterations,
--> 308                         class_weights = class_weights)
    309 
    310     return LogisticClassifier(model.__proxy__)

C:\Anaconda2\lib\site-packages\graphlab\toolkits\_supervised_learning.pyc in create(dataset, target, model_name, features, validation_set, verbose, distributed, **kwargs)
    398 
    399     # Target
--> 400     target_sframe = _toolkits_select_columns(dataset, [target])
    401 
    402     # Features

C:\Anaconda2\lib\site-packages\graphlab\toolkits\_internal_utils.pyc in _toolkits_select_columns(dataset, columns)
    296     """
    297     try:
--> 298         return dataset.select_columns(columns)
    299 
    300     except RuntimeError:

C:\Anaconda2\lib\site-packages\graphlab\data_structures\sframe.pyc in select_columns(self, keylist)
   3630         if not (all([isinstance(x, str) or isinstance(x, type) or isinstance(x, bytes)
   3631                      for x in keylist])):
-> 3632             raise TypeError("Invalid key type: must be str, bytes or type")
   3633 
   3634         column_names_set = set(self.column_names())

TypeError: Invalid key type: must be str, bytes or type