import graphlab
graphlab.canvas.set_target('ipynb')
loans = graphlab.SFrame('lending-club-data.gl/')
2016-03-24 07:49:12,384 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: C:\Users\erigits\AppData\Local\Temp\graphlab_server_1458794950.log.0
This non-commercial license of GraphLab Create is assigned to ericgithua2011@gmail.com and will expire on November 01, 2016. For commercial licensing options, visit https://dato.com/buy/.
loans.head()
id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade |
---|---|---|---|---|---|---|---|---|---|
1077501 | 1296599 | 5000 | 5000 | 4975 | 36 months | 10.65 | 162.87 | B | B2 |
1077430 | 1314167 | 2500 | 2500 | 2500 | 60 months | 15.27 | 59.83 | C | C4 |
1077175 | 1313524 | 2400 | 2400 | 2400 | 36 months | 15.96 | 84.33 | C | C5 |
1076863 | 1277178 | 10000 | 10000 | 10000 | 36 months | 13.49 | 339.31 | C | C1 |
1075269 | 1311441 | 5000 | 5000 | 5000 | 36 months | 7.9 | 156.46 | A | A4 |
1072053 | 1288686 | 3000 | 3000 | 3000 | 36 months | 18.64 | 109.43 | E | E1 |
1071795 | 1306957 | 5600 | 5600 | 5600 | 60 months | 21.28 | 152.39 | F | F2 |
1071570 | 1306721 | 5375 | 5375 | 5350 | 60 months | 12.69 | 121.45 | B | B5 |
1070078 | 1305201 | 6500 | 6500 | 6500 | 60 months | 14.65 | 153.45 | C | C3 |
1069908 | 1305008 | 12000 | 12000 | 12000 | 36 months | 12.69 | 402.54 | B | B5 |
emp_title | emp_length | home_ownership | annual_inc | is_inc_v | issue_d | loan_status | pymnt_plan |
---|---|---|---|---|---|---|---|
10+ years | RENT | 24000 | Verified | 20111201T000000 | Fully Paid | n | |
Ryder | < 1 year | RENT | 30000 | Source Verified | 20111201T000000 | Charged Off | n |
10+ years | RENT | 12252 | Not Verified | 20111201T000000 | Fully Paid | n | |
AIR RESOURCES BOARD | 10+ years | RENT | 49200 | Source Verified | 20111201T000000 | Fully Paid | n |
Veolia Transportaton | 3 years | RENT | 36000 | Source Verified | 20111201T000000 | Fully Paid | n |
MKC Accounting | 9 years | RENT | 48000 | Source Verified | 20111201T000000 | Fully Paid | n |
4 years | OWN | 40000 | Source Verified | 20111201T000000 | Charged Off | n | |
Starbucks | < 1 year | RENT | 15000 | Verified | 20111201T000000 | Charged Off | n |
Southwest Rural metro | 5 years | OWN | 72000 | Not Verified | 20111201T000000 | Fully Paid | n |
UCLA | 10+ years | OWN | 75000 | Source Verified | 20111201T000000 | Fully Paid | n |
url | desc | purpose | title | zip_code |
---|---|---|---|---|
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/22/11 > I need to ... |
credit_card | Computer | 860xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/22/11 > I plan to use ... |
car | bike | 309xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
small_business | real estate business | 606xx | |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/21/11 > to pay for ... |
other | personel | 917xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
wedding | My wedding loan I promise to pay back ... |
852xx | |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/16/11 > Downpayment ... |
car | Car Downpayment | 900xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/21/11 > I own a small ... |
small_business | Expand Business & Buy Debt Portfolio ... |
958xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/16/11 > I'm trying to ... |
other | Building my credit history. ... |
774xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
Borrower added on 12/15/11 > I had recived ... |
debt_consolidation | High intrest Consolidation ... |
853xx |
https://www.lendingclub.c om/browse/loanDetail. ... |
debt_consolidation | Consolidation | 913xx |
addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record |
---|---|---|---|---|---|---|
AZ | 27.65 | 0 | 19850101T000000 | 1 | None | None |
GA | 1.0 | 0 | 19990401T000000 | 5 | None | None |
IL | 8.72 | 0 | 20011101T000000 | 2 | None | None |
CA | 20.0 | 0 | 19960201T000000 | 1 | 35 | None |
AZ | 11.2 | 0 | 20041101T000000 | 3 | None | None |
CA | 5.35 | 0 | 20070101T000000 | 2 | None | None |
CA | 5.55 | 0 | 20040401T000000 | 2 | None | None |
TX | 18.08 | 0 | 20040901T000000 | 0 | None | None |
AZ | 16.12 | 0 | 19980101T000000 | 2 | None | None |
CA | 10.78 | 0 | 19891001T000000 | 0 | None | None |
open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt |
---|---|---|---|---|---|---|---|---|
3 | 0 | 13648 | 83.7 | 9 | f | 0.0 | 0.0 | 5861.07 |
3 | 0 | 1687 | 9.4 | 4 | f | 0.0 | 0.0 | 1008.71 |
2 | 0 | 2956 | 98.5 | 10 | f | 0.0 | 0.0 | 3003.65 |
10 | 0 | 5598 | 21.0 | 37 | f | 0.0 | 0.0 | 12226.3 |
9 | 0 | 7963 | 28.3 | 12 | f | 0.0 | 0.0 | 5631.38 |
4 | 0 | 8221 | 87.5 | 4 | f | 0.0 | 0.0 | 3938.14 |
11 | 0 | 5210 | 32.6 | 13 | f | 0.0 | 0.0 | 646.02 |
2 | 0 | 9279 | 36.5 | 3 | f | 0.0 | 0.0 | 1476.19 |
14 | 0 | 4032 | 20.6 | 23 | f | 0.0 | 0.0 | 7677.52 |
12 | 0 | 23336 | 67.1 | 34 | f | 0.0 | 0.0 | 13943.1 |
total_pymnt_inv | ... |
---|---|
5831.78 | ... |
1008.71 | ... |
3003.65 | ... |
12226.3 | ... |
5631.38 | ... |
3938.14 | ... |
646.02 | ... |
1469.34 | ... |
7677.52 | ... |
13943.1 | ... |
# safe_loans = 1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
loans['safe_loans'].show(view = 'Categorical')
features = ['grade', # grade of the loan
'sub_grade', # sub-grade of the loan
'short_emp', # one year or less of employment
'emp_length_num', # number of years of employment
'home_ownership', # home_ownership status: own, mortgage or rent
'dti', # debt to income ratio
'purpose', # the purpose of the loan
'term', # the term of the loan
'last_delinq_none', # has borrower had a delinquincy
'last_major_derog_none', # has borrower had 90 day or worse rating
'revol_util', # percent of available credit being used
'total_rec_late_fee', # total late fees received to day
]
target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky)
# Extract the feature columns and target column
loans = loans[features + [target]]
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)
Number of safe loans : 99457 Number of risky loans : 23150
print "Percentage of safe loans :", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
print "Percentage of risky loans :", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw)))
Percentage of safe loans : 0.811185331996 Percentage of risky loans : 0.188814668004
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)
# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)
print "Percentage of safe loans :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)
Percentage of safe loans : 0.502236174422 Percentage of risky loans : 0.497763825578 Total number of loans in our new dataset : 46508
train_data, test_data = loans_data.random_split(.8, seed=1)
print len(train_data)
print len(test_data)
37224 9284
Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data,
target = loans_data,
validation_set=None)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-23-122866ccca7b> in <module>() 1 Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data, 2 target = loans_data, ----> 3 validation_set=None) C:\Anaconda2\lib\site-packages\graphlab\toolkits\classifier\logistic_classifier.pyc in create(dataset, target, features, l2_penalty, l1_penalty, solver, feature_rescaling, convergence_threshold, step_size, lbfgs_memory_level, max_iterations, class_weights, validation_set, verbose) 306 lbfgs_memory_level = lbfgs_memory_level, 307 max_iterations = max_iterations, --> 308 class_weights = class_weights) 309 310 return LogisticClassifier(model.__proxy__) C:\Anaconda2\lib\site-packages\graphlab\toolkits\_supervised_learning.pyc in create(dataset, target, model_name, features, validation_set, verbose, distributed, **kwargs) 398 399 # Target --> 400 target_sframe = _toolkits_select_columns(dataset, [target]) 401 402 # Features C:\Anaconda2\lib\site-packages\graphlab\toolkits\_internal_utils.pyc in _toolkits_select_columns(dataset, columns) 296 """ 297 try: --> 298 return dataset.select_columns(columns) 299 300 except RuntimeError: C:\Anaconda2\lib\site-packages\graphlab\data_structures\sframe.pyc in select_columns(self, keylist) 3630 if not (all([isinstance(x, str) or isinstance(x, type) or isinstance(x, bytes) 3631 for x in keylist])): -> 3632 raise TypeError("Invalid key type: must be str, bytes or type") 3633 3634 column_names_set = set(self.column_names()) TypeError: Invalid key type: must be str, bytes or type