#!/usr/bin/env python # coding: utf-8 # In[3]: import graphlab graphlab.canvas.set_target('ipynb') # # Load Loan Dataset # In[4]: loans = graphlab.SFrame('lending-club-data.gl/') # In[5]: loans.head() # In[7]: # safe_loans = 1 => safe # safe_loans = -1 => risky loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1) loans = loans.remove_column('bad_loans') # In[8]: loans['safe_loans'].show(view = 'Categorical') # In[18]: features = ['grade', # grade of the loan 'sub_grade', # sub-grade of the loan 'short_emp', # one year or less of employment 'emp_length_num', # number of years of employment 'home_ownership', # home_ownership status: own, mortgage or rent 'dti', # debt to income ratio 'purpose', # the purpose of the loan 'term', # the term of the loan 'last_delinq_none', # has borrower had a delinquincy 'last_major_derog_none', # has borrower had 90 day or worse rating 'revol_util', # percent of available credit being used 'total_rec_late_fee', # total late fees received to day ] target = 'safe_loans' # prediction target (y) (+1 means safe, -1 is risky) # Extract the feature columns and target column loans = loans[features + [target]] # In[10]: safe_loans_raw = loans[loans[target] == +1] risky_loans_raw = loans[loans[target] == -1] print "Number of safe loans : %s" % len(safe_loans_raw) print "Number of risky loans : %s" % len(risky_loans_raw) # In[11]: print "Percentage of safe loans :", (len(safe_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw))) print "Percentage of risky loans :", (len(risky_loans_raw)/float(len(safe_loans_raw) + len(risky_loans_raw))) # In[12]: # Since there are fewer risky loans than safe loans, find the ratio of the sizes # and use that percentage to undersample the safe loans. percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) risky_loans = risky_loans_raw safe_loans = safe_loans_raw.sample(percentage, seed=1) # Append the risky_loans with the downsampled version of safe_loans loans_data = risky_loans.append(safe_loans) # In[13]: print "Percentage of safe loans :", len(safe_loans) / float(len(loans_data)) print "Percentage of risky loans :", len(risky_loans) / float(len(loans_data)) print "Total number of loans in our new dataset :", len(loans_data) # In[22]: train_data, test_data = loans_data.random_split(.8, seed=1) print len(train_data) print len(test_data) # In[23]: Logistic_Regression_Model = graphlab.logistic_classifier.create(train_data, target = loans_data, validation_set=None)