#!/usr/bin/env python # coding: utf-8 # # Predicting bank's customer response # # Banks strive to increase the efficiency of their contacts with customers. One of the areas which require this is offering new products to existing clients (cross-selling). Instead of offering new products to all clients, it is a good idea to predict the probability of a positive response. Then the offers could be sent to those clients, for whom the probability of response is higher than some threshold value. # # In this notebook I try to solve this problem. In 2011 OTP-Bank in Russia has organized a competition reflecting the aforementioned situation. The data is taken from that [site](http://machinelearning.ru/wiki/index.php?title=%D0%97%D0%B0%D0%B4%D0%B0%D1%87%D0%B0_%D0%BF%D1%80%D0%B5%D0%B4%D1%81%D0%BA%D0%B0%D0%B7%D0%B0%D0%BD%D0%B8%D1%8F_%D0%BE%D1%82%D0%BA%D0%BB%D0%B8%D0%BA%D0%B0_%D0%BA%D0%BB%D0%B8%D0%B5%D0%BD%D1%82%D0%BE%D0%B2_%D0%9E%D0%A2%D0%9F_%D0%91%D0%B0%D0%BD%D0%BA%D0%B0_%28%D0%BA%D0%BE%D0%BD%D0%BA%D1%83%D1%80%D1%81%29). # The competition's description and some data is in Russian, but I'll translate the necessary termins. Column names are already in English. # # Dataset contains 15223 clients; 1812 of them had a positive response. I can't use test set, as competition is finished and quality of predictions on test data can't be verified. So I can only split data in train and test and check the accuracy this way. # # The metric for the competition is AUC (area under curve). The winner achieved 0,6935, top-7 places have AUC higher than 0,67. # # I don't aim to beat these values, my goal is to explore and visualize the data. Also I want to show how to process the data and make predictions so that model is stable and can be interpreted. # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.metrics import auc, roc_curve from sklearn.model_selection import train_test_split, cross_val_score from sklearn import preprocessing from sklearn import linear_model pd.set_option("display.max_columns", 200) pd.set_option("display.max_rows", 100) #from IPython.core.interactiveshell import InteractiveShell #InteractiveShell.ast_node_interactivity = "all" import functions get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # ### Data loading and initial preprocessing # In[2]: get_ipython().run_cell_magic('time', '', "data = pd.read_excel('data set.xls', sheetname='данные')\n") # I'll rename values for several columns first of all, and I'll drop some unnecessary columns. # In[3]: data.loc[data['EDUCATION'] == 'Среднее специальное', 'EDUCATION'] = 'Professional School' data.loc[data['EDUCATION'] == 'Среднее', 'EDUCATION'] = 'Some High School' data.loc[data['EDUCATION'] == 'Неполное среднее', 'EDUCATION'] = 'Some Primary School' data.loc[data['EDUCATION'] == 'Высшее', 'EDUCATION'] = 'Undergraduate Degree' data.loc[data['EDUCATION'] == 'Неоконченное высшее', 'EDUCATION'] = 'No Formal Education' data.loc[data['EDUCATION'] == 'Два и более высших образования', 'EDUCATION'] = 'Post-Graduate Work' data.loc[data['EDUCATION'] == 'Ученая степень', 'EDUCATION'] = 'Graduate Degree' # In[4]: data.loc[data['MARITAL_STATUS'] == 'Состою в браке', 'MARITAL_STATUS'] = 'Married' data.loc[data['MARITAL_STATUS'] == 'Гражданский брак', 'MARITAL_STATUS'] = 'Partner' data.loc[data['MARITAL_STATUS'] == 'Разведен(а)', 'MARITAL_STATUS'] = 'Separated' data.loc[data['MARITAL_STATUS'] == 'Не состоял в браке', 'MARITAL_STATUS'] = 'Single' data.loc[data['MARITAL_STATUS'] == 'Вдовец/Вдова', 'MARITAL_STATUS'] = 'Widowed' # In[5]: data.loc[data['GEN_INDUSTRY'] == 'Металлургия/Промышленность/Машиностроение', 'GEN_INDUSTRY'] = 'Iron & Steel' data.loc[data['GEN_INDUSTRY'] == 'Строительство', 'GEN_INDUSTRY'] = 'Construction - Raw Materials' data.loc[data['GEN_INDUSTRY'] == 'Нефтегазовая промышленность', 'GEN_INDUSTRY'] = 'Oil & Gas Operations' data.loc[data['GEN_INDUSTRY'] == 'Энергетика', 'GEN_INDUSTRY'] = 'Oil Well Services & Equipment' data.loc[data['GEN_INDUSTRY'] == 'Страхование', 'GEN_INDUSTRY'] = 'Insurance (Accident & Health)' data.loc[data['GEN_INDUSTRY'] == 'Банк/Финансы', 'GEN_INDUSTRY'] = 'Regional Banks' data.loc[data['GEN_INDUSTRY'] == 'Здравоохранение', 'GEN_INDUSTRY'] = 'Healthcare' data.loc[data['GEN_INDUSTRY'] == 'Управляющая компания', 'GEN_INDUSTRY'] = 'Business Services' data.loc[data['GEN_INDUSTRY'] == 'Туризм', 'GEN_INDUSTRY'] = 'Hotels & Motels' data.loc[data['GEN_INDUSTRY'] == 'Юридические услуги/нотариальные услуги', 'GEN_INDUSTRY'] = 'Personal Services' data.loc[data['GEN_INDUSTRY'] == 'Недвижимость', 'GEN_INDUSTRY'] = 'Real Estate Operations' data.loc[data['GEN_INDUSTRY'] == 'Развлечения/Искусство', 'GEN_INDUSTRY'] = 'Recreational Activities' data.loc[data['GEN_INDUSTRY'] == 'Ресторанный бизнес /общественное питание', 'GEN_INDUSTRY'] = 'Restaurants' data.loc[data['GEN_INDUSTRY'] == 'Образование', 'GEN_INDUSTRY'] = 'Schools' data.loc[data['GEN_INDUSTRY'] == 'Наука', 'GEN_INDUSTRY'] = 'Scientific & Technical Instr.' data.loc[data['GEN_INDUSTRY'] == 'Информационные технологии', 'GEN_INDUSTRY'] = 'Software & Programming' data.loc[data['GEN_INDUSTRY'] == 'Транспорт', 'GEN_INDUSTRY'] = 'Transportation' data.loc[data['GEN_INDUSTRY'] == 'Логистика', 'GEN_INDUSTRY'] = 'Trucking' data.loc[data['GEN_INDUSTRY'] == 'Ресторанный бизнес/Общественное питание', 'GEN_INDUSTRY'] = 'Restaurant & Catering' data.loc[data['GEN_INDUSTRY'] == 'Коммунальное хоз-во/Дорожные службы', 'GEN_INDUSTRY'] = 'Municipal economy/Road service' data.loc[data['GEN_INDUSTRY'] == 'Салоны красоты и здоровья', 'GEN_INDUSTRY'] = 'Beauty shop' data.loc[data['GEN_INDUSTRY'] == 'Сборочные производства', 'GEN_INDUSTRY'] = 'Assembly production' data.loc[data['GEN_INDUSTRY'] == 'Сельское хозяйство', 'GEN_INDUSTRY'] = 'Agriculture' data.loc[data['GEN_INDUSTRY'] == 'Химия/Парфюмерия/Фармацевтика', 'GEN_INDUSTRY'] = 'Chemistry/Perfumery/Pharmaceut' data.loc[data['GEN_INDUSTRY'] == 'ЧОП/Детективная д-ть', 'GEN_INDUSTRY'] = 'Detective' data.loc[data['GEN_INDUSTRY'] == 'Другие сферы', 'GEN_INDUSTRY'] = 'Others fields' data.loc[data['GEN_INDUSTRY'] == 'Государственная служба', 'GEN_INDUSTRY'] = 'Public & municipal administ.' data.loc[data['GEN_INDUSTRY'] == 'Информационные услуги', 'GEN_INDUSTRY'] = 'Information service' data.loc[data['GEN_INDUSTRY'] == 'Торговля', 'GEN_INDUSTRY'] = 'Market, real estate' data.loc[data['GEN_INDUSTRY'] == 'Маркетинг', 'GEN_INDUSTRY'] = 'Marketing' data.loc[data['GEN_INDUSTRY'] == 'Подбор персонала', 'GEN_INDUSTRY'] = 'Staff recruitment' data.loc[data['GEN_INDUSTRY'] == 'СМИ/Реклама/PR-агенства', 'GEN_INDUSTRY'] = 'Mass media' # In[6]: data.loc[data['FAMILY_INCOME'] == 'от 10000 до 20000 руб.', 'FAMILY_INCOME'] = '10000-20000' data.loc[data['FAMILY_INCOME'] == 'от 20000 до 50000 руб.', 'FAMILY_INCOME'] = '20000-50000' data.loc[data['FAMILY_INCOME'] == 'от 5000 до 10000 руб.', 'FAMILY_INCOME'] = '5000-10000' data.loc[data['FAMILY_INCOME'] == 'свыше 50000 руб.', 'FAMILY_INCOME'] = '50000+' data.loc[data['FAMILY_INCOME'] == 'до 5000 руб.', 'FAMILY_INCOME'] = 'up to 5000' # In[7]: data.drop(['GEN_TITLE', 'ORG_TP_STATE', 'ORG_TP_FCAPITAL', 'JOB_DIR', 'REG_ADDRESS_PROVINCE', 'FACT_ADDRESS_PROVINCE', 'POSTAL_ADDRESS_PROVINCE', 'TP_PROVINCE', 'REGION_NM'], axis=1, inplace=True) # In[8]: data.head() # In[9]: data.info() # This is how the data looks like. 43 columns and several of them have missing values. # I'll do the following things: # # * drop several columns, where one of the values is too prevalent (has 95% or more). This is an arbitrary value and can be changed. The reason to do this is that if other categories in the variable have less that 5% in total and the target has ~11% positive response, than the variable will be hardly useful. Of course, maybe one of less common classes always has positive response (this needs to be checkes), in this case the feature should be used; # * process continuous variables; # * process categorical variables; # * select variables and build the model; # In[10]: for col in data.columns: if data[col].value_counts(dropna=False, normalize=True).values[0] > 0.95: if col == 'TARGET': pass else: print(col) data.drop([col], axis=1, inplace=True) # ## Continuous # # It is worth noticing that often it makes sense to create new variables from the ones already existing. While separate variables can have some impact on the model performance, their interaction may bring much more value. As an example I create a new variable as the value of income divided by the credit amount. If credit amount is much higher than income, there could be problems in paying it, if credit is many times lower, it could be of little interest to the customer. Of course, the dependences are more difficults, but you get the gist. # In[11]: data['Income_to_limit'] = data['PERSONAL_INCOME'] / data['CREDIT'] # And now there is a question about what to do with continuous variables. Usually I use them as they are, or use some kind of transformation (for example log) if necessary or normalize the values. But if the model needs to be interpretable, this won't do. The model should show how certain values impact the probability of positive response. So I'll split continuous variables into bins, so that each variable will have a separate coefficient in the model. I have written the function **split_best_iv** for this in [this file](https://github.com/Erlemar/Erlemar.github.io/blob/master/Notebooks/functions.py). It splits the continuous variable into bins to maximize IV (Information Value). # # What is IV? In fact it was and still is widely used in bank analysis. In simple terms it shows how useful is the variable for predicting the target. It is calculated in the following way (you can see an example below for "GENDER"): # * For each category % of responders is calculated - how many people in the category have positive class; # * The same is calculated for negative class; # * WOE (Weight of Evidence) is calculated as logarithm of responders rate divided by non-resonders rate. WOE shows how good is the category in separating positive and negative outcomes. Also negative WOE shows that there are more non-responders, positive implies more responders; # * Difference between distributions of positive and negative incomes is calculated; # * IV for each category is a multiplication of the aforementioned difference and WOE; # * IV for the variable is the sum of IV for each category; # # Rule of thumb for IV is the following: # * < 0.02 - feature isn't useful for prediction; # * 0.02 - 0.1 - weak impact on prediction quality; # * 0.1 - 0.3 - medium impact; # * 0.3 - 0.5 - strong impact; # * 0.5+ - may cause overfitting; # # These aren't definite thesholds, but we should pay attention to them. # In[12]: df = pd.DataFrame(index = data['GENDER'].unique(), data={'% responders': data.groupby('GENDER')['TARGET'].sum() / np.sum(data['TARGET'])}) df['% non-responders'] = (data.groupby('GENDER')['TARGET'].count() - data.groupby('GENDER')['TARGET'].sum()) \ / (len(data['TARGET']) - np.sum(data['TARGET'])) df['WOE'] = np.log(df['% responders'] / df['% non-responders']) df['DG-DB'] = df['% responders'] - df['% non-responders'] df['IV'] = df['WOE'] * df['DG-DB'] df print('IV is {:.2f}.'.format(np.sum(df['IV']))) # Back to the function. Function **split_best_iv** calls function **cont_split**, which tries to split the variable into bins. I use DecisionTreeClassifier for this, which is really great for the purpose. Interesting parameters: # * criterion='entropy': to maximize information gain while branching trees; # * min_samples_split=0.05, min_samples_leaf=0.05: so that there are at least 5% values in each category. The reasons for choosing this value were mentioned higher; # * class_weight='balanced': great option for working with unbalanced classes; # * max_leaf_nodes=leafs: how many categories will be created, more about this lower; # # After this I use **tree_to_thresholds** function to walk the tree and gather the thresholds for the decision rules. The code was adopted from [this](https://stackoverflow.com/questions/20224526/how-to-extract-the-decision-rules-from-scikit-learn-decision-tree) stackoverflow question. I round values, as having fractional age for example makes little sense. # Then I calculate and save IV value. # At the beginning there are 2 leafs. Then **split_best_iv** function increases number of leafs until IV stops increasing. This will be the optimal number of leafs and optimal split into the bins. The examples will be lower. # ## Outliers # # It is very important to deal with outliers. Some of the usual ways are: # * Dropping rows with these values; # * Replacing these values with more reasonable figures; # * Building a separate model for them; # # I'll go with the first choice. # # To identify outliers I use either boxplots or simply look at the top values. # In[13]: data['PERSONAL_INCOME'].plot(kind='box') # Boxplot shows that while median value is reasonable, max values are very high. In fact it is necessary to investigate whether these values are normal. Maybe they are VIP clients, maybe there is an error in the data, maybe this is completely normal or there could be some other reason. I have no additional data, so I'll just get rid of top-1% and low-1%. # In[14]: data = data[(data.PERSONAL_INCOME < np.percentile(data.PERSONAL_INCOME, 99)) & (data.PERSONAL_INCOME > np.percentile(data.PERSONAL_INCOME, 1))] # In[15]: data['WORK_TIME'].nlargest(20) # I may believe that people work at the current place for 10, 30, maybe even 50 years. More is quite unlikely. I'll drop these values. There is a possibility to replace these figures with more adequate values, but there is enough data, so dropping is okay. # In[16]: data.drop([8984, 4296, 2532, 5375, 9852, 1092, 11720, 13928, 9983, 10677, 10171, 676, 7711, 3323], inplace=True) # In[17]: data['FST_PAYMENT'].nlargest() # In[18]: data.loc[data['FST_PAYMENT'] > data['CREDIT']][['CREDIT', 'FST_PAYMENT']][:10] len(data.loc[data['FST_PAYMENT'] > data['CREDIT']][['CREDIT', 'FST_PAYMENT']]) # We see that there are 485 rows where initial payment is higher than the credit amount. This definitely isn't normal. # In[19]: data = data.loc[data['FST_PAYMENT'] < data['CREDIT']] # In[20]: #Living in the place, months. data['FACT_LIVING_TERM'].nlargest(20) # While it is possible that people can live in the same place all their life, I don't think that there are many people living for 100+ years :) # In[21]: data.drop([6186, 12261, 8562, 14739, 988, 12869, 7650, 12134, 5681, 11004, 14707], inplace=True) # In[22]: data.shape, np.sum(data['TARGET']) # In[23]: #This will be used lated. initial_data = data.copy() # 947 values were dropped, but only 92 of them had positive response. # ### PERSONAL_INCOME # In[24]: data['PERSONAL_INCOME'].plot(kind='box') # It is time to try splitting the variable. # In[25]: data['PERSONAL_INCOME'] = functions.split_best_iv(data, 'PERSONAL_INCOME', 'TARGET') # Done, and there are two more functions. Second one was already used, it caculates IV. The first one shows the following things: # * Counts of each category; # * Normalized counts including missing values; # * Graph with blue bars for counts and red line for mean value of target (or what percent of values in category have positive income); # In[26]: functions.feature_stat(data, 'PERSONAL_INCOME', 'TARGET') functions.calc_iv(data, 'TARGET', 'PERSONAL_INCOME')[0] # People with higher income tend to have higher positive response rate. # ### Age # In[27]: data['AGE'].plot(kind='box') # In[28]: data['AGE'] = functions.split_best_iv(data, 'AGE', 'TARGET') # In[29]: functions.feature_stat(data, 'AGE', 'TARGET') functions.calc_iv(data, 'TARGET', 'AGE')[0] # Younger people take more credits, while only a fraction of elder people have positive response. # ### WORK_TIME # # Time of work on the current workplace in months. # In[30]: #I assume that missing values mean that the person didn't work at all. data['WORK_TIME'].fillna(0, inplace=True) # In[31]: data['WORK_TIME'].plot(kind='box') # Here I add another line. If variable has zero values, DecisionTreeClassifier has problems with it. I combine zero values with the nearest interval. # In[32]: data['WORK_TIME'] = functions.split_best_iv(data, 'WORK_TIME', 'TARGET') data['WORK_TIME'].fillna(data['WORK_TIME'].cat.categories[0], inplace=True) # In[33]: functions.feature_stat(data, 'WORK_TIME', 'TARGET') functions.calc_iv(data, 'TARGET', 'WORK_TIME')[0] # ### CREDIT # # Credit amount in roubles. # In[34]: data['CREDIT'].plot(kind='box') # Some of credits have much higher values than median, but maybe these are special kinds of credit, how which these amounts are normal. # In[35]: data['CREDIT'] = functions.split_best_iv(data, 'CREDIT', 'TARGET') # In[36]: functions.feature_stat(data, 'CREDIT', 'TARGET') functions.calc_iv(data, 'TARGET', 'CREDIT')[0] # ### TERM # # Credit length. I think in months. # In[37]: data['TERM'].plot(kind='box') # In[38]: data['TERM'] = functions.split_best_iv(data, 'TERM', 'TARGET') # In[39]: functions.feature_stat(data, 'TERM', 'TARGET') functions.calc_iv(data, 'TARGET', 'TERM')[0] # ### FST_PAYMENT # # Initial fee amount in roubles # In[40]: data['FST_PAYMENT'].plot(kind='box') # In[41]: data['FST_PAYMENT'] = functions.split_best_iv(data, 'FST_PAYMENT', 'TARGET') data['FST_PAYMENT'].fillna(data['FST_PAYMENT'].cat.categories[0], inplace=True) # In[42]: functions.feature_stat(data, 'FST_PAYMENT', 'TARGET') functions.calc_iv(data, 'TARGET', 'FST_PAYMENT')[0] # ### FACT_LIVING_TERM # How long the person lives in the fact place, months. # In[43]: data['FACT_LIVING_TERM'].plot(kind='box') # In[44]: data['FACT_LIVING_TERM'] = functions.split_best_iv(data, 'FACT_LIVING_TERM', 'TARGET') data['FACT_LIVING_TERM'].fillna(data['FACT_LIVING_TERM'].cat.categories[0], inplace=True) # In[45]: functions.feature_stat(data, 'FACT_LIVING_TERM', 'TARGET') functions.calc_iv(data, 'TARGET', 'FACT_LIVING_TERM')[0] # ### LOAN_NUM_PAYM # # Number of payments by the client # In[46]: data['LOAN_NUM_PAYM'].plot(kind='box') # In[47]: data['LOAN_NUM_PAYM'] = functions.split_best_iv(data, 'LOAN_NUM_PAYM', 'TARGET') # In[48]: functions.feature_stat(data, 'LOAN_NUM_PAYM', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_NUM_PAYM')[0] # ### LOAN_AVG_DLQ_AMT # # Average deliquency amount # In[49]: data['LOAN_AVG_DLQ_AMT'].plot(kind='box') # In[50]: data['LOAN_AVG_DLQ_AMT'] = functions.split_best_iv(data, 'LOAN_AVG_DLQ_AMT', 'TARGET') data['LOAN_AVG_DLQ_AMT'].fillna(data['LOAN_AVG_DLQ_AMT'].cat.categories[0], inplace=True) # In[51]: functions.feature_stat(data, 'LOAN_AVG_DLQ_AMT', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_AVG_DLQ_AMT')[0] # ### LOAN_MAX_DLQ_AMT # In[52]: data['LOAN_MAX_DLQ_AMT'].plot(kind='box') # In[53]: data['LOAN_MAX_DLQ_AMT'] = functions.split_best_iv(data, 'LOAN_MAX_DLQ_AMT', 'TARGET') data['LOAN_MAX_DLQ_AMT'].fillna(data['LOAN_MAX_DLQ_AMT'].cat.categories[0], inplace=True) # In[54]: functions.feature_stat(data, 'LOAN_MAX_DLQ_AMT', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_MAX_DLQ_AMT')[0] # ### Income_to_limit # In[55]: data['Income_to_limit'].plot(kind='box') # In[56]: data['Income_to_limit'] = functions.split_best_iv(data, 'Income_to_limit', 'TARGET') # In[57]: functions.feature_stat(data, 'Income_to_limit', 'TARGET') functions.calc_iv(data, 'TARGET', 'Income_to_limit')[0] # ## Categorical # # Now categorical variables are different. Usually the main problem is that some categories have too little values. Again I'll try to do so that there are no categories with less than 5%. Most of the time it is necessary to combine categories based on the common or business case. I convert variables into type "category" for easier processing. Missing values are treated as a separate category. # In[58]: for col in ['GENDER', 'CHILD_TOTAL', 'DEPENDANTS', 'EDUCATION', 'MARITAL_STATUS', 'GEN_INDUSTRY', 'OWN_AUTO', 'FAMILY_INCOME', 'LOAN_NUM_TOTAL', 'LOAN_NUM_CLOSED', 'LOAN_DLQ_NUM', 'LOAN_MAX_DLQ']: data[col] = data[col].astype('category') if (data[col].isnull() == True).any(): data[col].cat.add_categories(['Unknown'], inplace=True) data[col].fillna('Unknown', inplace=True) # ### OWN_AUTO # # Number of cars owned. # In[59]: data['OWN_AUTO'].value_counts(dropna=False, normalize=True) # In[60]: data.loc[data['OWN_AUTO'] == 2, 'OWN_AUTO'] = 1 data['OWN_AUTO'] = data['OWN_AUTO'].cat.remove_unused_categories() # In[61]: functions.feature_stat(data, 'OWN_AUTO', 'TARGET') functions.calc_iv(data, 'TARGET', 'OWN_AUTO')[0] # ## GENDER # In[62]: functions.feature_stat(data, 'GENDER', 'TARGET') functions.calc_iv(data, 'TARGET', 'GENDER')[0] # ## CHILD_TOTAL # In[63]: data['CHILD_TOTAL'].value_counts(dropna=False, normalize=True) # In[64]: data['CHILD_TOTAL'].cat.add_categories(['3 or more'], inplace=True) data.loc[data['CHILD_TOTAL'].isin([1.0, 0.0, 2.0]) == False, 'CHILD_TOTAL'] = '3 or more' data['CHILD_TOTAL'] = data['CHILD_TOTAL'].cat.remove_unused_categories() # In[65]: functions.feature_stat(data, 'CHILD_TOTAL', 'TARGET') functions.calc_iv(data, 'TARGET', 'CHILD_TOTAL')[0] # ## DEPENDANTS # In[66]: data['DEPENDANTS'].value_counts(dropna=False, normalize=True) # In[67]: data['DEPENDANTS'].cat.add_categories(['2 or more'], inplace=True) data.loc[data['DEPENDANTS'].isin([1.0, 2.0]) == False, 'DEPENDANTS'] = '2 or more' data['DEPENDANTS'] = data['DEPENDANTS'].cat.remove_unused_categories() # In[68]: functions.feature_stat(data, 'DEPENDANTS', 'TARGET') functions.calc_iv(data, 'TARGET', 'DEPENDANTS')[0] # ## EDUCATION # In[69]: data['EDUCATION'].value_counts(dropna=False, normalize=True) # In[70]: data.loc[data['EDUCATION'].isin(['Undergraduate Degree', 'Post-Graduate Work', 'Graduate Degree']), 'EDUCATION'] = 'Undergraduate Degree' data.loc[data['EDUCATION'].isin(['Some High School', 'No Formal Education', 'Some Primary School']), 'EDUCATION'] = 'Some High School' data['EDUCATION'] = data['EDUCATION'].cat.remove_unused_categories() # In[71]: functions.feature_stat(data, 'EDUCATION', 'TARGET') functions.calc_iv(data, 'TARGET', 'EDUCATION')[0] # ## MARITAL_STATUS # In[72]: data['MARITAL_STATUS'].value_counts(dropna=False, normalize=True) # In[73]: data.loc[data['MARITAL_STATUS'].isin(['Married', 'Partner']), 'MARITAL_STATUS'] = 'Married' data.loc[data['MARITAL_STATUS'].isin(['Single', 'Separated', 'Widowed']), 'MARITAL_STATUS'] = 'Single' data['MARITAL_STATUS'] = data['MARITAL_STATUS'].cat.remove_unused_categories() # In[74]: functions.feature_stat(data, 'MARITAL_STATUS', 'TARGET') functions.calc_iv(data, 'TARGET', 'MARITAL_STATUS')[0] # ## GEN_INDUSTRY # In[75]: data['GEN_INDUSTRY'].value_counts(dropna=False, normalize=True) # In[76]: data['GEN_INDUSTRY'].cat.add_categories(['others'], inplace=True) data.loc[data['GEN_INDUSTRY'].isin(['Market, real estate', 'Others fields', 'Iron & Steel', 'Unknown', 'Transportation', 'Public & municipal administ.', 'Healthcare', 'Schools']) == False, 'GEN_INDUSTRY'] = 'others' data['GEN_INDUSTRY'] = data['GEN_INDUSTRY'].cat.remove_unused_categories() # In[77]: functions.feature_stat(data, 'GEN_INDUSTRY', 'TARGET') functions.calc_iv(data, 'TARGET', 'GEN_INDUSTRY')[0] # ## FAMILY_INCOME # In[78]: data['FAMILY_INCOME'].value_counts(dropna=False, normalize=True) # In[79]: data['FAMILY_INCOME'].cat.add_categories(['up to 10000', '20000+'], inplace=True) data.loc[data['FAMILY_INCOME'].isin(['up to 5000', '5000-10000']), 'FAMILY_INCOME'] = 'up to 10000' data.loc[data['FAMILY_INCOME'].isin(['20000-50000', '50000+']), 'FAMILY_INCOME'] = '20000+' data['FAMILY_INCOME'] = data['FAMILY_INCOME'].cat.remove_unused_categories() # In[80]: functions.feature_stat(data, 'FAMILY_INCOME', 'TARGET') functions.calc_iv(data, 'TARGET', 'FAMILY_INCOME')[0] # ## LOAN_NUM_TOTAL # In[81]: data['LOAN_NUM_TOTAL'].value_counts(dropna=False, normalize=True) # In[82]: data['LOAN_NUM_TOTAL'].cat.add_categories(['3 or more'], inplace=True) data.loc[data['LOAN_NUM_TOTAL'].isin([1, 2]) == False, 'LOAN_NUM_TOTAL'] = '3 or more' data['LOAN_NUM_TOTAL'] = data['LOAN_NUM_TOTAL'].cat.remove_unused_categories() # In[83]: functions.feature_stat(data, 'LOAN_NUM_TOTAL', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_NUM_TOTAL')[0] # ## LOAN_NUM_TOTAL # In[84]: data['LOAN_NUM_CLOSED'].value_counts(dropna=False, normalize=True) # In[85]: data['LOAN_NUM_CLOSED'].cat.add_categories(['3 or more'], inplace=True) data.loc[data['LOAN_NUM_CLOSED'].isin([0, 1, 2]) == False, 'LOAN_NUM_CLOSED'] = '3 or more' data['LOAN_NUM_CLOSED'] = data['LOAN_NUM_CLOSED'].cat.remove_unused_categories() # In[86]: functions.feature_stat(data, 'LOAN_NUM_CLOSED', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_NUM_CLOSED')[0] # ## LOAN_DLQ_NUM # In[87]: data['LOAN_DLQ_NUM'].value_counts(dropna=False, normalize=True) # In[88]: data['LOAN_DLQ_NUM'].cat.add_categories(['1 or more'], inplace=True) data.loc[data['LOAN_DLQ_NUM'].isin([0]) == False, 'LOAN_DLQ_NUM'] = '1 or more' data['LOAN_DLQ_NUM'] = data['LOAN_DLQ_NUM'].cat.remove_unused_categories() # In[89]: functions.feature_stat(data, 'LOAN_DLQ_NUM', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_DLQ_NUM')[0] # ## LOAN_MAX_DLQ # In[90]: data['LOAN_MAX_DLQ'].value_counts(dropna=False, normalize=True) # In[91]: data['LOAN_MAX_DLQ'].cat.add_categories(['1 or more'], inplace=True) data.loc[data['LOAN_MAX_DLQ'].isin([0]) == False, 'LOAN_MAX_DLQ'] = '1 or more' data['LOAN_MAX_DLQ'] = data['LOAN_MAX_DLQ'].cat.remove_unused_categories() # In[92]: functions.feature_stat(data, 'LOAN_MAX_DLQ', 'TARGET') functions.calc_iv(data, 'TARGET', 'LOAN_MAX_DLQ')[0] # In[93]: data.head(10) # This is it, all the variables are transformed. I didn't do anything to several variables which are flags, but they are good as they are. # # Feature selection based on IV # # Now it is time to choose columns. It could be done before, while processing separate columns, but I prefer to do this for all columns at once. I calculate IV for all columns and use only those which have value higher that threshold (0.02 in this case). # In[94]: columns_to_try = [col for col in list(data.columns) if col not in ('AGREEMENT_RK', 'CARD_ID_SB8', 'CARD_NUM', 'TARGET')] # In[95]: ivs = [] for col in columns_to_try: data[col] = data[col].astype('category') if data[col].isnull().any(): print(col) if 'Unknown' not in data[col].cat.categories: data[col].cat.add_categories(['Unknown'], inplace=True) data[col].fillna('Unknown', inplace=True) data[col] = data[col].cat.remove_unused_categories() _, iv = functions.calc_iv(data, 'TARGET', col) ivs.append((col, np.round(iv, 4))) # In[96]: good_cols = [i[0] for i in sorted(ivs, key=lambda tup: tup[1], reverse=True) if i[1] > 0.02] for i in ['TARGET', 'AGREEMENT_RK']: good_cols.append(i) # In[97]: good_cols # ## Some additional visualization # # Plotting variables by themselves is useful, but visualizing their interactions can unveil interesting things. There are some examples below. # # Pointplots show mean target rate for pairs of variables. I show only several plots as there are too many possible combinations. # In[98]: data_viz = data[good_cols] fig, ax = plt.subplots(1, 2, figsize = (16, 6)) sns.pointplot(x='SOCSTATUS_WORK_FL', y="TARGET", hue='SOCSTATUS_PENS_FL', data=data_viz, ax=ax[0]) sns.pointplot(x='LOAN_MAX_DLQ', y="TARGET", hue='SOCSTATUS_PENS_FL', data=data_viz, ax=ax[1]) # SOCSTATUS_PENS_FL 1 means that person is on pension, 0 otherwise. # SOCSTATUS_WORK_FL 1 means that person works, 0 otherwise. # # Three features on the plots above show clear distinctions between mean target rates. It could be a good idea to create new variables showing these interactions. # In[99]: data['work_pens'] = 0 data.loc[data['SOCSTATUS_WORK_FL'] == 0, 'work_pens'] = 1 data.loc[(data['SOCSTATUS_WORK_FL'] == 1) & (data['SOCSTATUS_PENS_FL'] == 1), 'work_pens'] = 2 data.loc[(data['SOCSTATUS_WORK_FL'] == 1) & (data['SOCSTATUS_PENS_FL'] == 0), 'work_pens'] = 3 # In[100]: data['pens_dlq'] = 0 data.loc[(data['LOAN_MAX_DLQ'] == 0) & (data['SOCSTATUS_PENS_FL'] == 0), 'pens_dlq'] = 1 data.loc[(data['LOAN_MAX_DLQ'] == '1 or more') & (data['SOCSTATUS_PENS_FL'] == 1), 'pens_dlq'] = 2 data.loc[(data['LOAN_MAX_DLQ'] == 0) & (data['SOCSTATUS_PENS_FL'] == 0), 'pens_dlq'] = 3 data.loc[(data['LOAN_MAX_DLQ'] == '1 or more') & (data['SOCSTATUS_PENS_FL'] == 1), 'pens_dlq'] = 4 # For the next graphs I'll need data, where continuous variables aren't binned. Also it is necessary to do label encoding for categorical variables, as sns.pairplot doesn't work well with them. # In[101]: le = preprocessing.LabelEncoder() for col in ['GENDER', 'CHILD_TOTAL', 'DEPENDANTS', 'EDUCATION', 'MARITAL_STATUS', 'GEN_INDUSTRY', 'OWN_AUTO', 'FAMILY_INCOME', 'LOAN_NUM_TOTAL', 'LOAN_NUM_CLOSED', 'LOAN_DLQ_NUM', 'LOAN_MAX_DLQ']: initial_data[col] = initial_data[col].astype('category') if (initial_data[col].isnull() == True).any(): initial_data[col].cat.add_categories(['Unknown'], inplace=True) initial_data[col].fillna('Unknown', inplace=True) initial_data[col] = le.fit_transform(initial_data[col]) # In[102]: data_viz1 = initial_data[good_cols].drop(['AGREEMENT_RK'], axis=1) # In[103]: plt.figure(figsize=(32, 32)) sns.pairplot(data_viz1[['LOAN_AVG_DLQ_AMT', 'LOAN_MAX_DLQ_AMT', 'AGE', 'TARGET', 'FAMILY_INCOME']], hue='TARGET') # I included only several variables in this pairplot, but it shows how variables can interact. Sometimes variables may interact in such a way, that their values cleate visible clusters based on target. New variables can be created based on this. # Another use of the graph is to find correlated features. 'LOAN_AVG_DLQ_AMT' and 'LOAN_MAX_DLQ_AMT' seem to be highly correlated, let's have a look. # In[104]: with sns.axes_style("white"): sns.jointplot(x=data_viz1['LOAN_AVG_DLQ_AMT'], y=data_viz1['LOAN_MAX_DLQ_AMT'], kind="hex", color="k"); # Well, it seems that pearson correlation coefficient is 1 which shows very high correlation. I'll drop one of these columns. # In[105]: data.drop(['LOAN_AVG_DLQ_AMT'], axis=1, inplace=True) # Let's try selecting variables based on IV again. # In[106]: columns_to_try = [col for col in list(data.columns) if col not in ('AGREEMENT_RK', 'CARD_ID_SB8', 'CARD_NUM', 'TARGET')] ivs = [] for col in columns_to_try: data[col] = data[col].astype('category') if data[col].isnull().any(): print(col) if 'Unknown' not in data[col].cat.categories: data[col].cat.add_categories(['Unknown'], inplace=True) data[col].fillna('Unknown', inplace=True) data[col] = data[col].cat.remove_unused_categories() _, iv = functions.calc_iv(data, 'TARGET', col) ivs.append((col, np.round(iv, 4))) good_cols = [i[0] for i in sorted(ivs, key=lambda tup: tup[1], reverse=True) if i[1] > 0.02] for i in ['TARGET', 'AGREEMENT_RK']: good_cols.append(i) good_cols # One of the newly created features proved to be useful! Now it's time to go further. I'll dummify all features. # In[107]: columns_dummify = [col for col in good_cols if col not in ('TARGET', 'AGREEMENT_RK')] data = data[good_cols] for col in columns_dummify: data[col] = data[col].astype('category') dummies = pd.get_dummies(data[col]) dummies = dummies.add_prefix('{}_:_'.format(col)) data.drop([col], axis=1, inplace=True) data = data.join(dummies) # In[108]: X = data.drop(['TARGET', 'AGREEMENT_RK'], axis=1) Y = data['TARGET'] # In[109]: X.shape # 87 variables could be okay, but I think it could be a good idea to reduce the number of them. There are various ways to select features: greedy algorithms, feature importance and so on. As I'm going to use Logistic Regression, I'll use sklearn's RandomizedLogisticRegression for this. # # [RandomizedLogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html) basically runs Logistic Regression several times with various penalties for random coefficients. After the runs high scores are assigned to the most stable features. # In[110]: randomized_logistic = linear_model.RandomizedLogisticRegression(C=0.1, selection_threshold=0.5, n_resampling=50, normalize=False) X_train_log = randomized_logistic.fit_transform(X=X, y=Y) randomized_logistic.get_support() # In[111]: X_train_log.shape # 36 from 87 were selected. It's time for the model. I split data into train, test and validation sets. LogisticRegressionCV is used to choose an optimal regularization strength. # In[112]: X_train, X_test, y_train, y_test = train_test_split(X_train_log, Y, test_size=0.2, stratify = Y) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify = y_train) logreg = linear_model.LogisticRegressionCV(class_weight='balanced', n_jobs=-1, fit_intercept=True) logreg.fit(X_train, y_train) y_pred_log_val = logreg.predict_proba(X_val) y_pred_log_val_1 = [i[1] for i in y_pred_log_val] fpr_val, tpr_val, thresholds_val = roc_curve(y_val, y_pred_log_val_1) plt.plot(fpr_val, tpr_val, label='Validation') scores_val = cross_val_score(logreg, X_val, y_val, cv=5, scoring='roc_auc') y_pred_log_test = logreg.predict_proba(X_test) y_pred_log_test_1 = [i[1] for i in y_pred_log_test] fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_pred_log_test_1) plt.plot(fpr_test, tpr_test, label='Test') scores_test = cross_val_score(logreg, X_test, y_test, cv=5, scoring='roc_auc') plt.title('ROCAUC curve') plt.legend(loc='lower right') # In[113]: print('Validation auc: ', np.round(auc(fpr_val, tpr_val), 4)) print('Cross-validation: mean value is {0} with std {1}.'.format(np.round(np.mean(scores_val), 4), np.round(np.std(scores_val), 4))) print('Test auc: ', np.round(auc(fpr_test, tpr_test), 4)) print('Cross-validation: mean value is {0} with std {1}.'.format(np.round(np.mean(scores_test), 4), np.round(np.std(scores_test), 4))) # In[114]: coefs = pd.DataFrame(list(zip(X[X.columns[randomized_logistic.get_support()]].columns, logreg.coef_[0])), columns=['Feature', 'Coefficient']) coefs # And here we can see how each category influenced the result. # So, this is it. The score is quite high, accuracy on real test set should be lower, but hopefully not much. There are many ways to enchance the model, of course: # - Transform variables with more care - maybe change parameters for DecisionTreeClassifier for specific variables to create better bins; # - Fill missing values with something else; # - Treat outliers instead of dropping rows with them; # - Create more variables bases of feature interaction; # - Try different threshold for feature selection); # # And if interpreting variables isn't necessary, then continuous variables can be used without binning. Maybe they can be transformed some way or scaled. More sophisticated algorithms can be used such as a reputable xgboost and so on.