#!/usr/bin/env python # coding: utf-8 #
#
# # Predict Blood Donations #
#
# The objective of this notebook is to predict if a blood donor will donate within a given time window, given parameters such as months since last donation (recency), number of donations made (frequency) , total volume of blood donated (cc) [monetary], months since first donation (time). # # This is a warm-up problem in an ongoing competition at https://www.drivendata.org/competitions/2/warm-up-predict-blood-donations/page/7/ # In[ ]: # In[387]: import pandas as pd import numpy as np import seaborn as sns import matplotlib import matplotlib.pyplot as plt from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix,log_loss from sklearn.model_selection import cross_val_score from imblearn import over_sampling as os from imblearn import under_sampling as us from imblearn import pipeline as pl from imblearn.metrics import classification_report_imbalanced from collections import Counter from imblearn.combine import SMOTETomek from sklearn.decomposition import PCA from keras.models import Sequential from keras.layers import Dense, Dropout from keras.callbacks import EarlyStopping from keras.utils import to_categorical RANDOM_STATE = 2017 get_ipython().run_line_magic('matplotlib', 'inline') # In[214]: from jupyterthemes import jtplot jtplot.style('grade3', context='poster', fscale=1.5) jtplot.style(ticks=True, grid=False) # In[ ]: # In[ ]: # In[405]: df = pd.read_csv("transfusion.data.txt") # In[406]: df.head() # In[415]: df.columns = ['months_since_last_donation', 'num_donations', 'vol_donations', 'months_since_first_donation', 'class'] # In[416]: df.head() #
#
# class 1 => the donor donated blood in March 2007 [let's call them donors] #
# class 0 => the donor did not donate blood in March 2007 [ let's call them non-donors ] #
#
# In[ ]: #
#
# ** Number of instances of each class ** #
#
# In[417]: df['class'].value_counts() #
#
# we see that there is a class imbalance problem here. We can fix this issue by oversampling the minority class (1) or undersampling the majority class. #
#
# #
# # --- # ## EDA # --- #
#
# # Let's explore the dataset and find out if we can get some interesting insights. # #
# In[418]: def ecdf(data): """ returns empirical CDF of a data """ n= len(data) x = np.array(sorted(data)) y = np.arange(1,n+1)/(n*1.0) return x,y # In[ ]: # In[ ]: # In[419]: plt.figure(figsize=(15,10)) _ = sns.kdeplot(df[df['class']==0]['months_since_last_donation'],cumulative=True,label='0') _ = sns.kdeplot(df[df['class']==1]['months_since_last_donation'],cumulative=True,label='1') _ = plt.xlabel('months since last donation') _ = plt.ylabel('CDF') _ = plt.title('CDF of recency of donors vs non-donors') plt.show(); #
#
# People who are likely to donate blood again have donated blood more recently. The curve representing the CDF of donors (class 1) grows faster with less recency values than CDF of non-donors (class 0). #
#
# In[413]: plt.figure(figsize=(15,10)) _ = sns.kdeplot(df[df['class']==0]['num_donations'],cumulative=True,label='0') _ = sns.kdeplot(df[df['class']==1]['num_donations'],cumulative=True,label='1') _ = plt.xlabel('number of donations') _ = plt.ylabel('CDF') _ = plt.title('CDF of frequency of donors vs non-donors') plt.show() #
#
# From the above plot, we observe that donors donate more frequently than non-donors #
#
# In[420]: plt.figure(figsize=(15,10)) _ = sns.kdeplot(df[df['class']==0]['vol_donations'],cumulative=True,label='0') _ = sns.kdeplot(df[df['class']==1]['vol_donations'],cumulative=True,label='1') _ = plt.xlabel('volume of blood donated') _ = plt.ylabel('CDF') _ = plt.title('CDF of monetary value of donors vs non-donors') plt.show() #
#
# The above CDF looks similar to the previous one. Frequency and monetary values are highly correlated. Here again, donars donate more volumne of blood than non-donors #
#
# In[421]: plt.figure(figsize=(15,10)); _ = sns.lmplot(x='num_donations', y='vol_donations', hue='class', fit_reg=False, data=df); _ = plt.title("Correlation between frequency and monetary"); plt.show(); # In[ ]: # In[423]: plt.figure(figsize=(15,10)) _ = sns.kdeplot(df[df['class']==0]['months_since_first_donation'],cumulative=True,label='0') _ = sns.kdeplot(df[df['class']==1]['months_since_first_donation'],cumulative=True,label='1') _ = plt.xlabel('months since first donation') _ = plt.ylabel('CDF') _ = plt.title('CDF of months since first donation of donors vs non-donors') plt.show() #
#
# CDF of months since first donations is not so differentiable between the two classes. #
#
# In[425]: df_long = pd.melt(df,id_vars='class',var_name='feature') plt.figure(figsize=(15,10)) _ = sns.boxplot(x='feature',y='value',hue='class',data=df_long[df_long.feature!='vol_donations']) plt.margins(0.02) _ = plt.title('Boxplot of recency,frequency and time') plt.show() #
#
# we see that there are few outliers in recency and frequency for each of the classes. Should we remove them? What's the story behind these outliers? #
#
# # In[ ]: # In[ ]: # In[ ]: # --- # # ## Prepare Data # # --- # In[227]: from sklearn.model_selection import train_test_split #
#
# Let's split our dataset into training and testing. We'll drop 'monetary' feature from our training data since it has high correlation to 'frequency'. It is a good practise to standardise our data. #
#
# In[426]: x_train, x_test, y_train,y_test = train_test_split(df.drop(['class','vol_donations'],axis=1), df['class'], test_size = 0.1, random_state=2017) # In[427]: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train) x_test_scaled = scaler.transform(x_test) # In[428]: x_train_scaled.mean(),x_train_scaled.std() # In[429]: x_test_scaled.mean(),x_test_scaled.std() # In[ ]: # In[ ]: # --- # # ## Resampling and Visualizations # # --- # In[430]: # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2,random_state=RANDOM_STATE) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(x_train_scaled) pca_original_df = pd.DataFrame(X_vis,y_train).reset_index() pca_original_df.columns = ['class','x','y'] # Apply SMOTE + Tomek links sm = SMOTETomek() X_resampled, y_resampled = sm.fit_sample(x_train_scaled, y_train) X_res_vis = pca.transform(X_resampled) pca_resampled_df = pd.DataFrame(X_res_vis,y_resampled).reset_index() pca_resampled_df.columns = ['class','x','y'] #
#
# **Visualizing original data in 2D** #
#
# In[431]: _ = sns.lmplot(x='x', y='y', hue='class', x_jitter=True, y_jitter=True, fit_reg=False, size=7, aspect = 1.5, data=pca_original_df, scatter_kws={'alpha':0.5}) #
#
# **Visualizing resampled data in 2D** #
#
# In[432]: _ = sns.lmplot(x='x', y='y', hue='class', x_jitter=True, y_jitter=True, fit_reg=False, size=7, aspect = 1.5, data=pca_resampled_df, scatter_kws={'alpha':0.5}) # In[ ]: #
#
# # I have tried building models on the resampled dataset, but the performance did not improve. Let's try doing some feature engineering on the dataset. #
#
# In[ ]: # In[433]: df.head() #
#
# **submission** #
#
# In[434]: def create_submission(clf): test = pd.read_csv("test.csv") test.columns = ['id','months_since_last_donation','num_donations','vol_donations','months_since_first_donation'] submit_id, submit_test = test['id'] , test.drop(['id','vol_donations'],axis=1) submit_test_scaled = scaler.transform(submit_test) #scale the data predictions = clf.predict_proba(submit_test_scaled) predictions = predictions[:,1] #only predictions for class-1 needs to be submitted pred_report = pd.DataFrame(predictions.tolist(),index=submit_id,columns=["Made Donation in March 2007"]) return pred_report # In[ ]: # --- # # ## Utility Functions # # --- # In[25]: import itertools def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Accent): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.figure(figsize=(10,7)) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix') print(cm) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() return def prediction_report(true_label,predicted_label,classes=[0,1]): report = classification_report(true_label, predicted_label) print "classification report:\n",report cnf_matrix = confusion_matrix(true_label, predicted_label) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure(figsize=(15,10)) plot_confusion_matrix(cnf_matrix, classes=classes,title='Confusion matrix, without normalization') return # In[ ]: # In[ ]: # --- # ## Models # --- # In[381]: from sklearn.svm import SVC; from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier; from keras.wrappers.scikit_learn import KerasClassifier np.random.seed(2017) early_stopping_monitor = EarlyStopping(monitor='val_loss',patience=5) def create_mlp(): model = Sequential() model.add(Dropout(0.25,input_shape=(x_train_scaled.shape[1],))) model.add(Dense(100,activation='relu',input_shape=(x_train_scaled.shape[1],))) model.add(Dropout(0.25)) model.add(Dense(100,activation='relu')) model.add(Dropout(0.25)) model.add(Dense(100,activation='relu')) model.add(Dense(2,activation='sigmoid')) model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']) return model svc = SVC(verbose=1,C=10,gamma=0.1,kernel='rbf',random_state=2017,probability=True); svc.fit(x_train_scaled,y_train); logit_model = LogisticRegression(random_state=2017) logit_model.fit(x_train_scaled,y_train) rf = RandomForestClassifier(n_estimators=100,max_depth=6,class_weight='balanced',n_jobs=4,random_state=2017) rf.fit(x_train_scaled,y_train) xgb = XGBClassifier(learning_rate=0.03,max_depth=5,n_estimators=250,reg_alpha=0.01) xgb.fit(x_train_scaled,y_train) mlp = KerasClassifier(build_fn=create_mlp, epochs=20, batch_size=10, validation_split=0.2,verbose=False) models = {'SVC': svc, 'Logistic Regression': logit_model, 'Random Forest': rf, 'XGBoost': xgb, 'mlp': mlp } #
#
# Grid search for the best parameters of our models #
#
# In[327]: from sklearn.model_selection import GridSearchCV parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10,100,1000], 'gamma': [1e-1,1e-2,1e-3,1,10,100]} svc_grid = SVC(probability=True,verbose=True) clf = GridSearchCV(svc_grid, parameters,scoring='neg_log_loss',verbose=True,n_jobs=4) clf.fit(x_train_scaled,y_train) # In[342]: pd.DataFrame(clf.cv_results_)[['mean_test_score', 'mean_train_score', 'param_C', 'param_gamma', 'param_kernel', 'rank_test_score']].sort_values('rank_test_score')[:5] # In[ ]: #
#
# **Cross-validation** #
#
# In[386]: np.random.seed(2017) print "Cross-validation score" print "=========================" for model_name, model in models.items(): if model_name=='mlp': cv_score = cross_val_score(estimator=model,X=x_train_scaled,y=to_categorical(y_train), scoring='neg_log_loss', cv=10, n_jobs=4, verbose=False, fit_params={'callbacks':[early_stopping_monitor]}) else: cv_score = cross_val_score(estimator=model, X=x_train_scaled, y=y_train, cv=10, n_jobs=4, scoring='neg_log_loss', verbose=False) print model_name," ",-cv_score.mean(),"+/-",cv_score.std() # In[ ]: #
#
# **Test the models on test set** #
#
# In[402]: np.random.seed(2017) print "Log loss" print "==============" for model_name,model in models.items(): if model_name == 'mlp': mlp.fit(x_train_scaled,to_categorical(y_train)) ##need to fit the model before predicting y_pred = model.predict_proba(x_test_scaled) print model_name,": ",log_loss(y_test,y_pred) # In[ ]: #
#
# ## Feature Engineering #
#
# In[435]: df.head() # In[436]: df['num_donations_per_month'] = df['num_donations']*1.0/df['months_since_first_donation'] # In[437]: df['months_between_first_n_last_donation'] = df['months_since_first_donation'] - df['months_since_last_donation'] # In[438]: df.head() # In[448]: np.corrcoef(df.months_since_first_donation,df.months_between_first_n_last_donation) # In[ ]: # In[444]: corr = df.drop('class',axis=1).corr() plt.figure(figsize=(15,10)) sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, annot=True ); # In[ ]: