#!/usr/bin/env python # coding: utf-8 # # Created on Fri Oct 4 14:32:45 2019 # # @author: dezsoribli # # # 1. Load hurricane data from the article "Hurricane-induced selection on the morphology of an island lizard". # https://www.nature.com/articles/s41586-018-0352-3 # - A, Drop the lizard with the most missing values # - B, Drop the ID column # - C, Encode, the Sex, Origin ans Hurricane values into binary columns, # and drop the original text columns. # - D, Make sure all your columns are encoded as floating point values, # not unsigned integers! # # # 2. Use logistic regression from the statsmodels package to predict whether # the lizard was measured after of before the hurricane, # using the whole dataset # - A, Investigate the Toe and Finger area coefficients, whats going on? # Fix this problem by only keeping the mean measurements. # - B, Which measured quality had the most significant positive effect on survival? # - C, Which measured quality had the most significant negative effect on survival? # - D, Try explain the results in your words. Check the abstract of the paper. # - E, Repeat the fit after scaling each input column to 0 mean and 1 variance. # Have the coefficients changed? Have the predictions changed? # # # 3. Repeat the fit with scikit-learn on the unnormalized dataset. # - A, Compare the coefficients with the ones you got from statsmodels. # Are they the same? If not try to answer why? # - B, Try to tweak the parameters of the scikit-learn method to reproduce the # the coefficients produced by statsmodels. # - C, Plot the ROC curve for the full dataset, and calculate the AUC. # - D, Repeat the fit after scaling each input column to 0 mean and 1 variance. # Have the coefficients changed? Have the predictions changed? # # # 4. Split the dataset into 5 folds and predict each fold by training on the other 4. # - A, Make sure to fix the seed of the splitting to 0 to make it reproducible. # - B, Plot the ROC for the 5 folds separately as curves on the same plot. # - C, Calculate the AUC values for the 5 folds separately. # # # # # # In[1]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.model_selection import KFold from sklearn.metrics import roc_auc_score,roc_curve import statsmodels.api as sm figsize(8,8) mpl.rcParams['font.size']=16 # In[2]: data = pd.read_csv('hurricane.csv') # renamed it data = data.drop([39]) data['Female'] = pd.get_dummies(data.Sex)['Female'].astype('float') data['Pine Cay'] = pd.get_dummies(data.Origin)['Pine Cay'].astype('float') data['After hurricane'] = pd.get_dummies(data.Hurricane)['After'].astype('float') # only keep numercal data data_num = data.drop(columns=['ID','Sex','Origin','Hurricane']) # drop which was not measured data_num = data_num.drop(columns=['SumFingers','SumToes','MaxFingerForce']) data_num = data_num.drop(columns=['FingerArea1','FingerArea2','FingerArea3', 'ToeArea1','ToeArea2','ToeArea3']) #data_num = data_num.drop(columns=['FingerCount','ToeCount']) # #data_num = data_num.drop(columns=['MeanFingerArea','MeanToeArea']) #data_num = data_num.drop(columns=['Female','Metatarsal','SVL','Tibia']) # %% data_num = sm.add_constant(data_num) # In[4]: # %% logit = sm.Logit(data_num['After hurricane'], data_num.drop(columns=['After hurricane'])) result = logit.fit() print(result.summary()) pred1=result.predict(data_num.drop(columns=['After hurricane'])) # In[5]: # %% ??? X = StandardScaler().fit_transform(data_num.drop(columns=['After hurricane','const'])) logit = sm.Logit(data_num['After hurricane'].values.astype('float'), sm.add_constant(X)) result = logit.fit() print (result.summary()) pred2=result.predict(sm.add_constant(X)) # In[6]: # %% pred1-pred2 # In[7]: # %% cls = LogisticRegression() cls.fit(data_num.drop(columns=['After hurricane','const']), data_num['After hurricane']) print (cls.coef_) pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1] # %% cls = LogisticRegression() cls.fit(sm.add_constant(X), data_num['After hurricane']) print (cls.coef_) pred2 = cls.predict_proba(sm.add_constant(X))[:,1] # %% pred1-pred2 # In[12]: # %% cls = LogisticRegression(C=1e26) cls.fit(data_num.drop(columns=['After hurricane','const']), data_num['After hurricane']) print (cls.coef_) pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1] # %% xroc,yroc,_ = roc_curve(data_num['After hurricane'],pred1 ) plot(xroc,yroc, label='AUC=%.3f'%roc_auc_score(data_num['After hurricane'],pred1)) legend() # In[13]: # %% X = data_num.drop(columns=['After hurricane']).values y=data_num['After hurricane'].values np.random.seed(0) cv = KFold(5, shuffle=True) for train_idx, test_idx in cv.split(X,y): logit = sm.Logit( y[train_idx], X[train_idx]) result = logit.fit() p = result.predict(X[test_idx]) xroc,yroc,_ = roc_curve(y[test_idx],p) plot(xroc,yroc, label='AUC=%.3f'%(roc_auc_score(y[test_idx],p))) legend()