Created on Fri Oct 4 14:32:45 2019
@author: dezsoribli
https://www.nature.com/articles/s41586-018-0352-3 - A, Drop the lizard with the most missing values - B, Drop the ID column - C, Encode, the Sex, Origin ans Hurricane values into binary columns, and drop the original text columns. - D, Make sure all your columns are encoded as floating point values, not unsigned integers!
Use logistic regression from the statsmodels package to predict whether the lizard was measured after of before the hurricane, using the whole dataset
Fix this problem by only keeping the mean measurements.
Have the coefficients changed? Have the predictions changed?
Repeat the fit with scikit-learn on the unnormalized dataset.
Are they the same? If not try to answer why?
the coefficients produced by statsmodels.
Have the coefficients changed? Have the predictions changed?
Split the dataset into 5 folds and predict each fold by training on the other 4.
%pylab inline
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.api as sm
figsize(8,8)
mpl.rcParams['font.size']=16
Populating the interactive namespace from numpy and matplotlib
data = pd.read_csv('hurricane.csv') # renamed it
data = data.drop([39])
data['Female'] = pd.get_dummies(data.Sex)['Female'].astype('float')
data['Pine Cay'] = pd.get_dummies(data.Origin)['Pine Cay'].astype('float')
data['After hurricane'] = pd.get_dummies(data.Hurricane)['After'].astype('float')
# only keep numercal data
data_num = data.drop(columns=['ID','Sex','Origin','Hurricane'])
# drop which was not measured
data_num = data_num.drop(columns=['SumFingers','SumToes','MaxFingerForce'])
data_num = data_num.drop(columns=['FingerArea1','FingerArea2','FingerArea3',
'ToeArea1','ToeArea2','ToeArea3'])
#data_num = data_num.drop(columns=['FingerCount','ToeCount'])
#
#data_num = data_num.drop(columns=['MeanFingerArea','MeanToeArea'])
#data_num = data_num.drop(columns=['Female','Metatarsal','SVL','Tibia'])
# %%
data_num = sm.add_constant(data_num)
/opt/conda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:52: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead. return getattr(obj, method)(*args, **kwds)
# %%
logit = sm.Logit(data_num['After hurricane'],
data_num.drop(columns=['After hurricane']))
result = logit.fit()
print(result.summary())
pred1=result.predict(data_num.drop(columns=['After hurricane']))
Optimization terminated successfully. Current function value: 0.256168 Iterations 8 Logit Regression Results ============================================================================== Dep. Variable: After hurricane No. Observations: 163 Model: Logit Df Residuals: 147 Method: MLE Df Model: 15 Date: Fri, 04 Oct 2019 Pseudo R-squ.: 0.6259 Time: 15:41:55 Log-Likelihood: -41.755 converged: True LL-Null: -111.63 LLR p-value: 2.588e-22 ================================================================================== coef std err z P>|z| [0.025 0.975] ---------------------------------------------------------------------------------- const 46.4732 11.204 4.148 0.000 24.514 68.433 SVL -0.4982 0.263 -1.894 0.058 -1.014 0.017 Femur -2.5733 0.723 -3.557 0.000 -3.991 -1.156 Tibia -1.1555 1.114 -1.037 0.300 -3.339 1.028 Metatarsal 0.8029 1.514 0.530 0.596 -2.165 3.771 LongestToe -3.3138 0.917 -3.614 0.000 -5.111 -1.517 Humerus 2.2870 0.846 2.704 0.007 0.629 3.945 Radius -1.1332 1.718 -0.660 0.509 -4.500 2.234 Metacarpal 1.0017 1.242 0.807 0.420 -1.432 3.435 LongestFinger -0.2999 1.280 -0.234 0.815 -2.810 2.210 FingerCount 1.5508 0.536 2.893 0.004 0.500 2.602 ToeCount -0.9490 0.468 -2.028 0.043 -1.866 -0.032 MeanFingerArea 5.4238 2.153 2.519 0.012 1.204 9.643 MeanToeArea 5.3069 1.827 2.905 0.004 1.726 8.888 Female -4.1135 1.528 -2.693 0.007 -7.108 -1.119 Pine Cay 1.6539 0.715 2.312 0.021 0.252 3.056 ==================================================================================
# %% ???
X = StandardScaler().fit_transform(data_num.drop(columns=['After hurricane','const']))
logit = sm.Logit(data_num['After hurricane'].values.astype('float'),
sm.add_constant(X))
result = logit.fit()
print (result.summary())
pred2=result.predict(sm.add_constant(X))
Optimization terminated successfully. Current function value: 0.256168 Iterations 8 Logit Regression Results ============================================================================== Dep. Variable: y No. Observations: 163 Model: Logit Df Residuals: 147 Method: MLE Df Model: 15 Date: Fri, 04 Oct 2019 Pseudo R-squ.: 0.6259 Time: 15:41:58 Log-Likelihood: -41.755 converged: True LL-Null: -111.63 LLR p-value: 2.588e-22 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const 0.7351 0.324 2.267 0.023 0.100 1.371 x1 -3.1191 1.647 -1.894 0.058 -6.347 0.109 x2 -4.4528 1.252 -3.557 0.000 -6.906 -1.999 x3 -1.9250 1.856 -1.037 0.300 -5.562 1.712 x4 0.7944 1.498 0.530 0.596 -2.142 3.731 x5 -3.4991 0.968 -3.614 0.000 -5.397 -1.602 x6 3.1678 1.171 2.704 0.007 0.872 5.464 x7 -1.2024 1.823 -0.660 0.509 -4.775 2.370 x8 0.3932 0.487 0.807 0.420 -0.562 1.348 x9 -0.1784 0.762 -0.234 0.815 -1.672 1.315 x10 1.6952 0.586 2.893 0.004 0.547 2.844 x11 -1.2302 0.606 -2.028 0.043 -2.419 -0.041 x12 2.9580 1.174 2.519 0.012 0.657 5.259 x13 4.7231 1.626 2.905 0.004 1.536 7.910 x14 -2.0427 0.759 -2.693 0.007 -3.530 -0.556 x15 0.8262 0.357 2.312 0.021 0.126 1.526 ==============================================================================
# %%
pred1-pred2
0 7.771561e-15 1 1.110223e-15 2 0.000000e+00 3 0.000000e+00 4 2.886580e-15 5 5.551115e-15 6 2.609024e-15 7 0.000000e+00 8 3.441691e-15 9 0.000000e+00 10 1.110223e-15 11 2.220446e-16 12 1.110223e-15 13 -3.663736e-15 14 7.771561e-16 15 3.330669e-15 16 6.661338e-16 17 4.440892e-16 18 1.554312e-15 19 9.992007e-16 20 2.220446e-16 21 2.220446e-16 22 9.992007e-16 23 2.220446e-16 24 6.661338e-15 25 2.220446e-16 26 -2.220446e-15 27 2.997602e-15 28 2.220446e-16 29 4.440892e-16 ... 134 -4.274359e-15 135 -4.874573e-16 136 -8.586881e-17 137 -2.484124e-15 138 -1.491862e-16 139 -8.361367e-16 140 4.662937e-15 141 5.995204e-15 142 1.276756e-15 143 -1.110223e-16 144 -1.762479e-15 145 -2.144118e-15 146 1.110223e-15 147 -8.153200e-16 148 -1.706968e-15 149 5.384582e-15 150 1.942890e-16 151 -1.179612e-15 152 3.885781e-15 153 -4.510281e-16 154 -3.642919e-16 155 9.020562e-17 156 -8.337515e-17 157 1.942890e-16 158 -1.179612e-15 159 -2.275957e-15 160 1.387779e-15 161 8.187895e-16 162 -1.110223e-16 163 -8.812395e-16 Length: 163, dtype: float64
# %%
cls = LogisticRegression()
cls.fit(data_num.drop(columns=['After hurricane','const']),
data_num['After hurricane'])
print (cls.coef_)
pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1]
# %%
cls = LogisticRegression()
cls.fit(sm.add_constant(X),
data_num['After hurricane'])
print (cls.coef_)
pred2 = cls.predict_proba(sm.add_constant(X))[:,1]
# %%
pred1-pred2
[[ 0.04554114 -1.7823243 -0.30337529 -0.06108083 -1.70506902 1.39230212 0.2203313 0.39376842 0.07844526 1.09892878 -0.02088438 1.18629206 1.33274758 -0.34572881 0.7630456 ]] [[ 0.22234396 -0.94557182 -2.42393522 -0.7036805 -0.12256229 -1.89094196 1.41110982 -0.32962143 0.24677151 -0.07331174 0.9290967 -0.37626081 1.6842705 1.73926583 -1.01235636 0.45617172]]
array([-2.85954451e-01, -7.09270087e-02, 2.52931727e-02, -3.75112573e-02, -1.82868902e-01, 1.22361822e-01, -1.50453050e-01, -1.73057026e-02, 1.63652417e-02, -4.94673199e-03, -4.76133539e-02, -7.45871174e-02, 3.71719032e-02, 7.37748272e-02, 1.42450982e-02, -4.13348974e-02, -7.55508641e-03, -7.41726008e-03, -1.49086939e-02, -1.84649655e-01, 5.52245766e-03, -3.44212123e-02, 1.62892696e-02, -6.35571092e-02, -1.45604870e-01, 5.98540199e-03, -1.81378497e-02, -5.48721913e-02, 5.13643126e-02, 2.92396130e-03, 7.37866577e-04, -8.07552701e-02, -1.08030644e-02, -1.08872743e-01, 4.59388407e-02, -2.43881395e-02, 6.87644881e-03, -2.11623133e-02, -7.75186961e-02, -3.38727951e-01, -2.86183583e-02, -2.22651867e-01, 2.24155232e-02, 4.58415270e-03, -1.91523133e-01, -6.99155595e-02, 1.28553775e-02, -1.05525038e-01, -3.51705014e-02, -3.87082513e-03, 5.69454174e-02, 5.72313675e-01, 1.05548107e-01, -1.20334502e-01, -3.69877277e-02, 5.06248915e-02, -3.84380256e-02, -9.88710820e-02, -1.33746066e-01, -5.90592508e-02, 1.02610137e-01, 1.25463655e-01, 1.34944972e-01, 3.14467176e-02, -4.36721034e-03, -8.86148267e-03, 6.81541575e-02, 4.15408665e-02, -5.50043401e-04, 3.29535918e-02, -5.44593458e-03, -8.34609281e-02, 1.38843867e-03, -2.55038949e-01, -3.47943015e-02, -2.01995303e-03, -1.31039949e-02, 1.62094362e-02, -3.60741982e-02, 4.37611807e-02, 1.38785864e-03, -2.95167509e-02, 7.62205743e-02, 2.30056790e-02, 6.32798575e-02, -1.23848837e-01, 2.23321476e-02, 9.54246185e-03, -8.82850129e-03, -3.15829305e-02, -4.82295231e-03, 1.21588387e-03, 2.53952544e-02, 9.10338709e-02, 7.65128525e-02, 3.61167833e-02, 1.44186831e-01, -1.18203561e-02, 1.24079508e-01, 2.44486443e-01, -3.52003302e-02, -7.52588020e-02, -2.38736482e-02, 1.92849359e-01, 3.61035508e-02, 4.67047087e-02, -7.70786751e-03, 1.94409045e-01, 1.09520993e-02, 1.60127413e-01, 3.91685079e-02, 1.81068551e-01, -4.48520019e-02, -4.06360250e-02, 6.30416931e-02, 1.21917300e-01, 8.71965110e-03, -1.83710758e-02, -4.01640593e-02, -2.40315956e-02, 7.38867704e-02, 4.34943102e-02, 1.26695194e-01, -6.88656728e-02, 1.80926449e-02, -2.52644581e-02, -4.01952768e-01, 1.02597272e-01, -2.54281687e-02, 2.80492870e-02, -5.91302178e-02, 9.37796339e-03, 2.24533202e-01, 1.80117157e-01, -1.67296287e-02, -2.50218577e-02, -7.97443395e-02, 4.75757576e-02, 1.75027906e-02, -8.57089345e-02, 6.00440838e-02, -8.39154470e-02, -2.33856384e-02, -3.51504632e-02, 3.12975733e-01, 1.20293119e-01, -1.52068053e-02, -5.90593632e-02, -1.56078165e-01, -8.71987743e-02, -6.48835278e-02, -5.15719769e-02, -4.64757255e-02, 1.03821659e-01, 1.22298260e-01, 8.60908120e-02, -1.00220379e-01, -3.68931373e-02, -9.03496047e-02, -6.44586586e-02, 1.36674815e-01, -8.18798700e-02, -1.99549748e-02])
# %%
cls = LogisticRegression(C=1e26)
cls.fit(data_num.drop(columns=['After hurricane','const']),
data_num['After hurricane'])
print (cls.coef_)
pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1]
# %%
xroc,yroc,_ = roc_curve(data_num['After hurricane'],pred1 )
plot(xroc,yroc, label='AUC=%.3f'%roc_auc_score(data_num['After hurricane'],pred1))
legend()
[[-0.48940382 -2.56746169 -1.14277845 0.79773787 -3.30123013 2.27390672 -1.11591419 0.9976535 -0.28057054 1.54624263 -0.93732781 5.36681844 5.25424454 -4.05273037 1.64582975]]
<matplotlib.legend.Legend at 0x7f1e69fc5d68>
# %%
X = data_num.drop(columns=['After hurricane']).values
y=data_num['After hurricane'].values
np.random.seed(0)
cv = KFold(5, shuffle=True)
for train_idx, test_idx in cv.split(X,y):
logit = sm.Logit( y[train_idx], X[train_idx])
result = logit.fit()
p = result.predict(X[test_idx])
xroc,yroc,_ = roc_curve(y[test_idx],p)
plot(xroc,yroc, label='AUC=%.3f'%(roc_auc_score(y[test_idx],p)))
legend()
Optimization terminated successfully. Current function value: 0.187765 Iterations 9 Optimization terminated successfully. Current function value: 0.264155 Iterations 8 Optimization terminated successfully. Current function value: 0.203891 Iterations 9 Optimization terminated successfully. Current function value: 0.230124 Iterations 9 Optimization terminated successfully. Current function value: 0.289693 Iterations 8
<matplotlib.legend.Legend at 0x7f1e69fea588>