#!/usr/bin/env python
# coding: utf-8

# 
# Created on Fri Oct  4 14:32:45 2019
# 
# @author: dezsoribli
# 
# 
# 1. Load hurricane data from the article "Hurricane-induced selection on the morphology of an island lizard".
# https://www.nature.com/articles/s41586-018-0352-3
#     - A, Drop the lizard with the most missing values
#     - B, Drop the ID column
#     - C, Encode, the Sex, Origin ans Hurricane values into binary columns,
#     and drop the original text columns.
#     - D, Make sure all your columns are encoded as floating point values, 
#     not unsigned integers!
#     
#     
# 2. Use logistic regression from the statsmodels package to predict whether
#     the lizard was measured after of before the hurricane, 
#     using the whole dataset
#     - A, Investigate the Toe and Finger area coefficients, whats going on? 
#     Fix this problem by only keeping the mean measurements.
#     - B, Which measured quality had the most significant positive effect on survival?
#     - C, Which measured quality had the most significant negative effect on survival?
#     - D, Try explain the results in your words. Check the abstract of the paper.
#     - E, Repeat the fit after scaling each input column to 0 mean and 1 variance. 
#     Have the coefficients changed? Have the predictions changed?
# 
# 
# 3. Repeat the fit with scikit-learn on the unnormalized dataset.
#     - A, Compare the coefficients with the ones you got from statsmodels. 
#     Are they the same? If not try to answer why?
#     - B, Try to tweak the parameters of the scikit-learn method to reproduce the
#     the coefficients produced by statsmodels.
#     - C, Plot the ROC curve for the full dataset, and calculate the AUC.
#     - D, Repeat the fit after scaling each input column to 0 mean and 1 variance. 
#     Have the coefficients changed? Have the predictions changed?
#     
#     
# 4. Split the dataset into 5 folds and predict each fold by training on the other 4.
#     - A, Make sure to fix the seed of the splitting to 0 to make it reproducible.
#     - B, Plot the ROC for the 5 folds separately as curves on the same plot.
#     - C, Calculate the AUC values for the 5 folds separately.
# 
# 
# 
# 
# 

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.api as sm
figsize(8,8)
mpl.rcParams['font.size']=16


# In[2]:


data = pd.read_csv('hurricane.csv')  # renamed it


data = data.drop([39])

data['Female'] = pd.get_dummies(data.Sex)['Female'].astype('float')
data['Pine Cay'] = pd.get_dummies(data.Origin)['Pine Cay'].astype('float')
data['After hurricane'] = pd.get_dummies(data.Hurricane)['After'].astype('float')

# only keep numercal data
data_num = data.drop(columns=['ID','Sex','Origin','Hurricane'])

# drop which was not measured
data_num = data_num.drop(columns=['SumFingers','SumToes','MaxFingerForce'])

data_num = data_num.drop(columns=['FingerArea1','FingerArea2','FingerArea3',
                          'ToeArea1','ToeArea2','ToeArea3'])

#data_num = data_num.drop(columns=['FingerCount','ToeCount'])
#
#data_num = data_num.drop(columns=['MeanFingerArea','MeanToeArea'])

#data_num = data_num.drop(columns=['Female','Metatarsal','SVL','Tibia'])

# %%
data_num = sm.add_constant(data_num)


# In[4]:


# %%
logit = sm.Logit(data_num['After hurricane'], 
                 data_num.drop(columns=['After hurricane']))
result = logit.fit()
print(result.summary())
pred1=result.predict(data_num.drop(columns=['After hurricane']))


# In[5]:


# %% ???
X =  StandardScaler().fit_transform(data_num.drop(columns=['After hurricane','const']))
logit = sm.Logit(data_num['After hurricane'].values.astype('float'), 
                 sm.add_constant(X))
result = logit.fit()
print (result.summary())
pred2=result.predict(sm.add_constant(X))


# In[6]:


# %%
pred1-pred2


# In[7]:


# %%
cls = LogisticRegression()
cls.fit(data_num.drop(columns=['After hurricane','const']),
        data_num['After hurricane'])
print (cls.coef_)
pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1]

# %%
cls = LogisticRegression()
cls.fit(sm.add_constant(X),
        data_num['After hurricane'])
print (cls.coef_)
pred2 = cls.predict_proba(sm.add_constant(X))[:,1]

# %%
pred1-pred2


# In[12]:


# %%
cls = LogisticRegression(C=1e26)
cls.fit(data_num.drop(columns=['After hurricane','const']),
        data_num['After hurricane'])
print (cls.coef_)
pred1 = cls.predict_proba(data_num.drop(columns=['After hurricane','const']))[:,1]

# %%
xroc,yroc,_ = roc_curve(data_num['After hurricane'],pred1 )
plot(xroc,yroc, label='AUC=%.3f'%roc_auc_score(data_num['After hurricane'],pred1))
legend()


# In[13]:


# %%
X = data_num.drop(columns=['After hurricane']).values
y=data_num['After hurricane'].values
np.random.seed(0)
cv = KFold(5, shuffle=True)
for train_idx, test_idx in cv.split(X,y):
    logit = sm.Logit( y[train_idx], X[train_idx])
    result = logit.fit()
    p = result.predict(X[test_idx])
    xroc,yroc,_ = roc_curve(y[test_idx],p)
    plot(xroc,yroc, label='AUC=%.3f'%(roc_auc_score(y[test_idx],p)))
legend()