#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
from redcap import Project
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')


# In[2]:


api_url = 'https://redcap.vanderbilt.edu/api/'

hospitalized_key = open("token.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_raw = hospitalized_proj.export_records(format='df', 
                            df_kwargs={'index_col': hospitalized_proj.field_names[0]})


# In[3]:


black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()


# In[4]:


black_subset['response'] = None
black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1
black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0
black_subset.response = black_subset.response.astype(float)


# In[5]:


black_subset.response.value_counts()


# In[6]:


dob = pd.to_datetime(black_subset.dob)
adm = pd.to_datetime(black_subset.date_admission)
black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)


# In[7]:


response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]
echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]
meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']
drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])

black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)


# In[8]:


black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]
black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))


# In[9]:


black_subset_complete.shape


# In[11]:


from sklearn import preprocessing

X = black_subset_complete.copy()
y = X.pop('response')

X_scaled = preprocessing.scale(X)


# In[12]:


from sklearn.preprocessing._weights import _balance_weights

w = _balance_weights(y)


# In[13]:


black_subset.response.value_counts()


# In[14]:


from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_scaled, y, test_size=0.4, random_state=0)


# In[15]:


from sklearn.ensemble import RandomForestClassifier


# In[20]:


from sklearn.grid_search import GridSearchCV

rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})

grid = GridSearchCV(rfc, 
                    param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},
                    scoring='precision', cv=5)
grid.fit(X, y)

print "best parameter choice:", grid.best_params_


# In[26]:


rf = RandomForestClassifier(n_jobs=4, max_depth=25,
                            class_weight={0:w.max(), 1:w.min()})
rf.fit(X_train, y_train)

preds = rf.predict(X_test)
pd.crosstab(y_test, preds, rownames=['actual'], 
            colnames=['prediction'])


# In[27]:


from sklearn.cross_validation import cross_val_score

cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, 
                                      n_estimators=50, max_depth=5,), 
                X, y, cv=3, 
                scoring='precision')


# In[32]:


importance = pd.Series(rf.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)[:25]