In [1]:
%matplotlib inline
from redcap import Project
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')
In [2]:
api_url = 'https://redcap.vanderbilt.edu/api/'

hospitalized_key = open("token.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_raw = hospitalized_proj.export_records(format='df', 
                            df_kwargs={'index_col': hospitalized_proj.field_names[0]})
In [3]:
black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()
In [4]:
black_subset['response'] = None
black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1
black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0
black_subset.response = black_subset.response.astype(float)
In [5]:
black_subset.response.value_counts()
Out[5]:
1    63
0    24
Name: response, dtype: int64
In [6]:
dob = pd.to_datetime(black_subset.dob)
adm = pd.to_datetime(black_subset.date_admission)
black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)
In [7]:
response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]
echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]
meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']
drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])

black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)
In [8]:
black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]
black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))
In [9]:
black_subset_complete.shape
Out[9]:
(87, 70)
In [11]:
from sklearn import preprocessing

X = black_subset_complete.copy()
y = X.pop('response')

X_scaled = preprocessing.scale(X)
In [12]:
from sklearn.preprocessing._weights import _balance_weights

w = _balance_weights(y)
In [13]:
black_subset.response.value_counts()
Out[13]:
1    63
0    24
Name: response, dtype: int64
In [14]:
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_scaled, y, test_size=0.4, random_state=0)
In [15]:
from sklearn.ensemble import RandomForestClassifier
In [20]:
from sklearn.grid_search import GridSearchCV

rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})

grid = GridSearchCV(rfc, 
                    param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},
                    scoring='precision', cv=5)
grid.fit(X, y)

print "best parameter choice:", grid.best_params_
best parameter choice: {'max_depth': 25}
In [26]:
rf = RandomForestClassifier(n_jobs=4, max_depth=25,
                            class_weight={0:w.max(), 1:w.min()})
rf.fit(X_train, y_train)

preds = rf.predict(X_test)
pd.crosstab(y_test, preds, rownames=['actual'], 
            colnames=['prediction'])
Out[26]:
prediction 0.0 1.0
actual
0 8 1
1 1 25
In [27]:
from sklearn.cross_validation import cross_val_score

cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, 
                                      n_estimators=50, max_depth=5,), 
                X, y, cv=3, 
                scoring='precision')
Out[27]:
array([ 0.91304348,  0.83333333,  0.91304348])
In [32]:
importance = pd.Series(rf.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)[:25]
Out[32]:
kd_therapy___1                    0.319578
hd_num_days                       0.132306
age                               0.085432
kd_therapy___9                    0.050139
abnormality___5                   0.044890
abnormality___3                   0.039355
lab_criteria___8                  0.038673
num_echo_post_eval                0.038341
illness_day_at_rx                 0.036826
lab_criteria___7                  0.023252
subsequent_diagnosis              0.018851
abnormality___1                   0.018535
abnormality___8                   0.016046
lab_criteria___5                  0.012319
lab_criteria___2                  0.012073
clinical_criteria___3             0.011801
clinical_criteria___5             0.009152
clinical_criteria___1             0.008939
picu_admission                    0.008713
lab_criteria___6                  0.008495
clinical_criteria___4             0.008262
subsequent_diagnosis_of_cv___2    0.007222
lab_criteria___1                  0.007044
type_of_subsequent_diagnos___4    0.005527
sex                               0.005014
dtype: float64