%matplotlib inline
from redcap import Project
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
sns.set_context('notebook')
api_url = 'https://redcap.vanderbilt.edu/api/'
hospitalized_key = open("token.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_raw = hospitalized_proj.export_records(format='df',
df_kwargs={'index_col': hospitalized_proj.field_names[0]})
black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()
black_subset['response'] = None
black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1
black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0
black_subset.response = black_subset.response.astype(float)
black_subset.response.value_counts()
1 63 0 24 Name: response, dtype: int64
dob = pd.to_datetime(black_subset.dob)
adm = pd.to_datetime(black_subset.date_admission)
black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)
response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]
echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]
meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']
drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])
black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)
black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]
black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))
black_subset_complete.shape
(87, 70)
from sklearn import preprocessing
X = black_subset_complete.copy()
y = X.pop('response')
X_scaled = preprocessing.scale(X)
from sklearn.preprocessing._weights import _balance_weights
w = _balance_weights(y)
black_subset.response.value_counts()
1 63 0 24 Name: response, dtype: int64
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
X_scaled, y, test_size=0.4, random_state=0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})
grid = GridSearchCV(rfc,
param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},
scoring='precision', cv=5)
grid.fit(X, y)
print "best parameter choice:", grid.best_params_
best parameter choice: {'max_depth': 25}
rf = RandomForestClassifier(n_jobs=4, max_depth=25,
class_weight={0:w.max(), 1:w.min()})
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
pd.crosstab(y_test, preds, rownames=['actual'],
colnames=['prediction'])
prediction | 0.0 | 1.0 |
---|---|---|
actual | ||
0 | 8 | 1 |
1 | 1 | 25 |
from sklearn.cross_validation import cross_val_score
cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()},
n_estimators=50, max_depth=5,),
X, y, cv=3,
scoring='precision')
array([ 0.91304348, 0.83333333, 0.91304348])
importance = pd.Series(rf.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)[:25]
kd_therapy___1 0.319578 hd_num_days 0.132306 age 0.085432 kd_therapy___9 0.050139 abnormality___5 0.044890 abnormality___3 0.039355 lab_criteria___8 0.038673 num_echo_post_eval 0.038341 illness_day_at_rx 0.036826 lab_criteria___7 0.023252 subsequent_diagnosis 0.018851 abnormality___1 0.018535 abnormality___8 0.016046 lab_criteria___5 0.012319 lab_criteria___2 0.012073 clinical_criteria___3 0.011801 clinical_criteria___5 0.009152 clinical_criteria___1 0.008939 picu_admission 0.008713 lab_criteria___6 0.008495 clinical_criteria___4 0.008262 subsequent_diagnosis_of_cv___2 0.007222 lab_criteria___1 0.007044 type_of_subsequent_diagnos___4 0.005527 sex 0.005014 dtype: float64