#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from redcap import Project import numpy as np import pandas as pd import seaborn as sns import matplotlib.pylab as plt sns.set_context('notebook') # In[2]: api_url = 'https://redcap.vanderbilt.edu/api/' hospitalized_key = open("token.txt").read() hospitalized_proj = Project(api_url, hospitalized_key) hospitalized_raw = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]}) # In[3]: black_subset = hospitalized_raw[hospitalized_raw.race==3].copy() # In[4]: black_subset['response'] = None black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1 black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0 black_subset.response = black_subset.response.astype(float) # In[5]: black_subset.response.value_counts() # In[6]: dob = pd.to_datetime(black_subset.dob) adm = pd.to_datetime(black_subset.date_admission) black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13) # In[7]: response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')] echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')] meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased'] drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols]) black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1) # In[8]: black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)] black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean())) # In[9]: black_subset_complete.shape # In[11]: from sklearn import preprocessing X = black_subset_complete.copy() y = X.pop('response') X_scaled = preprocessing.scale(X) # In[12]: from sklearn.preprocessing._weights import _balance_weights w = _balance_weights(y) # In[13]: black_subset.response.value_counts() # In[14]: from sklearn import cross_validation X_train, X_test, y_train, y_test = cross_validation.train_test_split( X_scaled, y, test_size=0.4, random_state=0) # In[15]: from sklearn.ensemble import RandomForestClassifier # In[20]: from sklearn.grid_search import GridSearchCV rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()}) grid = GridSearchCV(rfc, param_grid={'max_depth': [2, 5, 10, 25, 50, 100]}, scoring='precision', cv=5) grid.fit(X, y) print "best parameter choice:", grid.best_params_ # In[26]: rf = RandomForestClassifier(n_jobs=4, max_depth=25, class_weight={0:w.max(), 1:w.min()}) rf.fit(X_train, y_train) preds = rf.predict(X_test) pd.crosstab(y_test, preds, rownames=['actual'], colnames=['prediction']) # In[27]: from sklearn.cross_validation import cross_val_score cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, n_estimators=50, max_depth=5,), X, y, cv=3, scoring='precision') # In[32]: importance = pd.Series(rf.feature_importances_, index=X.columns) importance.sort_values(ascending=False)[:25]