#!/usr/bin/env python # coding: utf-8 # # NIH Seizure Prediction using Bayesian Logistic Regression and Pymc3 # # # Code and documentation for my solution (51th place) for the Kaggle Melbourne University AES/MathWorks/NIH Seizure Prediction challenge : https://www.kaggle.com/solomonk # # ### A 2016 Kaggle competition. # # https://www.kaggle.com/c/melbourne-university-seizure-prediction # In[1]: get_ipython().run_line_magic('reset', '-f') get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import pymc3 as pm import seaborn as sns import os,sys,inspect currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) dirToInclude=parentdir +'/features/' sys.path.insert(0,dirToInclude) import IeegConsts from IeegConsts import * from IeegFeatures import * import pandas import numpy as np import pandas as pd from sklearn import cross_validation from sklearn import metrics from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score, train_test_split import matplotlib.pyplot as plt import theano.tensor as tt get_ipython().run_line_magic('matplotlib', 'inline') np.set_printoptions(precision=4, threshold=10000, linewidth=100, edgeitems=999, suppress=True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.width', 100) pd.set_option('expand_frame_repr', False) pd.set_option('precision', 6) from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 12, 4 train_dir=TRAIN_DATA_FOLDER_IN_ALL test_dir=TEST_DATA_FOLDER_IN_ALL ieegFeatures= IeegFeatures(train_dir, True) df_cols_train=ieegFeatures.ieegGenCols() print(len(df_cols_train)) # F_NAME_TRAIN= TRAIN_FEAT_BASE + TRAIN_PREFIX_ALL +'-feat_TRAIN_df.hdf' # X_df_train=pandas.read_hdf(F_NAME_TRAIN, engine='python') X_df_train= pd.read_hdf(TRAIN_FEAT_BASE + TRAIN_PREFIX_ALL + 'X_df_train.hdf', 'data',format='fixed',complib='blosc',complevel=9) # X_df_train.drop('Unnamed: 0', axis=1, inplace=True) n=16 last_cols=list() for i in range(1, n_psd + 1): last_cols.append('psd_{}'.format(i)) for i in range(1, 16 + 1): last_cols.append('var_{}'.format(i)) for i in range(1, 16 + 1): last_cols.append('kurt_{}'.format(i)) for i in range(1, n_corr_coeff + 1): last_cols.append('corcoef_{}'.format(i)) for i in range(1, n + 1): last_cols.append('hurst_{}'.format(i)) # for i in range(1, n_plv+ 1): # last_cols.append('plv_{}'.format(i)) # for i in range(1, n + 1): # last_cols.append('mean_{}'.format(i)) # for i in range(1, n + 1): # last_cols.append('median_{}'.format(i)) # for i in range(1, n + 1): # last_cols.append('std_{}'.format(i)) X_df_train_SINGLE=X_df_train X_df_train_SINGLE.drop('id', axis=1, inplace=True) X_df_train_SINGLE.drop('file', axis=1, inplace=True) X_df_train_SINGLE.drop('patient_id', axis=1, inplace=True) X_df_train_SINGLE = X_df_train_SINGLE.loc[X_df_train_SINGLE['file_size'] > 100000] X_df_train_SINGLE.drop('file_size', axis=1, inplace=True) X_df_train_SINGLE.drop('sequence_id', axis=1, inplace=True) X_df_train_SINGLE.drop('segment', axis=1, inplace=True) answers_1_SINGLE = list (X_df_train_SINGLE[singleResponseVariable].values) X_df_train_SINGLE = X_df_train_SINGLE.drop(singleResponseVariable, axis=1) X_df_train_SINGLE=X_df_train_SINGLE[last_cols] X_df_train_SINGLE=X_df_train_SINGLE.apply(lambda x: pandas.to_numeric(x, errors='ignore')) X_df_train_SINGLE.head(5) # In[2]: y=answers_1_SINGLE X=X_df_train_SINGLE # Normalize variables X_norm = (X - X.mean())/X.std() lr_best_params = {'penalty': 'l2', 'C': 100, 'solver': 'newton-cg', 'fit_intercept': False} lr = LogisticRegression(**lr_best_params) lr.fit(X_norm, y) #Store LR coeefs lr_coeefs=lr.coef_ k = (X_df_train_SINGLE.shape[1]) # In[3]: with pm.Model() as logistic_model: μ = pm.Normal('μ', 0, sd=10) b = pm.Laplace('b', 0.0, b=0.1, shape=k) p = pm.math.invlogit(μ + tt.dot(X_norm, b)) likelihood = pm.Bernoulli('likelihood', p, observed=y) # In[4]: niter=3000 with logistic_model: trace_logistic_model = pm.sample(niter, n_init=50000) # In[5]: ax = pm.traceplot(trace_logistic_model[-1000:], figsize=(12,len(trace_logistic_model.varnames)*1.5), lines={k: v['mean'] for k, v in pm.df_summary(trace_logistic_model[-1000:]).iterrows()})