#!/usr/bin/env python
# coding: utf-8

# # Generalized Linear Mixed‐effects Model Playground
# 
# or the many ways to perform GLMM in python  
#   
# A comparison among:  
# [StatsModels](https://github.com/statsmodels/statsmodels)  
# [Theano](https://github.com/Theano/Theano)  
# [PyMC3](https://github.com/pymc-devs/pymc3)(Base on Theano)  
# [TensorFlow](https://github.com/tensorflow/tensorflow)  
# [Stan](https://github.com/stan-dev/stan) and [pyStan](https://github.com/stan-dev/pystan)  
# [Keras](https://github.com/fchollet/keras) with Tensorflow backend

# In[1]:


import numpy as np
#import pandas as pd
import seaborn as sns
from theano import tensor as tt
get_ipython().run_line_magic('pylab', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")
get_ipython().run_line_magic('qtconsole', '--colors=linux')

get_ipython().run_line_magic('load_ext', 'watermark')


# In[2]:


get_ipython().run_line_magic('watermark', '-v -m -p numpy,pandas,seaborn,statsmodels,theano,pymc3,tensorflow,keras,matplotlib')


# # Prepare data and quick visualization

# In[3]:


# load data
import pandas as pd
Tbl_beh  = pd.read_csv('./behavioral_data.txt', delimiter='\t')
Tbl_beh["subj"]  = Tbl_beh["subj"].astype('category')
#%% visualized data
# Draw a nested violinplot and split the violins for easier comparison
sns.set(style="ticks", palette="muted", color_codes=True)
plt.figure()
Tbl_beh['cbcond']  = pd.Series(Tbl_beh['chimera'] + '-' + Tbl_beh['identity'], index=Tbl_beh.index)
sns.violinplot(y="cbcond", x="rt", hue="group", data=Tbl_beh, split=True,
               inner="quart", palette={"cp": "b", "cg": "y"})
sns.despine(left=True)


# `Tbl_beh` is the raw data table.  

# In[4]:


Tbl_beh.head(5)


# In[5]:


#%% Compute the conditional mean of dataset
condi_sel = ['subj', 'chimera', 'identity', 'orientation']
tblmean = pd.DataFrame({'Nt' : Tbl_beh.groupby( condi_sel ).size()}).reset_index()
tblmean["subj"] = tblmean["subj"].astype('category')
tblmean['rt']  = np.asarray(Tbl_beh.groupby(condi_sel)['rt'].mean())
tblmean['ACC'] = np.asarray(Tbl_beh.groupby(condi_sel)['acc'].mean())
tblmean['group']= np.asarray(Tbl_beh.groupby(condi_sel)['group'].all())


# `tblmean` is the summary data table.  

# In[6]:


tblmean['cbcond']  = pd.Series(tblmean['chimera'] + '-' + tblmean['identity'], 
                index=tblmean.index)

## boxplot + scatter plot to show accuracy
#ax = sns.boxplot(y="cbcond", x="acc", data=tbltest,
#                 whis=np.inf, color="c")
## Add in points to show each observation
#sns.stripplot(y="cbcond", x="acc", data=tbltest,
#              jitter=True, size=3, color=".3", linewidth=0)
plt.figure()
g1 = sns.violinplot(y="cbcond", x="rt", hue="group", data=tblmean, split=True,
               inner="quart", palette={"cp": "b", "cg": "y"})
g1.set(xlim=(0, 3000))
# Make the quantitative axis logarithmic
sns.despine(trim=True)


# In[7]:


tblmean.head(5)


# Here I used the mean table as the data - it is smaller and faster to run

# In[8]:


tbltest = tblmean
tbltest.shape


# # Using StatsModels to perform a linear mixed model of reaction time

# In[9]:


import statsmodels.formula.api as smf
from patsy import dmatrices
formula = "rt ~ group*orientation*identity"
#formula = "rt ~ -1 + cbcond"
md  = smf.mixedlm(formula, tbltest, groups=tbltest["subj"])
mdf = md.fit()
print(mdf.summary())

fe_params = pd.DataFrame(mdf.fe_params,columns=['LMM'])
random_effects = pd.DataFrame(mdf.random_effects)
random_effects = random_effects.transpose()
random_effects = random_effects.rename(index=str, columns={'groups': 'LMM'})

#%% Generate Design Matrix for later use
Y, X   = dmatrices(formula, data=tbltest, return_type='matrix')
Terms  = X.design_info.column_names
_, Z   = dmatrices('rt ~ -1+subj', data=tbltest, return_type='matrix')
X      = np.asarray(X) # fixed effect
Z      = np.asarray(Z) # mixed effect
Y      = np.asarray(Y).flatten()
nfixed = np.shape(X)
nrandm = np.shape(Z)


# A helper function for ploting

# In[10]:


#%% ploting function 
def plotfitted(fe_params=fe_params,random_effects=random_effects,X=X,Z=Z,Y=Y):
    plt.figure(figsize=(18,9))
    ax1 = plt.subplot2grid((2,2), (0, 0))
    ax2 = plt.subplot2grid((2,2), (0, 1))
    ax3 = plt.subplot2grid((2,2), (1, 0), colspan=2)
    
    fe_params.plot(ax=ax1)
    random_effects.plot(ax=ax2)
    
    ax3.plot(Y.flatten(),'o',color='k',label = 'Observed', alpha=.25)
    for iname in fe_params.columns.get_values():
        fitted = np.dot(X,fe_params[iname])+np.dot(Z,random_effects[iname]).flatten()
        print("The MSE of "+iname+ " is " + str(np.mean(np.square(Y.flatten()-fitted))))
        ax3.plot(fitted,lw=1,label = iname, alpha=.5)
    ax3.legend(loc=0)
    #plt.ylim([0,5])
    plt.show()

plotfitted(fe_params=fe_params,random_effects=random_effects,X=X,Z=Z,Y=Y)


# # Using PyMC3

# In[12]:


beta0     = np.linalg.lstsq(X,Y)

fixedpred = np.argmax(X,axis=1)
randmpred = np.argmax(Z,axis=1)

con  = tt.constant(fixedpred)
sbj  = tt.constant(randmpred)
import pymc3 as pm
with pm.Model() as glmm1:
    # Fixed effect
    beta = pm.Normal('beta', mu=0., sd=100., shape=(nfixed[1]))
    # random effect
    s    = pm.HalfCauchy('s', 10.)
    b    = pm.Normal('b', mu=0., sd=s, shape=(nrandm[1]))
    eps  = pm.HalfCauchy('eps', 5.)
    
    #mu_est = pm.Deterministic('mu_est',beta[con] + b[sbj])
    mu_est = pm.Deterministic('mu_est', tt.dot(X,beta)+tt.dot(Z,b))
    RT = pm.Normal('RT', mu_est, eps, observed=Y)
    
    trace = pm.sample(1000, tune=1000)
    
pm.traceplot(trace,varnames=['beta','b','s']) # 
plt.show()

fixed_pymc = pm.summary(trace, varnames=['beta'])
randm_pymc = pm.summary(trace, varnames=['b'])
fe_params['PyMC'] = pd.Series(np.asarray(fixed_pymc['mean']), index=fe_params.index)
random_effects['PyMC'] = pd.Series(np.asarray(randm_pymc['mean']),index=random_effects.index)
print(fe_params)


# # Using Theano
# This is a non-probabilistic model, but none-the-less uncertainty related to parameter estimation could be obtained using dropout (http://mlg.eng.cam.ac.uk/yarin/blog_3d801aa532c1ce.html).

# In[13]:


Y, X   = dmatrices(formula, data=tbltest, return_type='matrix')
Terms  = X.design_info.column_names
_, Z   = dmatrices('rt ~ -1+subj', data=tbltest, return_type='matrix')
X      = np.asarray(X) # fixed effect
Z      = np.asarray(Z) # mixed effect
Y      = np.asarray(Y).flatten()
nfixed = np.shape(X)
nrandm = np.shape(Z)

X = X.astype(np.float32)
Z = Z.astype(np.float32)
Y = Y.astype(np.float32)

import theano
theano.config.compute_test_value = 'ignore'

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

def model(X, beta, Z, b):
    return tt.sum(tt.dot(X, beta) + tt.dot(Z, b),axis=1)

def sgd(cost, params, lr=0.001):
    grads = tt.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        updates.append([p, p - g * lr])
    return updates
    
Xtf   = tt.fmatrix()
Ztf   = tt.fmatrix()
y     = tt.vector()
tbeta = init_weights([nfixed[1], 1])
tb    = init_weights([nrandm[1], 1])
eps   = init_weights([0])
y_    = model(Xtf, tbeta, Ztf,tb)

cost    = tt.mean(tt.sqr(y - y_))
params  = [tbeta, tb]
updates = sgd(cost, params)

train = theano.function(inputs=[Xtf, Ztf, y], outputs=cost, updates=updates, allow_input_downcast=True)

for i in range(100000):
    sel = np.random.randint(0,nfixed[0],size=int(nfixed[0]/2))
    batch_xs, batch_zs, batch_ys = X[sel,:],Z[sel,:],Y[sel]
    train(batch_xs, batch_zs, batch_ys)
        
outdf = pd.DataFrame(data=tbeta.get_value(),columns=['beta'],index=Terms)
fe_params['theano'] = pd.Series(tbeta.get_value().flatten(), index=fe_params.index)
random_effects['theano'] = pd.Series(tb.get_value().flatten() ,index=random_effects.index)
print(fe_params)


# # Using TensorFlow

# In[14]:


Y, X   = dmatrices(formula, data=tbltest, return_type='matrix')
Terms  = X.design_info.column_names
_, Z   = dmatrices('rt ~ -1+subj', data=tbltest, return_type='matrix')
X      = np.asarray(X) # fixed effect
Z      = np.asarray(Z) # mixed effect
Y      = np.asarray(Y)
nfixed = np.shape(X)
nrandm = np.shape(Z)

X = X.astype(np.float32)
Z = Z.astype(np.float32)
Y = Y.astype(np.float32)

import tensorflow as tf
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

def model(X, beta, Z, b):
    y_pred = tf.matmul(X, beta) + tf.matmul(Z, b)
    #randcoef = tf.matmul(Z, b)
    #Xnew     = tf.transpose(X) * tf.transpose(randcoef)
    #y_pred   = tf.matmul(tf.transpose(Xnew), beta)
    return y_pred # notice we use the same model as linear regression, 
                  # this is because there is a baked in cost function which performs softmax and cross entropy

Xtf   = tf.placeholder("float32", [None, nfixed[1]]) # create symbolic variables
Ztf   = tf.placeholder("float32", [None, nrandm[1]])
y     = tf.placeholder("float32", [None, 1])
beta  = init_weights([nfixed[1], 1])
b     = init_weights([nrandm[1], 1])
eps   = init_weights([0])
y_    = model(Xtf, beta, Ztf, b)
# y_    = tf.nn.softmax(model(Xtf, beta) + model(Ztf, b) + eps)

# cost  = tf.reduce_mean(-tf.reduce_sum(y * tf.log(y_), reduction_indices=[1]))
cost  = tf.square(y - y_) # use square error for cost function

train_step    = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)

init          = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver         = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    for i in range(2000):
        sel = np.random.randint(0,nfixed[0],size=int(nfixed[0]/2))
        batch_xs, batch_zs, batch_ys = X[sel,:],Z[sel,:],Y[sel]
        sess.run(train_step, feed_dict={Xtf: batch_xs, Ztf: batch_zs, y: batch_ys})
        
        accuracy = tf.reduce_mean(tf.cast(cost, tf.float32))
        if i % 1000 == 0:        
            print(i,sess.run(accuracy, feed_dict={Xtf: X, Ztf: Z, y: Y}))

    betaend = sess.run(beta)
    bend    = sess.run(b)
    
fe_params['tf'] = pd.Series(betaend.flatten(), index=fe_params.index)
random_effects['tf'] = pd.Series(bend.flatten(), index=random_effects.index)
print(fe_params)


# # PyMC3 again 
# model similar to brms

# In[21]:


Y, X   = dmatrices(formula, data=tbltest, return_type='matrix')
Terms  = X.design_info.column_names
_, Z   = dmatrices('rt ~ -1+subj', data=tbltest, return_type='matrix')
X      = np.asarray(X) # fixed effect
Z      = np.asarray(Z) # mixed effect
Y      = np.asarray(Y).flatten()
nfixed = np.shape(X)
nrandm = np.shape(Z)
beta0     = np.linalg.lstsq(X,Y)

fixedpred = np.argmax(X,axis=1)
randmpred = np.argmax(Z,axis=1)

con  = tt.constant(fixedpred)
sbj  = tt.constant(randmpred)

X1     = X[0:, 1:]
X_mean = np.mean(X1, axis=0) # column means of X before centering 
X_cent = X1 - X_mean

with pm.Model() as glmm2:
    # temporary Intercept 
    temp_Intercept = pm.StudentT('temp_Intercept', nu=3, mu=0, sd=186)#
    # Fixed effect
    beta = pm.Normal('beta', mu = 0, sd = 100, shape=(nfixed[1]-1))
    
    # random effect
    # group-specific standard deviation
    s      = pm.HalfStudentT('sd_1', nu=3, sd=186)
    b      = pm.Normal('b', mu = 0, sd = 1, shape=(nrandm[1]))
    r_1    = pm.Deterministic('r_1', s*b)
    sigma  = pm.HalfStudentT('sigma', nu=3, sd=186)
    
    # compute linear predictor 
    mu_est = tt.dot(X_cent,beta) + temp_Intercept + r_1[sbj]
    
    RT = pm.Normal('RT', mu_est, sigma, observed = Y)
    b_Intercept = pm.Deterministic("b_Intercept", temp_Intercept - tt.sum(X_mean * beta))

    trace2 = pm.sample(1000, tune=1000)

pm.traceplot(trace2, varnames=['beta', 'b_Intercept', 'r_1']) # 
plt.show()


# In[23]:


df_summary = pm.summary(trace2, varnames=['beta'])
df_stmp    = pm.summary(trace2, varnames=['b_Intercept']) #['temp_Intercept']
df_new     = df_stmp.append(df_summary, ignore_index=True)
df_new.index=Terms

randm_pymc = pm.summary(trace2,varnames=['r_1'])
fe_params['PyMC2'] = pd.Series(np.asarray(df_new['mean']), index=fe_params.index)
random_effects['PyMC2'] = pd.Series(np.asarray(randm_pymc['mean']),index=random_effects.index)
print(fe_params)


# # Usint PyStan (Model is constructed using brm in R)  
# 
# 
# ```r
# library(lme4)
# library(brms)
# ```
# ```
# Tbl_beh <- read.csv("behavioral_data.txt",sep = "\t")
# Tbl_beh$subj <- factor(Tbl_beh$subj)
# Tbl_beh$trial <- factor(Tbl_beh$trial)
# stanmodel <- make_stancode(rt ~ group*orientation*identity + (1|subj),
#                 data = Tbl_beh,family = "normal")
# standata  <- make_standata(rt ~ group*orientation*identity + (1|subj),
#                           data = Tbl_beh,family = "normal")
# bayesfit <- brm(rt ~ group*orientation*identity + (1|subj),
#                 data = Tbl_beh,family = "normal")
# bayesfit$model
# summary(bayesfit, waic = TRUE)
# ```
# ```
# lmefit <- lmer(rt ~ group*orientation*identity + (1|subj),
#                  data = Tbl_beh)
# summary(lmefit)
# ```

# In[30]:


stan_datadict = {}
stan_datadict['N'] = Y.shape[0]
stan_datadict['Y'] = Y
stan_datadict['K'] = X1.shape[1]
stan_datadict['X'] = X_cent
stan_datadict['X_means'] = X_mean

stan_datadict['J_1'] = randmpred + 1
stan_datadict['N_1'] = max(randmpred + 1)
stan_datadict['K_1'] = 1
stan_datadict['Z_1'] = np.ones(randmpred.shape)

stan_datadict['prior_only'] = 0
#stan_datadict.items()


# In[31]:


stan_mdlspec_lmm = """
    // This Stan code was generated with the R package 'brms'. 
    // We recommend generating the data with the 'make_standata' function. 
    functions { 
    } 
    data { 
      int<lower=1> N;  // total number of observations 
      vector[N] Y;  // response variable 
      int<lower=1> K;  // number of population-level effects 
      matrix[N, K] X;  // centered population-level design matrix 
      vector[K] X_means;  // column means of X before centering 
      // data for group-specific effects of subj 
      int<lower=1> J_1[N]; 
      int<lower=1> N_1; 
      int<lower=1> K_1; 
      vector[N] Z_1; 
      int prior_only;  // should the likelihood be ignored? 
    } 
    transformed data { 
    } 
    parameters { 
      vector[K] b;  // population-level effects 
      real temp_Intercept;  // temporary Intercept 
      real<lower=0> sd_1;  // group-specific standard deviation 
      vector[N_1] z_1;  // unscaled group-specific effects 
      real<lower=0> sigma;  // residual SD 
    } 
    transformed parameters { 
      // group-specific effects 
      vector[N_1] r_1; 
      vector[N] eta;  // linear predictor 
      r_1 <- sd_1 * (z_1);
      // compute linear predictor 
      eta <- X * b + temp_Intercept; 
      for (n in 1:N) { 
        eta[n] <- eta[n] + r_1[J_1[n]] * Z_1[n]; 
      } 
    } 
    model { 
      // prior specifications 
      sd_1 ~ student_t(3, 0, 186); 
      z_1 ~ normal(0, 1); 
      sigma ~ student_t(3, 0, 186); 
      // likelihood contribution 
      if (!prior_only) { 
        Y ~ normal(eta, sigma); 
      } 
    } 
    generated quantities { 
      real b_Intercept;  // population-level intercept 
      b_Intercept <- temp_Intercept - dot_product(X_means, b); 
    } 
    """


# In[32]:


import pystan
stan_fit_lmm = pystan.stan(model_code=stan_mdlspec_lmm, data=stan_datadict,
                           iter=3000, warmup=1000, chains=4, n_jobs=2,verbose=False)
stan_fit_lmm.plot(pars=['b_Intercept','b','z_1'])
plt.show()


# In[33]:


Itercept = stan_fit_lmm.extract(permuted=True)['b_Intercept']
betatemp = stan_fit_lmm.extract(permuted=True)['b']
btemp    = stan_fit_lmm.extract(permuted=True)['z_1']
sdtemp   = stan_fit_lmm.extract(permuted=True)['sd_1']
betastan = np.hstack([Itercept.mean(axis=0), betatemp.mean(axis=0)])
bstan    = btemp.mean(axis=0)*sdtemp.mean(axis=0)

fe_params['Stan'] = pd.Series(betastan, index=fe_params.index)
random_effects['Stan'] = pd.Series(bstan,index=random_effects.index)
print(fe_params)


# # Using Keras

# In[34]:


import keras
from keras.models import Model
from keras.layers import Input, Add, Dense
from keras import backend as K

K.clear_session()
nb_epoch = 1000

fixedpred = np.argmax(X,axis=1)
randmpred = np.argmax(Z,axis=1)

Xinput      = Input(batch_shape=(None, nfixed[1]-1))
fixed_keras = Dense(1, input_dim=nfixed[1]-1, name = 'fixedEffect')(Xinput)

Zinput     = Input(batch_shape=(None, nrandm[1]))
randm_keras = Dense(1, input_dim=nrandm[1], use_bias=None, name = 'randomEffect')(Zinput)

merged = keras.layers.add([fixed_keras, randm_keras])

model = Model([Xinput,Zinput],merged)

model.compile(loss='mean_squared_error', optimizer='sgd')

from keras.callbacks import TensorBoard
# train the model
model.fit([X[:,1:], Z], Y.flatten(), 
          epochs=nb_epoch, 
          batch_size=100, 
          verbose=0,
          shuffle=True,
          )#callbacks=[TensorBoard(log_dir='/tmp/xinyi_spot_invert')])

Ypredict = model.predict([X[:,1:], Z])

betakeras = np.hstack((model.get_weights()[1], model.get_weights()[0].flatten()))
bkeras    = model.get_weights()[2].flatten()
fe_params['Keras'] = pd.Series(betakeras, index=fe_params.index)
random_effects['Keras'] = pd.Series(bkeras, index=random_effects.index)
print(fe_params)


# # Display estimation

# In[35]:


plotfitted(fe_params=fe_params,random_effects=random_effects,X=X,Z=Z,Y=Y)


# It is important to note from before that, although the model fitting (i.e., regression coefficients) are not the same across different approach, the model prediction is highly similar (at least it pass the eyeball test).