#!/usr/bin/env python # coding: utf-8 # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pystan as ps import pylab as pl # In[4]: schools_code = """ data { int J; // number of schools real y[J]; // estimated treatment effects real sigma[J]; // s.e. of effect estimates } parameters { real mu; real tau; real eta[J]; } transformed parameters { real theta[J]; for (j in 1:J) theta[j] <- mu + tau * eta[j]; } model { eta ~ normal(0, 1); y ~ normal(theta, sigma); } """ schools_dat = {'J': 8, 'y': [28, 8, -3, 7, -1, 1, 18, 12], 'sigma': [15, 10, 16, 11, 9, 11, 10, 18]} fit = ps.stan(model_code=schools_code, data=schools_dat, iter=1000, chains=4) # In[7]: mus = fit.extract()['mu'] pl.hist(mus, color = '#bbbbee') # ## Ridge regression # # Birkes and Dodge (1993) apply different regression models to the much-analysed stack-loss data of Brownlee (1965). This features 21 daily responses of stack loss $y$, the amount of ammonia escaping, with covariates being air flow ($x_1$) , temperature ($x_2$) and acid concentration ($x_3$). # # We first assume a linear regression on the expectation of $y$, with a variety of different error structures. For example, the double exponential: # # $$m_i = b_0 + b_1 z_{1i} + b_2 z_{2i} + b_3 z_{3i} $$ # # $$ y_i \sim \text{Double exp}( m_i , t ) $$ # # where $z_{ij} = (x_{ij} - \bar{x}_j ) / \text{sd}(x_j )$ are covariates standardised to have zero mean and unit variance. $b_1 , b_2 , b_3$ are initially given independent "noninformative" priors. # # Maximum likelihood estimates for the double expontential (Laplace) distribution are essentially equivalent to minimising the sum of absolute deviations (LAD), while the other options are alternative heavy-tailed distributions. # # We also consider the use of 'ridge regression', intended to avoid the instability due to correlated covariates. This has been shown Lindley and Smith (1972) to be equivalent to assuming the regression coefficients of the standardised covariates to be exchangeable, so that # # $$b_j \sim \text{Normal}(0, f )$$ # $$j = 1, 2, 3$$ # # In the following example we extend the work of Birkes and Dodge (1993) by applying this ridge technique to each of the possible error distributions. # In[13]: stacks_data = {'p': 3, 'N': 21, 'Y': (43, 37, 37, 28, 18, 18, 19, 20, 15, 14, 14, 13, 11, 12, 8, 7, 8, 8, 9, 15, 15), 'x': np.resize((80, 80, 75, 62, 62, 62, 62, 62, 59, 58, 58, 58, 58, 58, 50, 50, 50, 50, 50, 56, 70, 27, 27, 25, 24, 22, 23, 24, 24, 23, 18, 18, 17, 18, 19, 18, 18, 19, 19, 20, 20, 20, 89, 88, 90, 87, 87, 87, 93, 93, 87, 80, 89, 88, 82, 93, 89, 86, 72, 79, 80, 82, 91), (21, 3))} # In[11]: stacks_lasso = ''' data { int N; int p; real Y[N]; matrix[N,p] x; } // to standardize the x's transformed data { real z[N,p]; real mean_x[p]; real sd_x[p]; for (j in 1:p) { mean_x[j] <- mean(col(x,j)); sd_x[j] <- sd(col(x,j)); for (i in 1:N) z[i,j] <- (x[i,j] - mean_x[j]) / sd_x[j]; } } parameters { real beta0; real beta[p]; real sigmasq; real phi; } transformed parameters { real sigma; real mu[N]; sigma <- sqrt(2) * sigmasq; for (n in 1:N) mu[n] <- beta0 + beta[1] * z[n, 1] + beta[2] * z[n, 2] + beta[3] * z[n, 3]; } model { beta0 ~ normal(0, 316); phi ~ gamma(0.01, 0.01); beta ~ normal(0, sqrt(phi)); sigmasq ~ inv_gamma(.001, .001); for (n in 1:N) Y[n] ~ double_exponential(mu[n], sigmasq); } generated quantities { real b0; real b[p]; real outlier_1; real outlier_3; real outlier_4; real outlier_21; for (j in 1:p) b[j] <- beta[j] / sd_x[j]; b0 <- beta0 - b[1] * mean_x[1] - b[2] * mean_x[2] - b[3] * mean_x[3]; outlier_1 <- step(fabs((Y[1] - mu[1]) / sigma) - 2.5); outlier_3 <- step(fabs((Y[3] - mu[3]) / sigma) - 2.5); outlier_4 <- step(fabs((Y[4] - mu[4]) / sigma) - 2.5); outlier_21 <- step(fabs((Y[21] - mu[21]) / sigma) - 2.5); } ''' # In[16]: stacks_init = {'beta0': 10, 'beta': [0, 0, 0], 'sigmasq': 10, 'phi': 0.316} # In[18]: fit_lasso = ps.stan(model_code=stacks_lasso, data=stacks_data, iter=1000, chains=4) # In[26]: import seaborn as sb traces = fit_lasso.extract(permuted=True) # In[54]: import pandas as pd betas = pd.DataFrame(traces['beta'], columns=['beta1','beta2','beta3']).stack().reset_index() betas.columns = 'obs', 'parameter', 'value' g = sb.FacetGrid(betas, col='parameter') g.map(pl.hist, 'value', color="steelblue",lw=0) # In[55]: stacks_ridge = ''' data { int N; int p; real Y[N]; matrix[N,p] x; } // to standardize the x's transformed data { matrix[N,p] z; row_vector[p] mean_x; real sd_x[p]; real d; for (j in 1:p) { mean_x[j] <- mean(col(x,j)); sd_x[j] <- sd(col(x,j)); for (i in 1:N) z[i,j] <- (x[i,j] - mean_x[j]) / sd_x[j]; } d <- 4; // degrees of freedom for t } parameters { real beta0; vector[p] beta; real sigmasq; real phi; } transformed parameters { real sigma; vector[N] mu; sigma <- sqrt(d * sigmasq / (d-2)); // t errors on d degrees of freedom mu <- beta0 + z * beta; } model { beta0 ~ normal(0, 316); phi ~ gamma(0.01, 0.01); beta ~ normal(0, sqrt(phi)); sigmasq ~ inv_gamma(.001, .001); Y ~ student_t(d, mu, sqrt(sigmasq)); } generated quantities { real b0; vector[p] b; real outlier_1; real outlier_3; real outlier_4; real outlier_21; for (j in 1:p) b[j] <- beta[j] / sd_x[j]; b0 <- beta0 - mean_x * b; outlier_1 <- step(fabs((Y[3] - mu[3]) / sigma) - 2.5); outlier_3 <- step(fabs((Y[3] - mu[3]) / sigma) - 2.5); outlier_4 <- step(fabs((Y[4] - mu[4]) / sigma) - 2.5); outlier_21 <- step(fabs((Y[21] - mu[21]) / sigma) - 2.5); } ''' # In[61]: fit_ridge = ps.stan(model_code=stacks_ridge, data=stacks_data, init=stacks_init, iter=10000, chains=4) # In[57]: ridge_traces = fit_ridge.extract(permuted=True) ridge_betas = pd.DataFrame(ridge_traces['beta'], columns=['beta1','beta2','beta3']).stack().reset_index() ridge_betas.columns = 'obs', 'parameter', 'value' g = sb.FacetGrid(ridge_betas, col='parameter') g.map(pl.hist, 'value', color="steelblue",lw=0) # In[60]: pl.plot(ridge_traces['beta']) # In[ ]: basket_code = ''' data { int K; int y[K]; int N[K]; } parameters { real theta[K]; real mu; real s2; } transformed parameters { real p[K]; for (i in 1:K) p[i] <- inv_logit(theta[i]) } model { mu ~ normal(-1.34, 100) s2 ~ inv_gamma(0.00005, 0.000005) for (i in 1:K) theta[i] ~ normal(mu, sqrt(s2)) y[i] ~ binomial(n[i], p[i]) } ''' # In[ ]: basket_indep_code = ''' data { int K; int y[K]; int N[K]; } parameters { real theta[K]; } transformed parameters { real p[K]; for (i in 1:K) p[i] <- inv_logit(theta[i]) } model { for (i in 1:K) theta[i] ~ normal(-1.34, 100) y[i] ~ binomial(n[i], p[i]) } '''