In [85]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pystan
import statsmodels.api as sm
import statsmodels.formula.api as smf

np.random.seed(1)
In [86]:
n = 200
treat = np.r_[np.ones(int(n/2)), np.zeros(int(n/2))]
x = np.random.normal(100, 15, size=n)
y1 = 5*(treat==1) + 0.1*x + np.random.normal(0, 5, size=n)
data = pd.DataFrame(dict(x=x, y1=y1, treat=treat))
data.plot.scatter('x', 'y1', c='treat', cmap='plasma')
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x118957b00>
In [87]:
L2 = 0.7 * (treat == 1) + .05 * (x - 100)
y2 = (np.random.rand(n) <= 1 / (1 + np.exp(-L2))).astype(int)
In [88]:
fit1 = smf.ols(formula='y1 ~ x + treat', data=data).fit()
fit1.summary()
Out[88]:
OLS Regression Results
Dep. Variable: y1 R-squared: 0.220
Model: OLS Adj. R-squared: 0.212
Method: Least Squares F-statistic: 27.74
Date: Thu, 02 Jun 2016 Prob (F-statistic): 2.43e-11
Time: 13:50:56 Log-Likelihood: -612.35
No. Observations: 200 AIC: 1231.
Df Residuals: 197 BIC: 1241.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 1.7556 2.812 0.624 0.533 -3.790 7.302
x 0.0818 0.027 3.029 0.003 0.029 0.135
treat 5.1258 0.738 6.949 0.000 3.671 6.580
Omnibus: 0.216 Durbin-Watson: 2.083
Prob(Omnibus): 0.898 Jarque-Bera (JB): 0.108
Skew: 0.055 Prob(JB): 0.948
Kurtosis: 3.030 Cond. No. 784.
In [89]:
fit2 = smf.glm(formula='y2 ~ x + treat', data=data, family=sm.families.Binomial()).fit()
fit2.summary()
Out[89]:
Generalized Linear Model Regression Results
Dep. Variable: y2 No. Observations: 200
Model: GLM Df Residuals: 197
Model Family: Binomial Df Model: 2
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -122.91
Date: Thu, 02 Jun 2016 Deviance: 245.83
Time: 13:50:58 Pearson chi2: 200.
No. Iterations: 6
coef std err z P>|z| [95.0% Conf. Int.]
Intercept -4.7577 1.274 -3.733 0.000 -7.255 -2.260
x 0.0482 0.012 3.908 0.000 0.024 0.072
treat 0.7157 0.310 2.308 0.021 0.108 1.324

Specify Stan model

In [90]:
model = """
data {
    int n;
    vector[n] x;
    real y1[n];
    int y2[n];
    vector[n] treat;
    vector<lower=0>[2] sigma_b;
    vector[2] Zero;
}

parameters {
    vector[2] alpha;
    vector[2] beta;
    vector[2] mu;
    real<lower=0> sigma;
    corr_matrix[2] Omega_b; 
}


transformed parameters {
    vector[n] theta1;
    vector[n] theta2;
    cov_matrix[2] Sigma; 
    
    Sigma <- quad_form_diag(Omega_b, sigma_b); 
    
    theta1 <- mu[1] + alpha[1]*x + beta[1]*treat;
    theta2 <- mu[2] + alpha[2]*x + beta[2]*treat;
}

model {
    beta ~ multi_normal(Zero, Sigma); 
    Omega_b ~ lkj_corr(1);

    sigma ~ cauchy(0, 5);
    
    y1 ~ normal(theta1, sigma);
    y2 ~ bernoulli_logit(theta2);
}

generated quantities {
    real A;
    real B;
    real A_or_B;
    real A_and_B;

    A <- if_else(beta[1] > 2, 1, 0);
    B <- if_else(exp(beta[2]) > 1.25, 1, 0);
    
    A_or_B <- if_else(A || B, 1, 0);
    A_and_B <- if_else(A && B, 1, 0);
}
"""
In [91]:
fit = pystan.stan(model_code=model, 
                  data=dict(x=x, y1=y1, y2=y2, treat=treat, n=n, sigma_b=(2, 1.5), Zero=(0,0)))
In [92]:
fit['beta'].mean(0)
Out[92]:
array([ 4.40912301,  0.72588719])
In [93]:
np.exp(fit['beta'][:,1]).mean()
Out[93]:
2.1757017242076957
In [94]:
fit.plot(pars=['beta']);

Probability of A or B being true is essentially one, while the probability of both is 65%.

In [95]:
sample = fit.extract(pars=['A_or_B', 'A_and_B'])
In [96]:
[sample[s].mean(0) for s in sample]
Out[96]:
[1.0, 0.94825000000000004]

Correlation between betas

In [97]:
fit['Omega_b'].mean(0)
Out[97]:
array([[ 1.        ,  0.16257258],
       [ 0.16257258,  1.        ]])