%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pystan
import statsmodels.api as sm
import statsmodels.formula.api as smf
np.random.seed(1)
n = 200
treat = np.r_[np.ones(int(n/2)), np.zeros(int(n/2))]
x = np.random.normal(100, 15, size=n)
y1 = 5*(treat==1) + 0.1*x + np.random.normal(0, 5, size=n)
data = pd.DataFrame(dict(x=x, y1=y1, treat=treat))
data.plot.scatter('x', 'y1', c='treat', cmap='plasma')
<matplotlib.axes._subplots.AxesSubplot at 0x118957b00>
L2 = 0.7 * (treat == 1) + .05 * (x - 100)
y2 = (np.random.rand(n) <= 1 / (1 + np.exp(-L2))).astype(int)
fit1 = smf.ols(formula='y1 ~ x + treat', data=data).fit()
fit1.summary()
Dep. Variable: | y1 | R-squared: | 0.220 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.212 |
Method: | Least Squares | F-statistic: | 27.74 |
Date: | Thu, 02 Jun 2016 | Prob (F-statistic): | 2.43e-11 |
Time: | 13:50:56 | Log-Likelihood: | -612.35 |
No. Observations: | 200 | AIC: | 1231. |
Df Residuals: | 197 | BIC: | 1241. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 1.7556 | 2.812 | 0.624 | 0.533 | -3.790 7.302 |
x | 0.0818 | 0.027 | 3.029 | 0.003 | 0.029 0.135 |
treat | 5.1258 | 0.738 | 6.949 | 0.000 | 3.671 6.580 |
Omnibus: | 0.216 | Durbin-Watson: | 2.083 |
---|---|---|---|
Prob(Omnibus): | 0.898 | Jarque-Bera (JB): | 0.108 |
Skew: | 0.055 | Prob(JB): | 0.948 |
Kurtosis: | 3.030 | Cond. No. | 784. |
fit2 = smf.glm(formula='y2 ~ x + treat', data=data, family=sm.families.Binomial()).fit()
fit2.summary()
Dep. Variable: | y2 | No. Observations: | 200 |
---|---|---|---|
Model: | GLM | Df Residuals: | 197 |
Model Family: | Binomial | Df Model: | 2 |
Link Function: | logit | Scale: | 1.0 |
Method: | IRLS | Log-Likelihood: | -122.91 |
Date: | Thu, 02 Jun 2016 | Deviance: | 245.83 |
Time: | 13:50:58 | Pearson chi2: | 200. |
No. Iterations: | 6 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -4.7577 | 1.274 | -3.733 | 0.000 | -7.255 -2.260 |
x | 0.0482 | 0.012 | 3.908 | 0.000 | 0.024 0.072 |
treat | 0.7157 | 0.310 | 2.308 | 0.021 | 0.108 1.324 |
Specify Stan model
model = """
data {
int n;
vector[n] x;
real y1[n];
int y2[n];
vector[n] treat;
vector<lower=0>[2] sigma_b;
vector[2] Zero;
}
parameters {
vector[2] alpha;
vector[2] beta;
vector[2] mu;
real<lower=0> sigma;
corr_matrix[2] Omega_b;
}
transformed parameters {
vector[n] theta1;
vector[n] theta2;
cov_matrix[2] Sigma;
Sigma <- quad_form_diag(Omega_b, sigma_b);
theta1 <- mu[1] + alpha[1]*x + beta[1]*treat;
theta2 <- mu[2] + alpha[2]*x + beta[2]*treat;
}
model {
beta ~ multi_normal(Zero, Sigma);
Omega_b ~ lkj_corr(1);
sigma ~ cauchy(0, 5);
y1 ~ normal(theta1, sigma);
y2 ~ bernoulli_logit(theta2);
}
generated quantities {
real A;
real B;
real A_or_B;
real A_and_B;
A <- if_else(beta[1] > 2, 1, 0);
B <- if_else(exp(beta[2]) > 1.25, 1, 0);
A_or_B <- if_else(A || B, 1, 0);
A_and_B <- if_else(A && B, 1, 0);
}
"""
fit = pystan.stan(model_code=model,
data=dict(x=x, y1=y1, y2=y2, treat=treat, n=n, sigma_b=(2, 1.5), Zero=(0,0)))
fit['beta'].mean(0)
array([ 4.40912301, 0.72588719])
np.exp(fit['beta'][:,1]).mean()
2.1757017242076957
fit.plot(pars=['beta']);
Probability of A or B being true is essentially one, while the probability of both is 65%.
sample = fit.extract(pars=['A_or_B', 'A_and_B'])
[sample[s].mean(0) for s in sample]
[1.0, 0.94825000000000004]
Correlation between betas
fit['Omega_b'].mean(0)
array([[ 1. , 0.16257258], [ 0.16257258, 1. ]])