Ordinary Linear Regression VS Bayesian Linear Regression¶

The Economic Freedom Index is poised to help readers track over two decades of the advancement in economic freedom, prosperity, and opportunity and promote these ideas in their homes, schools, and communities. The Index covers 12 freedoms – from property rights to financial freedom – in 186 countries. The measurement of economic freedom is based on 12 quantitative and qualitative factors, grouped into four broad categories, or pillars, of economic freedom:

Rule of Law (property rights, government integrity, judicial effectiveness)
Government Size (government spending, tax burden, fiscal health)
Regulatory Efficiency (business freedom, labor freedom, monetary freedom)
Open Markets (trade freedom, investment freedom, financial freedom)

In [1]:

#hide
import pandas as pd
import numpy as np

import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

Let's see if business freedom has the same impact on GDP for countries in North America and Europe and outside.

In [227]:

data = pd.read_csv("/Users/jonny/Downloads/economic_freedom_index2019_data.csv", 
                   encoding='ISO-8859-1')[["Region", "Business Freedom", "GDP per Capita (PPP)"]]

# drop null rows
data = data.dropna(axis = 0, how = "any")
data.head()

Out[227]:

	Region	Business Freedom	GDP per Capita (PPP)
0	Asia-Pacific	49.2	$1,958
1	Europe	69.3	$12,507
2	Middle East and North Africa	61.6	$15,237
3	Sub-Saharan Africa	55.7	$6,753
4	Americas	56.4	$20,876

In [228]:

# GDP is a string, chnage it to int:
data["gdppc"] = list(map(lambda x: int(x.strip('$').replace(',', "").split(' ')[0]), data["GDP per Capita (PPP)"]))
data["log_gdppc"] = np.log(data["gdppc"])

# add 2 new features:
data["is_europe"] = data.Region == "Europe"
data["is_europe"] = data.is_europe.astype(int)
data["business_freedom_x_region"] = data.is_europe * data["Business Freedom"]

# subset and rename the columns that we need
df = data[["is_europe", "Business Freedom", "business_freedom_x_region", "log_gdppc"]].copy()
df.columns = ["is_europe", "business_freedom", "business_freedom_x_region", "log_gdppc"]
df.head()

Out[228]:

	is_europe	business_freedom	business_freedom_x_region	log_gdppc
0	0	49.2	0.0	7.579679
1	1	69.3	69.3	9.434044
2	0	61.6	0.0	9.631482
3	0	55.7	0.0	8.817742
4	0	56.4	0.0	9.946355

In [303]:

inside_europe = df[df.is_europe == 1]
outside_europe = df[df.is_europe == 0]

x = "business_freedom"
y = "log_gdppc"
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 4), sharey=True)
ax[0].scatter(x = inside_europe[x], y = inside_europe[y])
ax[0].set(title = "Europe")
ax[1].scatter(x = outside_europe[x], y = outside_europe[y])
ax[1].set(title = "Outside of Europe");

Ordinary Linear Regression¶

In [230]:

features = ["business_freedom", "business_freedom_x_region", "is_europe"]
x = df[features]
y = df["log_gdppc"]

reg = LinearRegression()
_ = reg.fit(x, y)

coef = dict([i for i in zip(list(x.columns), reg.coef_)]); coef
print("Intercept: %f" % reg.intercept_)
print("Coefficient of Determination: %f" % reg.score(x, y))

Out[230]:

{'business_freedom': 0.046021183174684115,
 'business_freedom_x_region': -0.02015001509198234,
 'is_europe': 2.0992420204524835}

Intercept: 6.247260
Coefficient of Determination: 0.478465

In [304]:

# predict log GPD using region and business freedom
europe_gdp = reg.predict(inside_europe[features])
non_europe_gdp = reg.predict(outside_europe[features])

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5), sharey=True)

ax[0].plot(outside_europe["business_freedom"],
           non_europe_gdp,
           color = "orange",
           label = "Regression line")
ax[0].scatter(outside_europe["business_freedom"],
              outside_europe["log_gdppc"],
              label="non-European Nations",
              s = 15)
ax[0].set(xlabel="business_freedom",
          ylabel="log GDP Per Capita",
          title="Non European Nations")

ax[1].plot(inside_europe["business_freedom"],
           europe_gdp,
           color = "orange",
           label = "Regression line")
ax[1].scatter(inside_europe["business_freedom"],
              inside_europe["log_gdppc"],
              color = "red",
              label="European Nations",
              s = 15)
ax[1].set(xlabel="business_freedom",
          ylabel="log GDP Per Capita",
          title="European Nations")
ax[0].legend()
ax[1].legend();

In [305]:

# backout the slopes of the regression lines for nations in and out of Europe
print("Slope for European Nations: ", 
      round(coef["business_freedom"] + coef["business_freedom_x_region"], 3))
print("Slope for non-European Nations: ", round(coef["business_freedom"], 3))

Slope for European Nations:  0.026
Slope for non-European Nations:  0.046

Bayesian Regression¶

In [247]:

#hide
from functools import partial
import torch

import pyro
import pyro.distributions as dist
from torch import nn
from pyro.nn import PyroModule

from pyro.nn import PyroSample
from pyro.infer.autoguide import AutoDiagonalNormal
from pyro.infer import SVI, Trace_ELBO
from pyro.infer import Predictive

In [252]:

class BayesianRegression(PyroModule):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = PyroModule[nn.Linear](in_features, out_features)
        # PyroSample used to declare priors:
        self.linear.weight = PyroSample(dist.Normal(0., 5.).expand([out_features, in_features]).to_event(2))
        self.linear.bias = PyroSample(dist.Normal(0., 5.).expand([out_features]).to_event(1))

    def forward(self, x, y=None):
        sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
        mean = self.linear(x).squeeze(-1)
        # sample from the posterior
        obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y)
        return mean

In [253]:

tensor_data = torch.tensor(df.values, dtype=torch.float)
x_data = tensor_data[:, [0, 1, 2]]
y_data = tensor_data[:, 3] 

In [254]:

model = BayesianRegression(3, 1)
auto_guide = AutoDiagonalNormal(model)

svi = SVI(model = model, # bayesian regression class  
          guide = auto_guide, # using auto guide
          optim = pyro.optim.Adam({"lr": 0.05}),
          loss=Trace_ELBO())

In [255]:

num_iterations = 2500

# param_store is where pyro stores param estimates
pyro.clear_param_store()

for j in range(num_iterations):
    # calculate the loss and take a gradient step
    loss = svi.step(x_data, y_data)
    if j % 250 == 0:
        print("[iteration %04d] loss: %.4f" % (j + 1, loss / len(data)))

[iteration 0001] loss: 67.0926
[iteration 0251] loss: 2.7928
[iteration 0501] loss: 2.4033
[iteration 0751] loss: 1.8199
[iteration 1001] loss: 1.5908
[iteration 1251] loss: 1.4756
[iteration 1501] loss: 1.4575
[iteration 1751] loss: 1.5769
[iteration 2001] loss: 1.4664
[iteration 2251] loss: 1.5560

In [256]:

auto_guide.requires_grad_(False)

for name, value in pyro.get_param_store().items():
    print(name, pyro.param(name))

Out[256]:

AutoDiagonalNormal()

AutoDiagonalNormal.loc Parameter containing:
tensor([-2.2944,  1.5998,  0.0407, -0.0170,  6.2517])
AutoDiagonalNormal.scale tensor([0.0579, 0.1402, 0.0012, 0.0022, 0.0457])

In [257]:

num_samples = 1000
predictive = Predictive(model = model, 
                        guide = auto_guide, 
                        num_samples = num_samples,
                        return_sites=("linear.weight", "linear.bias", "obs", "_RETURN"))
pred = predictive(x_data)
weight = pred["linear.weight"]
weight = weight.reshape(weight.shape[0], 3)
bias = pred["linear.bias"]

In [258]:

#random
# columns of x_data: is_europe, business_freedom, business_freedom_x_region
print("Mean for posterior distributions: ", torch.mean(weight, 0))
print("97.5 percentile: ", weight.kthvalue(int(num_samples * 0.975), dim = 0)[0]) # find the 97.5 percentile value
print("2.5 percentile: ", weight.kthvalue(int(num_samples * 0.025), dim = 0)[0]) # find the 2.5 percentile value

columns = ["is_europe", "business_freedom", "business_freedom_x_region"]
bayes_coef = dict(zip(columns, torch.mean(weight, 0).numpy()))

Mean for posterior distributions:  tensor([ 1.6074,  0.0407, -0.0169])
97.5 percentile:  tensor([ 1.8948,  0.0431, -0.0128])
2.5 percentile:  tensor([ 1.3426,  0.0385, -0.0213])

In [301]:

fig = plt.figure(figsize = (10, 5))
sns.distplot(weight[:, 1], 
             kde_kws = {"label": "business_freedom Posterior Samples"}, 
             color = "teal",
             norm_hist = True,
             kde = True)
sns.distplot(weight[:, 2], 
             kde_kws = {"label": "Interaction Term Posterior Samples"}, 
             color = "red",
             norm_hist = True,
             kde = True)

fig.suptitle("Posterior Distributions");

In [319]:

slope_inside_europe = weight[:, 1] + weight[:, 2] # business_freedom + business_freedom_x_region
slope_outside_europe = weight[:, 1] # business_freedom

print("Slope for European nations: ", torch.mean(slope_inside_europe).numpy()) # business_freedom + interaction
print("Slope for non-European nations: ", torch.mean(slope_outside_europe).numpy()) # business_freedom

fig = plt.figure(figsize=(10, 5))
sns.distplot(slope_inside_europe, kde_kws = {"label": "European nations"})
sns.distplot(slope_outside_europe, kde_kws={"label": "Non-European nations"})
fig.suptitle("log(GDP Per Capita) vs Business Freedom");

Slope for European nations:  0.023792317
Slope for non-European nations:  0.040710554

Not Included in Blog¶

In [ ]:

def summary(samples):
    site_stats = {}
    for k, v in samples.items():
        site_stats[k] = {
            "mean": torch.mean(v, 0),
            "std": torch.std(v, 0),
            "5%": v.kthvalue(int(len(v) * 0.025), dim=0)[0],
            "95%": v.kthvalue(int(len(v) * 0.975), dim=0)[0],
        }
    return site_stats

samples = predictive(x_data)
pred_summary = summary(samples)

mu = pred_summary["_RETURN"]
y = pred_summary["obs"]
predictions = pd.DataFrame({
    "cont_africa": x_data[:, 0],
    "rugged": x_data[:, 1],
    "mu_mean": mu["mean"],
    "mu_perc_5": mu["5%"],
    "mu_perc_95": mu["95%"],
    "y_mean": y["mean"],
    "y_perc_5": y["5%"],
    "y_perc_95": y["95%"],
    "true_gdp": y_data,
})
predictions.head()

In [ ]:

african_nations = predictions[predictions["cont_africa"] == 1]
non_african_nations = predictions[predictions["cont_africa"] == 0]
african_nations = african_nations.sort_values(by=["rugged"])
non_african_nations = non_african_nations.sort_values(by=["rugged"])

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5), sharey=True)
fig.suptitle("Regression line with 95% CI", fontsize=16)
ax[0].plot(non_african_nations["rugged"],
           non_african_nations["mu_mean"])
ax[0].fill_between(non_african_nations["rugged"],
                   non_african_nations["mu_perc_5"],
                   non_african_nations["mu_perc_95"],
                   alpha=0.5)
ax[0].plot(non_african_nations["rugged"],
           non_african_nations["true_gdp"],
           "o")
ax[0].set(xlabel="Terrain Ruggedness Index",
          ylabel="log GDP (2000)",
          title="Non African Nations")
idx = np.argsort(african_nations["rugged"])
ax[1].plot(african_nations["rugged"],
           african_nations["mu_mean"])
ax[1].fill_between(african_nations["rugged"],
                   african_nations["mu_perc_5"],
                   african_nations["mu_perc_95"],
                   alpha=0.5)
ax[1].plot(african_nations["rugged"],
           african_nations["true_gdp"],
           "o")
ax[1].set(xlabel="Terrain Ruggedness Index",
          ylabel="log GDP (2000)",
          title="African Nations");