import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression
from scipy import stats
import statsmodels.api as sm
import pylab
# from google.colab import files
# from io import StringIO
# uploaded = files.upload()
url = 'https://raw.githubusercontent.com/assemzh/ProbProg-COVID-19/master/full_grouped.csv'
data = pd.read_csv(url)
data.Date = pd.to_datetime(data.Date)
# for fancy python printing
from IPython.display import Markdown, display
def printmd(string):
display(Markdown(string))
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
# function to make the time series of confirmed and daily confirmed cases for a specific country
def create_country (country, end_date, state = False) :
if state :
df = data.loc[data["Province/State"] == country, ["Province/State", "Date", "Confirmed", "Deaths", "Recovered"]]
else :
df = data.loc[data["Country/Region"] == country, ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
df.columns = ["country", "date", "confirmed", "deaths", "recovered"]
# group by country and date, sum(confirmed, deaths, recovered). do this because countries have multiple cities
df = df.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()
# convert date string to datetime
std_dateparser = lambda x: str(x)[5:10]
df.date = pd.to_datetime(df.date)
df['date_only'] = df.date.apply(std_dateparser)
df = df.sort_values(by = "date")
df = df[df.date <= end_date]
# make new confirmed cases every day:
cases_shifted = np.array([0] + list(df.confirmed[:-1]))
daily_confirmed = np.array(df.confirmed) - cases_shifted
df["daily_confirmed"] = daily_confirmed
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7, 6))
ax = [ax]
sns.lineplot(x = df.date,
y = df.daily_confirmed,
ax = ax[0])
ax[0].set(ylabel='Daily Confirmed Cases')
ax[0].axvline(pd.to_datetime('2020-02-25'),
linestyle = '--', linewidth = 1.5,
label = "Start of Contact tracing and Quarantine: Feb 25, 2020" ,
color = "red")
ax[0].xaxis.get_label().set_fontsize(16)
ax[0].yaxis.get_label().set_fontsize(16)
ax[0].title.set_fontsize(20)
ax[0].tick_params(labelsize=16)
myFmt = mdates.DateFormatter('%b %-d')
ax[0].xaxis.set_major_formatter(myFmt)
ax[0].set(ylabel='Daily Confirmed Cases', xlabel='');
ax[0].legend(loc = "bottom right", fontsize=12.8)
ax[0].xaxis.set_major_locator(mdates.MonthLocator(interval=1)) #to get a tick every month
sns.set_style("ticks")
plt.tight_layout()
sns.despine()
plt.savefig('/content/sample_data/kr_daily.pdf')
print(df.tail())
return df
def summary(samples):
site_stats = {}
for k, v in samples.items():
site_stats[k] = {
"mean": torch.mean(v, 0),
"std": torch.std(v, 0),
"5%": v.kthvalue(int(len(v) * 0.05), dim=0)[0],
"95%": v.kthvalue(int(len(v) * 0.95), dim=0)[0],
}
return site_stats
cad = create_country("South Korea", end_date = "2020-05-31")
country date confirmed ... recovered date_only daily_confirmed 126 South Korea 2020-05-27 11344 ... 10340 05-27 79 127 South Korea 2020-05-28 11402 ... 10363 05-28 58 128 South Korea 2020-05-29 11441 ... 10398 05-29 39 129 South Korea 2020-05-30 11468 ... 10405 05-30 27 130 South Korea 2020-05-31 11503 ... 10422 05-31 35 [5 rows x 7 columns]
cad_start = "2020-02-20" # 13 confirmed cases
cad = cad[cad.date >= cad_start].reset_index(drop = True)
cad["days_since_start"] = np.arange(cad.shape[0]) + 1
cad.shape
cad_tmp = cad[cad.date < "2020-04-15"]
cad_tmp.shape
(55, 8)
# variable for data to easily swap it out:
country_ = "South Korea (Before April 15th)"
reg_data = cad_tmp.copy()
reg_data.head()
country | date | confirmed | deaths | recovered | date_only | daily_confirmed | days_since_start | |
---|---|---|---|---|---|---|---|---|
0 | South Korea | 2020-02-20 | 104 | 1 | 16 | 02-20 | 73 | 1 |
1 | South Korea | 2020-02-21 | 204 | 2 | 16 | 02-21 | 100 | 2 |
2 | South Korea | 2020-02-22 | 433 | 2 | 16 | 02-22 | 229 | 3 |
3 | South Korea | 2020-02-23 | 602 | 6 | 18 | 02-23 | 169 | 4 |
4 | South Korea | 2020-02-24 | 833 | 8 | 18 | 02-24 | 231 | 5 |
!pip install pyro-ppl
!pip install numpyro
Collecting pyro-ppl Downloading https://files.pythonhosted.org/packages/aa/7a/fbab572fd385154a0c07b0fa138683aa52e14603bb83d37b198e5f9269b1/pyro_ppl-1.6.0-py3-none-any.whl (634kB) |████████████████████████████████| 634kB 10.8MB/s Requirement already satisfied: torch>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.8.1+cu101) Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (3.3.0) Requirement already satisfied: tqdm>=4.36 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (4.41.1) Collecting pyro-api>=0.1.1 Downloading https://files.pythonhosted.org/packages/fc/81/957ae78e6398460a7230b0eb9b8f1cb954c5e913e868e48d89324c68cec7/pyro_api-0.1.2-py3-none-any.whl Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.7/dist-packages (from pyro-ppl) (1.19.5) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.8.0->pyro-ppl) (3.7.4.3) Installing collected packages: pyro-api, pyro-ppl Successfully installed pyro-api-0.1.2 pyro-ppl-1.6.0 Collecting numpyro Downloading https://files.pythonhosted.org/packages/00/a6/064eedcec968207259acf06cf156c0ea9a6534328bdf7da0e768cfdb3239/numpyro-0.6.0-py3-none-any.whl (218kB) |████████████████████████████████| 225kB 11.2MB/s Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from numpyro) (4.41.1) Collecting jax==0.2.10 Downloading https://files.pythonhosted.org/packages/88/9d/2862825b5eddd0df64c78b22cc0b897f0128b1c6494bf39e4849e9e0fade/jax-0.2.10.tar.gz (589kB) |████████████████████████████████| 593kB 18.6MB/s Collecting jaxlib==0.1.62 Downloading https://files.pythonhosted.org/packages/7e/75/30f1c643b7edb1309b6d748809042241737fe43127cb41754266eca79250/jaxlib-0.1.62-cp37-none-manylinux2010_x86_64.whl (35.7MB) |████████████████████████████████| 35.7MB 88kB/s Requirement already satisfied: numpy>=1.12 in /usr/local/lib/python3.7/dist-packages (from jax==0.2.10->numpyro) (1.19.5) Requirement already satisfied: absl-py in /usr/local/lib/python3.7/dist-packages (from jax==0.2.10->numpyro) (0.12.0) Requirement already satisfied: opt_einsum in /usr/local/lib/python3.7/dist-packages (from jax==0.2.10->numpyro) (3.3.0) Requirement already satisfied: flatbuffers in /usr/local/lib/python3.7/dist-packages (from jaxlib==0.1.62->numpyro) (1.12) Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from jaxlib==0.1.62->numpyro) (1.4.1) Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from absl-py->jax==0.2.10->numpyro) (1.15.0) Building wheels for collected packages: jax Building wheel for jax (setup.py) ... done Created wheel for jax: filename=jax-0.2.10-cp37-none-any.whl size=679776 sha256=511b156e79c1476901deca3c1aa27170da2422b9715e25ff6ec768c9e6c1f136 Stored in directory: /root/.cache/pip/wheels/44/ea/ac/3be3bc19ee3b62f6fe1561eb6df1199284bb6bab819c1befa4 Successfully built jax Installing collected packages: jax, jaxlib, numpyro Found existing installation: jax 0.2.12 Uninstalling jax-0.2.12: Successfully uninstalled jax-0.2.12 Found existing installation: jaxlib 0.1.65+cuda110 Uninstalling jaxlib-0.1.65+cuda110: Successfully uninstalled jaxlib-0.1.65+cuda110 Successfully installed jax-0.2.10 jaxlib-0.1.62 numpyro-0.6.0
import torch
import pyro
import pyro.distributions as dist
from torch import nn
from pyro.nn import PyroModule, PyroSample
from pyro.infer import MCMC, NUTS, HMC
from pyro.infer.autoguide import AutoGuide, AutoDiagonalNormal
from pyro.infer import SVI, Trace_ELBO
from pyro.infer import Predictive
# we should be able to have an empirical estimate for the mean of the prior for the 2nd regression bias term
# this will be something like b = log(max(daily_confirmed))
# might be able to have 1 regression model but change the data so that we have new terms for (tau < t)
# like an interaction term
class COVID_change(PyroModule):
def __init__(self, in_features, out_features, b1_mu, b2_mu):
super().__init__()
self.linear1 = PyroModule[nn.Linear](in_features, out_features, bias = False)
self.linear1.weight = PyroSample(dist.Normal(0.5, 0.25).expand([1, 1]).to_event(1))
self.linear1.bias = PyroSample(dist.Normal(b1_mu, 1.))
# could possibly have stronger priors for the 2nd regression line, because we wont have as much data
self.linear2 = PyroModule[nn.Linear](in_features, out_features, bias = False)
self.linear2.weight = PyroSample(dist.Normal(0., 0.25).expand([1, 1])) #.to_event(1))
self.linear2.bias = PyroSample(dist.Normal(b2_mu, b2_mu/4))
def forward(self, x, y=None):
tau = pyro.sample("tau", dist.Beta(4, 3))
sigma = pyro.sample("sigma", dist.Uniform(0., 3.))
# fit lm's to data based on tau
sep = int(np.ceil(tau.detach().numpy() * len(x)))
mean1 = self.linear1(x[:sep]).squeeze(-1)
mean2 = self.linear2(x[sep:]).squeeze(-1)
mean = torch.cat((mean1, mean2))
obs = pyro.sample("obs", dist.StudentT(2, mean, sigma), obs=y)
return mean
tensor_data = torch.tensor(reg_data[["confirmed", "days_since_start", "daily_confirmed"]].values, dtype=torch.float)
x_data = tensor_data[:, 1].unsqueeze_(1)
y_data = np.log(tensor_data[:, 0])
y_data_daily = np.log(tensor_data[:, 2])
# prior hyper params
# take log of the average of the 1st quartile to get the prior mean for the bias of the 2nd regression line
q1 = np.quantile(y_data, q = 0.25)
bias_1_mean = np.mean(y_data.numpy()[y_data <= q1])
print("Prior mean for Bias 1: ", bias_1_mean)
# take log of the average of the 4th quartile to get the prior mean for the bias of the 2nd regression line
q4 = np.quantile(y_data, q = 0.75)
bias_2_mean = np.mean(y_data.numpy()[y_data >= q4])
print("Prior mean for Bias 2: ", bias_2_mean)
Prior mean for Bias 1: 7.161369 Prior mean for Bias 2: 9.24027
model = COVID_change(1, 1,
b1_mu = bias_1_mean,
b2_mu = bias_2_mean)
# need more than 400 samples/chain if we want to use a flat prior on b_2 and w_2
num_samples = 400
# mcmc
nuts_kernel = NUTS(model)
mcmc = MCMC(nuts_kernel,
num_samples=num_samples,
warmup_steps = 200,
num_chains = 1)
mcmc.run(x_data, y_data)
samples = mcmc.get_samples()
Sample: 100%|██████████| 600/600 [33:58, 3.40s/it, step size=9.46e-05, acc. prob=0.860]
# Save the model:
import dill
with open('korea.pkl', 'wb') as f:
dill.dump(mcmc, f)
# with open('canada.pkl', 'rb') as f:
# mcmc = dill.load(f)
samples = mcmc.get_samples()
# extract individual posteriors
weight_1_post = samples["linear1.weight"].detach().numpy()
weight_2_post = samples["linear2.weight"].detach().numpy()
bias_1_post = samples["linear1.bias"].detach().numpy()
bias_2_post = samples["linear2.bias"].detach().numpy()
tau_post = samples["tau"].detach().numpy()
sigma_post = samples["sigma"].detach().numpy()
# build likelihood distribution:
tau_days = list(map(int, np.ceil(tau_post * len(x_data))))
mean_ = torch.zeros(len(tau_days), len(x_data))
obs_ = torch.zeros(len(tau_days), len(x_data))
for i in range(len(tau_days)) :
mean_[i, :] = torch.cat((x_data[:tau_days[i]] * weight_1_post[i] + bias_1_post[i],
x_data[tau_days[i]:] * weight_2_post[i] + bias_2_post[i])).reshape(len(x_data))
obs_[i, :] = dist.Normal(mean_[i, :], sigma_post[i]).sample()
samples["_RETURN"] = mean_
samples["obs"] = obs_
pred_summary = summary(samples)
mu = pred_summary["_RETURN"] # mean
y = pred_summary["obs"] # samples from likelihood: mu + sigma
y_shift = np.exp(y["mean"]) - np.exp(torch.cat((y["mean"][0:1], y["mean"][:-1])))
print(y_shift)
predictions = pd.DataFrame({
"days_since_start": x_data[:, 0],
"mu_mean": mu["mean"], # mean of likelihood
"mu_perc_5": mu["5%"],
"mu_perc_95": mu["95%"],
"y_mean": y["mean"], # mean of likelihood + noise
"y_perc_5": y["5%"],
"y_perc_95": y["95%"],
"true_confirmed": y_data,
"true_daily_confirmed": y_data_daily,
"y_daily_mean": y_shift
})
w1_ = pred_summary["linear1.weight"]
w2_ = pred_summary["linear2.weight"]
b1_ = pred_summary["linear1.bias"]
b2_ = pred_summary["linear2.bias"]
tau_ = pred_summary["tau"]
sigma_ = pred_summary["sigma"]
ind = int(np.ceil(tau_["mean"] * len(x_data)))
tensor([ 0.0000, 82.0016, 102.8954, 138.0626, 177.0313, 233.4354, 292.8321, 382.7584, 494.3716, 653.0012, 845.9763, 1071.9526, 1464.6138, 1130.1460, -6.6099, 61.6411, 75.4199, 73.4316, 56.1016, 97.3462, 101.0249, 61.6816, 97.2871, 83.4458, 64.8203, 88.0947, 114.5322, 62.6924, 112.9365, 89.4297, 80.8291, 102.3701, 85.1367, 107.2764, 59.3027, 139.7627, 96.3291, 81.1426, 90.6025, 138.5977, 63.8389, 111.0518, 109.2520, 112.5029, 100.8203, 86.5762, 126.0703, 100.6436, 118.2803, 117.6064, 111.0918, 96.0430, 121.1953, 112.4150, 144.9268])
mcmc.summary()
diag = mcmc.diagnostics()
mean std median 5.0% 95.0% n_eff r_hat tau 0.23 0.01 0.23 0.21 0.25 23.21 1.02 sigma 0.03 0.00 0.03 0.03 0.03 7.24 1.10 linear1.weight[0,0] 0.26 0.01 0.26 0.24 0.27 8.12 1.28 linear1.bias 5.36 0.06 5.36 5.26 5.44 9.64 1.23 linear2.weight[0,0] 0.01 0.00 0.01 0.01 0.01 359.67 1.00 linear2.bias 8.74 0.02 8.74 8.71 8.77 327.63 1.00 Number of divergences: 0
print(ind)
print(reg_data.date[ind])
sns.distplot(weight_1_post,
kde_kws = {"label": "Weight posterior before CP"},
color = "red",
norm_hist = True,
kde = True)
plt.axvline(x = w1_["mean"], linestyle = '--',label = "Mean weight before CP" ,
color = "red")
sns.distplot(weight_2_post,
kde_kws = {"label": "Weight posterior after CP"},
color = "teal",
norm_hist = True,
kde = True)
plt.axvline(x = w2_["mean"], linestyle = '--',label = "Mean weight after CP" ,
color = "teal")
legend = plt.legend(loc='upper right')
legend.get_frame().set_alpha(1)
sns.set_style("ticks")
plt.tight_layout()
sns.despine()
plt.savefig('/content/sample_data/kr_weights.pdf')
13 2020-03-04 00:00:00
start_date_ = str(reg_data.date[0]).split(' ')[0]
change_date_ = str(reg_data.date[ind]).split(' ')[0]
print("Date of change for {}: {}".format(country_, change_date_))
import seaborn as sns
# plot data:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7, 5))
ax = [ax]
# log regression model
ax[0].scatter(y = np.exp(y_data[:ind]), x = x_data[:ind], s = 15);
ax[0].scatter(y = np.exp(y_data[ind:]), x = x_data[ind:], s = 15, color = "red");
ax[0].plot(predictions["days_since_start"],
np.exp(predictions["y_mean"]),
color = "green",
label = "Fitted line by MCMC-NUTS model")
ax[0].axvline(5,
linestyle = '--', linewidth = 1.5,
label = "Start of Policy: Feb 25, 2020" ,
color = "red")
ax[0].axvline(ind,
linestyle = '--', linewidth = 1.5,
label = "Date of Change: Mar 4, 2020",
color = "black")
ax[0].fill_between(predictions["days_since_start"],
np.exp(predictions["y_perc_5"]),
np.exp(predictions["y_perc_95"]),
alpha = 0.25,
label = "90% CI of predictions",
color = "teal");
ax[0].fill_betweenx([0, 1],
tau_["5%"] * len(x_data),
tau_["95%"] * len(x_data),
alpha = 0.25,
label = "90% CI of changing point",
color = "lightcoral",
transform=ax[0].get_xaxis_transform());
ax[0].set(ylabel = "Total Cases",)
ax[0].legend(loc = "lower right", fontsize=12.8)
ax[0].set_ylim([100,150000])
ax[0].xaxis.get_label().set_fontsize(16)
ax[0].yaxis.get_label().set_fontsize(16)
ax[0].title.set_fontsize(20)
ax[0].tick_params(labelsize=16)
plt.xticks(ticks=[5,13,34,50], labels=["Feb 25",
"Mar 4",
"Mar 25",
"Apr 10"], fontsize=15)
ax[0].set_yscale('log')
plt.setp(ax[0].get_xticklabels(), rotation=0, horizontalalignment='center')
print(reg_data.columns)
myFmt = mdates.DateFormatter('%m-%d')
sns.set_style("ticks")
sns.despine()
plt.savefig('/content/sample_data/kr_cp.pdf')
Date of change for South Korea (Before April 15th): 2020-03-04 Index(['country', 'date', 'confirmed', 'deaths', 'recovered', 'date_only', 'daily_confirmed', 'days_since_start'], dtype='object')
fig, ax = plt.subplots(1,3, figsize=(15, 6))
#plt.figure(figsize=(11, 5))
sns.lineplot(x="date",
y="confirmed",
data= reg_data,
ax = ax[0]
).set_title("Confirmed COVID-19 Cases in %s" % country_)
ax[0].axvline(reg_data.date[ind], color="red", linestyle="--")
ax[1].scatter(y = reg_data.confirmed[:ind], x = x_data[:ind], s = 15);
ax[1].scatter(y = reg_data.confirmed[ind:], x = x_data[ind:], s = 15, color = "red");
ax[1].plot(predictions["days_since_start"],
np.exp(predictions["y_mean"]),
color = "green",
label = "Mean")
ax[1].axvline(ind, linestyle = '--',
linewidth = 1,
label = "Day of Change")
ax[1].legend(loc = "upper left")
ax[1].set(ylabel = "Confirmed Cases",
xlabel = "Days since %s" % start_date_,
title = "Confirmed Cases: %s" % country_);
ax[2].scatter(y = reg_data.daily_confirmed[:ind], x = x_data[:ind], s = 15);
ax[2].scatter(y = reg_data.daily_confirmed[ind:], x = x_data[ind:], s = 15, color = "red");
ax[2].plot(predictions["days_since_start"],
predictions["y_daily_mean"],
color = "green",
label = "Mean")
ax[2].axvline(ind, linestyle = '--',
linewidth = 1,
label = "Day of Change")
ax[2].legend(loc = "upper left")
ax[2].set(ylabel = "Daily Confirmed Cases",
xlabel = "Days since %s" % start_date_,
title = "Daily Confirmed Cases: %s" % country_);
printmd("**Date of change for {}: {}**".format(country_, change_date_));
import matplotlib.dates as mdates
myFmt = mdates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(myFmt)
# ax[0].set_xticklabels(reg_data.date, rotation = 45, fontsize="10", va="center")
plt.setp(ax[0].get_xticklabels(), rotation=30, horizontalalignment='right')
ax[0].set(ylabel='Confirmed Cases', xlabel='Date');
plt.tight_layout()
plt.savefig('/content/sample_data/kr_mean.pdf')
Date of change for South Korea (Before April 15th): 2020-03-04