Estimating the number of infected people by country based on the number of deaths and case fatality rate.
Note: This dashboard contains the results of a predictive model. The author has tried to make it as accurate as possible. But the COVID-19 situation is changing quickly, and these models inevitably include some level of speculation.
#hide
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from datetime import timedelta, datetime, date
%config InlineBackend.figure_format = 'retina'
chart_width = 550
chart_height= 400
#hide
def plot(data, type1, levels):
data_countries_pc2 = data.copy()
for i in range(0,len(countries)):
data_countries_pc2[i] = data_countries_pc2[i].reset_index()
data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
if type1 == "scatter":
data_countries_pc2[i]['cases'] = data_countries_pc2[i]["total_cases"]
data_countries_pc2[i]['infected'] = data_countries_pc2[i]["total_infected"]
data_plot = data_countries_pc2[0]
for i in range(1, len(countries)):
data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
if type1 == "scatter":
data_plot["45_line"] = data_plot["cases"]
# Plot it using Altair
source = data_plot
if levels == True:
ylabel = "Total"
else :
ylabel = "Per Million"
scales = alt.selection_interval(bind='scales')
selection = alt.selection_multi(fields=['location'], bind='legend')
if type1 == "line":
base = alt.Chart(source, title = "Estimated Infected Population By Country").encode(
x = alt.X('n_days:Q', title = "Days since outbreak"),
y = alt.Y("infected:Q",title = ylabel),
color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
scale=alt.Scale(scheme='tableau20'))
)
shades = base.mark_area().encode(
x='n_days:Q',
y='total_infected_lower:Q',
y2='total_infected_upper:Q',
opacity = alt.condition(selection, alt.value(0.2), alt.value(0.05))
)
lines = base.mark_line().encode(
opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
).add_selection(
scales
).add_selection(
selection
).properties(
width=chart_width,
height=chart_height
)
return(
( lines + shades)
.configure_title(fontSize=20)
.configure_axis(labelFontSize=15,titleFontSize=18)
)
if levels == True:
ylabel = "Infected"
xlabel = "Cases"
else :
ylabel = "Per Million Infected"
xlabel = "Per Million Cases"
if type1 == "scatter":
base = alt.Chart(source, title = "COVID-19 Cases VS Infected").encode(
x = alt.X('cases:Q', title = xlabel),
y = alt.Y("infected:Q",title = ylabel),
color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
scale=alt.Scale(scheme='tableau20')),
opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
)
scatter = base.mark_point().add_selection(
scales
).add_selection(
selection
).properties(
width=chart_width,
height=chart_height
)
line_45 = alt.Chart(source).encode(
x = "cases:Q",
y = alt.Y("45_line:Q", scale=alt.Scale(domain=(0, max(data_plot["infected"])))),
).mark_line(color="grey", strokeDash=[3,3])
return(
(scatter + line_45)
.configure_title(fontSize=20)
.configure_axis(labelFontSize=15,titleFontSize=18)
)
#hide
# Get data on deaths D_t
data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
error_bad_lines=False)
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_deaths"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"
#hide
# Get data and clean it
data_cases = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", error_bad_lines=False)
data_cases = data_cases.drop(columns=["Lat", "Long"])
data_cases = data_cases.melt(id_vars= ["Province/State", "Country/Region"])
data_cases = pd.DataFrame(data_cases.groupby(['Country/Region', "variable"]).sum())
data_cases.reset_index(inplace=True)
data_cases = data_cases.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data_cases['date'] =pd.to_datetime(data_cases.date)
data_cases = data_cases.sort_values(by = "date")
data_cases.loc[data_cases.location == "US","location"] = "United States"
data_cases.loc[data_cases.location == "Korea, South","location"] = "South Korea"
# Add countries
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany",
"Portugal", "United States", "Singapore","South Korea", "Japan",
"Brazil","Iran", "India", "Switzerland", "Canada", "Australia"]
data_final = pd.merge(data,
data_cases
)
data_final["CFR"] = data_final["total_deaths"]/data_final["total_cases"]
data_final["total_infected"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)
for j in countries:
for i in data_final["date"].unique()[0:-8]:
data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]
# Estimate growth rate of infected, g
data_final['infected_g'] = np.log(data_final['total_infected'])
data_final['infected_g'] = data_final['infected_g'].diff()
# Estimate number of infected given g
today = data_final.date.iloc[-1]
for j in countries:
for i in range(7,-1,-1):
data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected"] = data_final.loc[data_final.location == j, "total_infected"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))
# Upper Bound
data_final["total_infected_upper"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)
for j in countries:
for i in data_final["date"].unique()[0:-8]:
data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected_upper"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/(data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]*0.7)
# Estimate growth rate of infected, g
data_final['infected_g'] = np.log(data_final['total_infected_upper'])
data_final['infected_g'] = data_final['infected_g'].diff()
# Estimate number of infected given g
today = data_final.date.iloc[-1]
for j in countries:
for i in range(7,-1,-1):
data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected_upper"] = data_final.loc[data_final.location == j, "total_infected_upper"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))
# Lower Bound
data_final["total_infected_lower"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)
for j in countries:
for i in data_final["date"].unique()[0:-8]:
data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected_lower"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/(data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]*1.3)
# Estimate growth rate of infected, g
data_final['infected_g'] = np.log(data_final['total_infected_lower'])
data_final['infected_g'] = data_final['infected_g'].diff()
# Estimate number of infected given g
today = data_final.date.iloc[-1]
for j in countries:
for i in range(7,-1,-1):
data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected_lower"] = data_final.loc[data_final.location == j, "total_infected_lower"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))
data_final.loc[data_final.total_infected_lower < data_final.total_cases, "total_infected_lower"] = data_final.loc[data_final.total_infected_lower < data_final.total_cases, "total_cases"]
data_pc = data_final[['location', 'date', 'total_infected', 'total_infected_lower', 'total_infected_upper']].copy()
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany",
"Portugal", "United States", "Singapore","South Korea", "Japan",
"Brazil","Iran"]
data_countries = []
data_countries_pc = []
for i in countries:
data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]
# Get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries:
filter_country = data_pc["location"]== i
data_countries_pc.append(data_pc[filter_country & filter1])
/opt/hostedtoolcache/Python/3.6.10/x64/lib/python3.6/site-packages/ipykernel_launcher.py:46: RuntimeWarning: invalid value encountered in true_divide /opt/hostedtoolcache/Python/3.6.10/x64/lib/python3.6/site-packages/ipykernel_launcher.py:67: RuntimeWarning: invalid value encountered in true_divide /opt/hostedtoolcache/Python/3.6.10/x64/lib/python3.6/site-packages/ipykernel_launcher.py:83: RuntimeWarning: invalid value encountered in true_divide
with respect to days since outbreak
Tip: Click (Shift+ for multiple) on countries in the legend to filter the visualization.
#hide_input
# Plot estimated absolute number of infected
plot1 = plot(data_countries_pc, "line", True)
plot1.save("../images/covid-estimate-infections.png")
plot1
Latest Country Estimates
#hide_input
label = 'Estimated Infected'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']
metric_name = f'{label}'
temp.columns = ['Country', 'Date', metric_name, "Lower Bound Estimates", "Upper Bound Estimates"]
temp.loc[:, "Estimated Infected"] = temp.loc[:, "Estimated Infected"].round(0).map('{:,.0f}'.format)
temp.loc[:, "Lower Bound Estimates"] = temp.loc[:, "Lower Bound Estimates"].round(0).map('{:,.0f}'.format)
temp.loc[:, "Upper Bound Estimates"] = temp.loc[:, "Upper Bound Estimates"].round(0).map('{:,.0f}'.format)
temp.groupby('Country').last()
Date | Estimated Infected | Lower Bound Estimates | Upper Bound Estimates | |
---|---|---|---|---|
Country | ||||
Brazil | 2020-03-27 | 9,996 | 7,689 | 14,279 |
China | 2020-03-27 | 82,700 | 81,897 | 118,143 |
France | 2020-03-27 | 86,737 | 66,720 | 123,909 |
Germany | 2020-03-27 | 145,131 | 111,639 | 207,330 |
Iran | 2020-03-27 | 61,918 | 47,629 | 88,454 |
Italy | 2020-03-27 | 154,952 | 119,193 | 221,359 |
Japan | 2020-03-27 | 2,445 | 1,881 | 3,493 |
Portugal | 2020-03-27 | 16,274 | 12,518 | 23,248 |
Singapore | 2020-03-27 | 1,467 | 1,129 | 2,096 |
South Korea | 2020-03-27 | 10,117 | 9,332 | 14,452 |
Spain | 2020-03-27 | 210,367 | 161,821 | 300,524 |
United Kingdom | 2020-03-27 | 61,825 | 47,558 | 88,322 |
United States | 2020-03-27 | 467,805 | 359,850 | 668,293 |
Allows you to compare how countries have been tracking the true number of infected people. The smaller deviation from the dashed line (45 degree line) the better job at tracking the true number of infected people.
Tip: Click (Shift+ for multiple) on countries in the legend to filter the visualization.
#hide_input
# Plot it using Altair
data_pc = data_final[['location', 'date', 'total_cases', 'total_infected']].copy()
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany",
"Portugal", "United States", "Singapore","South Korea", "Japan",
"Brazil","Iran"]
data_countries = []
data_countries_pc = []
for i in countries:
data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]
data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]
# get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries:
filter_country = data_pc["location"]== i
data_countries_pc.append(data_pc[filter_country & filter1])
plot(data_countries_pc, "scatter", True)
Latest Observed vs. Estimate of Infected Cases
#hide_input
label1 = 'Observed Cases'
label2 = 'Estimated Infected'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']
metric_name1 = f'{label1}'
metric_name2 = f'{label2}'
temp.columns = ['Country', 'Date', metric_name1, metric_name2]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, "Observed Cases"] = temp.loc[:, "Observed Cases"].round(0).map('{:,.0f}'.format)
temp.loc[:, "Estimated Infected"] = temp.loc[:, "Estimated Infected"].round(0).map('{:,.0f}'.format)
temp.groupby('Country').last()
Date | Observed Cases | Estimated Infected | |
---|---|---|---|
Country | |||
Brazil | 2020-03-27 | 3,417 | 9,996 |
China | 2020-03-27 | 81,897 | 82,700 |
France | 2020-03-27 | 33,402 | 86,737 |
Germany | 2020-03-27 | 50,871 | 145,131 |
Iran | 2020-03-27 | 32,332 | 61,918 |
Italy | 2020-03-27 | 86,498 | 154,952 |
Japan | 2020-03-27 | 1,468 | 2,445 |
Portugal | 2020-03-27 | 4,268 | 16,274 |
Singapore | 2020-03-27 | 732 | 1,467 |
South Korea | 2020-03-27 | 9,332 | 10,117 |
Spain | 2020-03-27 | 65,719 | 210,367 |
United Kingdom | 2020-03-27 | 14,745 | 61,825 |
United States | 2020-03-27 | 101,657 | 467,805 |
We argue that the number of infected in the past can be inferred using today's number of deaths and average fatality rate from confirmed cases in the following way:
{%raw%}$$I_{t-j} = \frac{D_t}{{CFR}_t}$${%endraw%}
where {% raw %}$I_t${% endraw %} = number of infected, {% raw %}$D_t${% endraw %} = number of deaths, and {% raw %}${CFR}_t ${% endraw %} = case fatality rate = {% raw %}$\frac{D}{C}${% endraw %}. The {% raw %}$j${% endraw %} depends on the average number of days that covid patients die after having the first symptoms.
Assumption 1: The case fatality rate is a good proxy for the fatality rate of the infected population
Then, in order to estimate the current number of infected {% raw %}$I_t${% endraw %} we need to estimate its growth rate from {% raw %}$t-j${% endraw %} to {% raw %}$t${% endraw %}.
{% raw %}$$I_t = (1+\hat{g})^j I_{t-j}$${% endraw %}
Assumption 2: The growth rate of infected $\hat{g}$ is an unbiased estimate of $g$ .
For now we estimate $g$ using the average growth rate since having the first infected person.
Assumption 3: It takes on average 8 days to die after having the first symptoms.
This analysis was conducted by Joao B. Duarte. Relevant sources are listed below: