Estimating the Date of COVID-19 Changes

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from scipy import stats
import statsmodels.api as sm
import pylab

# for fancy python printing
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
    
import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
In [2]:
data = pd.read_csv("/Users/jonny/Desktop/Dataset/covid_19_counts_may2.csv")
data.Date = pd.to_datetime(data.Date)

# use only canada for now
cad = data.loc[data["Country/Region"] == "Canada", ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
cad.columns = ["country", "date", "confirmed", "deaths", "recovered"]

# group by country and date, sum(confirmed, deaths, recovered)
cad = cad.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()

# convert date string to datetime
cad.date = pd.to_datetime(cad.date)
cad = cad.sort_values(by = "date")
cad.tail()
Out[2]:
country date confirmed deaths recovered
95 Canada 2020-04-26 48033 2687 0
96 Canada 2020-04-27 49616 2841 0
97 Canada 2020-04-28 51150 2983 0
98 Canada 2020-04-29 52865 3155 0
99 Canada 2020-04-30 54457 3310 0
In [4]:
# countries with the most cases:
data.loc[data["Date"] == "2020-04-30", ["Country/Region", "Confirmed"]].sort_values(by = "Confirmed", 
                                                                                    ascending = False)[:7]
Out[4]:
Country/Region Confirmed
26361 US 1069424
26337 Spain 213435
26273 Italy 205463
26359 United Kingdom 171253
26252 France 165764
26256 Germany 163009
26349 Turkey 120204
In [5]:
# function to make the time series of confirmed and daily confirmed cases for a specific country
def create_country (country, end_date, state = False) : 
    if state :
        df = data.loc[data["Province/State"] == country, ["Province/State", "Date", "Confirmed", "Deaths", "Recovered"]]
    else : 
        df = data.loc[data["Country/Region"] == country, ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
    df.columns = ["country", "date", "confirmed", "deaths", "recovered"]

    # group by country and date, sum(confirmed, deaths, recovered). do this because countries have multiple cities 
    df = df.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()

    # convert date string to datetime
    df.date = pd.to_datetime(df.date)
    df = df.sort_values(by = "date")
    df = df[df.date <= end_date]

    df.tail()

    # make new confirmed cases every day:
    cases_shifted = np.array([0] + list(df.confirmed[:-1]))
    daily_confirmed = np.array(df.confirmed) - cases_shifted
    df["daily_confirmed"] = daily_confirmed 
    
    # moving average for daily confirmed cases
    df["moving_avg"] = df.daily_confirmed.rolling(window=4).mean()

    fig, ax = plt.subplots(1,2, figsize=(15, 6))

    # plot daily confirmed cases, along with moving average
    #plt.figure(figsize=(11, 5))
    sns.lineplot(x = df.date, 
                 y = df.daily_confirmed, 
                 #label = "Raw Data",
                 ax = ax[1])

#     sns.lineplot(x = df.date, 
#                  y = df.moving_avg, 
#                  label = "Moving Average",
#                  legend = "full",
#                  ax = ax[0]).set_title("Daily New Confirmed COVID-19 Cases in %s" % country)
    ax[1].set(ylabel='Daily Confirmed Cases', 
              xlabel='Date',
              title = "Daily New Confirmed COVID-19 Cases in %s" % country)

    sns.lineplot(x="date", 
                 y="confirmed", 
                 data= df,
                 ax = ax[0]
                ).set_title("Total Confirmed COVID-19 Cases in %s" % country)
    ax[0].set(ylabel='Daily Confirmed Cases', xlabel='Date');
    return df


def summary(samples):
    site_stats = {}
    for k, v in samples.items():
        site_stats[k] = {
            "mean": torch.mean(v, 0),
            "std": torch.std(v, 0),
            "5%": v.kthvalue(int(len(v) * 0.05), dim=0)[0],
            "95%": v.kthvalue(int(len(v) * 0.95), dim=0)[0],
        }
    return site_stats

US

In [6]:
us = create_country("US", end_date = "2020-04-30")
In [7]:
us_start = "2020-02-26" # 51 confirmed cases
us = us[us.date >= us_start].reset_index(drop = True)
us["days_since_start"] = np.arange(us.shape[0]) + 1

Canada

In [8]:
cad = create_country("Canada", end_date = "2020-04-30")