In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from urllib.request import Request, urlopen
from IPython.display import Markdown as md

%matplotlib inline

General¶

(c) Junqui Liu and Carlos Contreras, January 2021

In [ ]:

In [2]:

# Case data
req = Request('https://www.alberta.ca/data/stats/covid-19-alberta-statistics-data.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)
data = pd.read_csv(content)

In [3]:

data['Date reported'] = pd.to_datetime(data['Date reported'], format='%Y-%m-%d')

In [4]:

data.head()

Out[4]:

	Unnamed: 0	Date reported	Alberta Health Services Zone	Gender	Age group	Case status	Case type
0	1	2020-11-13	Calgary Zone	Female	1-4 years	Recovered	Confirmed
1	2	2021-04-21	Edmonton Zone	Male	30-39 years	Recovered	Confirmed
2	3	2021-05-17	North Zone	Male	10-19 years	Recovered	Confirmed
3	4	2020-12-13	Edmonton Zone	Male	5-9 years	Recovered	Confirmed
4	5	2021-01-05	Central Zone	Male	50-59 years	Recovered	Confirmed

In [5]:

print(data.shape)
NUM_PATIENTS = data.shape[0]
print(max(data['Date reported']))
LAST_DATE = max(data['Date reported'])
md("This data set contains {} patients and 6 features (as of {})".format(NUM_PATIENTS,LAST_DATE.strftime('%B %d, %Y')))

(264564, 7)
2021-09-08 00:00:00

Out[5]:

This data set contains 264564 patients and 6 features (as of September 08, 2021)

In [6]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264564 entries, 0 to 264563
Data columns (total 7 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   Unnamed: 0                    264564 non-null  int64         
 1   Date reported                 264564 non-null  datetime64[ns]
 2   Alberta Health Services Zone  264564 non-null  object        
 3   Gender                        264564 non-null  object        
 4   Age group                     264564 non-null  object        
 5   Case status                   264564 non-null  object        
 6   Case type                     264564 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 14.1+ MB

In [7]:

data.describe(include = [object])

Out[7]:

	Alberta Health Services Zone	Gender	Age group	Case status	Case type
count	264564	264564	264564	264564	264564
unique	6	3	12	3	2
top	Calgary Zone	Male	30-39 years	Recovered	Confirmed
freq	104878	133898	50714	246153	261726

In [8]:

data.dtypes

Out[8]:

Unnamed: 0                               int64
Date reported                   datetime64[ns]
Alberta Health Services Zone            object
Gender                                  object
Age group                               object
Case status                             object
Case type                               object
dtype: object

The following are the possible values for each feature

In [9]:

data['Alberta Health Services Zone'].unique()

Out[9]:

array(['Calgary Zone', 'Edmonton Zone', 'North Zone', 'Central Zone',
       'South Zone', 'Unknown'], dtype=object)

In [10]:

data['Age group'].unique()

Out[10]:

array(['1-4 years', '30-39 years', '10-19 years', '5-9 years',
       '50-59 years', '60-69 years', 'Under 1 year', '20-29 years',
       '40-49 years', '70-79 years', '80+ years', 'Unknown'], dtype=object)

In [11]:

data['Case status'].unique()

Out[11]:

array(['Recovered', 'Active', 'Died'], dtype=object)

In [12]:

data['Case type'].unique()

Out[12]:

array(['Confirmed', 'Probable'], dtype=object)

Using only the confirmed cases, 120928 cases af of January 26.

In [13]:

data['Case type'].value_counts().plot.bar()
data['Case type'].value_counts()

Out[13]:

Confirmed    261726
Probable       2838
Name: Case type, dtype: int64

In [14]:

df = data[data['Case type']=='Confirmed']

In [15]:

df.head(8)

Out[15]:

	Unnamed: 0	Date reported	Alberta Health Services Zone	Gender	Age group	Case status	Case type
0	1	2020-11-13	Calgary Zone	Female	1-4 years	Recovered	Confirmed
1	2	2021-04-21	Edmonton Zone	Male	30-39 years	Recovered	Confirmed
2	3	2021-05-17	North Zone	Male	10-19 years	Recovered	Confirmed
3	4	2020-12-13	Edmonton Zone	Male	5-9 years	Recovered	Confirmed
4	5	2021-01-05	Central Zone	Male	50-59 years	Recovered	Confirmed
5	6	2021-05-11	Edmonton Zone	Male	60-69 years	Recovered	Confirmed
6	7	2021-01-30	Calgary Zone	Female	1-4 years	Recovered	Confirmed
7	8	2021-01-20	Edmonton Zone	Female	Under 1 year	Recovered	Confirmed

In [16]:

df.sort_values('Date reported').tail(5)

Out[16]:

	Unnamed: 0	Date reported	Alberta Health Services Zone	Gender	Age group	Case status	Case type
6603	6604	2021-09-08	Central Zone	Male	1-4 years	Active	Confirmed
87152	87153	2021-09-08	Edmonton Zone	Female	20-29 years	Active	Confirmed
120571	120572	2021-09-08	Calgary Zone	Male	5-9 years	Active	Confirmed
71693	71694	2021-09-08	Calgary Zone	Female	30-39 years	Active	Confirmed
33700	33701	2021-09-08	North Zone	Male	10-19 years	Active	Confirmed

In [17]:

df.describe(include='object')

Out[17]:

	Alberta Health Services Zone	Gender	Age group	Case status	Case type
count	261726	261726	261726	261726	261726
unique	6	3	12	3	1
top	Calgary Zone	Male	30-39 years	Recovered	Confirmed
freq	103854	132498	50266	243459	261726

In [18]:

# Vaccine data
req = Request('https://www.alberta.ca/data/stats/lga-coverage.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)
data_vac = pd.read_csv(content)

In [19]:

Alta_pop = 8842943;

Charts of cases to date¶

In [20]:

print('Cases to date as of ' + df['Date reported'].max().strftime('%B %d, %Y'))

Cases to date as of September 08, 2021

In [21]:

temp = df['Case status'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Status of cases", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})

Out[21]:

	Count	Percentage
Recovered	243459	93.02
Active	15839	6.05
Died	2428	0.93

In [22]:

temp = df['Gender'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Gender of patients", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})

Out[22]:

	Count	Percentage
Male	132498	50.62
Female	129152	49.35
Unknown	76	0.03

In [23]:

temp = df['Alberta Health Services Zone'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Location of cases", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})

Out[23]:

	Count	Percentage
Calgary Zone	103854	39.68
Edmonton Zone	86445	33.03
North Zone	31655	12.09
Central Zone	24325	9.29
South Zone	15432	5.90
Unknown	15	0.01

In [24]:

ageix = ['Under 1 year', '1-4 years', '5-9 years', '10-19 years',
     '20-29 years', '30-39 years', '40-49 years', '50-59 years',
     '60-69 years', '70-79 years', '80+ years', 'Unknown']

In [25]:

temp = df['Age group'].value_counts()
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of cases")
plt.title("Age distribution")
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})

Out[25]:

	Count	Percentage
Under 1 year	1539	0.59
1-4 years	9738	3.72
5-9 years	13829	5.28
10-19 years	35175	13.44
20-29 years	49382	18.87
30-39 years	50266	19.21
40-49 years	40461	15.46
50-59 years	29677	11.34
60-69 years	17197	6.57
70-79 years	7508	2.87
80+ years	6947	2.65

Age distribution is skewed to the right.

In [26]:

temp = df.groupby('Gender')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Age distribution by gender", fontsize=16)
temp

Out[26]:

Gender	Female	Male	Unknown
Age group
Under 1 year	723.0	816.0	NaN
1-4 years	4699.0	5038.0	1.0
5-9 years	6577.0	7250.0	2.0
10-19 years	17159.0	17994.0	22.0
20-29 years	24332.0	25023.0	27.0
30-39 years	24837.0	25416.0	13.0
40-49 years	20098.0	20357.0	6.0
50-59 years	14486.0	15190.0	1.0
60-69 years	8221.0	8974.0	2.0
70-79 years	3805.0	3703.0	NaN
80+ years	4214.0	2733.0	NaN

There is an important gender difference in the number of cases of elderly people. More males of are infected in general for age less than 70 years, but more females cases above 70 years. It would be interested to compare with the population age distribution by gender.

In [27]:

temp = df.groupby('Gender')['Case status'].value_counts().unstack(0)
temp = temp.drop(['Unknown'],axis=1)
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Status of cases by gender", fontsize=16)
temp

Out[27]:

Gender	Female	Male
Case status
Active	8046.0	7778.0
Died	1083.0	1345.0
Recovered	120023.0	123375.0

In [28]:

temp = df.groupby('Alberta Health Services Zone')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp = temp.drop(['Unknown'],axis=1)
temp = temp.div(temp.sum(), axis=1)
temp.plot.bar()
plt.ylabel("Fraction of cases within zone", fontsize=15)
plt.title("Age distribution by location", fontsize=16)
temp

Out[28]:

Alberta Health Services Zone	Calgary Zone	Central Zone	Edmonton Zone	North Zone	South Zone
Age group
Under 1 year	0.005720	0.005509	0.005437	0.007424	0.006869
1-4 years	0.038266	0.032560	0.035573	0.042743	0.035251
5-9 years	0.054125	0.053815	0.048669	0.061920	0.047434
10-19 years	0.131262	0.170613	0.118566	0.151829	0.151503
20-29 years	0.190625	0.175382	0.195197	0.175270	0.187597
30-39 years	0.197645	0.162679	0.198077	0.190308	0.170490
40-49 years	0.162037	0.147591	0.150750	0.147438	0.151957
50-59 years	0.112630	0.117785	0.113349	0.113919	0.110679
60-69 years	0.061472	0.074700	0.067352	0.066058	0.070049
70-79 years	0.025536	0.031697	0.031617	0.025305	0.035640
80+ years	0.020683	0.027668	0.035411	0.017786	0.032530

Age distribution is very similar across zones, but there are a few interesting peaks:

Middle age (20-49 years) in Calgary and Edmonton (large cities)
Kids and teenagers (5-19 years) in North and Central Alberta (country side).
People older that 80 year in Edmonton.

It would be interesting to know why those peaks. Perhaps more contact within those groups in their zone, and more long-term care facilities in Edmonton.

In [29]:

temp = df[df['Case status']=='Died'].groupby('Gender')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of deaths", fontsize=15)
plt.title("Age distribution", fontsize=16)
temp

Out[29]:

Gender	Female	Male
Age group
Under 1 year	NaN	NaN
1-4 years	NaN	NaN
5-9 years	NaN	NaN
10-19 years	NaN	NaN
20-29 years	6.0	4.0
30-39 years	8.0	8.0
40-49 years	16.0	33.0
50-59 years	47.0	77.0
60-69 years	121.0	192.0
70-79 years	197.0	317.0
80+ years	688.0	713.0

That is a significant age and sex difference in the number of deaths. Old man are at higher risk.

Time series¶

In [30]:

temp = df['Date reported'].value_counts()
temp.plot()
plt.ylabel('Cases per day', fontsize=15);

The number of case per days is not a smooth graph, use the cummulative number of cases instead.

In [31]:

temp = df['Date reported'].value_counts().sort_index().cumsum()
temp.plot.area()
plt.ylabel('Cases to date', fontsize=15)
plt.title;

In [32]:

temp = df.groupby('Gender')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum()
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(stacked=False)
plt.ylabel('Cases to date (gender)', fontsize=15);

The age difference is not evident in the number of cases.

In [33]:

temp = df.groupby('Age group')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum().fillna(method="pad")
temp = temp.reindex(columns=ageix)
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(colormap=plt.get_cmap('twilight'))
plt.legend(loc=2)
plt.ylabel('Cases to date (age groups)', fontsize=15);

In [34]:

temp = df.groupby('Alberta Health Services Zone')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum().fillna(method="pad")
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area()
plt.legend(loc=2)
plt.ylabel('Cases to date (Alberta zones)', fontsize=15);

In [35]:

temp = df[df['Case status']=='Died']['Date reported']
temp = temp.value_counts().sort_index().cumsum().fillna(method="pad")
temp.plot.area()
plt.ylabel('Deaths to date', fontsize=15);

In [36]:

temp = df[df['Case status']=='Died'].groupby('Gender')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp.plot.area(stacked=False)
plt.ylabel('Deaths to date (gender)', fontsize=15);

The age difference is evident in the number of deaths.

In [37]:

temp = df[df['Case status']=='Died'].groupby('Age group')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp = temp.reindex(columns=ageix)
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(colormap=plt.get_cmap('twilight'))
plt.legend(loc=2)
plt.ylabel('Deaths to date (age groups)', fontsize=15);

The risk to elderly people is outstanding.

In [38]:

temp = df[df['Case status']=='Died'].groupby('Alberta Health Services Zone')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp.plot.area()
plt.legend(loc=2)
plt.ylabel('Deaths to date (Alberta zones)', fontsize=15);

In [39]:

fig, ax = plt.subplots(figsize=(15, 10))

temp = df['Date reported'].value_counts().sort_index().fillna(0).rolling(7).mean()
temp.plot()
plt.axvspan('2020-03-16', '2020-05-13', facecolor='darkgreen', alpha=0.3)
plt.axvspan('2020-05-13', '2020-11-24', facecolor='limegreen', alpha=0.3)
plt.axvspan('2020-11-24', '2021-06-01', facecolor='darkgreen', alpha=0.3)
plt.axvspan('2021-06-01', '2021-06-10', facecolor='forestgreen', alpha=0.3)
plt.axvspan('2021-06-10', '2021-07-01', facecolor='limegreen', alpha=0.3)
plt.hlines(500, temp.index.values.min(), temp.index.values.max(), linestyle='dashed', color='black')
ax.set_xlabel('Date reported', fontsize=15)
ax.set_ylabel('Cases per day', fontsize=15)

temp1 = df[df['Case status']=='Died']['Date reported'].value_counts().sort_index().fillna(0)
temp2 = df['Date reported'].value_counts().sort_index().fillna(0)
temp = (temp1).fillna(0).rolling(7).mean()
ax2 = ax.twinx()
temp.plot(color='red', ax = ax2)
ax2.set_xlabel('Date reported', fontsize=15)
ax2.set_ylabel('Deaths per day', fontsize=15);
# https://calgaryherald.com/news/local-news/covid-19-at-one-year-a-timeline-of-the-pandemic-in-alberta

In [ ]: