import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from urllib.request import Request, urlopen
from IPython.display import Markdown as md
%matplotlib inline
(c) Junqui Liu and Carlos Contreras, January 2021
# Case data
req = Request('https://www.alberta.ca/data/stats/covid-19-alberta-statistics-data.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)
data = pd.read_csv(content)
data['Date reported'] = pd.to_datetime(data['Date reported'], format='%Y-%m-%d')
data.head()
Unnamed: 0 | Date reported | Alberta Health Services Zone | Gender | Age group | Case status | Case type | |
---|---|---|---|---|---|---|---|
0 | 1 | 2020-11-13 | Calgary Zone | Female | 1-4 years | Recovered | Confirmed |
1 | 2 | 2021-04-21 | Edmonton Zone | Male | 30-39 years | Recovered | Confirmed |
2 | 3 | 2021-05-17 | North Zone | Male | 10-19 years | Recovered | Confirmed |
3 | 4 | 2020-12-13 | Edmonton Zone | Male | 5-9 years | Recovered | Confirmed |
4 | 5 | 2021-01-05 | Central Zone | Male | 50-59 years | Recovered | Confirmed |
print(data.shape)
NUM_PATIENTS = data.shape[0]
print(max(data['Date reported']))
LAST_DATE = max(data['Date reported'])
md("This data set contains {} patients and 6 features (as of {})".format(NUM_PATIENTS,LAST_DATE.strftime('%B %d, %Y')))
(264564, 7) 2021-09-08 00:00:00
This data set contains 264564 patients and 6 features (as of September 08, 2021)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 264564 entries, 0 to 264563 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 264564 non-null int64 1 Date reported 264564 non-null datetime64[ns] 2 Alberta Health Services Zone 264564 non-null object 3 Gender 264564 non-null object 4 Age group 264564 non-null object 5 Case status 264564 non-null object 6 Case type 264564 non-null object dtypes: datetime64[ns](1), int64(1), object(5) memory usage: 14.1+ MB
data.describe(include = [object])
Alberta Health Services Zone | Gender | Age group | Case status | Case type | |
---|---|---|---|---|---|
count | 264564 | 264564 | 264564 | 264564 | 264564 |
unique | 6 | 3 | 12 | 3 | 2 |
top | Calgary Zone | Male | 30-39 years | Recovered | Confirmed |
freq | 104878 | 133898 | 50714 | 246153 | 261726 |
data.dtypes
Unnamed: 0 int64 Date reported datetime64[ns] Alberta Health Services Zone object Gender object Age group object Case status object Case type object dtype: object
The following are the possible values for each feature
data['Alberta Health Services Zone'].unique()
array(['Calgary Zone', 'Edmonton Zone', 'North Zone', 'Central Zone', 'South Zone', 'Unknown'], dtype=object)
data['Age group'].unique()
array(['1-4 years', '30-39 years', '10-19 years', '5-9 years', '50-59 years', '60-69 years', 'Under 1 year', '20-29 years', '40-49 years', '70-79 years', '80+ years', 'Unknown'], dtype=object)
data['Case status'].unique()
array(['Recovered', 'Active', 'Died'], dtype=object)
data['Case type'].unique()
array(['Confirmed', 'Probable'], dtype=object)
Using only the confirmed cases, 120928 cases af of January 26.
data['Case type'].value_counts().plot.bar()
data['Case type'].value_counts()
Confirmed 261726 Probable 2838 Name: Case type, dtype: int64
df = data[data['Case type']=='Confirmed']
df.head(8)
Unnamed: 0 | Date reported | Alberta Health Services Zone | Gender | Age group | Case status | Case type | |
---|---|---|---|---|---|---|---|
0 | 1 | 2020-11-13 | Calgary Zone | Female | 1-4 years | Recovered | Confirmed |
1 | 2 | 2021-04-21 | Edmonton Zone | Male | 30-39 years | Recovered | Confirmed |
2 | 3 | 2021-05-17 | North Zone | Male | 10-19 years | Recovered | Confirmed |
3 | 4 | 2020-12-13 | Edmonton Zone | Male | 5-9 years | Recovered | Confirmed |
4 | 5 | 2021-01-05 | Central Zone | Male | 50-59 years | Recovered | Confirmed |
5 | 6 | 2021-05-11 | Edmonton Zone | Male | 60-69 years | Recovered | Confirmed |
6 | 7 | 2021-01-30 | Calgary Zone | Female | 1-4 years | Recovered | Confirmed |
7 | 8 | 2021-01-20 | Edmonton Zone | Female | Under 1 year | Recovered | Confirmed |
df.sort_values('Date reported').tail(5)
Unnamed: 0 | Date reported | Alberta Health Services Zone | Gender | Age group | Case status | Case type | |
---|---|---|---|---|---|---|---|
6603 | 6604 | 2021-09-08 | Central Zone | Male | 1-4 years | Active | Confirmed |
87152 | 87153 | 2021-09-08 | Edmonton Zone | Female | 20-29 years | Active | Confirmed |
120571 | 120572 | 2021-09-08 | Calgary Zone | Male | 5-9 years | Active | Confirmed |
71693 | 71694 | 2021-09-08 | Calgary Zone | Female | 30-39 years | Active | Confirmed |
33700 | 33701 | 2021-09-08 | North Zone | Male | 10-19 years | Active | Confirmed |
df.describe(include='object')
Alberta Health Services Zone | Gender | Age group | Case status | Case type | |
---|---|---|---|---|---|
count | 261726 | 261726 | 261726 | 261726 | 261726 |
unique | 6 | 3 | 12 | 3 | 1 |
top | Calgary Zone | Male | 30-39 years | Recovered | Confirmed |
freq | 103854 | 132498 | 50266 | 243459 | 261726 |
# Vaccine data
req = Request('https://www.alberta.ca/data/stats/lga-coverage.csv')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0')
content = urlopen(req)
data_vac = pd.read_csv(content)
Alta_pop = 8842943;
print('Cases to date as of ' + df['Date reported'].max().strftime('%B %d, %Y'))
Cases to date as of September 08, 2021
temp = df['Case status'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Status of cases", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})
Count | Percentage | |
---|---|---|
Recovered | 243459 | 93.02 |
Active | 15839 | 6.05 |
Died | 2428 | 0.93 |
temp = df['Gender'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Gender of patients", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})
Count | Percentage | |
---|---|---|
Male | 132498 | 50.62 |
Female | 129152 | 49.35 |
Unknown | 76 | 0.03 |
temp = df['Alberta Health Services Zone'].value_counts()
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Location of cases", fontsize=16)
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})
Count | Percentage | |
---|---|---|
Calgary Zone | 103854 | 39.68 |
Edmonton Zone | 86445 | 33.03 |
North Zone | 31655 | 12.09 |
Central Zone | 24325 | 9.29 |
South Zone | 15432 | 5.90 |
Unknown | 15 | 0.01 |
ageix = ['Under 1 year', '1-4 years', '5-9 years', '10-19 years',
'20-29 years', '30-39 years', '40-49 years', '50-59 years',
'60-69 years', '70-79 years', '80+ years', 'Unknown']
temp = df['Age group'].value_counts()
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of cases")
plt.title("Age distribution")
pd.DataFrame({'Count':temp, 'Percentage':(temp/len(df)*100).round(2)})
Count | Percentage | |
---|---|---|
Under 1 year | 1539 | 0.59 |
1-4 years | 9738 | 3.72 |
5-9 years | 13829 | 5.28 |
10-19 years | 35175 | 13.44 |
20-29 years | 49382 | 18.87 |
30-39 years | 50266 | 19.21 |
40-49 years | 40461 | 15.46 |
50-59 years | 29677 | 11.34 |
60-69 years | 17197 | 6.57 |
70-79 years | 7508 | 2.87 |
80+ years | 6947 | 2.65 |
Age distribution is skewed to the right.
temp = df.groupby('Gender')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Age distribution by gender", fontsize=16)
temp
Gender | Female | Male | Unknown |
---|---|---|---|
Age group | |||
Under 1 year | 723.0 | 816.0 | NaN |
1-4 years | 4699.0 | 5038.0 | 1.0 |
5-9 years | 6577.0 | 7250.0 | 2.0 |
10-19 years | 17159.0 | 17994.0 | 22.0 |
20-29 years | 24332.0 | 25023.0 | 27.0 |
30-39 years | 24837.0 | 25416.0 | 13.0 |
40-49 years | 20098.0 | 20357.0 | 6.0 |
50-59 years | 14486.0 | 15190.0 | 1.0 |
60-69 years | 8221.0 | 8974.0 | 2.0 |
70-79 years | 3805.0 | 3703.0 | NaN |
80+ years | 4214.0 | 2733.0 | NaN |
There is an important gender difference in the number of cases of elderly people. More males of are infected in general for age less than 70 years, but more females cases above 70 years. It would be interested to compare with the population age distribution by gender.
temp = df.groupby('Gender')['Case status'].value_counts().unstack(0)
temp = temp.drop(['Unknown'],axis=1)
temp.plot.bar()
plt.ylabel("Number of cases", fontsize=15)
plt.title("Status of cases by gender", fontsize=16)
temp
Gender | Female | Male |
---|---|---|
Case status | ||
Active | 8046.0 | 7778.0 |
Died | 1083.0 | 1345.0 |
Recovered | 120023.0 | 123375.0 |
temp = df.groupby('Alberta Health Services Zone')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp = temp.drop(['Unknown'],axis=1)
temp = temp.div(temp.sum(), axis=1)
temp.plot.bar()
plt.ylabel("Fraction of cases within zone", fontsize=15)
plt.title("Age distribution by location", fontsize=16)
temp
Alberta Health Services Zone | Calgary Zone | Central Zone | Edmonton Zone | North Zone | South Zone |
---|---|---|---|---|---|
Age group | |||||
Under 1 year | 0.005720 | 0.005509 | 0.005437 | 0.007424 | 0.006869 |
1-4 years | 0.038266 | 0.032560 | 0.035573 | 0.042743 | 0.035251 |
5-9 years | 0.054125 | 0.053815 | 0.048669 | 0.061920 | 0.047434 |
10-19 years | 0.131262 | 0.170613 | 0.118566 | 0.151829 | 0.151503 |
20-29 years | 0.190625 | 0.175382 | 0.195197 | 0.175270 | 0.187597 |
30-39 years | 0.197645 | 0.162679 | 0.198077 | 0.190308 | 0.170490 |
40-49 years | 0.162037 | 0.147591 | 0.150750 | 0.147438 | 0.151957 |
50-59 years | 0.112630 | 0.117785 | 0.113349 | 0.113919 | 0.110679 |
60-69 years | 0.061472 | 0.074700 | 0.067352 | 0.066058 | 0.070049 |
70-79 years | 0.025536 | 0.031697 | 0.031617 | 0.025305 | 0.035640 |
80+ years | 0.020683 | 0.027668 | 0.035411 | 0.017786 | 0.032530 |
Age distribution is very similar across zones, but there are a few interesting peaks:
It would be interesting to know why those peaks. Perhaps more contact within those groups in their zone, and more long-term care facilities in Edmonton.
temp = df[df['Case status']=='Died'].groupby('Gender')['Age group'].value_counts().unstack(0)
temp = temp.reindex(index=ageix)
temp = temp.drop(['Unknown'],axis=0)
temp.plot.bar()
plt.ylabel("Number of deaths", fontsize=15)
plt.title("Age distribution", fontsize=16)
temp
Gender | Female | Male |
---|---|---|
Age group | ||
Under 1 year | NaN | NaN |
1-4 years | NaN | NaN |
5-9 years | NaN | NaN |
10-19 years | NaN | NaN |
20-29 years | 6.0 | 4.0 |
30-39 years | 8.0 | 8.0 |
40-49 years | 16.0 | 33.0 |
50-59 years | 47.0 | 77.0 |
60-69 years | 121.0 | 192.0 |
70-79 years | 197.0 | 317.0 |
80+ years | 688.0 | 713.0 |
That is a significant age and sex difference in the number of deaths. Old man are at higher risk.
temp = df['Date reported'].value_counts()
temp.plot()
plt.ylabel('Cases per day', fontsize=15);
The number of case per days is not a smooth graph, use the cummulative number of cases instead.
temp = df['Date reported'].value_counts().sort_index().cumsum()
temp.plot.area()
plt.ylabel('Cases to date', fontsize=15)
plt.title;
temp = df.groupby('Gender')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum()
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(stacked=False)
plt.ylabel('Cases to date (gender)', fontsize=15);
The age difference is not evident in the number of cases.
temp = df.groupby('Age group')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum().fillna(method="pad")
temp = temp.reindex(columns=ageix)
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(colormap=plt.get_cmap('twilight'))
plt.legend(loc=2)
plt.ylabel('Cases to date (age groups)', fontsize=15);
temp = df.groupby('Alberta Health Services Zone')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().cumsum().fillna(method="pad")
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area()
plt.legend(loc=2)
plt.ylabel('Cases to date (Alberta zones)', fontsize=15);
temp = df[df['Case status']=='Died']['Date reported']
temp = temp.value_counts().sort_index().cumsum().fillna(method="pad")
temp.plot.area()
plt.ylabel('Deaths to date', fontsize=15);
temp = df[df['Case status']=='Died'].groupby('Gender')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp.plot.area(stacked=False)
plt.ylabel('Deaths to date (gender)', fontsize=15);
The age difference is evident in the number of deaths.
temp = df[df['Case status']=='Died'].groupby('Age group')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp = temp.reindex(columns=ageix)
temp = temp.drop(['Unknown'], axis=1)
temp.plot.area(colormap=plt.get_cmap('twilight'))
plt.legend(loc=2)
plt.ylabel('Deaths to date (age groups)', fontsize=15);
The risk to elderly people is outstanding.
temp = df[df['Case status']=='Died'].groupby('Alberta Health Services Zone')['Date reported']
temp = temp.value_counts().unstack(0).sort_index().fillna(0).cumsum().fillna(method="pad")
temp.plot.area()
plt.legend(loc=2)
plt.ylabel('Deaths to date (Alberta zones)', fontsize=15);
fig, ax = plt.subplots(figsize=(15, 10))
temp = df['Date reported'].value_counts().sort_index().fillna(0).rolling(7).mean()
temp.plot()
plt.axvspan('2020-03-16', '2020-05-13', facecolor='darkgreen', alpha=0.3)
plt.axvspan('2020-05-13', '2020-11-24', facecolor='limegreen', alpha=0.3)
plt.axvspan('2020-11-24', '2021-06-01', facecolor='darkgreen', alpha=0.3)
plt.axvspan('2021-06-01', '2021-06-10', facecolor='forestgreen', alpha=0.3)
plt.axvspan('2021-06-10', '2021-07-01', facecolor='limegreen', alpha=0.3)
plt.hlines(500, temp.index.values.min(), temp.index.values.max(), linestyle='dashed', color='black')
ax.set_xlabel('Date reported', fontsize=15)
ax.set_ylabel('Cases per day', fontsize=15)
temp1 = df[df['Case status']=='Died']['Date reported'].value_counts().sort_index().fillna(0)
temp2 = df['Date reported'].value_counts().sort_index().fillna(0)
temp = (temp1).fillna(0).rolling(7).mean()
ax2 = ax.twinx()
temp.plot(color='red', ax = ax2)
ax2.set_xlabel('Date reported', fontsize=15)
ax2.set_ylabel('Deaths per day', fontsize=15);
# https://calgaryherald.com/news/local-news/covid-19-at-one-year-a-timeline-of-the-pandemic-in-alberta