import pandas as pd
import altair as alt
from github import Github
import configparser
config = configparser.ConfigParser()
config.read('secrets.ini');
g = Github(config['corona']['api_key'])
START_CASES_INFECTION = 100
START_CASES_DEATH = 10
URL = "https://covid.ourworldindata.org/data/ecdc/full_data.csv"
df = pd.read_csv(URL)
df.date = pd.to_datetime(df.date)
df = df.sort_values(['location','date'], ascending=True)
df.shape[0]
7213
df.head()
date | location | new_cases | new_deaths | total_cases | total_deaths | |
---|---|---|---|---|---|---|
0 | 2019-12-31 | Afghanistan | 0 | 0 | 0 | 0 |
1 | 2020-01-01 | Afghanistan | 0 | 0 | 0 | 0 |
2 | 2020-01-02 | Afghanistan | 0 | 0 | 0 | 0 |
3 | 2020-01-03 | Afghanistan | 0 | 0 | 0 | 0 |
4 | 2020-01-04 | Afghanistan | 0 | 0 | 0 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7213 entries, 0 to 7212 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 7213 non-null datetime64[ns] 1 location 7213 non-null object 2 new_cases 7213 non-null int64 3 new_deaths 7213 non-null int64 4 total_cases 7213 non-null int64 5 total_deaths 7213 non-null int64 dtypes: datetime64[ns](1), int64(4), object(1) memory usage: 394.5+ KB
df.date.max()
Timestamp('2020-03-27 00:00:00')
df = df.drop(columns=['new_cases', 'new_deaths'])
df_infection = df[df.total_cases >= START_CASES_INFECTION].copy()
df_death = df[df.total_deaths >= START_CASES_DEATH].copy()
df_infection.head()
date | location | total_cases | total_deaths | |
---|---|---|---|---|
93 | 2020-03-24 | Albania | 100 | 4 |
94 | 2020-03-25 | Albania | 123 | 5 |
95 | 2020-03-26 | Albania | 146 | 5 |
96 | 2020-03-27 | Albania | 174 | 6 |
175 | 2020-03-23 | Algeria | 102 | 15 |
df_infection[df_infection.location == 'Austria'].tail(3)
date | location | total_cases | total_deaths | |
---|---|---|---|---|
488 | 2020-03-25 | Austria | 5282 | 30 |
489 | 2020-03-26 | Austria | 5888 | 34 |
490 | 2020-03-27 | Austria | 7029 | 49 |
df_death.head()
date | location | total_cases | total_deaths | |
---|---|---|---|---|
173 | 2020-03-21 | Algeria | 94 | 10 |
174 | 2020-03-22 | Algeria | 94 | 10 |
175 | 2020-03-23 | Algeria | 102 | 15 |
176 | 2020-03-24 | Algeria | 189 | 17 |
177 | 2020-03-25 | Algeria | 231 | 17 |
df_infection['total_cases_normalized'] = df_infection.groupby(
"location")[['total_cases']].transform(lambda x: x / x.min() * START_CASES_INFECTION)
df_infection.total_cases_normalized = df_infection.total_cases_normalized.astype("int")
df_death['total_deaths_normalized'] = df_death.groupby(
"location")[['total_deaths']].transform(lambda x: x / x.min() * START_CASES_DEATH)
df_death.total_deaths_normalized = df_death.total_deaths_normalized.astype("int")
df_infection["days_after"] = df_infection.groupby("location").cumcount()
df_death["days_after"] = df_death.groupby("location").cumcount()
df_infection.head()
date | location | total_cases | total_deaths | total_cases_normalized | days_after | |
---|---|---|---|---|---|---|
93 | 2020-03-24 | Albania | 100 | 4 | 100 | 0 |
94 | 2020-03-25 | Albania | 123 | 5 | 123 | 1 |
95 | 2020-03-26 | Albania | 146 | 5 | 146 | 2 |
96 | 2020-03-27 | Albania | 174 | 6 | 174 | 3 |
175 | 2020-03-23 | Algeria | 102 | 15 | 100 | 0 |
df_death.head()
date | location | total_cases | total_deaths | total_deaths_normalized | days_after | |
---|---|---|---|---|---|---|
173 | 2020-03-21 | Algeria | 94 | 10 | 10 | 0 |
174 | 2020-03-22 | Algeria | 94 | 10 | 10 | 1 |
175 | 2020-03-23 | Algeria | 102 | 15 | 15 | 2 |
176 | 2020-03-24 | Algeria | 189 | 17 | 17 | 3 |
177 | 2020-03-25 | Algeria | 231 | 17 | 17 | 4 |
df_summary = df[df.date == df.date.max()].drop(columns='date')
df_summary['cases'] = df_summary.total_cases
df_summary = df_summary.drop(['total_cases', 'total_deaths'], axis=1)
df_summary.head()
location | cases | |
---|---|---|
77 | Afghanistan | 75 |
96 | Albania | 174 |
179 | Algeria | 305 |
193 | Andorra | 224 |
199 | Angola | 3 |
df_infection.location.unique()
array(['Albania', 'Algeria', 'Andorra', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia', 'Faeroe Islands', 'Finland', 'France', 'Germany', 'Ghana', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'International', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lebanon', 'Lithuania', 'Luxembourg', 'Macedonia', 'Malaysia', 'Malta', 'Mexico', 'Moldova', 'Morocco', 'Netherlands', 'New Zealand', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'San Marino', 'Saudi Arabia', 'Senegal', 'Serbia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Tunisia', 'Turkey', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Venezuela', 'Vietnam', 'World'], dtype=object)
df_death.location.unique()
array(['Algeria', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Canada', 'China', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'France', 'Germany', 'Greece', 'Hungary', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Japan', 'Malaysia', 'Morocco', 'Netherlands', 'Norway', 'Philippines', 'Poland', 'Portugal', 'Romania', 'San Marino', 'South Korea', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom', 'United States', 'World'], dtype=object)
COUNTRIES = set(df[(df.date == df.date.max())].nlargest(columns='total_cases', n=34).sort_values('location').location)
COUNTRIES.remove('World')
url_infection = 'data_infection.json'
url_death = 'data_death.json'
url_summary = 'data_summary.json'
df_infection[df_infection.location.isin(COUNTRIES)].to_json(url_infection, orient='records')
df_death[df_death.location.isin(COUNTRIES)].to_json(url_death, orient='records')
df_summary[df_summary.location.isin(COUNTRIES)].to_json(url_summary, orient='records')
repo = g.get_repo("Datenspieler/notebooks_for_blog")
FILELIST = ['data_infection.json', 'data_death.json', 'data_summary.json']
for file_for_upload in FILELIST:
print('Uploading', file_for_upload, end=' - ')
with open(file_for_upload) as f:
data = f.read()
contents = repo.get_contents("2020-corona/" + file_for_upload)
repo.update_file(contents.path, "Update data as of " + df.date.max().strftime('%Y-%m-%d'), data,
contents.sha, branch="master")
contents = repo.get_contents("2020-corona/" + file_for_upload)
print(contents.last_modified)
Uploading data_infection.json - Sat, 28 Mar 2020 04:46:56 GMT Uploading data_death.json - Sat, 28 Mar 2020 04:46:59 GMT Uploading data_summary.json - Sat, 28 Mar 2020 04:47:00 GMT
# In future version files should be pushed to git automatically, this time I uploaded them
BASEURL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-corona/'
url_infection = BASEURL + url_infection
url_death = BASEURL + url_death
url_summary = BASEURL + url_summary
url_infection = df_infection[df_infection.location.isin(COUNTRIES)]
url_death = df_death[df_death.location.isin(COUNTRIES)]
url_summary = df_summary[df_summary.location.isin(COUNTRIES)]
highlight = alt.selection(type='single', on='mouseover',
fields=['location'], nearest=True)
base_infection = alt.Chart(url_infection).encode(
alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,30], type='ordinal'),
title='Days since the 100th confirmed infection'),
alt.Y('total_cases_normalized', type='quantitative', scale=alt.Scale(type='log', base=10),
title='Total confirmed infections of COVID-19, normalized'),
alt.Color('location:N', title="Country", legend=None),
alt.Tooltip(['location:N', 'total_cases:Q', 'date:T'])
).properties(
title='Development of Corona infections',
)
base_death = alt.Chart(url_death).encode(
alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,30], type='ordinal'),
title='Days since the 10th confirmed death'),
alt.Y('total_deaths_normalized', type='quantitative', scale=alt.Scale(type='log', base=10),
title='Total confirmed deaths of COVID-19, normalized'),
alt.Color('location:N', title="Country", legend=None),
alt.Tooltip(['location:N', 'total_deaths:Q', 'date:T'])
).properties(
title='Development of Corona deaths',
)
base_summary = alt.Chart(url_summary).mark_bar().encode(
x = alt.X('cases', type='quantitative', title='Confirmed cases', scale=alt.Scale(type='linear')),
y = alt.Y('location:N', title='Country'),
color = alt.condition(highlight, alt.Color('location:N', title="Country", legend=None), alt.ColorValue("grey")),
tooltip = alt.Tooltip(['location:N', 'cases:Q'])
).add_selection(
highlight
).properties(
width=200,
title='Corona cases by country'
)
points_infection = base_infection.mark_circle().encode(
opacity=alt.value(0)
).add_selection(
highlight
).properties(
width=600
).interactive(
bind_y = False
)
points_death = base_death.mark_circle().encode(
opacity=alt.value(0)
).add_selection(
highlight
).properties(
width=600
).interactive(
bind_y = False
)
lines_infection = base_infection.mark_line().encode(
size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)
lines_death = base_death.mark_line().encode(
size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)
chart = alt.vconcat(points_infection + lines_infection, points_death + lines_death)
chart = alt.hconcat(base_summary, chart)
#chart.save('corona.html')
chart.save('corona.json')
chart