Loading libraries and data

In [124]:
import pandas as pd
import altair as alt

from github import Github
import configparser
In [176]:
config = configparser.ConfigParser()
config.read('secrets.ini');
In [177]:
g = Github(config['corona']['api_key'])
In [209]:
START_CASES_INFECTION = 100
START_CASES_DEATH = 10
In [210]:
URL = "https://covid.ourworldindata.org/data/ecdc/full_data.csv"
In [211]:
df = pd.read_csv(URL)
In [212]:
df.date = pd.to_datetime(df.date)
In [213]:
df = df.sort_values(['location','date'], ascending=True)
In [214]:
df.shape[0]
Out[214]:
7213
In [215]:
df.head()
Out[215]:
date location new_cases new_deaths total_cases total_deaths
0 2019-12-31 Afghanistan 0 0 0 0
1 2020-01-01 Afghanistan 0 0 0 0
2 2020-01-02 Afghanistan 0 0 0 0
3 2020-01-03 Afghanistan 0 0 0 0
4 2020-01-04 Afghanistan 0 0 0 0
In [216]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7213 entries, 0 to 7212
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          7213 non-null   datetime64[ns]
 1   location      7213 non-null   object        
 2   new_cases     7213 non-null   int64         
 3   new_deaths    7213 non-null   int64         
 4   total_cases   7213 non-null   int64         
 5   total_deaths  7213 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 394.5+ KB

Date of data

In [217]:
df.date.max()
Out[217]:
Timestamp('2020-03-27 00:00:00')

Creating data for graphs

In [218]:
df = df.drop(columns=['new_cases', 'new_deaths'])
In [219]:
df_infection = df[df.total_cases >= START_CASES_INFECTION].copy()
df_death = df[df.total_deaths >= START_CASES_DEATH].copy()
In [220]:
df_infection.head()
Out[220]:
date location total_cases total_deaths
93 2020-03-24 Albania 100 4
94 2020-03-25 Albania 123 5
95 2020-03-26 Albania 146 5
96 2020-03-27 Albania 174 6
175 2020-03-23 Algeria 102 15
In [221]:
df_infection[df_infection.location == 'Austria'].tail(3)
Out[221]:
date location total_cases total_deaths
488 2020-03-25 Austria 5282 30
489 2020-03-26 Austria 5888 34
490 2020-03-27 Austria 7029 49
In [222]:
df_death.head()
Out[222]:
date location total_cases total_deaths
173 2020-03-21 Algeria 94 10
174 2020-03-22 Algeria 94 10
175 2020-03-23 Algeria 102 15
176 2020-03-24 Algeria 189 17
177 2020-03-25 Algeria 231 17
In [223]:
df_infection['total_cases_normalized'] = df_infection.groupby(
    "location")[['total_cases']].transform(lambda x: x / x.min() * START_CASES_INFECTION)
df_infection.total_cases_normalized = df_infection.total_cases_normalized.astype("int")

df_death['total_deaths_normalized'] = df_death.groupby(
    "location")[['total_deaths']].transform(lambda x: x / x.min() * START_CASES_DEATH)
df_death.total_deaths_normalized = df_death.total_deaths_normalized.astype("int")
In [224]:
df_infection["days_after"] = df_infection.groupby("location").cumcount()
df_death["days_after"] = df_death.groupby("location").cumcount()
In [225]:
df_infection.head()
Out[225]:
date location total_cases total_deaths total_cases_normalized days_after
93 2020-03-24 Albania 100 4 100 0
94 2020-03-25 Albania 123 5 123 1
95 2020-03-26 Albania 146 5 146 2
96 2020-03-27 Albania 174 6 174 3
175 2020-03-23 Algeria 102 15 100 0
In [226]:
df_death.head()
Out[226]:
date location total_cases total_deaths total_deaths_normalized days_after
173 2020-03-21 Algeria 94 10 10 0
174 2020-03-22 Algeria 94 10 10 1
175 2020-03-23 Algeria 102 15 15 2
176 2020-03-24 Algeria 189 17 17 3
177 2020-03-25 Algeria 231 17 17 4
In [227]:
df_summary = df[df.date == df.date.max()].drop(columns='date')
df_summary['cases'] = df_summary.total_cases
df_summary = df_summary.drop(['total_cases', 'total_deaths'], axis=1)
In [228]:
df_summary.head()
Out[228]:
location cases
77 Afghanistan 75
96 Albania 174
179 Algeria 305
193 Andorra 224
199 Angola 3
In [229]:
df_infection.location.unique()
Out[229]:
array(['Albania', 'Algeria', 'Andorra', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Belgium',
       'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Canada', 'Chile', 'China', 'Colombia',
       'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia',
       'Faeroe Islands', 'Finland', 'France', 'Germany', 'Ghana',
       'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia',
       'International', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lebanon',
       'Lithuania', 'Luxembourg', 'Macedonia', 'Malaysia', 'Malta',
       'Mexico', 'Moldova', 'Morocco', 'Netherlands', 'New Zealand',
       'Norway', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'San Marino',
       'Saudi Arabia', 'Senegal', 'Serbia', 'Singapore', 'Slovakia',
       'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka',
       'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Tunisia', 'Turkey',
       'Ukraine', 'United Arab Emirates', 'United Kingdom',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam', 'World'],
      dtype=object)
In [230]:
df_death.location.unique()
Out[230]:
array(['Algeria', 'Argentina', 'Australia', 'Austria', 'Belgium',
       'Brazil', 'Canada', 'China', 'Denmark', 'Dominican Republic',
       'Ecuador', 'Egypt', 'France', 'Germany', 'Greece', 'Hungary',
       'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Japan',
       'Malaysia', 'Morocco', 'Netherlands', 'Norway', 'Philippines',
       'Poland', 'Portugal', 'Romania', 'San Marino', 'South Korea',
       'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom',
       'United States', 'World'], dtype=object)
In [231]:
COUNTRIES = set(df[(df.date == df.date.max())].nlargest(columns='total_cases', n=34).sort_values('location').location)
COUNTRIES.remove('World')

Altair

Storing data locally

In [232]:
url_infection = 'data_infection.json'
url_death = 'data_death.json'
url_summary = 'data_summary.json'

df_infection[df_infection.location.isin(COUNTRIES)].to_json(url_infection, orient='records')
df_death[df_death.location.isin(COUNTRIES)].to_json(url_death, orient='records')
df_summary[df_summary.location.isin(COUNTRIES)].to_json(url_summary, orient='records')

Uploading data to github

In [202]:
repo = g.get_repo("Datenspieler/notebooks_for_blog")
In [203]:
FILELIST = ['data_infection.json', 'data_death.json', 'data_summary.json']
In [236]:
for file_for_upload in FILELIST:
    print('Uploading', file_for_upload, end=' - ')
    with open(file_for_upload) as f:
        data = f.read()
    contents = repo.get_contents("2020-corona/" + file_for_upload)
    repo.update_file(contents.path, "Update data as of " + df.date.max().strftime('%Y-%m-%d'), data, 
                     contents.sha, branch="master")
    
    contents = repo.get_contents("2020-corona/" + file_for_upload)

    print(contents.last_modified)
Uploading data_infection.json - Sat, 28 Mar 2020 04:46:56 GMT
Uploading data_death.json - Sat, 28 Mar 2020 04:46:59 GMT
Uploading data_summary.json - Sat, 28 Mar 2020 04:47:00 GMT
In [60]:
# In future version files should be pushed to git automatically, this time I uploaded them 

BASEURL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-corona/'

url_infection = BASEURL + url_infection
url_death = BASEURL + url_death
url_summary = BASEURL + url_summary

Alternatively local version

In [233]:
url_infection = df_infection[df_infection.location.isin(COUNTRIES)]
url_death = df_death[df_death.location.isin(COUNTRIES)]
url_summary = df_summary[df_summary.location.isin(COUNTRIES)]

Plotting data

In [235]:
highlight = alt.selection(type='single', on='mouseover',
                          fields=['location'], nearest=True)

base_infection = alt.Chart(url_infection).encode(
    alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,30], type='ordinal'),
         title='Days since the 100th confirmed infection'),
    alt.Y('total_cases_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), 
          title='Total confirmed infections of COVID-19, normalized'),
    alt.Color('location:N', title="Country", legend=None),
    alt.Tooltip(['location:N', 'total_cases:Q', 'date:T'])
).properties(
    title='Development of Corona infections',
)

base_death = alt.Chart(url_death).encode(
    alt.X('days_after', type='quantitative',  scale=alt.Scale(domain=[0,30], type='ordinal'),
         title='Days since the 10th confirmed death'),
    alt.Y('total_deaths_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), 
          title='Total confirmed deaths of COVID-19, normalized'),
    alt.Color('location:N', title="Country", legend=None),
    alt.Tooltip(['location:N', 'total_deaths:Q', 'date:T'])
).properties(
    title='Development of Corona deaths',
)

base_summary = alt.Chart(url_summary).mark_bar().encode(
    x = alt.X('cases', type='quantitative', title='Confirmed cases', scale=alt.Scale(type='linear')),
    y = alt.Y('location:N', title='Country'),
    color = alt.condition(highlight, alt.Color('location:N', title="Country", legend=None), alt.ColorValue("grey")),
    tooltip = alt.Tooltip(['location:N', 'cases:Q'])
).add_selection(
    highlight
).properties(
    width=200,
    title='Corona cases by country'
)


points_infection = base_infection.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
).interactive(
    bind_y = False
)

points_death = base_death.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
).interactive(
    bind_y = False
)


lines_infection = base_infection.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)

lines_death = base_death.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)
)

chart = alt.vconcat(points_infection + lines_infection, points_death + lines_death)

chart = alt.hconcat(base_summary, chart)


#chart.save('corona.html')
chart.save('corona.json')

chart
Out[235]:
In [ ]: