ver. A.L. 20200512, 20210519
Slightly modified from Greg Rafferty's https://github.com/raffg/covid-19;
see also his
dashboard to monitor the COVID-19 pandemic https://covid-19-raffg.herokuapp.com and his portfolio
Requires:
plotly: https://plotly.com/python (conda install plotly
)
cufflinks: https://plotly.com/python/v3/ipython-notebooks/cufflinks (pip install cufflinks --upgrade
)
#import sys
#!{sys.executable} -m pip install plotly==4.14.3
#!{sys.executable} -m pip install cufflinks --upgrade
pandas
plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import re
from datetime import date, timedelta
import io
import requests
import plotly
print('plotly:', plotly.__version__)
# Standard plotly imports
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
print('cufflinks:', cufflinks.__version__)
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
plotly: 4.14.3 cufflinks: 0.17.3
# # Load files from folder
# path = 'COVID-19/csse_covid_19_data/csse_covid_19_daily_reports'
# all_files = glob.glob(path + "/*.csv")
# files = []
# for filename in all_files:
# file = re.search(r'([0-9]{2}\-[0-9]{2}\-[0-9]{4})', filename)[0]
# df = pd.read_csv(filename, index_col=None, header=0)
# df['date'] = pd.to_datetime(file)
# files.append(df)
# df = pd.concat(files, axis=0, ignore_index=True, sort=False)
#```
# Load files from web
file_date = date(2021, 4, 1)
dates = []
while file_date <= date.today():
dates.append(file_date)
file_date += timedelta(days=1)
files = []
for file in dates:
file = file.strftime("%m-%d-%Y")
print(file)
url = r'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv'.format(file)
raw_string = requests.get(url).content
dff = pd.read_csv(io.StringIO(raw_string.decode('utf-8')))
dff['date'] = pd.to_datetime(file)
dff.rename(columns={'Country_Region': 'Country/Region'}, inplace=True)
files.append(dff)
dff = pd.concat(files, axis=0, ignore_index=True, sort=False)
#```
04-01-2021 04-02-2021 04-03-2021 04-04-2021 04-05-2021 04-06-2021 04-07-2021 04-08-2021 04-09-2021 04-10-2021 04-11-2021 04-12-2021 04-13-2021 04-14-2021 04-15-2021 04-16-2021 04-17-2021 04-18-2021 04-19-2021 04-20-2021 04-21-2021 04-22-2021 04-23-2021 04-24-2021 04-25-2021 04-26-2021 04-27-2021 04-28-2021 04-29-2021 04-30-2021 05-01-2021 05-02-2021 05-03-2021 05-04-2021 05-05-2021 05-06-2021 05-07-2021 05-08-2021 05-09-2021 05-10-2021 05-11-2021 05-12-2021 05-13-2021 05-14-2021 05-15-2021 05-16-2021 05-17-2021 05-18-2021 05-19-2021
dff.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 191184 entries, 0 to 191183 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 156768 non-null float64 1 Admin2 157008 non-null object 2 Province_State 182976 non-null object 3 Country/Region 191184 non-null object 4 Last_Update 191184 non-null object 5 Lat 186960 non-null float64 6 Long_ 186960 non-null float64 7 Confirmed 191184 non-null float64 8 Deaths 191184 non-null float64 9 Recovered 33890 non-null float64 10 Active 33826 non-null float64 11 Combined_Key 191184 non-null object 12 Incident_Rate 186958 non-null float64 13 Case_Fatality_Ratio 189206 non-null float64 14 date 191184 non-null datetime64[ns] 15 404: Not Found 0 non-null object dtypes: datetime64[ns](1), float64(9), object(6) memory usage: 23.3+ MB
2020051
dff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165400 entries, 0 to 165399
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Province/State 4358 non-null object
1 Country/Region 165400 non-null object
2 Last Update 7617 non-null object
3 Confirmed 165381 non-null float64
4 Deaths 164959 non-null float64
5 Recovered 165012 non-null float64
6 date 165400 non-null datetime64[ns]
7 Latitude 4799 non-null float64
8 Longitude 4799 non-null float64
9 FIPS 143674 non-null float64
10 Admin2 144198 non-null object
11 Province_State 148664 non-null object
12 Last_Update 157783 non-null object
13 Lat 155291 non-null float64
14 Long_ 155291 non-null float64
15 Active 157783 non-null float64
16 Combined_Key 157783 non-null object
17 404: Not Found 0 non-null object
dtypes: datetime64[ns](1), float64(9), object(8)
memory usage: 22.7+ MB
# Save to disk (overwrite previous version)
dff.to_csv('./data/csse_covid_19_daily_reports.csv', encoding='utf-8', index=False)
tmp = pd.read_csv('./data/csse_covid_19_daily_reports.csv')
df = tmp
# Rename countries with duplicate naming conventions
df['Country/Region'].replace('Mainland China', 'China', inplace=True)
df['Country/Region'].replace('Hong Kong SAR', 'Hong Kong', inplace=True)
df['Country/Region'].replace(' Azerbaijan', 'Azerbaijan', inplace=True)
df['Country/Region'].replace('Holy See', 'Vatican City', inplace=True)
df['Country/Region'].replace('Iran (Islamic Republic of)', 'Iran', inplace=True)
df['Country/Region'].replace('Taiwan*', 'Taiwan', inplace=True)
df['Country/Region'].replace('Korea, South', 'South Korea', inplace=True)
df['Country/Region'].replace('Viet Nam', 'Vietnam', inplace=True)
df['Country/Region'].replace('Macao SAR', 'Macau', inplace=True)
df['Country/Region'].replace('Russian Federation', 'Russia', inplace=True)
df['Country/Region'].replace('Republic of Moldova', 'Moldova', inplace=True)
df['Country/Region'].replace('Czechia', 'Czech Republic', inplace=True)
df['Country/Region'].replace('Congo (Kinshasa)', 'Congo', inplace=True)
df['Country/Region'].replace('Northern Ireland', 'United Kingdom', inplace=True)
df['Country/Region'].replace('Republic of Korea', 'North Korea', inplace=True)
df['Country/Region'].replace('Congo (Brazzaville)', 'Congo', inplace=True)
df['Country/Region'].replace('Taipei and environs', 'Taiwan', inplace=True)
df['Country/Region'].replace('Others', 'Cruise Ship', inplace=True)
df['Province_State'].replace('Cruise Ship', 'Diamond Princess cruise ship', inplace=True)
df['Province_State'].replace('From Diamond Princess', 'Diamond Princess cruise ship', inplace=True)
# Replace old reporting standards
df['Province_State'].replace('Chicago', 'Illinois', inplace=True)
df['Province_State'].replace('Chicago, IL', 'Illinois', inplace=True)
df['Province_State'].replace('Cook County, IL', 'Illinois', inplace=True)
df['Province_State'].replace('Boston, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace(' Norfolk County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Suffolk County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Middlesex County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Norwell County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Plymouth County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Norfolk County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Berkshire County, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Unknown Location, MA', 'Massachusetts', inplace=True)
df['Province_State'].replace('Los Angeles, CA', 'California', inplace=True)
df['Province_State'].replace('Orange, CA', 'California', inplace=True)
df['Province_State'].replace('Santa Clara, CA', 'California', inplace=True)
df['Province_State'].replace('San Benito, CA', 'California', inplace=True)
df['Province_State'].replace('Humboldt County, CA', 'California', inplace=True)
df['Province_State'].replace('Sacramento County, CA', 'California', inplace=True)
df['Province_State'].replace('Travis, CA (From Diamond Princess)', 'California', inplace=True)
df['Province_State'].replace('Placer County, CA', 'California', inplace=True)
df['Province_State'].replace('San Mateo, CA', 'California', inplace=True)
df['Province_State'].replace('Sonoma County, CA', 'California', inplace=True)
df['Province_State'].replace('Berkeley, CA', 'California', inplace=True)
df['Province_State'].replace('Orange County, CA', 'California', inplace=True)
df['Province_State'].replace('Contra Costa County, CA', 'California', inplace=True)
df['Province_State'].replace('San Francisco County, CA', 'California', inplace=True)
df['Province_State'].replace('Yolo County, CA', 'California', inplace=True)
df['Province_State'].replace('Santa Clara County, CA', 'California', inplace=True)
df['Province_State'].replace('San Diego County, CA', 'California', inplace=True)
df['Province_State'].replace('Travis, CA', 'California', inplace=True)
df['Province_State'].replace('Alameda County, CA', 'California', inplace=True)
df['Province_State'].replace('Madera County, CA', 'California', inplace=True)
df['Province_State'].replace('Santa Cruz County, CA', 'California', inplace=True)
df['Province_State'].replace('Fresno County, CA', 'California', inplace=True)
df['Province_State'].replace('Riverside County, CA', 'California', inplace=True)
df['Province_State'].replace('Shasta County, CA', 'California', inplace=True)
df['Province_State'].replace('Seattle, WA', 'Washington', inplace=True)
df['Province_State'].replace('Snohomish County, WA', 'Washington', inplace=True)
df['Province_State'].replace('King County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Unassigned Location, WA', 'Washington', inplace=True)
df['Province_State'].replace('Clark County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Jefferson County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Pierce County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Kittitas County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Grant County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Spokane County, WA', 'Washington', inplace=True)
df['Province_State'].replace('Tempe, AZ', 'Arizona', inplace=True)
df['Province_State'].replace('Maricopa County, AZ', 'Arizona', inplace=True)
df['Province_State'].replace('Pinal County, AZ', 'Arizona', inplace=True)
df['Province_State'].replace('Madison, WI', 'Wisconsin', inplace=True)
df['Province_State'].replace('San Antonio, TX', 'Texas', inplace=True)
df['Province_State'].replace('Lackland, TX', 'Texas', inplace=True)
df['Province_State'].replace('Lackland, TX (From Diamond Princess)', 'Texas', inplace=True)
df['Province_State'].replace('Harris County, TX', 'Texas', inplace=True)
df['Province_State'].replace('Fort Bend County, TX', 'Texas', inplace=True)
df['Province_State'].replace('Montgomery County, TX', 'Texas', inplace=True)
df['Province_State'].replace('Collin County, TX', 'Texas', inplace=True)
df['Province_State'].replace('Ashland, NE', 'Nebraska', inplace=True)
df['Province_State'].replace('Omaha, NE (From Diamond Princess)', 'Nebraska', inplace=True)
df['Province_State'].replace('Douglas County, NE', 'Nebraska', inplace=True)
df['Province_State'].replace('Portland, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Umatilla, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Klamath County, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Douglas County, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Marion County, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Jackson County, OR ', 'Oregon', inplace=True)
df['Province_State'].replace('Washington County, OR', 'Oregon', inplace=True)
df['Province_State'].replace('Providence, RI', 'Rhode Island', inplace=True)
df['Province_State'].replace('Providence County, RI', 'Rhode Island', inplace=True)
df['Province_State'].replace('Grafton County, NH', 'New Hampshire', inplace=True)
df['Province_State'].replace('Rockingham County, NH', 'New Hampshire', inplace=True)
df['Province_State'].replace('Hillsborough, FL', 'Florida', inplace=True)
df['Province_State'].replace('Sarasota, FL', 'Florida', inplace=True)
df['Province_State'].replace('Santa Rosa County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Broward County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Lee County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Volusia County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Manatee County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Okaloosa County, FL', 'Florida', inplace=True)
df['Province_State'].replace('Charlotte County, FL', 'Florida', inplace=True)
df['Province_State'].replace('New York City, NY', 'New York', inplace=True)
df['Province_State'].replace('Westchester County, NY', 'New York', inplace=True)
df['Province_State'].replace('Queens County, NY', 'New York', inplace=True)
df['Province_State'].replace('New York County, NY', 'New York', inplace=True)
df['Province_State'].replace('Nassau, NY', 'New York', inplace=True)
df['Province_State'].replace('Nassau County, NY', 'New York', inplace=True)
df['Province_State'].replace('Rockland County, NY', 'New York', inplace=True)
df['Province_State'].replace('Saratoga County, NY', 'New York', inplace=True)
df['Province_State'].replace('Suffolk County, NY', 'New York', inplace=True)
df['Province_State'].replace('Ulster County, NY', 'New York', inplace=True)
df['Province_State'].replace('Fulton County, GA', 'Georgia', inplace=True)
df['Province_State'].replace('Floyd County, GA', 'Georgia', inplace=True)
df['Province_State'].replace('Polk County, GA', 'Georgia', inplace=True)
df['Province_State'].replace('Cherokee County, GA', 'Georgia', inplace=True)
df['Province_State'].replace('Cobb County, GA', 'Georgia', inplace=True)
df['Province_State'].replace('Wake County, NC', 'North Carolina', inplace=True)
df['Province_State'].replace('Chatham County, NC', 'North Carolina', inplace=True)
df['Province_State'].replace('Bergen County, NJ', 'New Jersey', inplace=True)
df['Province_State'].replace('Hudson County, NJ', 'New Jersey', inplace=True)
df['Province_State'].replace('Clark County, NV', 'Nevada', inplace=True)
df['Province_State'].replace('Washoe County, NV', 'Nevada', inplace=True)
df['Province_State'].replace('Williamson County, TN', 'Tennessee', inplace=True)
df['Province_State'].replace('Davidson County, TN', 'Tennessee', inplace=True)
df['Province_State'].replace('Shelby County, TN', 'Tennessee', inplace=True)
df['Province_State'].replace('Montgomery County, MD', 'Maryland', inplace=True)
df['Province_State'].replace('Harford County, MD', 'Maryland', inplace=True)
df['Province_State'].replace('Denver County, CO', 'Colorado', inplace=True)
df['Province_State'].replace('Summit County, CO', 'Colorado', inplace=True)
df['Province_State'].replace('Douglas County, CO', 'Colorado', inplace=True)
df['Province_State'].replace('El Paso County, CO', 'Colorado', inplace=True)
df['Province_State'].replace('Delaware County, PA', 'Pennsylvania', inplace=True)
df['Province_State'].replace('Wayne County, PA', 'Pennsylvania', inplace=True)
df['Province_State'].replace('Montgomery County, PA', 'Pennsylvania', inplace=True)
df['Province_State'].replace('Fayette County, KY', 'Kentucky', inplace=True)
df['Province_State'].replace('Jefferson County, KY', 'Kentucky', inplace=True)
df['Province_State'].replace('Harrison County, KY', 'Kentucky', inplace=True)
df['Province_State'].replace('Marion County, IN', 'Indiana', inplace=True)
df['Province_State'].replace('Hendricks County, IN', 'Indiana', inplace=True)
df['Province_State'].replace('Ramsey County, MN', 'Minnesota', inplace=True)
df['Province_State'].replace('Carver County, MN', 'Minnesota', inplace=True)
df['Province_State'].replace('Fairfield County, CT', 'Connecticut', inplace=True)
df['Province_State'].replace('Charleston County, SC', 'South Carolina', inplace=True)
df['Province_State'].replace('Spartanburg County, SC', 'South Carolina', inplace=True)
df['Province_State'].replace('Kershaw County, SC', 'South Carolina', inplace=True)
df['Province_State'].replace('Davis County, UT', 'Utah', inplace=True)
df['Province_State'].replace('Honolulu County, HI', 'Hawaii', inplace=True)
df['Province_State'].replace('Tulsa County, OK', 'Oklahoma', inplace=True)
df['Province_State'].replace('Fairfax County, VA', 'Virginia', inplace=True)
df['Province_State'].replace('St. Louis County, MO', 'Missouri', inplace=True)
df['Province_State'].replace('Unassigned Location, VT', 'Vermont', inplace=True)
df['Province_State'].replace('Bennington County, VT', 'Vermont', inplace=True)
df['Province_State'].replace('Johnson County, IA', 'Iowa', inplace=True)
df['Province_State'].replace('Jefferson Parish, LA', 'Louisiana', inplace=True)
df['Province_State'].replace('Johnson County, KS', 'Kansas', inplace=True)
df['Province_State'].replace('Washington, D.C.', 'District of Columbia', inplace=True)
# Interpolate values for missing South Korea data on March 11
# (we skip this, but see the original https://github.com/raffg/covid-19/blob/master/eda.ipynb)
# South Korea data on March 10 seems to be mislabled as North Korea
df.loc[(df['Country/Region'] == 'North Korea') & (df['date'] == '03-10-2020'), 'Country/Region'] = 'South Korea'
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 191184 entries, 0 to 191183 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 156768 non-null float64 1 Admin2 157008 non-null object 2 Province_State 182976 non-null object 3 Country/Region 191184 non-null object 4 Last_Update 191184 non-null object 5 Lat 186960 non-null float64 6 Long_ 186960 non-null float64 7 Confirmed 191184 non-null float64 8 Deaths 191184 non-null float64 9 Recovered 33890 non-null float64 10 Active 33826 non-null float64 11 Combined_Key 191184 non-null object 12 Incident_Rate 186958 non-null float64 13 Case_Fatality_Ratio 189206 non-null float64 14 date 191184 non-null object 15 404: Not Found 0 non-null float64 dtypes: float64(10), object(6) memory usage: 23.3+ MB
df
FIPS | Admin2 | Province_State | Country/Region | Last_Update | Lat | Long_ | Confirmed | Deaths | Recovered | Active | Combined_Key | Incident_Rate | Case_Fatality_Ratio | date | 404: Not Found | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | Afghanistan | 2021-04-02 04:20:36 | 33.939110 | 67.709953 | 56517.0 | 2489.0 | 51788.0 | 2240.0 | Afghanistan | 145.182144 | 4.403985 | 2021-04-01 | NaN |
1 | NaN | NaN | NaN | Albania | 2021-04-02 04:20:36 | 41.153300 | 20.168300 | 125506.0 | 2241.0 | 91875.0 | 31390.0 | Albania | 4361.178678 | 1.785572 | 2021-04-01 | NaN |
2 | NaN | NaN | NaN | Algeria | 2021-04-02 04:20:36 | 28.033900 | 1.659600 | 117304.0 | 3096.0 | 81632.0 | 32576.0 | Algeria | 267.505610 | 2.639296 | 2021-04-01 | NaN |
3 | NaN | NaN | NaN | Andorra | 2021-04-02 04:20:36 | 42.506300 | 1.521800 | 12053.0 | 115.0 | 11365.0 | 573.0 | Andorra | 15599.559956 | 0.954119 | 2021-04-01 | NaN |
4 | NaN | NaN | NaN | Angola | 2021-04-02 04:20:36 | -11.202700 | 17.873900 | 22399.0 | 538.0 | 20508.0 | 1353.0 | Angola | 68.151942 | 2.401893 | 2021-04-01 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
191179 | NaN | NaN | NaN | Vietnam | 2021-05-19 04:20:42 | 14.058324 | 108.277199 | 4512.0 | 37.0 | 2687.0 | 1788.0 | Vietnam | 4.635366 | 0.820035 | 2021-05-18 | NaN |
191180 | NaN | NaN | NaN | West Bank and Gaza | 2021-05-19 04:20:42 | 31.952200 | 35.233200 | 304074.0 | 3437.0 | 293808.0 | 6829.0 | West Bank and Gaza | 5960.580357 | 1.130317 | 2021-05-18 | NaN |
191181 | NaN | NaN | NaN | Yemen | 2021-05-19 04:20:42 | 15.552727 | 48.516388 | 6586.0 | 1297.0 | 3072.0 | 2217.0 | Yemen | 22.081429 | 19.693289 | 2021-05-18 | NaN |
191182 | NaN | NaN | NaN | Zambia | 2021-05-19 04:20:42 | -13.133897 | 27.849332 | 92520.0 | 1263.0 | 90892.0 | 365.0 | Zambia | 503.264912 | 1.365110 | 2021-05-18 | NaN |
191183 | NaN | NaN | NaN | Zimbabwe | 2021-05-19 04:20:42 | -19.015438 | 29.154857 | 38595.0 | 1583.0 | 36371.0 | 641.0 | Zimbabwe | 259.672943 | 4.101568 | 2021-05-18 | NaN |
191184 rows × 16 columns
# Re-order the columns for readability
df = df[['date',
'Country/Region',
'Province_State',
'Confirmed',
'Deaths',
'Recovered',
'Lat',
'Long_']]
# Fill missing values as 0; create Active cases column
df['Confirmed'] = df['Confirmed'].fillna(0).astype(int)
df['Deaths'] = df['Deaths'].fillna(0).astype(int)
df['Recovered'] = df['Recovered'].fillna(0).astype(int)
df['Active'] = df['Confirmed'] - (df['Deaths'] + df['Recovered'])
# Replace missing values for latitude and longitude
df['Latitude'] = df['Lat'].fillna(df.groupby('Province_State')['Lat'].transform('mean'))
df['Longitude'] = df['Long_'].fillna(df.groupby('Province_State')['Long_'].transform('mean'))
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 191184 entries, 0 to 191183 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 191184 non-null object 1 Country/Region 191184 non-null object 2 Province_State 182976 non-null object 3 Confirmed 191184 non-null int64 4 Deaths 191184 non-null int64 5 Recovered 191184 non-null int64 6 Lat 186960 non-null float64 7 Long_ 186960 non-null float64 8 Active 191184 non-null int64 9 Latitude 190176 non-null float64 10 Longitude 190176 non-null float64 dtypes: float64(4), int64(4), object(3) memory usage: 16.0+ MB
n_reg = len(df['Country/Region'].unique())
print('Number of unique Country/Region:', n_reg)
Number of unique Country/Region: 191
df[df['Country/Region'] == 'US'].groupby(['date', 'Province_State'])[['Confirmed', 'Deaths', 'Recovered', 'Active']].sum()
Confirmed | Deaths | Recovered | Active | ||
---|---|---|---|---|---|
date | Province_State | ||||
2021-04-01 | Alabama | 515866 | 10553 | 0 | 505313 |
Alaska | 63298 | 313 | 0 | 62985 | |
Arizona | 842192 | 16977 | 0 | 825215 | |
Arkansas | 330609 | 5636 | 0 | 324973 | |
California | 3671349 | 59411 | 0 | 3611938 | |
... | ... | ... | ... | ... | ... |
2021-05-18 | Virginia | 670834 | 11042 | 0 | 659792 |
Washington | 424050 | 5653 | 0 | 418397 | |
West Virginia | 158888 | 2763 | 0 | 156125 | |
Wisconsin | 670362 | 7726 | 0 | 662636 | |
Wyoming | 59321 | 713 | 0 | 58608 |
2784 rows × 4 columns
df[df['Country/Region'] == 'US'].groupby('date')[['Confirmed', 'Deaths', 'Recovered', 'Active']].sum()
Confirmed | Deaths | Recovered | Active | |
---|---|---|---|---|
date | ||||
2021-04-01 | 30541255 | 553733 | 0 | 29987522 |
2021-04-02 | 30611086 | 554686 | 0 | 30056400 |
2021-04-03 | 30674153 | 555423 | 0 | 30118730 |
2021-04-04 | 30709125 | 555717 | 0 | 30153408 |
2021-04-05 | 30786804 | 556236 | 0 | 30230568 |
2021-04-06 | 30847348 | 557097 | 0 | 30290251 |
2021-04-07 | 30922386 | 559690 | 0 | 30362696 |
2021-04-08 | 31002264 | 560711 | 0 | 30441553 |
2021-04-09 | 31084962 | 561600 | 0 | 30523362 |
2021-04-10 | 31151497 | 562316 | 0 | 30589181 |
2021-04-11 | 31197877 | 562616 | 0 | 30635261 |
2021-04-12 | 31268107 | 563092 | 0 | 30705015 |
2021-04-13 | 31345985 | 563924 | 0 | 30782061 |
2021-04-14 | 31421360 | 564896 | 0 | 30856464 |
2021-04-15 | 31495649 | 565795 | 0 | 30929854 |
2021-04-16 | 31575640 | 566667 | 0 | 31008973 |
2021-04-17 | 31628013 | 567359 | 0 | 31060654 |
2021-04-18 | 31670031 | 567692 | 0 | 31102339 |
2021-04-19 | 31737964 | 568191 | 0 | 31169773 |
2021-04-20 | 31799237 | 569027 | 0 | 31230210 |
2021-04-21 | 31862094 | 569886 | 0 | 31292208 |
2021-04-22 | 31929351 | 570844 | 0 | 31358507 |
2021-04-23 | 31991750 | 571621 | 0 | 31420129 |
2021-04-24 | 32045113 | 572356 | 0 | 31472757 |
2021-04-25 | 32077178 | 572645 | 0 | 31504533 |
2021-04-26 | 32124869 | 573132 | 0 | 31551737 |
2021-04-27 | 32175725 | 573782 | 0 | 31601943 |
2021-04-28 | 32230850 | 574751 | 0 | 31656099 |
2021-04-29 | 32289049 | 575616 | 0 | 31713433 |
2021-04-30 | 32346971 | 576354 | 0 | 31770617 |
2021-05-01 | 32392274 | 577069 | 0 | 31815205 |
2021-05-02 | 32421641 | 577401 | 0 | 31844240 |
2021-05-03 | 32472201 | 577887 | 0 | 31894314 |
2021-05-04 | 32512934 | 578763 | 0 | 31934171 |
2021-05-05 | 32557669 | 579541 | 0 | 31978128 |
2021-05-06 | 32605183 | 580334 | 0 | 32024849 |
2021-05-07 | 32652472 | 581088 | 0 | 32071384 |
2021-05-08 | 32686965 | 581705 | 0 | 32105260 |
2021-05-09 | 32708357 | 581947 | 0 | 32126410 |
2021-05-10 | 32745255 | 582347 | 0 | 32162908 |
2021-05-11 | 32778906 | 583023 | 0 | 32195883 |
2021-05-12 | 32814784 | 583872 | 0 | 32230912 |
2021-05-13 | 32852871 | 584674 | 0 | 32268197 |
2021-05-14 | 32895169 | 585332 | 0 | 32309837 |
2021-05-15 | 32923982 | 585808 | 0 | 32338174 |
2021-05-16 | 32940846 | 586070 | 0 | 32354776 |
2021-05-17 | 32969512 | 586459 | 0 | 32383053 |
2021-05-18 | 32997331 | 587219 | 0 | 32410112 |
# fatality rate
'{:.2f}%'.format(100 *
df[df['date'] == df['date'].iloc[-1]]['Deaths'].sum() /
df[df['date'] == df['date'].iloc[-1]]['Confirmed'].sum())
'2.07%'
fig = go.Figure([go.Scatter(x=df[df['Country/Region'] == 'US'].groupby('date')['date'].first(),
y=df[df['Country/Region'] == 'US'].groupby('date')['Active'].sum())])
fig.update_layout(
title="US: Active COVID-19",
xaxis_title="Date",
yaxis_title="Active infected",
font=dict(
family="Courier New, monospace",
size=16,
color="#7f7f7f"
)
)
fig.show()
geo_us = df[(df['date'] == '2021-05-17') &
(df['Country/Region'] == 'US')].groupby('Province_State',
as_index=False).agg({'Longitude': 'mean',
'Latitude': 'mean'})
temp2 = pd.read_csv('./data/csse_covid_19_daily_reports.csv')
df4 = temp2[temp2['Country/Region'] == 'US'].groupby('Province_State', as_index=False).agg({'Confirmed': 'sum'})
df4 = df4.merge(geo_us, left_on='Province_State', right_on='Province_State')
fig = go.Figure(data=go.Scattergeo(
lon = df4['Longitude'],
lat = df4['Latitude'],
text = df4['Province_State'] + ', ' + ': ' + df4['Confirmed'].astype(str),
mode = 'markers',
marker_size = (200 * df4['Confirmed'] / df4['Confirmed'].max()),
marker = dict(reversescale = False,
autocolorscale = False,
symbol = 'circle',
line = dict(width=1, color='rgba(102, 102, 102)'),
colorscale = 'Reds',
cmin = 0,
color = df4['Confirmed'],
cmax = df4['Confirmed'].max(),
colorbar_title="Confirmed Cases")))
fig.update_layout(title = 'Number of cumulative confirmed cases in the US by state ',
geo=dict(scope='usa',
projection_type='albers usa',
showland = True,
landcolor = "rgb(100, 125, 100)",
showocean = True,
oceancolor = "rgb(150, 150, 250)",
showcountries=True,
showsubunits=True,
showlakes=True,))
fig.show()
eu = ['Albania', 'Andorra', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia and Herzegovina',
'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Georgia',
'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kazakhstan', 'Kosovo', 'Latvia', 'Liechtenstein',
'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 'Netherlands', 'North Macedonia', 'Norway',
'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino', 'Serbia', 'Slovakia', 'Slovenia', 'Spain', 'Sweden',
'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom', 'Vatican City']
df3 = df[df['Country/Region'].isin(eu)]
data = df3[df3['date'] == df3['date'].iloc[-1]].groupby('Country/Region').agg({'Active': 'sum',
'Longitude': 'mean',
'Latitude': 'mean',
'Country/Region': 'first',
'Province_State': 'first'})
data.loc[data['Country/Region'] == 'France', 'Latitude'] = 46.2276
data.loc[data['Country/Region'] == 'France', 'Longitude'] = -3.4360
data.loc[data['Country/Region'] == 'United Kingdom', 'Latitude'] = 55.3781
data.loc[data['Country/Region'] == 'United Kingdom', 'Longitude'] = 3.4360
data.loc[data['Country/Region'] == 'Denmark', 'Latitude'] = 56.2639
data.loc[data['Country/Region'] == 'Denmark', 'Longitude'] = 9.5018
data.loc[data['Country/Region'] == 'Netherlands', 'Latitude'] = 52.1326
data.loc[data['Country/Region'] == 'Netherlands', 'Longitude'] = 5.2913
fig = go.Figure(data=go.Scattergeo(
lon = data['Longitude'],
lat = data['Latitude'],
text = data['Country/Region'] + ', ' + data['Country/Region'] + ': ' + data['Active'].astype(str),
mode = 'markers',
marker_size = (100 * data['Active'] / data['Active'].max()),
marker = dict(reversescale = False,
autocolorscale = False,
symbol = 'circle',
line = dict(width=1, color='rgba(102, 102, 102)'),
colorscale = 'Reds',
cmin = 0,
color = data['Active'],
cmax = data['Active'].max(),
colorbar_title="Active Cases")))
fig.update_layout(title = 'Number of active cases by European country ',
geo=dict(scope='europe',
projection_type="natural earth",
showland = True,
landcolor = "rgb(100, 125, 100)",
showocean = True,
oceancolor = "rgb(150, 150, 250)",
showcountries=True,
showsubunits=True,
showlakes=False,))
fig.show()
from IPython.display import Image
Image('./assets/active_cases_eu.png', width=600)
df0 = df[df['Country/Region'] == 'Norway']
df0.head()
date | Country/Region | Province_State | Confirmed | Deaths | Recovered | Lat | Long_ | Active | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|
452 | 2021-04-01 | Norway | NaN | 96770 | 673 | 17998 | 60.472 | 8.4689 | 78099 | 60.472 | 8.4689 |
4435 | 2021-04-02 | Norway | NaN | 97410 | 673 | 17998 | 60.472 | 8.4689 | 78739 | 60.472 | 8.4689 |
8418 | 2021-04-03 | Norway | NaN | 98102 | 673 | 17998 | 60.472 | 8.4689 | 79431 | 60.472 | 8.4689 |
12401 | 2021-04-04 | Norway | NaN | 98676 | 673 | 17998 | 60.472 | 8.4689 | 80005 | 60.472 | 8.4689 |
16384 | 2021-04-05 | Norway | NaN | 99249 | 676 | 17998 | 60.472 | 8.4689 | 80575 | 60.472 | 8.4689 |
df0.tail()
date | Country/Region | Province_State | Confirmed | Deaths | Recovered | Lat | Long_ | Active | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|
171721 | 2021-05-14 | Norway | NaN | 118740 | 774 | 17998 | 60.472 | 8.4689 | 99968 | 60.472 | 8.4689 |
175704 | 2021-05-15 | Norway | NaN | 119052 | 774 | 17998 | 60.472 | 8.4689 | 100280 | 60.472 | 8.4689 |
179687 | 2021-05-16 | Norway | NaN | 119299 | 774 | 17998 | 60.472 | 8.4689 | 100527 | 60.472 | 8.4689 |
183670 | 2021-05-17 | Norway | NaN | 119500 | 774 | 17998 | 60.472 | 8.4689 | 100728 | 60.472 | 8.4689 |
187653 | 2021-05-18 | Norway | NaN | 119814 | 774 | 17998 | 60.472 | 8.4689 | 101042 | 60.472 | 8.4689 |
df1 = df[df['Country/Region'] == 'Norway'].groupby('date')[['Confirmed', 'Deaths', 'Recovered', 'Active']].sum()
df1.head()
Confirmed | Deaths | Recovered | Active | |
---|---|---|---|---|
date | ||||
2021-04-01 | 96770 | 673 | 17998 | 78099 |
2021-04-02 | 97410 | 673 | 17998 | 78739 |
2021-04-03 | 98102 | 673 | 17998 | 79431 |
2021-04-04 | 98676 | 673 | 17998 | 80005 |
2021-04-05 | 99249 | 676 | 17998 | 80575 |
df1.tail()
Confirmed | Deaths | Recovered | Active | |
---|---|---|---|---|
date | ||||
2021-05-14 | 118740 | 774 | 17998 | 99968 |
2021-05-15 | 119052 | 774 | 17998 | 100280 |
2021-05-16 | 119299 | 774 | 17998 | 100527 |
2021-05-17 | 119500 | 774 | 17998 | 100728 |
2021-05-18 | 119814 | 774 | 17998 | 101042 |
def fatality_rate_given_country(csse_daily_df, country):
dfc = csse_daily_df[csse_daily_df['Country/Region'] == country]
last = dfc['date'].iloc[-1]
cfr = dfc[dfc['date'] == last]['Deaths'].sum() / dfc[dfc['date'] == last]['Confirmed'].sum()
active = dfc[dfc['date'] == last]['Active'].sum()
confirmed = dfc[dfc['date'] == last]['Confirmed'].sum()
return last, cfr, active, confirmed
countrylist = ['Norway', 'Sweden', 'Denmark', 'Iceland', 'China', 'Italy', 'US']
print('Case fatality rate (accumulated Deaths/accumulated Confirmed) for given country:\n')
for i, c in enumerate(countrylist):
last, cfr, active, confirmed = fatality_rate_given_country(df, c)
print('%s (upto %s) = %.2f%% (confirmed=%d, active=%d)' % (c, last, cfr*100, confirmed, active))
Case fatality rate (accumulated Deaths/accumulated Confirmed) for given country: Norway (upto 2021-05-18) = 0.65% (confirmed=119814, active=101042) Sweden (upto 2021-05-18) = 1.37% (confirmed=1047143, active=1032842) Denmark (upto 2021-05-18) = 0.93% (confirmed=268956, active=13068) Iceland (upto 2021-05-18) = 0.44% (confirmed=6549, active=58) China (upto 2021-05-18) = 4.71% (confirmed=102784, active=378) Italy (upto 2021-05-18) = 2.99% (confirmed=4167025, active=315308) US (upto 2021-05-18) = 1.78% (confirmed=32997331, active=32410112)
last, cfr, active, confirmed = fatality_rate_given_country(df, 'Norway')
fig = go.Figure([go.Scatter(x=df[df['Country/Region'] == 'Norway'].groupby('date')['date'].first(),
y=df[df['Country/Region'] == 'Norway'].groupby('date')['Active'].sum())])
fig.update_layout(
title="NORWAY: Active COVID-19 (CFR=%.2f%%)" % (cfr*100),
xaxis_title="Date",
yaxis_title="Active infected",
font=dict(
family="Courier New, monospace",
size=16,
color="#7f7f7f"
)
)
fig.show()
from IPython.display import Image
Image('./assets/active_cases_cfr_norway.png', width=600)
region = 'Norway'
fig = go.Figure()
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
name="Active cases"))
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
y=df[df['Country/Region'] == region].groupby('date')['Confirmed'].sum(),
name="Total Confirmed"))
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
y=df[df['Country/Region'] == region].groupby('date')['Deaths'].sum(),
name="Deaths"))
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
y=df[df['Country/Region'] == region].groupby('date')['Recovered'].sum(),
name="Recovered"))
fig.update_layout(title="COVID-19 infections in {}".format(region),
xaxis_title="Date",
yaxis_title="Number of Individuals")
fig.show()
fig = go.Figure()
countries = ['China', 'Italy', 'South Korea', 'US', 'Spain', 'France', 'Germany', 'Norway']
for country in countries:
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == country].groupby('date')['date'].first(),
y=df[df['Country/Region'] == country].groupby('date')['Active'].sum(),
name=country,
opacity=0.8))
fig.update_layout(title="Active COVID-19 cases",
xaxis_title="Date",
yaxis_title="Number of Individuals")
fig.show()
from IPython.display import Image
Image('./assets/active_cases_selected_countries.png', width=600)
fig = go.Figure()
for region in ['China', 'Italy', 'US', 'Spain', 'France', 'Germany', 'South Korea', 'Norway']:
fig.add_trace(go.Scatter(
x=df[df['Country/Region'] == region].groupby('date')['date'].first(),
y=df[df['Country/Region'] == region].groupby('date')['Active'].sum(),
name=region,
hoverinfo='x+y+z+text+name',
stackgroup='one'))
fig.update_layout(title="COVID-19 Active Cases Worldwide",
xaxis_title="Date",
yaxis_title="Number of Individuals")
fig.show()