print('lets go...')
lets go...
# sync from origin
# !git remote add upstream https://github.com/CSSEGISandData/2019-nCoV
!git fetch upstream
!git checkout master
!git merge upstream/master
From https://github.com/CSSEGISandData/2019-nCoV 05e55a9..37fc591 master -> upstream/master 23a786b..6f4a50a web-data -> upstream/web-data
Your branch is ahead of 'origin/master' by 167 commits. (use "git push" to publish your local commits)
Already on 'master'
Merge made by the 'recursive' strategy. .../csse_covid_19_daily_reports/03-04-2020.csv | 161 +++++++++++ .../time_series_19-covid-Confirmed.csv | 313 +++++++++++---------- .../time_series_19-covid-Deaths.csv | 313 +++++++++++---------- .../time_series_19-covid-Recovered.csv | 313 +++++++++++---------- .../20200225-sitrep-36-covid-19.pdf | Bin 0 -> 1040861 bytes .../20200226-sitrep-37-covid-19.pdf | Bin 0 -> 864348 bytes .../20200227-sitrep-38-covid-19.pdf | Bin 0 -> 978463 bytes .../20200228-sitrep-39-covid-19.pdf | Bin 0 -> 1079326 bytes .../20200229-sitrep-40-covid-19.pdf | Bin 0 -> 1018078 bytes .../20200301-sitrep-41-covid-19.pdf | Bin 0 -> 879065 bytes .../20200302-sitrep-42-covid-19.pdf | Bin 0 -> 923008 bytes .../20200303-sitrep-43-covid-19.pdf | Bin 0 -> 1170956 bytes .../who_covid_19_sit_rep_time_series.csv | 186 +++++++----- 13 files changed, 759 insertions(+), 527 deletions(-) create mode 100644 csse_covid_19_data/csse_covid_19_daily_reports/03-04-2020.csv create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200225-sitrep-36-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200226-sitrep-37-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200227-sitrep-38-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200228-sitrep-39-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200229-sitrep-40-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200301-sitrep-41-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200302-sitrep-42-covid-19.pdf create mode 100644 who_covid_19_situation_reports/who_covid_19_sit_rep_pdfs/20200303-sitrep-43-covid-19.pdf
# imports
import pandas as pd
from datetime import datetime
from datetime import timedelta
from glob import glob
import matplotlib.pyplot as plt
import numpy
from scipy.special import expit
from scipy.optimize import curve_fit
import math
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras import layers
from sklearn.metrics import r2_score
import math
# read the raw data + basic pre-processing
def parse_date_str(d):
try:
return datetime.strptime(d, '%Y-%m-%dT%H:%M:%S')
except:
try:
return datetime.strptime(d, '%m/%d/%Y %H%p')
except:
try:
return datetime.strptime(d, '%m/%d/%y %H:%M')
except:
try:
return datetime.strptime(d, '%m/%d/%Y %H:%M')
except:
try:
return datetime.strptime(d, '%m/%d/%Y %H:%M:%S')
except:
return datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
path = 'csse_covid_19_data\csse_covid_19_daily_reports/*.csv'
all_cols = ['Last Update', 'Country/Region', 'Province/State', 'Confirmed', 'Deaths', 'Recovered']
l = [pd.read_csv(filename)[all_cols] for filename in glob(path)]
df = pd.concat(l, axis=0).reset_index().fillna(0)
df = df.rename(columns = { 'Country/Region': 'Country', 'Province/State': 'Province'})
df['Last Update'] = df['Last Update'].apply(parse_date_str)
df['Date'] = df['Last Update'].apply(datetime.date)
first_update = min(df['Last Update'])
df['Lag'] = df['Last Update'].apply(lambda x: (x - first_update).total_seconds())
df[df['Country'] == 'Mainland China'].tail(10)
index | Last Update | Country | Province | Confirmed | Deaths | Recovered | Date | Lag | |
---|---|---|---|---|---|---|---|---|---|
3264 | 29 | 2020-03-03 11:43:02 | Mainland China | Tianjin | 136.0 | 3.0 | 124.0 | 2020-03-03 | 3523382.0 |
3265 | 30 | 2020-03-03 23:13:05 | Mainland China | Shanxi | 133.0 | 0.0 | 124.0 | 2020-03-03 | 3564785.0 |
3266 | 31 | 2020-03-03 14:33:03 | Mainland China | Liaoning | 125.0 | 1.0 | 106.0 | 2020-03-03 | 3533583.0 |
3269 | 34 | 2020-03-04 10:03:19 | Mainland China | Jilin | 93.0 | 1.0 | 86.0 | 2020-03-04 | 3603799.0 |
3270 | 35 | 2020-03-04 15:43:03 | Mainland China | Gansu | 91.0 | 2.0 | 87.0 | 2020-03-04 | 3624183.0 |
3273 | 38 | 2020-03-04 15:43:03 | Mainland China | Xinjiang | 76.0 | 3.0 | 69.0 | 2020-03-04 | 3624183.0 |
3274 | 39 | 2020-03-04 15:43:03 | Mainland China | Inner Mongolia | 75.0 | 1.0 | 63.0 | 2020-03-04 | 3624183.0 |
3275 | 40 | 2020-03-04 01:33:07 | Mainland China | Ningxia | 75.0 | 0.0 | 69.0 | 2020-03-04 | 3573187.0 |
3294 | 59 | 2020-02-21 04:43:02 | Mainland China | Qinghai | 18.0 | 0.0 | 18.0 | 2020-02-21 | 2547782.0 |
3366 | 131 | 2020-02-12 06:43:02 | Mainland China | Tibet | 1.0 | 0.0 | 1.0 | 2020-02-12 | 1777382.0 |
# summary
summary = df.groupby(['Date', 'Country', 'Province'])['Confirmed', 'Deaths', 'Recovered'].max().reset_index()
summary = summary.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().reset_index()
# summary = summary[summary['Date'] != datetime.date(datetime.now())] # ignore today
summary['Confirmed Change'] = summary['Confirmed'].pct_change()
summary['Recovered Change'] = summary['Recovered'].pct_change()
summary['Deaths Change'] = summary['Deaths'].pct_change()
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
summary['Confirmed'].plot(ax=axes[0], color='blue', label='Confirmed', title='Confirmed curve', legend=True)
summary['Confirmed Change'].plot.bar(ax=axes[0], secondary_y=True, color='lightblue', label='Change', legend=True, mark_right=False)
summary['Deaths'].plot(ax=axes[1], color='red', label='Deaths', title='Deaths curve', legend=True)
summary['Deaths Change'].plot.bar(ax=axes[1], secondary_y=True, color='pink', label='Change', legend=True, mark_right=False)
summary['Recovered'].plot(ax=axes[2], color='green', label='Recovered', title='Recovered curve', legend=True)
summary['Recovered Change'].plot.bar(ax=axes[2], secondary_y=True, color='lightgreen', label='Change', legend=True, mark_right=False)
c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. This is separate from the ipykernel package so we can avoid doing imports until
<matplotlib.axes._subplots.AxesSubplot at 0x18da58dc470>
# basic fitting
def exp_curve(x, a, b, c):
return a * numpy.exp(-c * x) + b
def linear_curve(x, a, b):
return a * x + b
def logistic_curve(x, a, b, c):
return a/(expit(-c * x) + b)
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
fig.tight_layout(h_pad=7, w_pad=3)
x = summary.index
t = range(0, max(summary.index) + 30) # predict 3 weeks ahead
for i, curve_func in enumerate([linear_curve, exp_curve, logistic_curve]):
for j, y_col in enumerate(['Confirmed', 'Deaths']):
y = summary[y_col]
try:
popt, pcov = curve_fit(curve_func, x, y)
pred = curve_func(t, *popt)
y_pred = curve_func(x, *popt)
res = pd.DataFrame()
res['date'] = pd.Series(t).apply(lambda x: first_update + timedelta(days=x))
res['Predicted'] = pred
res[y_col] = y
if curve_func == logistic_curve:
res['The End?'] = res['Predicted'].mul(res['Predicted'].pct_change() < 0.001).apply(lambda x: None if x == 0 else x)
rsq = str(round(r2_score(y, y_pred), 4))
title = '%s / %s (rsq=%s)' % (y_col, curve_func.__name__, rsq)
res.plot(x='date', ax=axes[j, i], title=title)
except:
pass
c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\scipy\optimize\minpack.py:808: OptimizeWarning: Covariance of the parameters could not be estimated category=OptimizeWarning)
# using NN on the summary data frame to emulate a logistic curve
model = keras.Sequential()
model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dense(64, activation='exponential'))
model.add(layers.Dense(1))
model.compile(loss='mse', metrics=['mse', 'mae', 'accuracy'])
x = numpy.array(x)
y = summary['Confirmed'].values
hist = model.fit(x, y, epochs=1000, verbose=0)
t = numpy.array(t)
pred = model.predict(t).flatten()
y_pred = model.predict(x).flatten()
res = pd.DataFrame()
res['Predicted'] = pd.Series(pred)
res['Confirmed'] = pd.Series(y)
res['date'] = pd.Series(t).apply(lambda x: first_update + timedelta(days=x))
rsq = str(round(r2_score(y, y_pred), 3))
title = 'Confirmed / NN (rsq=%s)' % (rsq)
res.plot(x='date', title=title)
<matplotlib.axes._subplots.AxesSubplot at 0x18dad827f98>
# fitting using NN on the entire dataset
countries_and_provinces = pd.get_dummies(df[['Country', 'Province']])
lags = df['Lag']
left_input = layers.Input(shape=(1, ))
left_branch = layers.Dense(32)(left_input)
left_branch = layers.BatchNormalization()(left_branch)
right_len = len(countries_and_provinces.columns)
right_input = layers.Input(shape=(right_len, ))
right_branch = layers.Embedding(32, 32, input_length=right_len)(right_input)
right_branch = layers.Flatten()(right_branch)
right_branch = layers.Dense(32)(right_branch)
right_branch = layers.BatchNormalization()(right_branch)
merged = layers.concatenate([left_branch, right_branch])
merged = layers.Dense(32, activation='sigmoid')(merged)
merged = layers.Dense(32, activation='exponential')(merged)
out = layers.Dense(1)(merged)
model = keras.Model(inputs=[left_input, right_input], outputs=out)
model.compile(loss='mse', metrics=['mse', 'mae', 'accuracy'])
x = [lags.values, countries_and_provinces.values]
y = df['Confirmed'].values
hist = model.fit(x, y, epochs=100, verbose=0)
hist = pd.DataFrame(hist.history)
hist['mse'].plot()
y_pred = model.predict(x)
print ('r2 score', r2_score(y, y_pred))
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-8-cdf91c2c5acf> in <module> 31 32 ---> 33 y_pred = model.predict(x) 34 print ('r2 score', r2_score(y, y_pred)) c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training.py in predict(self, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing) 1011 max_queue_size=max_queue_size, 1012 workers=workers, -> 1013 use_multiprocessing=use_multiprocessing) 1014 1015 def reset_metrics(self): c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in predict(self, model, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing, **kwargs) 496 model, ModeKeys.PREDICT, x=x, batch_size=batch_size, verbose=verbose, 497 steps=steps, callbacks=callbacks, max_queue_size=max_queue_size, --> 498 workers=workers, use_multiprocessing=use_multiprocessing, **kwargs) 499 500 c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in _model_iteration(self, model, mode, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, **kwargs) 444 model, mode) 445 --> 446 data_iterator = iter(dataset) 447 448 callbacks = cbks.configure_callbacks( c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\data\ops\dataset_ops.py in __iter__(self) 416 if (context.executing_eagerly() 417 or ops.get_default_graph()._building_function): # pylint: disable=protected-access --> 418 return iterator_ops.OwnedIterator(self) 419 else: 420 raise RuntimeError("__iter__() is only supported inside of tf.function " c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\data\ops\iterator_ops.py in __init__(self, dataset, components, element_spec) 592 context.context().device_spec.device_type != "CPU"): 593 with ops.device("/cpu:0"): --> 594 self._create_iterator(dataset) 595 else: 596 self._create_iterator(dataset) c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\data\ops\iterator_ops.py in _create_iterator(self, dataset) 617 output_types=self._flat_output_types, 618 output_shapes=self._flat_output_shapes)) --> 619 gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource) 620 # Delete the resource when this object is deleted 621 self._resource_deleter = IteratorResourceDeleter( c:\users\haggais\appdata\local\continuum\miniconda3\lib\site-packages\tensorflow_core\python\ops\gen_dataset_ops.py in make_iterator(dataset, iterator, name) 2694 _result = _pywrap_tensorflow.TFE_Py_FastPathExecute( 2695 _ctx._context_handle, tld.device_name, "MakeIterator", name, -> 2696 tld.op_callbacks, dataset, iterator) 2697 return _result 2698 except _core._FallbackException: KeyboardInterrupt:
print (df['Province'].unique())
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
fig.tight_layout(h_pad=7, w_pad=3)
for i, province in enumerate(['Province_Hubei', 'Province_Beijing', 'Province_Hong Kong']):
province_df = countries_and_provinces[countries_and_provinces[province] == 1]
Lag = province_df.join(df)['Lag']
y = province_df.join(df)['Confirmed']
pred = model.predict([Lag.values, province_df.values])
p = pd.DataFrame()
p['Predicted'] = pred.flatten()
p['Confirmed'] = y.values
p.plot(ax=axes[i], title=province)