#!/usr/bin/env python # coding: utf-8 # Useful links # # - https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/ # - http://karpathy.github.io/2015/05/21/rnn-effectiveness/ # - https://stackabuse.com/time-series-prediction-using-lstm-with-pytorch-in-python/ # - https://towardsdatascience.com/pytorch-basics-how-to-train-your-neural-net-intro-to-rnn-cb6ebc594677 # - https://towardsdatascience.com/time-series-forecasting-with-rnns-ff22683bbbb0 # - https://towardsdatascience.com/time-series-forecasting-with-recurrent-neural-networks-74674e289816 # - https://towardsdatascience.com/analyzing-time-series-data-in-pandas-be3887fdd621 # - http://www.wildml.com/2015/10/recurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients/ # - https://colah.github.io/posts/2015-09-NN-Types-FP/ # - https://pytorch.org/docs/stable/nn.html#rnn # In[1]: import pandas as pd import numpy as np import requests as rq import datetime as dt import traceback as tb import torch tnn = torch.nn top = torch.optim from torch.utils import data as tdt # ### Download India's data # In[2]: # get daily time series data from covid19india resp = rq.get("https://api.covid19india.org/data.json") ts = resp.json()['cases_time_series'] r = { "date": [], "confirmed": [], "deceased": [], "recovered": [] } for d in ts: try: r['date'].append(dt.datetime.strptime(d['date'] + "2020", '%d %B %Y')) r['confirmed'].append(np.int64(d['dailyconfirmed'])) r['deceased'].append(np.int64(d['dailydeceased'])) r['recovered'].append(np.int64(d['dailyrecovered'])) except Exception as e: print(d['date']) tb.print_exc() df = pd.DataFrame(r) df.sort_values('date', inplace=True) df.sample() # ### Italy's and Spain's data # In[3]: it_df = pd.read_csv('csv/italy.csv') print(it_df.sample()) es_df = pd.read_csv('csv/spain.csv') print(es_df.sample()) # ### Prep data # In[5]: def get_rnn_dataset(series, seq_len): """get rnn training dataset, given a series and seq_len""" ip_seq = np.array(series[:-1], dtype=np.float32) op_seq = np.array(series[1:], dtype=np.float32) data_len = len(series) - 1 ips = [] ops = [] for i in range(data_len - seq_len + 1): ip, op = ip_seq[i : i+seq_len], op_seq[i : i+seq_len] ips.append(ip) ops.append(op) ips = np.array(ips, dtype=np.float32) ops = np.array(ops, dtype=np.float32) # convert from numpy to torch ip_t = torch.from_numpy(ips) op_t = torch.from_numpy(ops) dataset = tdt.TensorDataset(ip_t, op_t) return dataset SEQ_LEN = 5 VAL_SPLIT = 0.5 IN_POP_FCT = 1300000 # 130Cr / 1000 IT_POP_FCT = 60000 # 6Cr / 1000 ES_POP_FCT = 47000 # 4.7Cr / 1000 # preprocess data: sma(6) and normalise by 10k in_cnf = np.array(df['confirmed'][37:].rolling(6, center=True, min_periods=1).mean()) / IN_POP_FCT it_cnf = np.array(it_df['daily'].rolling(6, center=True, min_periods=1).mean()) / IT_POP_FCT es_cnf = np.array(es_df['daily'].rolling(6, center=True, min_periods=1).mean()) / ES_POP_FCT # Choose among India or Italy or Spain here cnf = es_cnf # np.append(it_cnf, es_cnf) dataset = get_rnn_dataset(cnf, SEQ_LEN) val_len = int(VAL_SPLIT * len(dataset)) train_len = len(dataset) - val_len train_set, val_set = tdt.random_split(dataset, (train_len, val_len)) trn_loader = tdt.DataLoader(train_set, shuffle=True, batch_size=1) val_loader = tdt.DataLoader(val_set, shuffle=True, batch_size=1) all_loader = tdt.DataLoader(dataset, shuffle=False, batch_size=1) # ### Define NN and train # In[13]: class Forecaster(tnn.Module): def __init__(self, seq_len=1, hidden_size=1, num_layers=1): super(Forecaster, self).__init__() self.seq_len = seq_len self.hidden_size = hidden_size self.num_layers = num_layers self.rnn = tnn.RNN(input_size=1, hidden_size=self.hidden_size, num_layers=self.num_layers) # self.gru = tnn.GRU(input_size=1, hidden_size=self.hidden_size, num_layers=self.num_layers) # self.lstm = tnn.LSTM(input_size=1, hidden_size=self.hidden_size, num_layers=self.num_layers) self.linear = tnn.Linear(self.hidden_size, 1) self.sigmoid = tnn.Sigmoid() def forward(self, ip, h=None): rnn_out, rnn_h = self.rnn(ip, h) # return (torch.squeeze(rnn_out), torch.squeeze(rnn_h)) # linear layer final_out = [] for i in range(self.seq_len): linear_out = self.linear(rnn_out[i].view(self.hidden_size)) sigmoid_out = self.sigmoid(linear_out) final_out.append(sigmoid_out) return torch.stack(final_out).view(self.seq_len), rnn_h def predict(self, ip, num_predictions=1): preds = [] with torch.no_grad(): pred = ip h = None while(num_predictions): pred, h = self.forward(pred.view(self.seq_len, 1, 1), h) preds.append(pred.view(self.seq_len)[-1]) num_predictions -= 1 return torch.stack(preds) HIDDEN_SIZE = 2 NUM_LAYERS = 1 LEARNING_RATE = 0.001 NUM_EPOCHS = 5000 model = Forecaster(seq_len=SEQ_LEN, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS) loss_fn = tnn.MSELoss() optimizer = top.Adam(model.parameters(), lr=LEARNING_RATE) # TRAIN trn_loss_vals = [] val_loss_vals = [] for e in range(NUM_EPOCHS): model.train() trn_losses = [] for i, data in enumerate(trn_loader): ip, op = data optimizer.zero_grad() # set grads to 0 pred, _ = model(ip.view(SEQ_LEN, 1, 1)) # predict loss = loss_fn(pred, op.view(SEQ_LEN)) # calc loss loss.backward() # calc and assign grads optimizer.step() # update weights trn_losses.append(loss) # logging avg_trn_loss = torch.stack(trn_losses).mean().item() trn_loss_vals.append(avg_trn_loss) model.eval() with torch.no_grad(): val_losses = [] for i, data in enumerate(val_loader): ip, op = data pred, _ = model(ip.view(SEQ_LEN, 1, 1)) loss = loss_fn(pred, op.view(SEQ_LEN)) val_losses.append(loss) avg_val_loss = torch.stack(val_losses).mean().item() val_loss_vals.append(avg_val_loss) if e%100==0: print("epoch:", f"{e:3}", "avg_val_loss:", f"{avg_val_loss: .5f}", "avg_trn_loss:", f"{avg_trn_loss: .5f}") df_trn_loss = pd.DataFrame({ 'trn_loss': trn_loss_vals, 'val_loss': val_loss_vals }) _ = df_trn_loss.plot( y=['trn_loss', 'val_loss'], title=['Training loss per epoch', 'Validation loss per epoch'], subplots=True, figsize=(5,8), sharex=False, logy=True ) # ### Test # - future values on any country's data # In[15]: # CHOOSE FROM IN, IT OR ES HERE df_cnf = in_cnf # df_cnf = np.array([8000, 7600, 7300, 6900, 6650], dtype=np.float32) / 10000 test_in = df_cnf[-SEQ_LEN:] t = torch.tensor( test_in.reshape(SEQ_LEN, 1, 1), dtype=torch.float32 ) print("IN:", t.view(SEQ_LEN) * IN_POP_FCT) out = model.predict(t, num_predictions=30) print("OUT:", out * IN_POP_FCT) orig_df = pd.DataFrame({ 'actual': df_cnf * IN_POP_FCT }) fut_df = pd.DataFrame({ 'predicted': (out.numpy() * IN_POP_FCT) }) orig_df = orig_df.append(fut_df, ignore_index=True, sort=False) _ = orig_df.plot() # - on training data # In[16]: # change this according to country selected for training pop_fct = ES_POP_FCT pred_vals = [] out_vals = [] for data in all_loader: ip, op = data pred = model.predict(ip.view(SEQ_LEN, 1, 1)) pred_vals.append(pred.item() * pop_fct) out_vals.append(op.view(SEQ_LEN)[-1].item() * pop_fct) cmp_df = pd.DataFrame({ 'predicted cases': pred_vals, 'actual cases': out_vals }) _ = cmp_df.plot() # In[13]: for k in model.state_dict().keys(): print(k, "=", model.state_dict()[k])