import pandas as pd import numpy as np # load datasets train = pd.read_csv('/content/drive/MyDrive/owid-covid-data.csv') train from google.colab import drive drive.mount('/content/drive') train.info() import datetime date = train.date train['date'] = pd.to_datetime(train['date'], errors='coerce') train['date_num'] = train['date'].dt.strftime('%d%m%Y') train country = train['location'].to_list() # using naive method to remove duplicated from list list_country = [] for i in country: if i not in list_country: list_country.append(i) list_country print(len(list_country)) from datetime import date from datetime import timedelta today = date.today() print("Today is: ", today) yesterday = today - timedelta(days = 1) print("Yesterday was: ", yesterday) last_date = train.date.max() last_date print(type(last_date)) yesterday = pd.Timestamp(yesterday) print(type(yesterday)) if last_date == yesterday: last_date = yesterday elif last_date > yesterday: last_date = yesterday else: last_date = last_date last_date last_date = train[train.date == last_date] last_date from matplotlib import pyplot as plt train.new_cases.hist() train.new_deaths.hist() var = 'location' data = pd.concat([train['new_cases'], train[var]], axis=1) data.plot.scatter(x=var, y='new_cases', ylim=(0,1000000), s=32); var = 'location' data = pd.concat([train['new_deaths'], train[var]], axis=1) data.plot.scatter(x=var, y='new_deaths', ylim=(0,30000), s=32); import seaborn as sns var = 'continent' data = pd.concat([train['new_cases'], train[var]], axis=1) f, ax = plt.subplots(figsize=(14, 8)) fig = sns.boxplot(x=var, y="new_cases", data=data) fig.axis(ymin=0, ymax=500000); var = 'continent' data = pd.concat([train['new_deaths'], train[var]], axis=1) f, ax = plt.subplots(figsize=(14, 8)) fig = sns.boxplot(x=var, y="new_deaths", data=data) fig.axis(ymin=0, ymax=20000); corrmat = train.corr() f, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corrmat, vmax=.8, square=True) country = "United Kingdom" include_uk = train[train['location'].values == country] exclude_uk = train[train['location'].values != country] uk_cov19 = include_uk uk_cov19 import matplotlib.pyplot as plt uk_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("uk_new_cases_per_million") uk_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("uk_new_deaths_per_million") country = "United States" include_us = train[train['location'].values == country] exclude_us = train[train['location'].values != country] us_cov19 = include_us us_cov19 us_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("us_new_cases_per_million") us_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("us_new_deaths_per_million") country = "France" include_fr = train[train['location'].values == country] exclude_fr = train[train['location'].values != country] fr_cov19 = include_fr fr_cov19 fr_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("france_new_cases_per_million") fr_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("france_new_deaths_per_million") country = "Italy" include_it = train[train['location'].values == country] exclude_it = train[train['location'].values != country] it_cov19 = include_it it_cov19 it_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("italy_new_cases_per_million") it_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("italy_new_deaths_per_million") country = "Brazil" include_br = train[train['location'].values == country] exclude_br = train[train['location'].values != country] br_cov19 = include_br br_cov19 br_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("brazil_new_cases_per_million") br_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("brazil_new_deaths_per_million") country = "Portugal" include_p = train[train['location'].values == country] exclude_p = train[train['location'].values != country] p_cov19 = include_p p_cov19 p_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("portugal_new_cases_per_million") p_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("portugal_new_deaths_per_million") country = "Spain" include_sp = train[train['location'].values == country] exclude_sp = train[train['location'].values != country] sp_cov19 = include_sp sp_cov19 sp_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("spain_new_cases_per_million") sp_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("spain_new_deaths_per_million") country = "India" include_i = train[train['location'].values == country] exclude_i = train[train['location'].values != country] i_cov19 = include_i i_cov19 i_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("us_new_cases_per_million") i_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("us_new_deaths_per_million") country = "China" include_c = train[train['location'].values == country] exclude_c = train[train['location'].values != country] c_cov19 = include_c c_cov19 c_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("china_new_cases_per_million") c_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("china_new_deaths_per_million") country = "Taiwan" include_t = train[train['location'].values == country] exclude_t = train[train['location'].values != country] t_cov19 = include_t t_cov19 t_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("taiwan_new_cases_per_million") t_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("taiwan_new_deaths_per_million") country = "Japan" include_j = train[train['location'].values == country] exclude_j = train[train['location'].values != country] j_cov19 = include_j j_cov19 j_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("japan_new_cases_per_million") j_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("japan_new_deaths_per_million") country = "New Zealand" include_nz = train[train['location'].values == country] exclude_nz = train[train['location'].values != country] nz_cov19 = include_nz nz_cov19 nz_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("new_zealand_new_cases_per_million") nz_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("new_zealand_new_deaths_per_million") country = "Australia" include_a = train[train['location'].values == country] exclude_a = train[train['location'].values != country] a_cov19 = include_a a_cov19 a_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("australia_new_cases_per_million") nz_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("australia_new_deaths_per_million") country = "South Africa" include_sa = train[train['location'].values == country] exclude_sa = train[train['location'].values != country] sa_cov19 = include_sa sa_cov19 sa_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("sa_new_cases_per_million") sa_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("sa_new_deaths_per_million") country = "Nigeria" include_n = train[train['location'].values == country] exclude_n = train[train['location'].values != country] n_cov19 = include_n n_cov19 n_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("n_new_cases_per_million") n_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("n_new_deaths_per_million") country = "Sweden" include_s = train[train['location'].values == country] exclude_s = train[train['location'].values != country] s_cov19 = include_s s_cov19 s_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("sweden_new_cases_per_million") s_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("sweden_new_deaths_per_million") country = "Denmark" include_d = train[train['location'].values == country] exclude_d = train[train['location'].values != country] d_cov19 = include_d d_cov19 d_cov19['new_cases_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("denmark_new_cases_per_million") d_cov19['new_deaths_per_million'].plot(figsize=(16, 8)) plt.title('Time Series') plt.xlabel("date") plt.ylabel("denmark_new_deaths_per_million") train.plot(x="date_num", y=["new_cases_per_million", "new_deaths_per_million"]) uk_cov19.plot(x="date_num", y=["new_cases_per_million", "new_deaths_per_million"]) revised_uk_cov19 = uk_cov19[28:] revised_uk_cov19 revised_us_cov19 = us_cov19[37:] revised_us_cov19 revised_fr_cov19 = fr_cov19[35:] revised_fr_cov19 revised_it_cov19 = it_cov19[28:] revised_it_cov19 revised_br_cov19 = br_cov19[2:] revised_br_cov19 revised_sp_cov19 = sp_cov19[27:] revised_sp_cov19 revised_p_cov19 = p_cov19[2:] revised_p_cov19 revised_c_cov19 = c_cov19[37:] revised_c_cov19 revised_i_cov19 = i_cov19[29:] revised_i_cov19 revised_t_cov19 = t_cov19[43:] revised_t_cov19 revised_j_cov19 = j_cov19[37:] revised_j_cov19 revised_s_cov19 = s_cov19[27:] revised_s_cov19 revised_d_cov19 = d_cov19[26:] revised_d_cov19 revised_sa_cov19 = sa_cov19[21:] revised_sa_cov19 revised_a_cov19 = a_cov19[33:] revised_a_cov19 revised_uk_cov19.new_cases_per_million.shape, revised_us_cov19.new_cases_per_million.shape, revised_uk_cov19.new_deaths_per_million.shape, revised_us_cov19.new_deaths_per_million.shape date = revised_uk_cov19.date.values uk_new_case_per_million = revised_uk_cov19.new_cases_per_million uk_new_death_per_million = revised_uk_cov19.new_deaths_per_million us_new_case_per_million = revised_us_cov19.new_cases_per_million us_new_death_per_million = revised_us_cov19.new_deaths_per_million date = revised_fr_cov19.date.values fr_new_case_per_million = revised_fr_cov19.new_cases_per_million fr_new_death_per_million = revised_fr_cov19.new_deaths_per_million date = revised_it_cov19.date.values it_new_case_per_million = revised_it_cov19.new_cases_per_million it_new_death_per_million = revised_it_cov19.new_deaths_per_million br_new_case_per_million = revised_br_cov19.new_cases_per_million br_new_death_per_million = revised_br_cov19.new_deaths_per_million p_new_case_per_million = revised_p_cov19.new_cases_per_million p_new_death_per_million = revised_p_cov19.new_deaths_per_million sp_new_case_per_million = revised_sp_cov19.new_cases_per_million sp_new_death_per_million = revised_sp_cov19.new_deaths_per_million c_new_case_per_million = revised_c_cov19.new_cases_per_million c_new_death_per_million = revised_c_cov19.new_deaths_per_million i_new_case_per_million = revised_i_cov19.new_cases_per_million i_new_death_per_million = revised_i_cov19.new_deaths_per_million t_new_case_per_million = revised_t_cov19.new_cases_per_million t_new_death_per_million = revised_t_cov19.new_deaths_per_million j_new_case_per_million = revised_j_cov19.new_cases_per_million j_new_death_per_million = revised_j_cov19.new_deaths_per_million sa_new_case_per_million = revised_sa_cov19.new_cases_per_million sa_new_death_per_million = revised_sa_cov19.new_deaths_per_million n_new_case_per_million = n_cov19.new_cases_per_million n_new_death_per_million = n_cov19.new_deaths_per_million nz_new_case_per_million = nz_cov19.new_cases_per_million nz_new_death_per_million = nz_cov19.new_deaths_per_million a_new_case_per_million = revised_a_cov19.new_cases_per_million a_new_death_per_million = revised_a_cov19.new_deaths_per_million s_new_case_per_million = revised_s_cov19.new_cases_per_million s_new_death_per_million = revised_s_cov19.new_deaths_per_million d_new_case_per_million = revised_d_cov19.new_cases_per_million d_new_death_per_million = revised_d_cov19.new_deaths_per_million uk_compare = pd.DataFrame({'date': date,'uk_new_cse_pr_million': uk_new_case_per_million, 'uk_new_dth_pr_million': uk_new_death_per_million}) uk_compare us_compare = pd.DataFrame({'date': date,'us_new_cse_pr_million': us_new_case_per_million, 'us_new_dth_pr_million': us_new_death_per_million}) us_compare fr_compare = pd.DataFrame({'date': date,'fr_new_cse_pr_million': fr_new_case_per_million, 'fr_new_dth_pr_million': fr_new_death_per_million}) fr_compare it_compare = pd.DataFrame({'date': date,'it_new_cse_pr_million': it_new_case_per_million, 'it_new_dth_pr_million': it_new_death_per_million}) it_compare br_compare = pd.DataFrame({'date': date,'br_new_cse_pr_million': br_new_case_per_million, 'br_new_dth_pr_million': br_new_death_per_million}) br_compare p_compare = pd.DataFrame({'date': date,'p_new_cse_pr_million': p_new_case_per_million, 'p_new_dth_pr_million': p_new_death_per_million}) p_compare sp_compare = pd.DataFrame({'date': date,'sp_new_cse_pr_million': sp_new_case_per_million, 'sp_new_dth_pr_million': sp_new_death_per_million}) sp_compare i_compare = pd.DataFrame({'date': date,'i_new_cse_pr_million': i_new_case_per_million, 'i_new_dth_pr_million': i_new_death_per_million}) i_compare c_compare = pd.DataFrame({'date': date,'c_new_cse_pr_million': c_new_case_per_million, 'c_new_dth_pr_million': c_new_death_per_million}) c_compare t_compare = pd.DataFrame({'date': date,'t_new_cse_pr_million': t_new_case_per_million, 't_new_dth_pr_million': t_new_death_per_million}) t_compare j_compare = pd.DataFrame({'date': date,'j_new_cse_pr_million': j_new_case_per_million, 'j_new_dth_pr_million': j_new_death_per_million}) j_compare sa_compare = pd.DataFrame({'date': date,'sa_new_cse_pr_million': sa_new_case_per_million, 'sa_new_dth_pr_million': sa_new_death_per_million}) sa_compare n_compare = pd.DataFrame({'date': date,'n_new_cse_pr_million': n_new_case_per_million, 'n_new_dth_pr_million': n_new_death_per_million}) n_compare nz_compare = pd.DataFrame({'date': date,'nz_new_cse_pr_million': nz_new_case_per_million, 'nz_new_dth_pr_million': nz_new_death_per_million}) nz_compare a_compare = pd.DataFrame({'date': date,'a_new_cse_pr_million': a_new_case_per_million, 'a_new_dth_pr_million': a_new_death_per_million}) a_compare s_compare = pd.DataFrame({'date': date,'s_new_cse_pr_million': s_new_case_per_million, 's_new_dth_pr_million': s_new_death_per_million}) s_compare d_compare = pd.DataFrame({'date': date,'d_new_cse_pr_million': d_new_case_per_million, 'd_new_dth_pr_million': d_new_death_per_million}) d_compare compare = pd.merge(uk_compare, us_compare, on="date").merge(c_compare, on="date").merge(sa_compare, on="date") \ .merge(s_compare, on="date").merge(t_compare, on='date').merge(d_compare, on='date').merge(n_compare, on="date") \ .merge(j_compare, on='date').merge(i_compare, on='date').merge(nz_compare, on='date').merge(a_compare, on="date") \ .merge(sp_compare, on='date').merge(br_compare,on='date').merge(p_compare, on='date').merge(it_compare, on="date") \ .merge(fr_compare, on='date') compare compare.isnull().sum() compare.fillna(0,inplace=True) compare.plot(x="date", y=["uk_new_cse_pr_million", "i_new_cse_pr_million" ]) compare.plot(x="date", y=["uk_new_dth_pr_million", "i_new_dth_pr_million" ]) train.isnull().sum() train = train.drop(['date'], axis=1) train.dtypes[train.dtypes != 'int64'][train.dtypes != 'float64'] train['iso_code'] = train['iso_code'].fillna('Not Listed') train['continent'] = train['continent'].fillna('Not Listed') train['location'] = train['location'].fillna('Not Listed') train['tests_units'] = train['tests_units'].fillna('Not Listed') train.fillna(0,inplace=True) #ordinal encoder from sklearn import preprocessing from sklearn.preprocessing import OrdinalEncoder enc = OrdinalEncoder() train.iso_code = enc.fit_transform(train.iso_code.values.reshape(-1,1)) train.continent = enc.fit_transform(train.continent.values.reshape(-1,1)) train.location = enc.fit_transform(train.location.values.reshape(-1,1)) train.tests_units = enc.fit_transform(train.tests_units.values.reshape(-1,1)) iso_code = train.iso_code y = train[['new_deaths']] X = train.drop(['iso_code', 'new_deaths', 'total_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million'], axis=1) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) y X #split train set for testing from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=1, shuffle=True) X_train.shape, X_val.shape, y_train.shape, y_val.shape from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor model = HistGradientBoostingRegressor(max_iter=2000, random_state=1).fit(X_train, y_train) print(model.score(X_train, y_train)) from sklearn.metrics import r2_score y_pred = model.predict(X_val) y_pred = y_pred.astype(int) y_pred[y_pred < 0] = 0 print(model.score(X_val, y_val)), print(r2_score(y_pred, model.predict(X_val))) y_val y_train.values y_pred # plot predictions and expected results from matplotlib import pyplot pyplot.plot(y_train) pyplot.plot([None for i in y_train.values] + [x for x in y_val.values]) pyplot.plot([None for i in y_train.values] + [x for x in y_pred.astype(int)]) pyplot.show() df_val=pd.DataFrame({'actual_deaths': y_val.values.ravel(), 'predicted_deaths':y_pred.astype(int)}) df_val.reset_index(drop=True, inplace=True) df_val test = last_date.copy() test.date = today test import datetime test['date'] = pd.to_datetime(test['date'], errors='coerce') test['date_num'] = test['date'].dt.strftime('%d%m%Y') test test.isnull().sum() predict_date = test.date test = test.drop(['date'], axis=1) test.dtypes[test.dtypes != 'int64'][test.dtypes != 'float64'] test['iso_code'] = test['iso_code'].fillna('Not Listed') test['continent'] = test['continent'].fillna('Not Listed') test['location'] = test['location'].fillna('Not Listed') test['tests_units'] = test['tests_units'].fillna('Not Listed') test.fillna(0,inplace=True) test test_country = test.location test_country test.iso_code = enc.fit_transform(test.iso_code.values.reshape(-1,1)) test.continent = enc.fit_transform(test.continent.values.reshape(-1,1)) test.location = enc.fit_transform(test.location.values.reshape(-1,1)) test.tests_units = enc.fit_transform(test.tests_units.values.reshape(-1,1)) iso_code = test.iso_code X_test = test.drop(['iso_code', 'total_cases', 'new_deaths','new_cases_smoothed', 'total_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million'], axis=1) X_test = scaler.transform(X_test) X_test prediction = model.predict(X_test) prediction = prediction.astype(int) prediction[prediction < 0] = 0 prediction.shape df_pred=pd.DataFrame({'country': test_country, 'predicted_date': predict_date, 'predicted_deaths':prediction.astype(int)}) df_pred.reset_index(drop=True, inplace=True) df_pred