#!/usr/bin/env python # coding: utf-8 # # ECON 323 Project # # #### The primary goal of this analysis is to find the factors that impact transmission rate of COVID-19. To be precise, the question we are trying to answer is: # # ### What role does temperature and relative humidity play in the transmission of COVID-19 ? # ### Background # # Coronavirus disease (COVID-19) is an infectious disease caused by a new virus. # The disease causes respiratory illness (like the flu) with symptoms such as a cough, fever, and in more severe cases, difficulty in breathing. You can protect yourself by washing your hands frequently, avoiding touching your face, and avoiding close contact (1 meter or 3 feet) with people who are unwell. # # ##### HOW IT SPREADS # # Coronavirus disease spreads primarily through contact with an infected person when they cough or sneeze. It also spreads when a person touches a surface or object that has the virus on it, then touches their eyes, nose, or mouth. # ### Dataset # # Our aim is to test this claim with the current data that dates from 22 January, 2020 to 22 April, 2020. We selected this specific timeline for two reasons: # # Firstly, our study focuses on the 8 countries that have been affected the most by the virus (we define "most affected" by the "most number of confirmed cases"). Based on the John Hopkins dataset, which was last updated on April 22, the countries with the most confirmed cases include: USA, Spain, Italy, France, Germany, United Kingdom, Turkey, China. # We selected the first data to be January 22, 2020 since that is the date when the first case of the virus was confirmed in the USA. # # Secondly, due to temperature and humitdity data constraints, 22 April 2020 was the last date for which we could collect data on all the countries. # # NOTE: We use "confirmed cases" instead of "number of deaths" to represent the most affected countries because almost all the countries have used different strategies to tackle the spread of the virus. While some countries, such as India and Iran have issue a national lockdown, Japan and South Korea have only issued national "recommendations" regarding social distancing. Furthermore, all the countries may also vary in terms of the population and literacy rate which might play a key role in the amount of interaction the people are exposed to. Number of deaths may be the outcome of the severity of the lockdown or the population or some other factor, therefore, not being the best indicator of the spread of the virus due to temperature-related conditions. # # # ### Methodology # # 1) General visualizations to show the spread of the virus across the world (including confirmed cases, death and recoveries by country) # # 2) Visualization of the trend between in the number of cases for each country over the selected time peiod (for the 8 countries with highest number of confirmed cases)- Formulate Hypothesis based on these visualizations # # 3) Presentation of estimation results from 2 different regression specifications: Ordinary Least Squares Regression and Random Forests Regression # # 4) Discussion of the limitations of our study and areas for further research. # # # # https://www.bbc.com/future/article/20200323-coronavirus-will-hot-weather-kill-covid-19 # # https://www.bbc.com/news/world-52103747 # # ![alt text](image/covid19_map.png "Title") # In[114]: import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # import seaborn as sns import matplotlib.pyplot as plt import plotly_express as px # ## Datasets # ### Display Lastest counts for all Countries # In[115]: # Load all datasets confirmed_orig = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv') deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv') recoveries = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv') confirmed_orig.head(10) # In[116]: # melt datasets to get dates into 1 column ids = ["Province/State","Country/Region", "Lat","Long"] confirmed=confirmed_orig.melt(id_vars=ids, var_name="Date", value_name="cases") deaths=deaths.melt(id_vars=ids,var_name="Date", value_name="deaths") recoveries=recoveries.melt(id_vars=ids,var_name="Date", value_name="recoveries") # In[117]: # get latest data max_date = recoveries.iloc[[-1]]["Date"].iloc[0] latest_data_recoveries = recoveries.loc[recoveries['Date'] == max_date] latest_data_confirmed = confirmed.loc[confirmed['Date'] == max_date] latest_data_deaths = deaths.loc[deaths['Date'] == max_date] latest_data_confirmed.loc[latest_data_confirmed['Country/Region'] == "Canada"] latest_data_confirmed_countries = latest_data_confirmed.groupby('Country/Region').agg('sum') latest_data_deaths_countries = latest_data_deaths.groupby('Country/Region').agg('sum') latest_data_recoveries = latest_data_recoveries.drop(columns=['Province/State', 'Date']).groupby('Country/Region').agg('sum') latest_data_recoveries.head(10) # ### Country wise total defined cases # In[118]: import warnings warnings.filterwarnings('ignore') covid = latest_data_confirmed_countries.join(latest_data_deaths_countries["deaths"]).join(latest_data_recoveries["recoveries"]); covid = covid.rename(columns={"cases": "Confirmed", "deaths": "Deaths", "recoveries": "Recovered"}); latest_data_complete = covid[["Confirmed", "Deaths","Recovered" ]]; latest_data_complete['Active'] = latest_data_complete['Confirmed'] - latest_data_complete['Deaths'] - latest_data_complete['Recovered']; latest_data_complete.sort_values(by="Confirmed", ascending=False).head(10) # In[119]: # latest_data_complete[:15].plot.bar() colors = [ (0.902, 0.902, 0.997), (0.695, 0.695, 0.993), (0.488, 0.488, 0.989), (0.282, 0.282, 0.985), (0.078, 0.078, 0.980) ] fig, ax = plt.subplots(figsize=(15,15)) latest_data_complete.sort_values(by="Confirmed", ascending=True).tail(10).plot(kind="barh", ax=ax, color=colors) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_title("Top 10 countries with the Highest number of confirmed cases") # ## Data Visualizations # # #### Load Dataset created with above modifications # In[122]: complete_time_series = pd.read_csv('data/covid_19_clean_complete.csv', parse_dates=['Date']) complete_time_series.tail(5) # In[123]: # Preparing Data for visualizations # Defining COVID-19 cases as per classifications cases = ['Confirmed', 'Deaths', 'Recovered', 'Active'] # Defining Active Case: Active Case = confirmed - deaths - recovered complete_time_series['Active'] = complete_time_series['Confirmed'] - complete_time_series['Deaths'] - complete_time_series['Recovered'] # Renaming Mainland china as China in the data table complete_time_series['Country/Region'] = complete_time_series['Country/Region'].replace('Mainland China', 'China') # filling missing values complete_time_series[['Province/State']] = complete_time_series[['Province/State']].fillna('') complete_time_series[cases] = complete_time_series[cases].fillna(0) # In[124]: complete_time_series[complete_time_series["Country/Region"] == "US"] # In[126]: complete_time_series["ln(confirmed)"] = np.log1p(complete_time_series["Confirmed"] ) px.line(complete_time_series.sort_values(by="Confirmed", ascending=False), x='Date', y='Confirmed', color='Country/Region',title='COVID19 Total Confrimed Cases growth for top 10 worst affected countries'); # In[127]: px.line(complete_time_series, x='Date', y='Deaths', color='Country/Region', title='COVID19 Total Deaths growth for top 10 worst affected countries') # ***A Time-series graph of the confirmed and recovered cases of COVID-19*** # In[128]: import plotly as py import plotly.graph_objects as go import pandas as pd from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot init_notebook_mode(connected=True) fig = go.Figure() fig.add_trace(go.Scatter( x=complete_time_series.Date, y=complete_time_series['Confirmed'], name="Confirmed", line_color='deepskyblue', opacity=0.8)) fig.add_trace(go.Scatter( x=complete_time_series.Date, y=complete_time_series['Recovered'], name="Recovered", line_color='green', opacity=0.6)) fig.update_layout(title_text='Progression of cases since the beginning', xaxis_rangeslider_visible=True) py.offline.iplot(fig) # 1. Confirmed cases increasing exponentially between 1st week to 3rd week of Feb # 1. Recovered cases increasing exponentially from 4th week of March # 1. Growth rate of confirmed cases significantly drops from March 1st week # ***Interactive globe showing the growth of the pandemic with respect to confirmed cases of COVID19*** # In[129]: latest_data = latest_data_complete.reset_index() # In[130]: import plotly.offline as py py.init_notebook_mode(connected=True) # Calculating the count of confirmed cases by country countries = np.unique(latest_data['Country/Region']) confirmed_mean = [] for country in countries: confirmed_mean.append(latest_data[latest_data['Country/Region'] == country]['Confirmed'].sum()) # Building the dataframe data = [ dict( type = 'choropleth', locations = countries, z = confirmed_mean, locationmode = 'country names', text = countries, marker = dict( line = dict(color = 'rgb(0,0,0)', width = 1)), colorbar = dict(autotick = True, tickprefix = '', title = 'Count') ) ] # Building the visual layout = dict( title = 'COVID-19 Confirmed Cases', geo = dict( showframe = False, showocean = True, oceancolor = 'rgb(0,255,255)', projection = dict( type = 'orthographic', rotation = dict( lon = 60, lat = 10), ), lonaxis = dict( showgrid = True, gridcolor = 'rgb(102, 102, 102)' ), lataxis = dict( showgrid = True, gridcolor = 'rgb(102, 102, 102)' ) ), ) fig = dict(data=data, layout=layout) py.iplot(fig, validate=False, filename='worldmap') # **Interactive Globe displaying the average land temperatures of the planet** # In[131]: temp_country = pd.read_csv("data/GlobalLandTemperaturesByCountry.csv") # In[132]: # temp_country[temp_country["Country"] == "Spain"] py.init_notebook_mode(connected=True) ## Removing the duplicates temp_country_clear = temp_country[~temp_country['Country'].isin( ['Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands', 'United Kingdom', 'Africa', 'South America'])] temp_country_clear = temp_country_clear.replace( ['Denmark (Europe)', 'France (Europe)', 'Netherlands (Europe)', 'United Kingdom (Europe)'], ['Denmark', 'France', 'Netherlands', 'United Kingdom']) #Calculating average temperature by country countries = np.unique(temp_country_clear['Country']) mean_temp = [] for country in countries: mean_temp.append(temp_country_clear[temp_country_clear['Country'] == country]['AverageTemperature'].mean()) # Building the data frame data = [ dict( type = 'choropleth', locations = countries, z = mean_temp, locationmode = 'country names', text = countries, marker = dict( line = dict(color = 'rgb(0,0,0)', width = 1)), colorbar = dict(autotick = True, tickprefix = '', title = '# Average\nTemperature,\n°C') ) ] # Building the visual layout = dict( title = 'GLOBAL AVERAGE LAND TEMPERATURES', geo = dict( showframe = False, showocean = True, oceancolor = 'rgb(0,255,255)', projection = dict( type = 'orthographic', rotation = dict( lon = 60, lat = 10), ), lonaxis = dict( showgrid = True, gridcolor = 'rgb(102, 102, 102)' ), lataxis = dict( showgrid = True, gridcolor = 'rgb(102, 102, 102)' ) ), ) fig = dict(data=data, layout=layout) py.iplot(fig, validate=False, filename='worldmap') # **Display of progression of spread of COVID19 across the World** # In[159]: # import plotly.express as px # import plotly.offline as py py.init_notebook_mode(connected=True) complete_time_series_globe = complete_time_series.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered'].max() complete_time_series_globe = complete_time_series_globe.reset_index() complete_time_series_globe['Date'] = pd.to_datetime(complete_time_series_globe['Date']) complete_time_series_globe['Date'] = complete_time_series_globe['Date'].dt.strftime('%m/%d/%Y') complete_time_series_globe['size'] = complete_time_series_globe['Confirmed'].pow(0.2) fig = px.scatter_geo(complete_time_series_globe, locations="Country/Region", locationmode='country names', color="Confirmed", size='size', hover_name="Country/Region", range_color= [0, max(complete_time_series_globe['Confirmed'])+2], projection="natural earth", animation_frame="Date", title='Progression of spread of COVID-19') fig.update(layout_coloraxis_showscale=False) py.offline.iplot(fig) # The spread is much more than in the temperate zone of the planet compared to tropical zones. # Moreover, drier parts of the world has lesser transmission. # You , can play this multiple times to get a deeper understanding # ### Analysis # According to AccuWeather, these are some of the general findings regarding the spread of viruses and it's relationship to average temperature # # 1. Temperature plays an important role in the transmission of the virus. # 1. The virus would not be be able to survive for long in warmer climates or surroundings # 1. There is a higher rate of transmission in the temperate zone of the planet, where the average temp is about 5-10 degrees Celsius # 1. However, there is a relatively low transmission rate in the tropical zone, where the avg. temp is about 25 degrees celsius or higher. # # # # # - https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3551767 # # # We aim to test these general findings by focusing our study on the most affected regions in the world. # ### Key Assumption: # We acknowledge that our findings would be inaccurate if we use a national average for temperature and humidity (for all the countries we are going to examine) since some of the countries (such as USA and China) are spread out over a huge land mass and different parts of the country experience quite varied average temperatures. In order to deal with this issue, we only look at the specific area in each country which has been affected the most. # # In USA, we only look at New York and in China, we only take into account Hubei. These two provinces/states have the highest (and the majority proportion) of confirmed cases in the country. Firstly, for USA, New York has more confirmed cases than all the other countries (New York has 258,222 number of cases, while Spain, which has the second highest number of cases has 208,839 confirmed cases). As for Hubei, it has 68,128 confirmed cases (being the 8th in order still behind Turkey, which has 98,674 cases). Thus these "regions" still follow the same order and their number of confirmed cases are amongst the top 8 across the world. # # # NOTE: The ranking is based on the number of confirmed cases as of 22 April, 2020. # # Data collection for individual Regions of high impact # ### New York # In[137]: # New york data - Explain why New York #TODO confirmed_US = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') # Drop unwanted Columns confirmed_US = confirmed_US.drop(columns=["UID", "iso2", "iso3", "code3", "FIPS", "Admin2", "Lat", "Long_","Combined_Key", "Country_Region" ]) confirmed_US # Get names of indexes for which Province_state is not New York indexNames = confirmed_US[ confirmed_US['Province_State'] != "New York" ].index # Delete these row indexes from dataFrame confirmed_US.drop(indexNames , inplace=True) confirmed_US # aggregate different counties into one row new_york = confirmed_US.groupby('Province_State').agg('sum') new_york new_york= new_york.reset_index() # Melt date into one column new_york = new_york.melt(id_vars="Province_State", var_name="Date", value_name="cases") new_york = new_york.rename(columns={"Province_State": "Region"}) new_york.head(8) # ### CHINA (Hubei) # In[138]: china = confirmed[confirmed["Country/Region"] == "China"] hubei = china[china["Province/State"] == "Hubei"] # confirmed hubei = hubei.drop(columns=["Country/Region", "Lat", "Long"]) hubei = hubei.rename(columns={"Province/State": "Region"}) hubei.tail(8) # ## Others # # ### Spain, Italy, France, Germany, Turkey, United Kingdom # # # In[139]: regions = ["Spain", "Italy", "France", "Germany", "Turkey", "United Kingdom"] Others = confirmed_orig[confirmed_orig['Country/Region'].isin(regions)] # remove provincial data and keep country data Others = Others[Others['Province/State'].isna()] Others = Others.drop(columns=["Province/State", "Lat", "Long"]).rename(columns={"Country/Region": "Region"}) Others = Others.melt(id_vars="Region", var_name="Date", value_name="cases") Others.head(8) # ## Combine all datasets (Region, Dates, Cases) # In[140]: frames = [new_york, hubei, Others] all_regions = pd.concat(frames) regions += ["Hubei", "New York"] # Change Date type to Date Object all_regions['Date'] = pd.to_datetime(all_regions['Date']) all_regions['Date'] = all_regions['Date'].dt.strftime('%m/%d/%Y') all_regions = all_regions.sort_values(by=[ "Date"], ascending=True) all_regions.head(8) # ### Get Temperature Humidity data # # Temperature and Humidity data was scraped from World Weather Online API using a Python Script as shown below. # In[1]: # Temperature_Humidity_data/WWO.py # import requests # import csv # regions = ["Hubei", "Spain", "Italy", "France", # "Germany", "Turkey", "UK", "New York"] # columns = [["Date", "Region", "AvgTemp", "Humidity"]] # # print(list_of_dates) # for region in regions: # dates = ["2020-01-22", "2020-02-26", "2020-04-01"] # for st_date in dates: # url = 'http://api.worldweatheronline.com/premium/v1/past-weather.ashx?key=fe87d912d7e045e6b9a91044202304&q=' + \ # region+'&format=json&date=' + st_date + '&enddate=2020-04-22' # resp = requests.get(url=url) # data = resp.json() # if region == "UK": # region = "United Kingdom" # list_of_dates = data["data"]["weather"] # for date in list_of_dates: # first_date = date # # print(first_date["date"]) # day = first_date["date"] # avg_temp = first_date["avgtempC"] # avg_humidity = 0 # count = 0 # for data in first_date["hourly"]: # humidity = float(data["humidity"]) # # print(humidity, " avg - ", avg_humidity) # avg_humidity = avg_humidity + humidity # count += 1 # avg_humidity = avg_humidity/count # # print(first_date["date"], avg_temp, humidity) # columns.append([day, region, avg_temp, avg_humidity]) # with open('Avg_Temperature_Humidity_data.csv', 'w', newline='') as file: # writer = csv.writer(file) # writer.writerows(columns) # In[141]: avg_temp_hum = pd.read_csv('Temperature_Humidity_data/Avg_Temperature_Humidity_data.csv', parse_dates = True) avg_temp_hum.info() # Change Date type to Date Object avg_temp_hum['Date'] = pd.to_datetime(avg_temp_hum['Date']) avg_temp_hum['Date'] = avg_temp_hum['Date'].dt.strftime('%m/%d/%Y') avg_temp_hum.sort_values(by=[ "Date"], ascending=True) # ## Merge the two datasets, by adding AvgTemp and Humidity to the confirmed cases timeseries dataset (all_regions) # In[142]: all_regions.head(8) # In[143]: all_regions.info() avg_temp_hum.info() # In[144]: # Since we only have data u to 22nd April (Limited access to API) split_date = '04/22/2020' all_regions[all_regions["Date"] <= split_date].sort_values(by=["Region","Date"], ascending=True).tail(10) temp_hum_cases = pd.merge(avg_temp_hum, all_regions, how='left', left_on=["Date", "Region"], right_on = ["Date", "Region"]) temp_hum_cases.tail(8) # #### Using Excel to add new column for the increase in number of cases per day # In[145]: temp_hum_cases.to_csv("data/temp_hum_cases.csv") # In[146]: temp_hum_cases_daily = pd.read_csv('data/temp_hum_new_cases.csv', parse_dates = True, index_col=0) temp_hum_cases_daily['Date'] = pd.to_datetime(temp_hum_cases_daily['Date']) temp_hum_cases_daily['Date'] = temp_hum_cases_daily['Date'].dt.strftime('%m/%d/%Y') temp_hum_cases_daily.sort_values(by="Date", ascending=True) # #### Visualizations for correlation between Temperature, Humidity and Cases_per_day # In[147]: from IPython.display import Markdown, display import seaborn as sns # Use seaborn style defaults and set the default figure size sns.set(rc={'figure.figsize':(15, 4)}) def printmd(string): display(Markdown(string)) # In[150]: printmd('###