# for calender map
# ! pip install calmap
# to get continent name from country name
# ! pip install pycountry_convert
# to get acess to interactive plots
# ! pip install plotly
# datetime oprations
from datetime import timedelta
# for numerical analyiss
import numpy as np
# to access and use dataframes
import pandas as pd
# basic visualization package
import matplotlib.pyplot as plt
# advanced ploting
import seaborn as sns
# interactive visualization
import plotly.express as px
import plotly.graph_objs as go
# import plotly.figure_factory as ff
from plotly.subplots import make_subplots
# for offline ploting
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
creating a color pallette to make a uniform theme for all the graphs and also reducing redundency.
# color pallette
cnf, dth, rec, act = '#ef476f', '#ffd166', '#06d6a0', '#118ab2'
# Full data
full_table = pd.read_csv('input/covid_19_clean_complete.csv')
# full_table.head()
# Grouped by day, country
full_grouped = pd.read_csv('input/full_grouped.csv')
full_grouped['Date'] = pd.to_datetime(full_grouped['Date'])
# full_grouped.head()
# Day wise
# ========
day_wise = pd.read_csv('input/day_wise.csv')
day_wise['Date'] = pd.to_datetime(day_wise['Date'])
# day_wise.head()
# Country wise
# ============
country_wise = pd.read_csv('input/country_wise_latest.csv')
country_wise = country_wise.replace('', np.nan).fillna(0)
# country_wise.head()
# Worldometer data
# ================
worldometer_data = pd.read_csv('input/worldometer_data.csv')
worldometer_data = worldometer_data.replace('', np.nan).fillna(0)
# worldometer_data.head()
temp = day_wise[['Date','Deaths', 'Recovered', 'Active']].tail(1)
temp = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
fig = px.treemap(temp, path=["variable"], values="value", height=225,
color_discrete_sequence=[act, rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()
def plot_map(df, col, pal):
df = df[df[col]>0]
fig = px.choropleth(df, locations="Country/Region", locationmode='country names',
color=col, hover_name="Country/Region",
title=col, hover_data=[col], color_continuous_scale=pal)
# fig.update_layout(coloraxis_showscale=False)
fig.show()
Map plots are perfect for comparing the accumulation of data in different parts of a map, the map itself can be customized in terms of the visible areas. The function used to create a map plot in this example is using a special library and it is of course interactive due to usage of plotly. This map may not be visible in 3rd party websites because of the lack of installing this particular package. Also, there are other variations of this kind of plot that can mark down the spots selected on a map much like google maps.
plot_map(country_wise, 'Confirmed', 'matter')
def plot_daywise(col, hue):
fig = px.bar(day_wise, x="Date", y=col, width=700, color_discrete_sequence=[hue])
fig.update_layout(title=col, xaxis_title="", yaxis_title="")
fig.show()
def plot_daywise_line(col, hue):
fig = px.line(day_wise, x="Date", y=col, width=700, color_discrete_sequence=[hue])
fig.update_layout(title=col, xaxis_title="", yaxis_title="")
fig.show()
temp = full_grouped.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],
var_name='Case', value_name='Count')
#temp.head()
Area Graphs are Line Graphs but with the area below the line filled in with a certain colour or texture. Area Graphs are drawn by first plotting data points on a Cartesian coordinate grid, joining a line between the points and finally filling in the space below the completed line. In this example, because of the cumulative behavior of data, the graph is always accending.
fig = px.area(temp, x="Date", y="Count", color='Case', height=600, width=700,
title='Cases over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
Bar charts have a discrete domain of categories, and are usually scaled so that all the data can fit on the chart. When there is no natural ordering of the categories being compared, bars on the chart may be arranged in any order. Categorical data is a grouping of data into discrete groups, such as months of the year, age group, shoe sizes, and animals. These categories are usually qualitative. In a column bar chart, the categories appear along the horizontal axis; the height of the bar corresponds to the value of each category.
You may see several bar charts down below, Some of them are cumulative as the data always accent but some of them are not like a 'Number of New Cases Daily' that may differ from one day to another. On the number of Deaths Daily' graph, you can see how good or bad the world adapt to new virus and control the cases also there is a term called 'wave'. A wave implies a rising number of sick individuals, a defined peak, and then a decline.
plot_daywise('Confirmed', '#333333')
plot_daywise('New cases', '#333333')
plot_daywise('Deaths', dth)
plot_daywise('New deaths', dth)
plot_daywise('Recovered', rec)
plot_daywise('New recovered', rec)
Line graphs are used to track changes over short and long periods of time. When smaller changes exist, line graphs are better to use than bar graphs. Line graphs can also be used to compare changes over the same period of time for more than one group.As you can see the X-axis of the graphs are always showing time and on the different graphs the y-axis differs from one to another and for example on the graph below it shows how often a person dies due to a virus, in another world how deadly the virus was at that time.
plot_daywise_line('Deaths / 100 Cases', dth)
plot_daywise_line('Deaths / 100 Recovered', dth)
plot_daywise_line('Recovered / 100 Cases', rec)
This is another Line graph that compares number of cases to recovered ones.
temp = day_wise[['Date', 'Recovered', 'Active']]
temp = temp.melt(id_vars='Date', value_vars=['Recovered', 'Active'],
var_name='Variable', value_name='Count')
px.line(temp, x='Date', y='Count', color='Variable')
Bar plot showing the total number of countries affected by COVID-19, the data capped at 189 as the dataset I'm using has access to only 189 total number of countries.
plot_daywise('No. of countries', '#035aa6')
In this section, the plots show the top 15 countries in every way that virus affects them: Confirmed, Cases, Deaths, Active, etc. To represent the data in the best way possible I used horizontal bar charts, It's pretty much straightforward. Also, I sorted out the data in descending order as the topic suggests this graph shows the top 15 countries only.
def plot_hbar(df, col, n, hover_data=[]):
fig = px.bar(df.sort_values(col).tail(n),
x=col, y="Country/Region", color='WHO Region',
text=col, orientation='h', width=700, hover_data=hover_data,
color_discrete_sequence = px.colors.qualitative.Dark2)
fig.update_layout(title=col, xaxis_title="", yaxis_title="",
yaxis_categoryorder = 'total ascending',
uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
def plot_hbar_wm(col, n, min_pop=1000000, sort='descending'):
df = worldometer_data[worldometer_data['Population']>min_pop]
df = df.sort_values(col, ascending=True).tail(n)
fig = px.bar(df,
x=col, y="Country/Region", color='WHO Region',
text=col, orientation='h', width=700,
color_discrete_sequence = px.colors.qualitative.Dark2)
fig.update_layout(title=col+' (Only countries with > 1M Pop)',
xaxis_title="", yaxis_title="",
yaxis_categoryorder = 'total ascending',
uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
plot_hbar(country_wise, 'Confirmed', 15)
plot_hbar(country_wise, 'Active', 15)
plot_hbar(country_wise, 'New cases', 15)
plot_hbar(country_wise, 'Deaths', 15)
plot_hbar(country_wise, 'New deaths', 15)
plot_hbar(country_wise, 'Deaths / 100 Cases', 15)
plot_hbar(country_wise, 'Recovered', 15)
plot_hbar(country_wise, 'New recovered', 15)
plot_hbar(country_wise, 'Recovered / 100 Cases', 15)
plot_hbar(country_wise, '1 week change', 15)
plot_hbar(country_wise, '1 week % increase', 15)
plot_hbar_wm('Tot Cases/1M pop', 15, 1000000)
plot_hbar_wm('Deaths/1M pop', 15, 1000000)
plot_hbar_wm('TotalTests', 15, 1000000)
plot_hbar_wm('Tests/1M pop', 15)
A stacked bar graph (or stacked bar chart) is a chart that uses bars to show comparisons between categories of data, but with ability to break down and compare parts of a whole. Each bar in the chart represents a whole, and segments in the bar represent different parts or categories of that whole.
In this example the longer a country's bar is representing more people affected by that cause.
you may notice I used the 2000 reports with most values that's simply due to reducing lags and make the notebook a bit easier to load all this plots are interactive so you may zoom in and investigate the data for yourself.
def plot_stacked(col):
fig = px.bar(full_grouped.sort_values(col).tail(2000), x="Date", y=col, color='Country/Region',
height=600, title=col,
color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(showlegend=True)
fig.show()
plot_stacked('Confirmed')
plot_stacked('Deaths')
plot_stacked('New cases')
A similiar representation to Stacked bar plot. It's equivelant to connecting the bars in bar plot. However, its for all 189 available countries in this dataset rather thant top 2000 reports.
def plot_line(col):
fig = px.line(full_grouped, x="Date", y=col, color='Country/Region',
height=600, title=col,
color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(showlegend=True)
fig.show()
plot_line('Confirmed')
plot_line('Deaths')
plot_line('Active')
plot_line('New cases')
In this particular bar graph number of cases from each country is compared and represented in a bar graph. You may notice at first over 99% of affected cases were from China ( Obviously the source of the virus was over there, then South Korea and Iran started to rise in numbers.
Also, there is a possibility in these interactive graphs that you can double click on a specific country to show that country in a particular or single click to exclude that country from the chart.
temp = pd.merge(full_grouped[['Date', 'Country/Region', 'Confirmed', 'Deaths']],
day_wise[['Date', 'Confirmed', 'Deaths']], on='Date')
temp['% Confirmed'] = round(temp['Confirmed_x']/temp['Confirmed_y'], 3)*100
temp['% Deaths'] = round(temp['Deaths_x']/temp['Deaths_y'], 3)*100
temp_confirmed = temp.sort_values('% Confirmed').tail(2000)
fig = px.bar(temp_confirmed, x='Date', y='% Confirmed', color='Country/Region',
range_y=(0, 100), title='% of Cases from each country',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
temp_death = temp.sort_values('% Deaths').tail(2000)
fig = px.bar(temp_death, x='Date', y='% Deaths', color='Country/Region',
range_y=(0, 100), title='% of Cases from each country',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.show()
A bubble chart (aka bubble plot) is an extension of the scatter plot used to look at relationships between three numeric variables. Each dot in a bubble chart corresponds with a single data point, and the variables’ values for each point are indicated by horizontal position, vertical position, and dot size.
You can change the duration via using the grips on the sides of lower plot.
fig = px.scatter(country_wise.sort_values('Deaths', ascending=False).iloc[:20, :],
x='Confirmed', y='Deaths', color='Country/Region', size='Confirmed',
height=700, text='Country/Region', log_x=True, log_y=True,
title='Deaths vs Confirmed (Scale is in log10)')
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=False)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
def plot_treemap(col):
fig = px.treemap(country_wise, path=["Country/Region"], values=col, height=700,
title=col, color_discrete_sequence = px.colors.qualitative.Dark2)
fig.data[0].textinfo = 'label+text+value'
fig.show()
This is a more advanced example of tree map that explained earlier on.
plot_treemap('Confirmed')
plot_treemap('Deaths')
def plot_bubble(col, pal):
temp = full_grouped[full_grouped[col]>0].sort_values('Country/Region', ascending=False)
fig = px.scatter(temp, x='Date', y='Country/Region', size=col, color=col, height=3000,
color_continuous_scale=pal)
fig.update_layout(yaxis = dict(dtick = 1))
fig.update(layout_coloraxis_showscale=False)
fig.show()
Like the scatter plot, a bubble chart is primarily used to depict and show relationships between numeric variables. However, the addition of marker size as a dimension allows for the comparison between three variables rather than just two.
In this version of the Bubble plot, every country is specified on a row and the axis ( row ) represents the timeline, and circles on the row represent the number of cases, obviously the bigger the circles are equivalent to more cases on that specific timestamp.
plot_bubble('New cases', 'Viridis')
plot_bubble('Active', 'Viridis')
A heat map is a data visualization technique that shows the magnitude of a phenomenon as color in two dimensions. The variation in color may be by hue or intensity, giving obvious visual cues to the reader about how the phenomenon is clustered or varies over space.
This is a special version of the heatmap, the Binary heatmap. This heatmap rather than comparing the correlation between different features in a data frame simply indicates that each country had a COVID-19 case report in a day or not, and for each day there was a report marked as 1 and for days that had no reports marked as 0.
temp = full_grouped[['Date', 'Country/Region', 'New cases']]
temp['New cases reported ?'] = temp['New cases']!=0
temp['New cases reported ?'] = temp['New cases reported ?'].astype(int)
# temp.head()
fig = go.Figure(data=go.Heatmap(
z=temp['New cases reported ?'],
x=temp['Date'],
y=temp['Country/Region'],
colorscale='Emrld',
showlegend=False,
text=temp['New cases reported ?']))
fig.update_layout(yaxis = dict(dtick = 1))
fig.update_layout(height=3000)
fig.show()
"For the COVID-19 data, we collect data from official reports, directly from Government's communication channels or indirectly, through local media sources when deemed reliable. We provide the source of each data update in the "Latest Updates" (News) section. Timely updates are made possible thanks to the participation of users around the world and to the dedication of a team of analysts and researchers who validate data from an ever-growing list of over 5,000 sources." - https://www.worldometers.info/about/
The World Health Organization divides the world into six WHO regions, for the purposes of reporting, analysis and administration.
temp = worldometer_data[worldometer_data['WHO Region']!=0]
fig = px.scatter(temp, x='TotalCases', y='TotalDeaths', color='WHO Region',
height=700, hover_name='Country/Region', log_x=True, log_y=True,
title='Confirmed vs Deaths',
color_discrete_sequence=px.colors.qualitative.Vivid)
This is another colored scatter plot that divided countries into 6 "WHO regions" and compared the number of deaths to confirmed cases.
Countries that have a lower value for Y comparing to others despite their X value seem to have fewer casualties and handled the pandemic better than others.
fig.update_traces(textposition='top center')
# fig.update_layout(showlegend=False)
# fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
fig = px.scatter(temp, x='Population', y='TotalCases', color='WHO Region',
height=700, hover_name='Country/Region', log_x=True, log_y=True,
title='Population vs Confirmed',
color_discrete_sequence=px.colors.qualitative.Vivid)
fig.update_traces(textposition='top center')
# fig.update_layout(showlegend=False)
# fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
A box and whisker plot is a way of summarizing a set of data measured on an interval scale. It is often used in explanatory data analysis. This type of graph is used to show the shape of the distribution, its central value, and its variability.
This plot shows the distribution of data in every 4 quarters, median and mean.
px.box(worldometer_data, x='WHO Region', y='TotalCases', color='WHO Region',
title='Distribution of country wise no. of cases in different WHO Region')
def plot_pie_charts(x, y, title):
# more muted color
c = ['lightcoral', 'rosybrown', 'sandybrown', 'navajowhite', 'gold',
'khaki', 'lightskyblue']
plt.figure(figsize=(15,10))
plt.title(title, size=20)
plt.pie(y, colors=c,shadow=True, labels=y,autopct='%1.1f%%')
plt.legend(x, loc='best', fontsize=12)
plt.show()
Pie charts are generally used to show percentage or proportional data and usually the percentage represented by each category is provided next to the corresponding slice of pie. Pie charts are good for displaying data for around 6 categories or fewer.
Here, we used pie chart to show percentage of total cases in 6 different WHO regions.
plot_pie_charts( ['Other','Africa','America','Eastern Mediterranean','Europe','South-EastAsia','WesternPacific'], worldometer_data.groupby('WHO Region')['TotalCases'].sum(), 'Covid-19 Total Cases per WHO Rigion')
plot_pie_charts( ['Other','Africa','America','Eastern Mediterranean','Europe','South-EastAsia','WesternPacific'], worldometer_data.groupby('WHO Region')['TotalDeaths'].sum(), 'Covid-19 Total Deaths per WHO Rigion')
In this time of great distress, it is very important to do what we can to help the human community to fight this virus. If you can be of any assitance in this war against our biggest enemy, then please do your part!
Prepare, don’t panic. Look after the vulnerable people in your community. We’re all in this together. And we are in it for the long haul.
"We might have lost the battle, but we'll surely win the war"