source: Humanitarian Data Exchange (HDX)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
from bokeh.charts import Histogram, Bar, BoxPlot, Scatter
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, output_file
from bokeh.palettes import Spectral11
ebola_df = pd.read_csv("data/out/ebola_outbreaks_before_2014-geometry_fixed.csv", encoding="utf-8", index_col=False)
ebola_data = ebola_df.drop(ebola_df.columns[[0, 1, 2, 9]], axis=1)
ebola_data.head(3)
country_code_iso_2_digits | country_name | duration_days | ebola_subtype | end_date | end_datetime | geometry | geometry_geojson | latitude | longitude | reported_number_of_deaths_among_cases | reported_number_of_human_cases | reported_of_deaths_among_cases | start_date | start_datetime | year_s | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RU | Russia | 365 | Zaire virus | 2004-12-31T00:00:00Z | 2004-12-31T00:00:00Z | (POLYGON ((132.448985 42.845404, 132.44988 42.... | {u'type': u'MultiPolygon', u'coordinates': [[[... | 64.686314 | 97.745306 | 1 | 1 | 1 | 2004-01-01T00:00:00Z | 2004-01-01T00:00:00Z | 2004 |
1 | PH | Philippines | 365 | Reston virus | 1996-12-31T00:00:00Z | 1996-12-31T00:00:00Z | (POLYGON ((119.849783 4.796861, 119.833995 4.7... | {u'type': u'MultiPolygon', u'coordinates': [[[... | 12.750349 | 122.731210 | 0 | 0 | 0 | 1996-01-01T00:00:00Z | 1996-01-01T00:00:00Z | 1996 |
2 | US | USA | 364 | Reston virus | 1990-12-31T00:00:00Z | 1990-12-31T00:00:00Z | (POLYGON ((-155.606519 20.137956, -155.586363 ... | {u'type': u'MultiPolygon', u'coordinates': [[[... | 39.783730 | -100.445882 | 0 | 4 | 0 | 1990-01-01T00:00:00Z | 1990-01-01T00:00:00Z | 1990 |
# Handle non-ASCII character
ebola_data = ebola_data.replace(u"C\xf4te d'Ivoire (Ivory Coast)",
u"Cote d'Ivoire (Ivory Coast)")
outbrks_data = ebola_data.groupby(["country_name"])["country_name"].count().order()
print outbrks_data
outbrks_data.describe()
country_name Cote d'Ivoire (Ivory Coast) 1 England 1 Italy 1 South Africa 1 Russia 2 Philippines 3 Sudan (South Sudan) 3 USA 3 Gabon 4 Uganda 5 Democratic Republic of the Congo 9 Name: country_name, dtype: int64
count 11.000000 mean 3.000000 std 2.408319 min 1.000000 25% 1.000000 50% 3.000000 75% 3.500000 max 9.000000 Name: country_name, dtype: float64
Prior to 2014, there has been Ebola outbreaks in 11 countries.
Those countries have had between 1 and 9 outbreaks.
total_outbrks = ebola_data.groupby(["country_name"])["country_name"].count().sum()
outbrks_percent = outbrks_data.mul(100).truediv(total_outbrks)
data = outbrks_percent.to_frame(name='proportions of outbreaks (%)')
print data
bar_outbrks = Bar(data, values='proportions of outbreaks (%)', color='navy',
title="Proportion of ebola outbreaks per country in %")
output_notebook()
output_file("outbreaks.html", title="Proportion of ebola outbreaks per country in %")
show(bar_outbrks)
proportions of outbreaks (%) country_name Cote d'Ivoire (Ivory Coast) 3.030303 England 3.030303 Italy 3.030303 South Africa 3.030303 Russia 6.060606 Philippines 9.090909 Sudan (South Sudan) 9.090909 USA 9.090909 Gabon 12.121212 Uganda 15.151515 Democratic Republic of the Congo 27.272727
<Bokeh Notebook handle for In[6]>
ebola_data[["country_name", "reported_number_of_human_cases"]].groupby(["country_name"]).sum()
reported_number_of_human_cases | |
---|---|
country_name | |
Cote d'Ivoire (Ivory Coast) | 1 |
Democratic Republic of the Congo | 1201 |
England | 1 |
Gabon | 214 |
Italy | 0 |
Philippines | 9 |
Russia | 2 |
South Africa | 2 |
Sudan (South Sudan) | 335 |
USA | 4 |
Uganda | 592 |
df_ebola_victims = ebola_data[["country_name", "reported_number_of_human_cases",
"reported_number_of_deaths_among_cases"]]
box_ev = BoxPlot(df_ebola_victims, values="reported_number_of_human_cases",
label='country_name', title="Number of ebola victims per country",
outliers=False, whisker_color='country_name')
output_notebook()
output_file("victims.html", title="Number of ebola victims per country")
show(box_ev)
<Bokeh Notebook handle for In[10]>
ebola_data[["country_name", "reported_number_of_deaths_among_cases"]].groupby(["country_name"]).sum()
reported_number_of_deaths_among_cases | |
---|---|
country_name | |
Cote d'Ivoire (Ivory Coast) | 0 |
Democratic Republic of the Congo | 946 |
England | 0 |
Gabon | 150 |
Italy | 0 |
Philippines | 0 |
Russia | 2 |
South Africa | 1 |
Sudan (South Sudan) | 180 |
USA | 0 |
Uganda | 269 |
box_ed = BoxPlot(df_ebola_victims, values="reported_number_of_deaths_among_cases",
label='country_name', title="Number of deaths from ebola per country",
outliers=False, whisker_color='country_name')
output_notebook()
output_file("deaths.html", title="Number of deaths from ebola per country")
show(box_ed)
<Bokeh Notebook handle for In[12]>
The number of victims could be due to the country's health system or the moment in time when the outbreak occurred.
=> It be interested to plot the number of victims vs the number of outbreaks, a value from WHO representing countries health system score and to look at the number of victims over time
I'd like to look at:
cases_by_subtype = ebola_data[["ebola_subtype", "reported_number_of_human_cases"]].groupby(["ebola_subtype"]).sum()
cases_by_subtype
reported_number_of_human_cases | |
---|---|
ebola_subtype | |
Bundibugyo virus | 185 |
Reston virus | 13 |
Sudan virus | 779 |
Taï Forest virus | 1 |
Zaire virus | 1383 |
total_cases_by_subtype = cases_by_subtype.sum()
cases_subtype_percent = cases_by_subtype.mul(100).truediv(total_cases_by_subtype)
cases_subtype_percent
reported_number_of_human_cases | |
---|---|
ebola_subtype | |
Bundibugyo virus | 7.835663 |
Reston virus | 0.550614 |
Sudan virus | 32.994494 |
Taï Forest virus | 0.042355 |
Zaire virus | 58.576874 |
deaths_by_subtype = ebola_data[["ebola_subtype", "reported_number_of_deaths_among_cases"]].groupby(["ebola_subtype"]).sum()
deaths_by_subtype
reported_number_of_deaths_among_cases | |
---|---|
ebola_subtype | |
Bundibugyo virus | 50 |
Reston virus | 0 |
Sudan virus | 412 |
Taï Forest virus | 0 |
Zaire virus | 1086 |
total_deaths_by_subtype = deaths_by_subtype.sum()
deaths_subtype_percent = deaths_by_subtype.mul(100).truediv(total_deaths_by_subtype)
deaths_subtype_percent
reported_number_of_deaths_among_cases | |
---|---|
ebola_subtype | |
Bundibugyo virus | 3.229974 |
Reston virus | 0.000000 |
Sudan virus | 26.614987 |
Taï Forest virus | 0.000000 |
Zaire virus | 70.155039 |
outbrks_by_subtype = ebola_data.groupby(["ebola_subtype"])["ebola_subtype"].count()
total_outbrks_by_subtype = outbrks_by_subtype.sum()
outbrks_subtype_percent = outbrks_by_subtype.mul(100).truediv(total_outbrks_by_subtype)
data_percent = outbrks_subtype_percent.to_frame(name='Outbreaks by virus subtype (%)')
data_percent
Outbreaks by virus subtype (%) | |
---|---|
ebola_subtype | |
Bundibugyo virus | 6.060606 |
Reston virus | 21.212121 |
Sudan virus | 24.242424 |
Taï Forest virus | 3.030303 |
Zaire virus | 45.454545 |
all_subtypes_proportions = {"Ebola subtypes" : ["Bundibugyo virus", "Reston virus", "Sudan virus", "Taï Forest virus", "Zaire virus"],
"Outbreaks by virus subtype (%)" : [6.060606, 21.212121, 24.242424, 3.030303, 45.454545],
"Reported human cases by virus subtype (%)" : [7.835663, 0.550614, 32.994494, 0.042355, 58.576874],
"Reported deaths among cases by virus subtype (%)" : [3.229974, 0.0, 26.614987, 0.0, 70.155039]}
data_subtypes_proportions = pd.DataFrame(all_subtypes_proportions)
data_subtypes_proportions
Ebola subtypes | Outbreaks by virus subtype (%) | Reported deaths among cases by virus subtype (%) | Reported human cases by virus subtype (%) | |
---|---|---|---|---|
0 | Bundibugyo virus | 6.060606 | 3.229974 | 7.835663 |
1 | Reston virus | 21.212121 | 0.000000 | 0.550614 |
2 | Sudan virus | 24.242424 | 26.614987 | 32.994494 |
3 | Taï Forest virus | 3.030303 | 0.000000 | 0.042355 |
4 | Zaire virus | 45.454545 | 70.155039 | 58.576874 |
data_subtypes_proportions.plot(kind="bar", x="Ebola subtypes", figsize=(10,5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f1cfa2fbe50>
outbreaks_dates_by_country = ebola_data[['country_name', 'start_date', 'start_date', ]]