import numpy as np
import pandas as pd
from bokeh.charts import Histogram, Bar, BoxPlot
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
ebola_df = pd.read_csv("data/out/ebola_outbreaks_before_2014-geometry_fixed.csv", encoding="utf-8", index_col=False)
ebola_data = ebola_df.drop(ebola_df.columns[[0, 1, 2, 9]], axis=1)
cols = list(ebola_data.columns)
for column in cols:
print "column", (cols.index(column) + 1), ":", column
column 1 : country_code_iso_2_digits column 2 : country_name column 3 : duration_days column 4 : ebola_subtype column 5 : end_date column 6 : end_datetime column 7 : geometry column 8 : geometry_geojson column 9 : latitude column 10 : longitude column 11 : reported_number_of_deaths_among_cases column 12 : reported_number_of_human_cases column 13 : reported_of_deaths_among_cases column 14 : start_date column 15 : start_datetime column 16 : year_s
=> locations info, reported cases, date/time
ebola_data.groupby(["country_name"])["country_name"].count().order()
country_name Côte d'Ivoire (Ivory Coast) 1 England 1 Italy 1 South Africa 1 Russia 2 Philippines 3 Sudan (South Sudan) 3 USA 3 Gabon 4 Uganda 5 Democratic Republic of the Congo 9 Name: country_name, dtype: int64
# Bokeh can't seem to handle the unicode for circumflex accent on the 'o' in "Côte d'Ivoire"
# I'm replacing it by a normal 'o'
countries_list = list(ebola_data["country_name"])
for i in range(len(countries_list)):
if countries_list[i] == u"C\xf4te d'Ivoire (Ivory Coast)":
countries_list[i] = u"Cote d'Ivoire (Ivory Coast)"
data_nb = {
'countries': countries_list,
'ebola outbreaks': [1] * len(countries_list)
}
bar_nb = Bar(data_nb, values='ebola outbreaks', label='countries', agg='sum', color="#3B6849",
title="Number of ebola outbreak(s) per country", plot_width=600, plot_height=500)
output_notebook()
show(bar_nb)
ebola_data.groupby(["country_name"])["duration_days"].mean().order()
country_name Uganda 206.800000 Gabon 219.750000 Democratic Republic of the Congo 235.444444 Côte d'Ivoire (Ivory Coast) 364.000000 USA 364.333333 Philippines 364.666667 Sudan (South Sudan) 364.666667 England 365.000000 Italy 365.000000 Russia 365.000000 South Africa 365.000000 Name: duration_days, dtype: float64
data_dur = {
'countries': countries_list,
'outbreaks duration': list(ebola_data["duration_days"])
}
bar_dur = Bar(data_dur, values='outbreaks duration', label='countries', agg='mean', color="#586996",
title="Durations of ebola outbreaks (days) per country", plot_width=600, plot_height=500)
output_notebook()
show(bar_dur)
ebola_data.groupby(["ebola_subtype"])["ebola_subtype"].count().order()
ebola_subtype Taï Forest virus 1 Bundibugyo virus 2 Reston virus 7 Sudan virus 8 Zaire virus 15 Name: ebola_subtype, dtype: int64
# Bokeh can't seem to handle the unicode for trema accent on the 'i' in Taï Forest virus
# I'm replacing it by a normal 'i'
eb_virus_types = list(ebola_data["ebola_subtype"])
for i in range(len(eb_virus_types)):
if eb_virus_types[i] == u"Ta\xef Forest virus":
eb_virus_types[i] = u"Tai Forest virus"
data_vir = {
'ebola virus subtypes': eb_virus_types,
'number of ebola outbreaks': [1] * len(eb_virus_types)
}
bar_vir = Bar(data_vir, values='number of ebola outbreaks', label='ebola virus subtypes', agg='sum', color="#E2AE7A",
title="Number of ebola outbreak(s) per virus subtype", plot_width=600, plot_height=400, bar_width=0.5)
output_notebook()
show(bar_vir)
ebola_data[["country_name", "ebola_subtype"]].sort(["country_name"])
country_name | ebola_subtype | |
---|---|---|
30 | Côte d'Ivoire (Ivory Coast) | Taï Forest virus |
16 | Democratic Republic of the Congo | Zaire virus |
18 | Democratic Republic of the Congo | Zaire virus |
17 | Democratic Republic of the Congo | Zaire virus |
15 | Democratic Republic of the Congo | Bundibugyo virus |
5 | Democratic Republic of the Congo | Zaire virus |
6 | Democratic Republic of the Congo | Zaire virus |
12 | Democratic Republic of the Congo | Zaire virus |
8 | Democratic Republic of the Congo | Zaire virus |
10 | Democratic Republic of the Congo | Zaire virus |
32 | England | Sudan virus |
13 | Gabon | Zaire virus |
4 | Gabon | Zaire virus |
20 | Gabon | Zaire virus |
23 | Gabon | Zaire virus |
24 | Italy | Reston virus |
7 | Philippines | Reston virus |
14 | Philippines | Reston virus |
1 | Philippines | Reston virus |
0 | Russia | Zaire virus |
19 | Russia | Zaire virus |
27 | South Africa | Zaire virus |
31 | Sudan (South Sudan) | Sudan virus |
29 | Sudan (South Sudan) | Sudan virus |
28 | Sudan (South Sudan) | Sudan virus |
21 | USA | Reston virus |
2 | USA | Reston virus |
3 | USA | Reston virus |
26 | Uganda | Sudan virus |
22 | Uganda | Bundibugyo virus |
9 | Uganda | Sudan virus |
25 | Uganda | Sudan virus |
11 | Uganda | Sudan virus |
ebola_data.groupby(["ebola_subtype"])["duration_days"].mean().order()
ebola_subtype Bundibugyo virus 121.500000 Zaire virus 260.733333 Sudan virus 304.000000 Taï Forest virus 364.000000 Reston virus 364.571429 Name: duration_days, dtype: float64
=> Bundibugyo virus seems to be correlated with shorter outbreaks
df_drc = ebola_data[ebola_data.country_name == "Democratic Republic of the Congo"]
drc = df_drc[["ebola_subtype", "start_date", "end_date", "reported_number_of_human_cases", "reported_number_of_deaths_among_cases"]]
drc.sort(["start_date"])
ebola_subtype | start_date | end_date | reported_number_of_human_cases | reported_number_of_deaths_among_cases | |
---|---|---|---|---|---|
5 | Zaire virus | 1976-01-01T00:00:00Z | 1976-12-31T00:00:00Z | 318 | 280 |
8 | Zaire virus | 1977-01-01T00:00:00Z | 1977-12-31T00:00:00Z | 1 | 1 |
18 | Zaire virus | 1995-01-01T00:00:00Z | 1995-12-31T00:00:00Z | 315 | 250 |
16 | Zaire virus | 2001-10-01T00:00:00Z | 2002-03-31T00:00:00Z | 57 | 43 |
17 | Zaire virus | 2002-12-01T00:00:00Z | 2003-04-30T00:00:00Z | 143 | 128 |
10 | Zaire virus | 2003-11-01T00:00:00Z | 2003-12-31T00:00:00Z | 35 | 29 |
6 | Zaire virus | 2007-01-01T00:00:00Z | 2007-12-31T00:00:00Z | 264 | 187 |
12 | Zaire virus | 2008-12-01T00:00:00Z | 2009-02-28T00:00:00Z | 32 | 15 |
15 | Bundibugyo virus | 2012-06-01T00:00:00Z | 2012-11-30T00:00:00Z | 36 | 13 |