#!/usr/bin/env python # coding: utf-8 # # Makeover Monday, 16 April 2018 # # Analysis and visualisation of (simulated) malaria cases for [Makeover Monday](http://www.makeovermonday.co.uk). # # Data from [VisualizeNoMalaria](https://data.world/makeovermonday/2018w16-zambia-southern-province-confirmed-malaria-cases) via [Makeover Monday](http://www.makeovermonday.co.uk/data/). # In[1]: import collections from datetime import datetime import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.cm as cm get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import scipy.stats # # Read the dataset # Rename the columns while we're here. # In[2]: malaria_raw = pd.read_excel('Simulated VisualizeNoMalaria Counts.xlsx').drop('Disclaimer', axis=1) malaria_raw.columns = ['country', 'province', 'district', 'ruralurban', 'date', 'report', 'cases'] malaria_raw.head() # # Explore the data # Just see how many items there are for each category # In[3]: malaria_raw.country.value_counts() # In[4]: malaria_raw.province.value_counts() # In[5]: malaria_raw.district.value_counts() # In[6]: malaria_raw.ruralurban.value_counts() # In[7]: malaria_raw.report.value_counts() # Country and province don't mean anything. # In[8]: malaria_raw.groupby(['district', 'ruralurban']).size() # # Initial plots # Just a quick few plots to see what the data looks like. # In[9]: malaria_raw.groupby('date').sum().plot() # In[10]: malaria_raw.groupby(['date', 'report']).sum().unstack().plot() # In[11]: ax = malaria_raw.groupby(['date', 'district']).sum().unstack().plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # In[12]: malaria_raw.groupby('district').sum().sort_values(by='cases') # In[13]: ax = malaria_raw.groupby(['date', 'ruralurban']).sum().unstack().plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # In[14]: ax = malaria_raw.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot(figsize=(15, 15)) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # # Just 2014 # Atypical things are happening in 2014. Let's look at just this data. # In[15]: malaria_2014 = malaria_raw[malaria_raw.date.dt.year == 2014] malaria_2014.head() # In[16]: malaria_2015p = malaria_raw[malaria_raw.date.dt.year >= 2015] malaria_2015p.head() # In[17]: ax = malaria_2014.groupby(['date', 'district']).sum().unstack().plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # In[18]: ax = malaria_2014.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # In[19]: ax = malaria_2015p.groupby(['date', 'district']).sum().unstack().plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # In[20]: ax = malaria_2015p.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # # Sinazongwe # Sinazongwe is an outlier. Let's look at just that, and everything except Sinazongwe. # In[21]: malaria_sina = malaria_raw[malaria_raw.district == 'Sinazongwe'] malaria_sina.head() # In[22]: malaria_not_sina = malaria_raw[malaria_raw.district != 'Sinazongwe'] malaria_not_sina.head() # In[23]: malaria_not_sina.groupby(['date', 'report']).sum().unstack().plot() # In[24]: malaria_sina.groupby(['date', 'report']).sum().unstack().plot() # In[25]: ax = malaria_not_sina.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # # Summarising districts # In[26]: district_month = malaria_raw.groupby(['date', 'district']).sum().unstack().T.reset_index().set_index('district').drop('level_0', axis=1).T district_month.head() # In[27]: district_month_sorted = district_month.reindex(district_month.sum().sort_values(ascending=False).index, axis=1) ax = district_month_sorted.plot.area(figsize=(10, 7)) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[::-1], labels[::-1], loc='center left', bbox_to_anchor=(1, 0.5), title='District'); # ## Focusing on the largest districts # Can we simplify this by focusing on just the largest districts? # In[28]: d2_month = pd.DataFrame() d2_month['Sinazongwe'] = district_month['Sinazongwe'] d2_month['Gwembe'] = district_month['Gwembe'] d2_month['Siavonga'] = district_month['Siavonga'] d2_month['Others'] = district_month_sorted.drop(['Sinazongwe', 'Gwembe', 'Siavonga'], axis=1).sum(axis=1) d2_month.head() # In[29]: f, ax = plt.subplots(1, 1, sharey=True, figsize=(10, 7), facecolor='lemonchiffon') plt.suptitle('Incidence of malaria cases in southern Zambia (simulated)\nThree provinces with highest caseload separated', fontsize=20) d2_month.plot.area(figsize=(10, 7), ax=ax, color=['firebrick', 'tomato', 'lightsalmon', 'darkgreen']) ax.set_facecolor('lemonchiffon') handles, labels = ax.get_legend_handles_labels() ax.legend(handles[::-1], labels[::-1], title='District', facecolor='lemonchiffon'); # , loc='center left', bbox_to_anchor=(1, 0.5) # In[30]: f.savefig('malaria-districts.png', facecolor=f.get_facecolor(), transparent=True) # In[31]: ax = d2_month.plot(figsize=(10, 7)) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[::-1], labels[::-1], loc='center left', bbox_to_anchor=(1, 0.5), title='District'); # # Annual changes # In[32]: district_year = malaria_raw.groupby([malaria_raw.date.dt.year, 'district']).sum().unstack().T.reset_index().set_index('district').drop('level_0', axis=1).T district_year # In[33]: # district_year = malaria_raw.groupby([malaria_raw.date.dt.year, 'district']).sum().unstack() # district_year # In[34]: ax = district_year.plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # In[35]: ax = district_year.plot.area() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # In[36]: district_year_sorted = district_year.reindex(district_year.sum().sort_values(ascending=False).index, axis=1) ax = district_year_sorted.plot.area(figsize=(10, 7)) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[::-1], labels[::-1], loc='center left', bbox_to_anchor=(1, 0.5), title='District'); # In[37]: ax = district_year.drop('Sinazongwe', axis=1).plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # ## Focusing on the largest districts # Again, can we simplify this by focusing on just the largest districts? # In[38]: d2_year = pd.DataFrame() d2_year['Others'] = district_year.drop(['Sinazongwe', 'Gwembe', 'Siavonga'], axis=1).sum(axis=1) d2_year['Siavonga'] = district_year['Siavonga'] d2_year['Gwembe'] = district_year['Gwembe'] d2_year['Sinazongwe'] = district_year['Sinazongwe'] d2_year # In[39]: d2_year.plot.bar(stacked=True) # In[40]: d2_year.plot.area(xticks=[x for x in range(2014, 2019)]) # # Cyclic changes # What if we plot how cases each month change over the five years? # In[41]: malaria_raw['month'] = malaria_raw['date'].dt.month malaria_raw.head() # In[42]: malaria_month_year = pd.pivot_table(malaria_raw,index='month',columns='district',values='cases', aggfunc=np.sum) malaria_month_year # In[43]: malaria_month_year.plot() # Normalise the data, so that scores are fraction of that district's cases in each month. # In[44]: mmy_norm = malaria_month_year / malaria_month_year.sum() mmy_norm # In[45]: ax = mmy_norm.plot() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); # and a very clear seasonal pattern emerges. # # Proportions # What can we see if we look at the proportions of reports by different categories? # ## Rural, urban, and health centre type # First, let's look at the rural/urban split, and the health reporter split (whether a health worker or a health facility). # In[60]: report_month = malaria_raw.pivot_table(index='date', columns=['ruralurban', 'report'], values='cases', aggfunc=sum) report_month.head() # In[73]: ax = report_month.divide(report_month.sum(axis=1), axis=0).plot.area() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # ## District # Another look at districts. Which districts have the largest proportion of cases each month? # In[72]: ax = district_month_sorted.divide(district_month_sorted.sum(axis=1), axis=0).plot.area() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # In[100]: ax = malaria_month_year.divide(malaria_month_year.sum(axis=1), axis=0).plot.area() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # Count the numbers in each category pair. # In[105]: malaria_raw.groupby('ruralurban').cases.sum() / malaria_raw.groupby('ruralurban').cases.sum().sum() # In[106]: malaria_raw.groupby('report').cases.sum() / malaria_raw.groupby('report').cases.sum().sum() # In[107]: malaria_raw.groupby('district').cases.sum() / malaria_raw.groupby('district').cases.sum().sum() # While these are interesting graphs, they don't tell us much on their own. In particular, what is the base population in each of these categories? For instance, that only 4% of cases are urban could just be a reflection that 4% of the population is urban. Similarly, if Sinazongwe has 34% of the population, that would neatly explain the 34% of cases. # In[ ]: