Analysis and visualisation of (simulated) malaria cases for Makeover Monday.
Data from VisualizeNoMalaria via Makeover Monday.
import collections
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats
Rename the columns while we're here.
malaria_raw = pd.read_excel('Simulated VisualizeNoMalaria Counts.xlsx').drop('Disclaimer', axis=1)
malaria_raw.columns = ['country', 'province', 'district', 'ruralurban', 'date', 'report', 'cases']
malaria_raw.head()
Just see how many items there are for each category
malaria_raw.country.value_counts()
malaria_raw.province.value_counts()
malaria_raw.district.value_counts()
malaria_raw.ruralurban.value_counts()
malaria_raw.report.value_counts()
Country and province don't mean anything.
malaria_raw.groupby(['district', 'ruralurban']).size()
Just a quick few plots to see what the data looks like.
malaria_raw.groupby('date').sum().plot()
malaria_raw.groupby(['date', 'report']).sum().unstack().plot()
ax = malaria_raw.groupby(['date', 'district']).sum().unstack().plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
malaria_raw.groupby('district').sum().sort_values(by='cases')
ax = malaria_raw.groupby(['date', 'ruralurban']).sum().unstack().plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax = malaria_raw.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot(figsize=(15, 15))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));
Atypical things are happening in 2014. Let's look at just this data.
malaria_2014 = malaria_raw[malaria_raw.date.dt.year == 2014]
malaria_2014.head()
malaria_2015p = malaria_raw[malaria_raw.date.dt.year >= 2015]
malaria_2015p.head()
ax = malaria_2014.groupby(['date', 'district']).sum().unstack().plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));
ax = malaria_2014.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));
ax = malaria_2015p.groupby(['date', 'district']).sum().unstack().plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));
ax = malaria_2015p.groupby(['date', 'district', 'report']).sum().unstack([-2, -1]).plot()
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));
Sinazongwe is an outlier. Let's look at just that, and everything except Sinazongwe.
malaria_sina = malaria_raw[malaria_raw.district == 'Sinazongwe']
malaria_sina.head()
malaria_not_sina = malaria_raw[malaria_raw.district != 'Sinazongwe']
malaria_not_sina.head()
malaria_not_sina.groupby(['date', 'report']).sum().unstack().plot()