#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import seaborn as sns import matplotlib.pylab as plt sns.set_context('notebook') seed_numbers = 20090425, 19700903 from datetime import timedelta, datetime # Import data # In[2]: rotavirus_data = pd.read_csv('ROTA.csv') rotavirus_data.index.is_unique # Rename columns and recode data # In[3]: race_cols = [race for race in rotavirus_data.columns if race.startswith('Race')] (rotavirus_data[race_cols]=='Checked').sum() # In[4]: rotavirus_data['white'] = (rotavirus_data['Race (choice=White)']=='Checked').astype(int) rotavirus_data['black'] = (rotavirus_data['Race (choice=Black/African American)']=='Checked').astype(int) rotavirus_data = rotavirus_data[(rotavirus_data.white + rotavirus_data.black)==1].drop(race_cols, axis=1) # In[5]: rotavirus_data['hispanic'] = (rotavirus_data.Ethnicity=='Hispanic or Latino').astype(int) # In[6]: rotavirus_data['rotavirus'] = (rotavirus_data['Result of rotavirus testing'] == 'Positive').astype(int) rotavirus_data['sapovirus'] = (rotavirus_data['Result of sapovirus testing'] == 'Positive').astype(int) rotavirus_data['astrovorus'] = (rotavirus_data['Result of astrovirus testing'] == 'Positive').astype(int) rotavirus_data['norovirus'] = (rotavirus_data['Result of Norovirus Testing'] == 'GII').astype(int) rotavirus_data = rotavirus_data.drop([col for col in rotavirus_data.columns if col.startswith('Result')], axis=1) # In[7]: rotavirus_data['Enrollment Date'] = pd.to_datetime(rotavirus_data['Enrollment Date']) rotavirus_data['Date of Birth'] = pd.to_datetime(rotavirus_data['Date of Birth']) # Calculate age, and compare to given age. # In[8]: rotavirus_data['age_months'] = (rotavirus_data['Enrollment Date'] - rotavirus_data['Date of Birth'] + timedelta(1)).astype('timedelta64[M]') # In[9]: np.abs((rotavirus_data['Age in months (auto calculated) '] - rotavirus_data['age_months']) > 1).sum() # Group into ages under 5 and 5 or over # In[10]: rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int) # Calculate natural year # In[11]: rotavirus_data['year'] = rotavirus_data.Year - 2 rotavirus_data.year.value_counts() # Recode sex and stool # In[12]: rotavirus_data['male'] = rotavirus_data['Sex updated']=='Male' rotavirus_data['stool_collected'] = rotavirus_data['Was stool collected?']=='Yes' # In[13]: # Drop extraneous columns rotavirus_data = rotavirus_data.drop(['Date of Birth', 'Age in months (auto calculated) ', 'Sex', 'Sex updated', 'Was stool collected?', 'Ethnicity', 'Year'], axis=1) # In[14]: rotavirus_data = pd.concat([rotavirus_data, pd.get_dummies(rotavirus_data.Setting)], axis=1).drop('Setting', axis=1) # In[15]: rotavirus_data['setting'] = rotavirus_data.IP.astype(int) rotavirus_data.loc[rotavirus_data.OP==1, 'setting'] = 2 rotavirus_data.setting.value_counts() # In[16]: rotavirus_data.shape # In[17]: rotavirus_data.head() # In[18]: rotavirus_data.isnull().sum() # In[19]: rotavirus_data.describe() # Aggregate data by age group and setting # In[20]: rotavirus_summary = (rotavirus_data.drop('Enrollment Date', axis=1) .groupby(['under_5', 'year', 'setting', 'black', 'hispanic'])['Record ID'].count() .reset_index() .rename(columns={'Record ID':'cases'})) rotavirus_summary.head() # ### Census data # # The key for AGEGRP is as follows: # # + 0 = Total # + 1 = Age 0 to 4 years # + 2 = Age 5 to 9 years # + 3 = Age 10 to 14 years # + 4 = Age 15 to 19 years # + 5 = Age 20 to 24 years # + 6 = Age 25 to 29 years # + 7 = Age 30 to 34 years # + 8 = Age 35 to 39 years # + 9 = Age 40 to 44 years # + 10 = Age 45 to 49 years # + 11 = Age 50 to 54 years # + 12 = Age 55 to 59 years # + 13 = Age 60 to 64 years # + 14 = Age 65 to 69 years # + 15 = Age 70 to 74 years # + 16 = Age 75 to 79 years # + 17 = Age 80 to 84 years # + 18 = Age 85 years or older # In[21]: davidson_census = pd.read_csv('davidson_county.csv') davidson_census.head() # Identify racial/ethnic subgroup populations # In[22]: census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy() census_data_child['HISPANIC_WHITE'] = census_data_child[['HWA_MALE', 'HWA_FEMALE']].sum(axis=1) census_data_child['HISPANIC_BLACK'] = census_data_child[['HBA_MALE', 'HBA_FEMALE']].sum(axis=1) census_data_child['NONHISPANIC_WHITE'] = census_data_child[['NHWA_MALE', 'NHWA_FEMALE']].sum(axis=1) census_data_child['NONHISPANIC_BLACK'] = census_data_child[['NHBA_MALE', 'NHBA_FEMALE']].sum(axis=1) # In[23]: census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1) .groupby(['under_5','YEAR'])[['HISPANIC_WHITE', 'HISPANIC_BLACK', 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK']] .sum()).reset_index() census_data['YEAR'] = census_data.YEAR - 2011 census_data # Reshape data # In[24]: melted_census = (pd.melt(census_data, id_vars=['under_5', 'YEAR'], value_vars=['HISPANIC_WHITE', 'HISPANIC_BLACK', 'NONHISPANIC_WHITE', 'NONHISPANIC_BLACK'])) census_clean = (melted_census.assign(hispanic=melted_census.variable.apply(lambda x: x.split('_')[0]), black=melted_census.variable.apply(lambda x: x.split('_')[1])) .drop('variable', axis=1) .replace({'black':{'BLACK':1, 'WHITE':0}, 'hispanic':{'HISPANIC':1, 'NONHISPANIC':0}}) .rename(columns={'YEAR':'year', 'value':'n'})) census_clean # In[25]: census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)]) # In[26]: merged_data = (rotavirus_summary.merge(census_full, on=['under_5', 'year', 'black', 'hispanic','setting'], how='right') .fillna(0).astype(int)) # In[27]: merged_data.head() # VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health # In[28]: inpatient_market_share = 0.94, 0.90, 0.83 ed_market_share = 0.95, 0.94, 0.92 Y2012, Y2013, Y2014 = 0, 1, 2 # Will estimate 2014 market shares by extrapolating a linear fit to the previous years. # In[29]: from sklearn import linear_model # In[30]: inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share)) inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.reshape(-1, 1), y=inpatient_df.y) inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]] inpatient_market_share # In[31]: ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share)) ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.reshape(-1, 1), y=ed_df.y) ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]] ed_market_share # Enrollment days/week for ED, inpatient, outpatient # In[32]: monitoring_days = 4, 7, 4 # Indices to each value ED, INPATIENT, OUTPATIENT = 0, 1, 2 monitoring_rate = np.array(np.array(monitoring_days)/7) monitoring_rate # In[33]: rotavirus_data[(rotavirus_data.hispanic==1) & (rotavirus_data.white==1) & (rotavirus_data.setting==2) & (rotavirus_data.year==2) & (rotavirus_data.under_5==1)].shape # In[34]: setting_data = dict(list(merged_data.groupby('setting'))) # ## Inpatient Model # In[35]: from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform from pymc3 import traceplot, forestplot, summary, df_summary import theano.tensor as tt # In[36]: def generate_inpatient_model(inpatient_data, pool_years=False): with Model() as model: under_5, cases, n = inpatient_data[['under_5', 'cases', 'n']].values.T weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year] weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=inpatient_data.shape[0], testval=(cases/weights).astype(int)) weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases) if pool_years: assert not len(cases) % 3 pooled_shape = int(len(cases)/3) pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1)) θ = Normal('θ', 0, sd=1000, shape=pooled_shape) λ = Deterministic('λ', tt.exp(θ)) AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases)) else: θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0]) λ = Deterministic('λ', tt.exp(θ)) AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases)) return model # ### Full (unpooled) model # In[37]: inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1)) # In[38]: with inpatient_model_full: start = find_MAP() step = NUTS(scaling=start) inpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers) # In[39]: plt.subplots(figsize=(8,12)) forestplot(inpatient_full_trace[1000:], varnames=['λ']) # In[40]: label_map = {'under_5':{0:'5+', 1:'<5'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'black':{0:'white', 1:'black'}, 'hispanic':{0:'non-hispanic', 1:'hispanic'}, 'setting':{0:'ED', 1:'inpatient', 2:'outpatient'}} # In[41]: inpatient_data_labels = (setting_data[INPATIENT][['under_5', 'year', 'black', 'hispanic']] .reset_index(drop=True) .replace(label_map) .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'})) # Rate factor for population # In[42]: rate_factor = 10000 # In[43]: rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns) # In[44]: inpatient_data_labels # In[45]: from pymc3 import hpd def _hpd_df(x, alpha): cnames = ['hpd_{0:g}'.format(100 * alpha/2), 'hpd_{0:g}'.format(100 * (1 - alpha/2))] return pd.DataFrame(hpd(x, alpha), columns=cnames) # In[46]: estimates = (df_summary(inpatient_full_trace[1000:], varnames=['λ'], stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'), lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3) estimates.index = inpatient_data_labels.index pd.concat([inpatient_data_labels, estimates], axis=1) # In[47]: trace, labels = inpatient_full_trace, inpatient_data_labels # In[48]: index=['age', 'year'] columns=['race', 'ethnicity'] varnames=['λ'] # In[49]: # rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), # names=labels.columns) # rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']] # rate_df.index = rate_df_index # pivot_all = pd.pivot_table(rate_df # .reset_index(), # index=index, columns=columns) # value_iterator = zip(pivot_all['mean'].values.flatten(), # pivot_all['hpd_2.5'].values.flatten(), # pivot_all['hpd_97.5'].values.flatten()) # rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], # pivot_all['mean'].shape) # pd.DataFrame(rate_strings, index=pivot_mean.index, columns=pivot_mean.columns) # In[52]: def generate_table(trace, labels, index, columns, varnames=['λ']): rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), names=labels.columns) rate_df = (df_summary(trace[1000:], varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']] rate_df.index = rate_df_index pivot_all = pd.pivot_table(rate_df .reset_index(), index=index, columns=columns) value_iterator = zip(pivot_all['mean'].values.flatten(), pivot_all['hpd_2.5'].values.flatten(), pivot_all['hpd_97.5'].values.flatten()) rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], pivot_all['mean'].shape) return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns) # In[53]: generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race', 'ethnicity']) # In[54]: inpatient_rate_samples = pd.concat( [inpatient_data_labels, pd.DataFrame(inpatient_full_trace['λ'][-1000:]).T], axis=1) # In[55]: inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race', 'ethnicity'])) inpatient_rate_samples['race/ethnicity'] = inpatient_rate_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1) # Plot of rates broken down by year/age/race/ethnicity. # In[56]: sns.set(style="ticks") rateplot = sns.factorplot(x="year", y="rate", hue="age", row='race', col='ethnicity', data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*10000), palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0) rateplot.despine(left=True) # ### Pooled across race/ethnicity # In[57]: inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1) .groupby(['under_5', 'year'])[['cases','n']] .sum() .reset_index()) inpatient_pool_race_data # In[58]: inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data) # In[59]: with inpatient_model_pool_race: start = find_MAP() step = NUTS(scaling=start) inpatient_pool_race_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers) # In[60]: inpatient_pool_race_data_labels = (inpatient_pool_race_data[['under_5', 'year']] .reset_index(drop=True) .replace(label_map) .rename(columns={'under_5':'age'})) # In[61]: generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age']) # In[62]: inpatient_pool_race_samples = pd.concat( [inpatient_pool_race_data[['under_5', 'year']].reset_index(drop=True), pd.DataFrame(inpatient_pool_race_trace['λ'][-1000:]).T], axis=1) # In[63]: inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, id_vars=['under_5', 'year']) .replace({'under_5':{0:'5+', 1:'<5'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}}) .rename(columns={'under_5':'age group'})) # Plot of rates pooled by ethnicity # In[64]: sns.set_context(font_scale=1.5) sns.boxplot(x="year", y="rate", hue="age group", data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*10000), palette="gray", linewidth=0.6, fliersize=0) # ### Pooled across years # In[65]: inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1) .sort(columns=['under_5', 'black', 'hispanic', 'year'], ascending=False)).reset_index(drop=True) inpatient_pool_year_data # In[66]: inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True) # In[67]: with inpatient_model_pool_year: start = find_MAP() step = NUTS(scaling=start) inpatient_pool_year_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers) # In[68]: inpatient_pool_year_labels = (inpatient_pool_year_data[['under_5', 'black', 'hispanic']] .reset_index(drop=True) .replace(label_map) .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age'}) .drop_duplicates().reset_index(drop=True)) # In[69]: estimates_pooled = (df_summary(inpatient_pool_year_trace[1000:], varnames=['λ'], stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'), lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3) estimates_pooled.index = inpatient_pool_year_labels.index pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1) # In[70]: generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race','ethnicity'], columns=['age']) # In[71]: inpatient_pool_year_samples = pd.concat( [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['under_5', 'black', 'hispanic']], pd.DataFrame(inpatient_pool_year_trace['λ'][-1000:]).T], axis=1) # In[72]: inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['under_5', 'black', 'hispanic']) .replace({'under_5':{0:'5+', 1:'<5'}, 'black':{0:'white', 1:'black'}, 'hispanic':{0:'non-hispanic', 1:'hispanic'}}) .rename(columns={'black':'race', 'hispanic':'ethnicity', 'under_5':'age group'})) inpatient_pool_year_samples['race/ethnicity'] = inpatient_pool_year_samples.apply(lambda x: x.race + ', ' + x.ethnicity, axis=1) # Plot of rates pooled by year # In[73]: sns.set_context(font_scale=1.5) plt.subplots(figsize=(12,6)) sns.boxplot(x="race/ethnicity", y="rate", hue="age group", data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*10000), palette="gray_r", linewidth=0.6, fliersize=0) # ## Outpatient Model # # Import outpatient denominator information. # In[74]: from redcap import Project api_url = 'https://redcap.vanderbilt.edu/api/' api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read() # In[75]: outpatient_project = Project(api_url, api_key) # In[76]: outpatients = outpatient_project.export_records(format='df') # In[77]: outpatients.groupby('year').year.count() # In[78]: outpatients[(outpatients.ptage<5) & (outpatients.ptethnicity=='HL') & (outpatients.year==2012)].shape # In[79]: outpatients_kids = outpatients[(outpatients.ptage<=18) & (outpatients.ptrace.isin(['B','W'])) & (outpatients.ptethnicity.isin(['NH', 'HL']))].copy() # In[80]: outpatients_kids.ptrace.value_counts() # In[81]: outpatients_kids.ptethnicity.value_counts() # In[82]: outpatients_kids['under_5'] = (outpatients_kids.ptage<5).astype(int) outpatients_kids['black'] = (outpatients_kids.ptrace=='B').astype(int) outpatients_kids['hispanic'] = (outpatients_kids.ptethnicity=='HL').astype(int) # In[83]: outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'}) .groupby(['year', 'black', 'hispanic', 'under_5']) .n.count() .reset_index()) outpatient_n # In[84]: interp_2013 = (outpatient_n.groupby(['black','hispanic','under_5']) .apply(lambda x: np.interp(2013, x.year, x.n)) .astype(int) .reset_index()).assign(year=2013).rename(columns={0:'n'}) interp_2013 # In[85]: outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0) outpatient_n = outpatient_n[outpatient_n.year<2015] outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012) outpatient_n # In[86]: outpatient_data_full = setting_data[OUTPATIENT].drop(['setting','n'], axis=1) # In[87]: outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['black','hispanic','under_5','year']) # In[88]: outpatient_merged # In[89]: def generate_outpatient_model(outpatient_data, pool_years=False): with Model() as model: cases,n = outpatient_data[['cases','n']].values.T ω = monitoring_rate[INPATIENT] shape = outpatient_data.shape[0] if pool_years: assert not len(cases) % 3 shape = int(len(cases)/3) cases = cases.reshape((pooled_shape, 3)).sum(-1) π = Beta('π', 1, 10, shape=shape) AGE_obs = Binomial('AGE_obs', n, π*ω, observed=cases) return model # In[90]: outpatient_model_full = generate_outpatient_model(outpatient_merged) # In[91]: with outpatient_model_full: start = find_MAP() step = NUTS(scaling=start) outpatient_full_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers) # In[92]: plt.subplots(figsize=(8,12)) forestplot(outpatient_full_trace[1000:], varnames=['π']) # In[93]: outpatient_data_labels = inpatient_data_labels[inpatient_data_labels.year.isin(['2012/13','2013/14'])] # In[94]: generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['π'], index=['age', 'year'], columns=['race', 'ethnicity']) # ## ED Model # In[95]: ED2 = pd.read_excel('FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0) ED3 = pd.read_excel('Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0) ED2.columns = ED3.columns # In[96]: ED2_total = ED2.sum() ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)', 'Total Eligible on Date (Age > 11 Years and < 18 Years)']]) ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'], ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)', 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']]) # In[97]: ED3_total = ED3.sum() ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)', 'Total Eligible on Date (Age > 11 Years and < 18 Years)']]) ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'], ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)', 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']]) # In[98]: ED_all = pd.concat([ED2, ED3]) eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')] # In[99]: ED_counts = ED_all[eligible_cols].sum(1).astype(int) # In[100]: rotavirus_ed = rotavirus_data[rotavirus_data.ED==1] surveillance_counts = rotavirus_data.groupby(['Enrollment Date'])['Record ID'].count() # In[101]: ED_counts.name = 'count24' surveillance_counts.name = 'count8' # In[102]: surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna() # Drop days where 8-hour count exceeds 24-hour count surveillance = surveillance[surveillance.count8<=surveillance.count24] # Here is what it looks like if you resample to weekly # In[103]: weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W').sum() (weekly_surveillance.count8 / weekly_surveillance.count24).hist() # We will just use the proportion of the pooled suveillance # In[104]: surveillance_total = surveillance.sum() surveillance_total # In[105]: surveillance_proportion = surveillance_total.count8 / surveillance_total.count24 surveillance_proportion # In[106]: def generate_ed_model(ED_data): with Model() as model: data_len = ED_data.shape[0] cases, n = ED_data[['cases','n']].values.T weights = (np.array(monitoring_rate)[ED] * np.array(ed_market_share[:-1])[ED_data.year] * surveillance_proportion) weighted_cases = DiscreteUniform('weighted_cases', cases, n, shape=data_len, testval=(cases/weights).astype(int)) weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases) θ = Normal('θ', 0, sd=1000, shape=data_len) λ = Deterministic('λ', tt.exp(θ)) age_vanderbilt = Poisson('age_vanderbilt', λ * n, observed=weighted_cases) return model # In[107]: ed_pool_race_data = (setting_data[ED].drop('setting', axis=1) .query('year<2') .groupby(['under_5', 'year'])[['cases','n']] .sum() .reset_index()) ed_pool_race_data # In[108]: ed_model = generate_ed_model(ed_pool_race_data) # In[109]: with ed_model: start = find_MAP() step = NUTS(scaling=start) ed_trace = sample(2000, step, start, njobs=2, random_seed=seed_numbers) # In[110]: forestplot(ed_trace[1000:], varnames=['λ']) # In[111]: ed_data_labels = (ed_pool_race_data[['under_5', 'year']] .reset_index(drop=True) .replace(label_map) .rename(columns={'under_5':'age'})) # In[112]: generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age']) # Create data frame of MCMC samples for plotting # In[120]: ed_rate_samples = pd.concat([ed_pool_race_data[['under_5', 'year']], pd.DataFrame(ed_trace['λ'][-1000:]).T], axis=1) # In[121]: ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['under_5', 'year']) .replace({'under_5':{0:'5+', 1:'<5'}, 'year':{0:'2012/13', 1:'2013/14'}}) .rename(columns={'under_5':'age group'})) # In[122]: sns.set_context(font_scale=1.5) sns.boxplot(x="year", y="rate", hue="age group", data=ed_rate_samples.assign(rate=ed_rate_samples.value*10000), palette="gray", linewidth=0.6, fliersize=0) # ## Virus rates # In[ ]: census_data_virus = (census_data_child.assign(under_5=census_data_child.AGEGRP==1) .groupby(['under_5','YEAR'])['TOT_POP'].sum()).reset_index() census_data_virus['YEAR'] = census_data_virus.YEAR - 2011 # In[ ]: census_virus_clean = census_data_virus.rename(columns={'YEAR':'year', 'TOT_POP':'n'}) census_virus_clean.head() # In[ ]: virus_summary = (rotavirus_data.groupby(['under_5', 'year', 'setting', 'rotavirus', 'sapovirus', 'astrovorus', 'norovirus']) ['Record ID'].count() .reset_index() .rename(columns={'Record ID':'cases', 'astrovorus':'astrovirus'})) virus_summary.head() # In[ ]: merged_virus_data = virus_summary.merge(census_virus_clean, on=['under_5', 'year'], how='left') merged_virus_data.head(10) # In[ ]: rota_data = (merged_virus_data[merged_virus_data.rotavirus==1] .groupby(['under_5', 'year', 'setting']) .agg({'cases':sum, 'n':max})) sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1] .groupby(['under_5', 'year', 'setting']) .agg({'cases':sum, 'n':max}) .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad')) astro_data = (merged_virus_data[merged_virus_data.astrovirus==1] .groupby(['under_5', 'year', 'setting']) .agg({'cases':sum, 'n':max}) .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad')) noro_data = (merged_virus_data[merged_virus_data.norovirus==1] .groupby(['under_5', 'year', 'setting']) .agg({'cases':sum, 'n':max}) .reindex_like(rota_data).fillna({'cases':0}).fillna(method='pad')) # In[ ]: virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'), noro_data.reset_index().assign(virus='norovirus'), rota_data.reset_index().assign(virus='rotavirus'), sapo_data.reset_index().assign(virus='sapovirus')]) virus_totals['cases'] = virus_totals.cases.astype(int) virus_totals['n'] = virus_totals.n.astype(int) virus_totals.head(10) # In[ ]: n_lookup = virus_totals.groupby(['under_5', 'year', 'virus']).agg({'n':max}) n_lookup # In[ ]: with Model() as virus_model: n_total = int(n_lookup.shape[0]) p_enroll = Beta('p_enroll', 1, 1, testval=0.9) enrolled = Binomial('enrolled', n=rotavirus_data.shape[0], p=p_enroll, observed=rotavirus_data.stool_collected.sum()) weights = (np.array(monitoring_rate)[virus_totals.setting] * np.array(market_share)[virus_totals.year]) weighted_cases = DiscreteUniform('weighted_cases', virus_totals.cases, virus_totals.n, shape=virus_totals.shape[0], testval=np.maximum((virus_totals.cases.values/weights).astype(int), 1)) p_weight = Deterministic('p_weight', weights*p_enroll) weighting = Binomial('weighting', n=weighted_cases, p=p_weight, observed=virus_totals.cases) weighted_cases_total = Deterministic('weighted_cases_total', weighted_cases.reshape((n_total, 3)).sum(1)) θ = Normal('θ', 0, sd=1000, shape=n_total, testval=np.ones(n_total)) λ = Deterministic('λ', tt.exp(θ)) virus_like = Potential('virus_like', Poisson.dist(λ * (n_lookup.n/10000.)).logp(weighted_cases_total)) # In[ ]: with virus_model: virus_trace = sample(20000, njobs=2, random_seed=seed_numbers) # In[ ]: traceplot(virus_trace[10000:], varnames=['p_enroll']) # In[ ]: forestplot(virus_trace[10000:], varnames=['weighted_cases_total']) # In[ ]: virus_samples = pd.concat([n_lookup.reset_index()[['under_5', 'year', 'virus']], pd.DataFrame(virus_trace['λ'][-1000:]).T], axis=1) # In[ ]: virus_samples = (pd.melt(virus_samples, id_vars=['under_5', 'year', 'virus']) .replace({'under_5':{0:'5+', 1:'<5'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}}) .rename(columns={'under_5':'age group'})) # In[ ]: sns.set(style="ticks") rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', data=virus_samples.rename(columns={'value':'rate'}), palette="PRGn", size=6, aspect=.75, kind='box', linewidth=0.6, col_wrap=2) rateplot.despine(left=True)