#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import seaborn as sns import matplotlib.pylab as plt sns.set_context('notebook') seed_numbers = 20090425, 19700903 from datetime import timedelta, datetime # # Data import and cleanup # In[2]: # rotavirus_data = pd.read_csv('data/AGE Year 2-4 with Final Rota.csv', low_memory=False) rotavirus_data = pd.read_csv('data/AGE Cases and Healthy Controls Merged with Lab Results Year 2 to 4 v05.23.2017.csv') rotavirus_data.index.is_unique # In[3]: rotavirus_data.head() # Rename columns and recode data # In[4]: rotavirus_data.race.value_counts() # In[5]: rotavirus_data['white'] = rotavirus_data['race'] == 1 rotavirus_data['black'] = rotavirus_data['race'] == 2 # In[6]: rotavirus_data['astro'].value_counts().index # In[7]: # rotavirus_data['rotavirus'] = (rotavirus_data['rotacdc'] == 'Positive').astype(float) # rotavirus_data['sapovirus'] = (rotavirus_data['sapo'] == 'Positive').astype(float) # rotavirus_data['astrovirus'] = (rotavirus_data['astro'] == 'Positive').astype(float) # rotavirus_data['norovirus'] = (rotavirus_data['noro'] == 'Positive').astype(float) # rotavirus_data.loc[rotavirus_data['rotacdc'].isnull() | (rotavirus_data.rotacdc=='Inhibition') | (rotavirus_data.specimencol=='No'), # 'rotavirus'] = np.nan # rotavirus_data.loc[rotavirus_data['sapo'].isnull() | (rotavirus_data.sapo=='Inhibition') | (rotavirus_data.specimencol=='No'), # 'sapovirus'] = np.nan # rotavirus_data.loc[rotavirus_data['astro'].isnull() | (rotavirus_data.astro=='Inhibition') | (rotavirus_data.specimencol=='No'), # 'astrovirus'] = np.nan # rotavirus_data.loc[rotavirus_data['noro'].isnull() | (rotavirus_data.noro=='Inhibition') | (rotavirus_data.specimencol=='No'), # 'norovirus'] = np.nan # In[8]: rotavirus_data=rotavirus_data.rename(columns={'rotacdc':'rotavirus', 'sapo':'sapovirus', 'astro':'astrovirus', 'noro':'norovirus'}) # In[9]: rotavirus_data.rotavirus.value_counts() # In[10]: rotavirus_data.rotavu.value_counts() # In[11]: virus_cols = ['rotavirus','sapovirus','astrovirus','norovirus'] # Proportion of test results missing # In[12]: rotavirus_data[virus_cols].isnull().mean() # In[13]: rotavirus_data['scrdate'] = pd.to_datetime(rotavirus_data['scrdate']) rotavirus_data['dob'] = pd.to_datetime(rotavirus_data['dob']) # In[14]: astro_data = rotavirus_data[rotavirus_data.astrovirus==1].dropna(subset=['astro_ct']) astro_data.shape # In[15]: noro_data = rotavirus_data[rotavirus_data.norovirus==1].dropna(subset=['noro_ct']) noro_data.shape # In[16]: noro_data['coinfect'] = noro_data[['rotavirus', 'sapovirus', 'astrovirus']].sum(1) # In[17]: astro_data['coinfect'] = astro_data[['rotavirus', 'sapovirus', 'norovirus']].sum(1) # In[18]: with_coinf = noro_data.noro_ct[noro_data.coinfect==1].astype(float) no_coinf = noro_data.noro_ct[(noro_data.coinfect==0)].astype(float) # with_coinf = astro_data.astro_ct[astro_data.coinfect==1].astype(float) # no_coinf = astro_data.astro_ct[(astro_data.coinfect==0)].astype(float) # In[19]: from scipy.stats import ranksums ranksums(with_coinf, no_coinf) # In[20]: rotavirus_data.sapo_ct.dropna().astype(float).hist() # Calculate age, and compare to given age. # In[21]: rotavirus_data['age_months'] = (rotavirus_data['scrdate'] - rotavirus_data['dob'] + timedelta(1)).astype('timedelta64[M]') # In[22]: rotavirus_data.age_months.hist(); # Group into ages under 5 and 5 or over # In[23]: rotavirus_data['under_5'] = ((rotavirus_data.age_months / 12) < 5).astype(int) # In[24]: rotavirus_data['under_2'] = ((rotavirus_data.age_months / 12) < 2).astype(int) # In[25]: rotavirus_data.loc[rotavirus_data.caseid=='EN1I0141', ['under_5', 'under_2']] # In[26]: rotavirus_data['age_group'] = 2 rotavirus_data.loc[rotavirus_data.under_5==1, 'age_group'] = 1 rotavirus_data.loc[rotavirus_data.under_2==1, 'age_group'] = 0 # In[27]: rotavirus_data.age_group.value_counts() # In[28]: rotavirus_data.year.value_counts() # Calculate natural year # In[29]: rotavirus_data['year'] = rotavirus_data.year - 2 # Rename setting variable # In[30]: assert not rotavirus_data.settingnew.isnull().sum() # In[31]: rotavirus_data['setting'] = rotavirus_data.settingnew - 1 # In[32]: # Indices to each value INPATIENT, ED, OUTPATIENT, HC = 0, 1, 2, 3 # rotavirus_data['setting'] = rotavirus_data.provider.replace({1:INPATIENT, 2:ED, 3:OUTPATIENT, 4:HC}) # Recode sex and stool # In[33]: assert not rotavirus_data['specimencol'].isnull().sum() # In[34]: rotavirus_data['male'] = rotavirus_data['sex']==1 rotavirus_data['stool_collected'] = rotavirus_data['specimencol']==1 # Recode insurance # In[35]: rotavirus_data.insurance.value_counts() # In[36]: rotavirus_data['public_ins'] = rotavirus_data.insurance.isin({1, 3}) rotavirus_data['private_ins'] = rotavirus_data.insurance.isin({2, 3}) rotavirus_data.loc[rotavirus_data.insurance==8, ['public_ins', 'private_ins']] = np.nan # In[37]: # Drop extraneous columns rotavirus_data = rotavirus_data.drop(['intvwdate', 'dob', 'provider', 'coldat', 'tdat', 'insurance', 'sex', 'hispcr', 'mdegree_7', 'rvacver_7', 'rvacver_7r', 'race', 'race2_8'], axis=1) # Summary of counts by setting # In[38]: rotavirus_data.shape # Summary of missing values # In[39]: rotavirus_data.isnull().sum() # Aggregate data by age group and setting # In[40]: rotavirus_summary = (rotavirus_data.drop('scrdate', axis=1) .groupby(['age_group', 'year', 'setting', 'black'])['caseid'].count() .reset_index() .rename(columns={'caseid':'cases'})) rotavirus_summary.head() # ### Census data # # The key for AGEGRP is as follows: # # + 0 = Total # + 1 = Age 0 to 4 years # + 2 = Age 5 to 9 years # + 3 = Age 10 to 14 years # + 4 = Age 15 to 19 years # + 5 = Age 20 to 24 years # + 6 = Age 25 to 29 years # + 7 = Age 30 to 34 years # + 8 = Age 35 to 39 years # + 9 = Age 40 to 44 years # + 10 = Age 45 to 49 years # + 11 = Age 50 to 54 years # + 12 = Age 55 to 59 years # + 13 = Age 60 to 64 years # + 14 = Age 65 to 69 years # + 15 = Age 70 to 74 years # + 16 = Age 75 to 79 years # + 17 = Age 80 to 84 years # + 18 = Age 85 years or older # In[41]: davidson_census = pd.read_csv('data/davidson_county.csv') davidson_census.head() # Number under 18 and under 5 from [2010 census](http://www.census.gov/quickfacts/table/PST045215/47037,47): # In[42]: under_18_2010 = 136612 under_5_2010 = 44493 over_4_under_18 = under_18_2010 - under_5_2010 over_4_under_18 # Children 5-19 from census dataset: # In[43]: under_20 = davidson_census.loc[(davidson_census.YEAR==2010) & davidson_census.AGEGRP.isin([2,3,4]), 'TOT_POP'].sum() under_20 # Proportion over 4 but also under 18: # In[44]: pct_under_18 = over_4_under_18 / under_20 pct_under_18 # Identify racial/ethnic subgroup populations # In[45]: census_data_child = davidson_census.loc[(davidson_census.YEAR>2010) & davidson_census.AGEGRP.isin([1,2,3,4])].copy() census_data_child['WHITE'] = census_data_child[['WA_MALE', 'WA_FEMALE']].sum(axis=1) census_data_child['BLACK'] = census_data_child[['BA_MALE', 'BA_FEMALE']].sum(axis=1) # In[46]: census_data = (census_data_child.assign(under_5=census_data_child.AGEGRP==1) .groupby(['under_5','YEAR'])[['WHITE', 'BLACK']] .sum()).reset_index() census_data['year'] = census_data.YEAR - 2011 census_data # Correct over-4 counts # In[47]: census_data.loc[~census_data.under_5, ['WHITE', 'BLACK']] *= pct_under_18 # Add "under 2" category that is 40% of `under_5` # In[48]: census_data['under_2'] = False # In[49]: census_under_2 = census_data[census_data.under_5].copy() census_under_2.under_2 = True census_under_2.under_5 = False census_under_2.WHITE = (census_under_2.WHITE*0.4).astype(int) census_under_2.BLACK = (census_under_2.BLACK*0.4).astype(int) census_under_2 # Subtract under-2's from under-5's # In[50]: census_data.loc[census_data.under_5, ['WHITE', 'BLACK']] -= census_under_2[['WHITE', 'BLACK']] # In[51]: census_data = (pd.concat([census_data, census_under_2]) .reset_index(drop=True)[['under_2', 'under_5', 'year', 'WHITE', 'BLACK']] .rename(columns={'under_5':'2_to_4'})) census_data # In[52]: census_data[['WHITE', 'BLACK']] = census_data[['WHITE', 'BLACK']].astype(int) census_data # In[53]: census_data['age_group'] = 2 census_data.loc[census_data['under_2'], 'age_group'] = 0 census_data.loc[census_data['2_to_4'], 'age_group'] = 1 census_data # Reshape data # In[54]: melted_census = (pd.melt(census_data, id_vars=['age_group', 'year'], value_vars=['WHITE', 'BLACK'])) census_clean = (melted_census.assign(black=(melted_census.variable=='BLACK').astype(int)) .drop('variable', axis=1) .rename(columns={'value':'n'})) census_clean # In[55]: census_full = pd.concat([census_clean.assign(setting=i) for i in range(3)]) # In[56]: merged_data = (rotavirus_summary.merge(census_full, on=['age_group', 'year', 'black', 'setting'], how='right') .fillna(0).astype(int)) # VUMC market share for 2011-13, excluding Normal Newborns, Neonates, Women’s Health, and Behavioral Health # In[57]: inpatient_market_share = 0.94, 0.90, 0.83 ed_market_share = 0.60, 0.59, 0.62 Y2012, Y2013, Y2014 = 0, 1, 2 # Will estimate 2014 market shares by extrapolating a linear fit to the previous years. # In[58]: from sklearn import linear_model # In[59]: inpatient_df = pd.DataFrame(dict(x=range(3), y=inpatient_market_share)) inpatient_fit = linear_model.LinearRegression().fit(X=inpatient_df.x.values.reshape(-1, 1), y=inpatient_df.y) inpatient_market_share = list(inpatient_market_share[1:]) + [inpatient_market_share[-1] + inpatient_fit.coef_[0]] inpatient_market_share # In[60]: ed_df = pd.DataFrame(dict(x=range(3), y=ed_market_share)) ed_fit = linear_model.LinearRegression().fit(X=ed_df.x.values.reshape(-1, 1), y=ed_df.y) ed_market_share = list(ed_market_share[1:]) + [ed_market_share[-1] + ed_fit.coef_[0]] ed_market_share # Enrollment days/week for inpatient, outpatient, ED # In[61]: monitoring_days = 7, 4, 4 # Only 5.5 days of intake on outpatient monitoring_rate = np.array(monitoring_days)/np.array([7, 5.5, 7]) monitoring_rate # In[62]: rotavirus_data[(rotavirus_data.white==1) & (rotavirus_data.setting==2) & (rotavirus_data.year==2) & (rotavirus_data.under_5==1)].shape # In[63]: setting_data = dict(list(merged_data.groupby('setting'))) # # Inpatient AGE Model # In[64]: from pymc3 import Model, Deterministic, sample, NUTS, find_MAP, Potential, Metropolis from pymc3 import Binomial, Beta, Poisson, Normal, DiscreteUniform, Uniform, Flat, Gamma from pymc3 import traceplot, forestplot, summary, df_summary import theano.tensor as tt # In[65]: setting_data[INPATIENT].drop('setting', axis=1).head() # In[66]: inpatient_data = setting_data[INPATIENT].drop('setting', axis=1) inpatient_data[['cases', 'n']] # In[67]: def generate_inpatient_model(inpatient_data, pool_years=False): assert not inpatient_data.isnull().sum().any() with Model() as model: cases, n = inpatient_data[['cases', 'n']].values.T # Cases weighted by monitoring effort and market share weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[inpatient_data.year] # Uniform prior on weighted cases weighted_cases = Uniform('weighted_cases', cases, n, shape=inpatient_data.shape[0]) # testval=(cases/weights).astype(int)) # Likelihood of observed cases weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases) if pool_years: assert not len(cases) % 3 pooled_shape = int(len(cases)/3) pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1)) θ = Normal('θ', 0, sd=1000, shape=pooled_shape) λ = Deterministic('λ', tt.exp(θ)) AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases)) else: θ = Normal('θ', 0, sd=1000, shape=inpatient_data.shape[0]) # Poisson rate parameter λ = Deterministic('λ', tt.exp(θ)) # Poisson likelihood AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases)) return model # ### Full (unpooled) model # In[68]: inpatient_model_full = generate_inpatient_model(setting_data[INPATIENT].drop('setting', axis=1)) # In[69]: iterations = 1000 tune = 1000 # In[70]: with inpatient_model_full: inpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # In[71]: forestplot(inpatient_full_trace, varnames=['λ']) # In[72]: label_map = {'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'black':{0:'white', 1:'black'}, 'setting':{0:'inpatient', 1:'ED', 2:'outpatient'}} # In[73]: inpatient_data_labels = (setting_data[INPATIENT][['age_group', 'year', 'black']] .reset_index(drop=True) .replace(label_map) .rename(columns={'black':'race', 'age_group':'age'})) # Rate factor for population # In[74]: rate_factor = 10000 # In[75]: rate_df_index = pd.MultiIndex.from_tuples(inpatient_data_labels.values.tolist(), names=inpatient_data_labels.columns) # In[76]: from pymc3 import hpd def _hpd_df(x, alpha): cnames = ['hpd_{0:g}'.format(100 * alpha/2), 'hpd_{0:g}'.format(100 * (1 - alpha/2))] return pd.DataFrame(hpd(x, alpha), columns=cnames) # In[77]: estimates = (df_summary(inpatient_full_trace, varnames=['λ'], stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'), lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3) estimates.index = inpatient_data_labels.index # In[78]: trace, labels = inpatient_full_trace, inpatient_data_labels # In[79]: def generate_table(trace, labels, index, columns, varnames=['λ']): rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), names=labels.columns) rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']] rate_df.index = rate_df_index pivot_all = pd.pivot_table(rate_df .reset_index(), index=index, columns=columns) value_iterator = zip(pivot_all['mean'].values.flatten(), pivot_all['hpd_2.5'].values.flatten(), pivot_all['hpd_97.5'].values.flatten()) rate_strings = np.reshape(['{0:.2f} ({1:.2f}, {2:.2f})'.format(*vals) for vals in value_iterator], pivot_all['mean'].shape) return pd.DataFrame(rate_strings, index=pivot_all['mean'].index, columns=pivot_all['mean'].columns) # In[80]: inpatient_full_table = generate_table(inpatient_full_trace, inpatient_data_labels, index=['age', 'year'], columns=['race']).sort_index() inpatient_full_table.to_excel('results/inpatient_full.xlsx') inpatient_full_table # In[81]: inpatient_rate_samples = pd.concat( [inpatient_data_labels, pd.DataFrame(inpatient_full_trace['λ']).T], axis=1) # In[82]: inpatient_rate_samples = (pd.melt(inpatient_rate_samples, id_vars=['age', 'year', 'race'])) # Plot of rates broken down by year/age/race/ethnicity. # In[83]: sns.set(style="ticks") rateplot = sns.factorplot(x="year", y="rate", hue="age", col='race', hue_order=['0-1','2-4','5+'], data=inpatient_rate_samples.assign(rate=inpatient_rate_samples.value*rate_factor), palette="gray", size=5, aspect=.8, kind='box', linewidth=0.6, fliersize=0) rateplot.despine(left=True) # ### Pooled across race # In[84]: inpatient_pool_race_data = (setting_data[INPATIENT].drop('setting', axis=1) .groupby(['age_group', 'year'])[['cases','n']] .sum() .reset_index()) inpatient_pool_race_data # In[85]: inpatient_model_pool_race = generate_inpatient_model(inpatient_pool_race_data) # In[86]: with inpatient_model_pool_race: inpatient_pool_race_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # In[87]: inpatient_pool_race_data_labels = (inpatient_pool_race_data[['age_group', 'year']] .reset_index(drop=True) .replace(label_map) .rename(columns={'age_group':'age'})) # In[88]: inpatient_pool_race_table = generate_table(inpatient_pool_race_trace, inpatient_pool_race_data_labels, index=['year'], columns=['age']) inpatient_pool_race_table.to_excel('results/inpatient_pool_race.xlsx') inpatient_pool_race_table # In[89]: inpatient_pool_race_samples = pd.concat( [inpatient_pool_race_data[['age_group', 'year']].reset_index(drop=True), pd.DataFrame(inpatient_pool_race_trace['λ']).T], axis=1) # In[90]: inpatient_pool_race_samples = (pd.melt(inpatient_pool_race_samples, id_vars=['age_group', 'year']) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}}) .rename(columns={'age_group':'age group'})) # Plot of rates pooled by ethnicity # In[91]: sns.set_context(font_scale=1.5) sns.boxplot(x="year", y="rate", hue="age group", data=inpatient_pool_race_samples.assign(rate=inpatient_pool_race_samples.value*rate_factor), palette="gray", linewidth=0.6, fliersize=0).set_xlabel('year') # ### Pooled across years # In[92]: inpatient_pool_year_data = (setting_data[INPATIENT].drop('setting', axis=1) .sort_values(by=['age_group', 'black', 'year'], ascending=False)).reset_index(drop=True) inpatient_pool_year_data # In[93]: inpatient_model_pool_year = generate_inpatient_model(inpatient_pool_year_data, pool_years=True) # In[94]: with inpatient_model_pool_year: inpatient_pool_year_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # In[95]: inpatient_pool_year_labels = (inpatient_pool_year_data[['age_group', 'black']] .reset_index(drop=True) .replace(label_map) .rename(columns={'black':'race', 'age_group':'age'}) .drop_duplicates().reset_index(drop=True)) # In[96]: estimates_pooled = (df_summary(inpatient_pool_year_trace, varnames=['λ'], stat_funcs=[lambda x: pd.Series(np.median(x, 0), name='median'), lambda x: _hpd_df(x, 0.5)], extend=True) * rate_factor).round(3) estimates_pooled.index = inpatient_pool_year_labels.index pd.concat([inpatient_pool_year_labels, estimates_pooled], axis=1) # ### Inpatient AGE estimates # In[97]: inpatient_pool_year_table = generate_table(inpatient_pool_year_trace, inpatient_pool_year_labels, index=['race'], columns=['age']) inpatient_pool_year_table.to_excel('results/inpatient_pool_year.xlsx') inpatient_pool_year_table # In[98]: inpatient_pool_year_samples = pd.concat( [inpatient_pool_year_data[inpatient_pool_year_data.year==0].reset_index(drop=True)[['age_group', 'black']], pd.DataFrame(inpatient_pool_year_trace['λ']).T], axis=1) # In[99]: inpatient_pool_year_samples = (pd.melt(inpatient_pool_year_samples, id_vars=['age_group', 'black']) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'black':{0:'white', 1:'black'}}) .rename(columns={'black':'race', 'age_group':'age group'})) # Plot of rates pooled by year # In[100]: sns.set_context(font_scale=1.5) plt.subplots(figsize=(12,6)) sns.boxplot(x="race", y="rate", hue="age group", data=inpatient_pool_year_samples.assign(rate=inpatient_pool_year_samples.value*rate_factor), palette="gray_r", linewidth=0.6, fliersize=0) # # Outpatient AGE Model # # Import outpatient denominator information from REDCap # In[101]: import redcap import getpass local_auth = True api_url = 'https://redcap.vanderbilt.edu/api/' if local_auth: api_key = open("/Users/fonnescj/Dropbox/Collaborations/Halasa/Tokens/outpatient_token.txt").read() else: api_key = getpass.getpass() # In[102]: outpatient_project = redcap.Project(api_url, api_key) # In[103]: outpatients = outpatient_project.export_records(format='df') # In[104]: outpatients.groupby('year').year.count() # In[105]: outpatients.shape # In[106]: outpatients.head() # In[107]: outpatients[['ageinjan', 'ageinnov']].hist(bins=20) # Going to assume that the outlier ages above were mistakenly entered as months. # In[108]: outliers = outpatients.ageinjan > 20 outpatients.loc[outliers, ['ageinjan', 'ageinnov']] = outpatients.loc[outliers, ['ageinjan', 'ageinnov']] / 12 # In[109]: outpatients[['ageinjan', 'ageinnov']].hist(bins=20) # In[110]: outpatients['age'] = np.maximum(outpatients[['ageinjan', 'ageinnov']].mean(1), 0.1) # In[111]: outpatients_kids = outpatients[outpatients.age<18].copy() # In[112]: outpatients_kids['under_5'] = (outpatients_kids.age<5).astype(int) outpatients_kids['under_2'] = (outpatients_kids.age<2).astype(int) # In[113]: outpatients_kids['age_group'] = 2 outpatients_kids.loc[outpatients_kids.under_5==1, 'age_group'] = 1 outpatients_kids.loc[outpatients_kids.under_2==1, 'age_group'] = 0 # In[114]: outpatients_kids.groupby('year').age.hist(alpha=0.3) # In[115]: outpatient_n = (outpatients_kids.rename(columns={'ptname':'n'}) .groupby(['year', 'age_group']) .n.count() .reset_index()) outpatient_n # Interpolate 2013 # In[116]: interp_2013 = (outpatient_n.groupby(['age_group']) .apply(lambda x: np.interp(2013, x.year, x.n)) .astype(int) .reset_index()).assign(year=2013).rename(columns={0:'n'}) interp_2013 # Drop 2015 and recode year # In[117]: outpatient_n = pd.concat([outpatient_n, interp_2013], axis=0) outpatient_n = outpatient_n[outpatient_n.year<2015] outpatient_n = outpatient_n.assign(year=outpatient_n.year-2012) outpatient_n # In[118]: outpatient_data_full = setting_data[OUTPATIENT].groupby(['age_group','year']).cases.sum().reset_index() # In[119]: outpatient_data_full # In[120]: outpatient_merged = outpatient_data_full.merge(outpatient_n, on=['age_group','year']) # In[121]: outpatient_merged # In[122]: def generate_outpatient_model(outpatient_data, pool_years=False): assert not outpatient_data.isnull().sum().any() with Model() as model: cases,n = outpatient_data[['cases','n']].values.T shape = outpatient_data.shape[0] # Cases weighted by monitoring effort weights = monitoring_rate[OUTPATIENT] # Uniform prior on weighted cases weighted_cases = Uniform('weighted_cases', cases, n, shape=shape) # Likelihood of observed cases weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases) if pool_years: assert not len(cases) % 3 pooled_shape = int(len(cases)/3) pooled_cases = Deterministic('pooled_cases', tt.reshape(weighted_cases, (pooled_shape, 3)).sum(-1)) θ = Normal('θ', 0, sd=1000, shape=pooled_shape) λ = Deterministic('λ', tt.exp(θ)) AGE_obs = Potential('AGE_obs', Poisson.dist(λ*(np.reshape(n, (pooled_shape, 3))[:, 0])).logp(pooled_cases)) else: θ = Normal('θ', 0, sd=1000, shape=outpatient_data.shape[0]) # Poisson rate parameter λ = Deterministic('λ', tt.exp(θ)) # Poisson likelihood AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases)) return model # In[123]: outpatient_model_full = generate_outpatient_model(outpatient_merged) # In[124]: with outpatient_model_full: outpatient_full_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # In[125]: outpatient_data_labels = (inpatient_pool_race_data[['age_group', 'year']] .reset_index(drop=True) .replace(label_map) .rename(columns={'age_group':'age'})) outpatient_data_labels = outpatient_data_labels[outpatient_data_labels.year.isin(['2012/13','2013/14'])] outpatient_data_labels.columns = ['age', 'year'] # In[126]: outpatient_data_labels # ### Outpatient AGE estimates # In[127]: outpatient_full_table = generate_table(outpatient_full_trace, outpatient_data_labels, varnames=['λ'], index=['age'], columns=['year']) outpatient_full_table.to_excel('results/outpatient_pool_race.xlsx') outpatient_full_table # In[128]: outpatient_full_samples = pd.concat( [outpatient_merged[['age_group', 'year']].reset_index(drop=True), pd.DataFrame(outpatient_full_trace['λ']).T], axis=1) # In[129]: outpatient_full_samples = (pd.melt(outpatient_full_samples, id_vars=['age_group', 'year']) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}}) .rename(columns={'age_group':'age group'})) # In[130]: sns.set_context(font_scale=1.5) sns.boxplot(x="year", y="rate", hue="age group", data=outpatient_full_samples.assign(rate=outpatient_full_samples.value*rate_factor), palette="gray", linewidth=0.6, fliersize=0) # ## ED AGE Model # In[131]: ED2 = pd.read_excel('data/FINAL year 2 for CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0) ED3 = pd.read_excel('data/Final year 3 CDC.xlsx', skiprows=3, parse_cols='B:H', index_col=0).fillna(0) ED2.columns = ED3.columns # In[132]: ED4 = pd.read_excel('data/Year 4 denominators.xlsx', index_col=0) ED4 = ED4.fillna(ED4.mean()) ED4.columns = ED3.columns # In[133]: ED2_total = ED2.sum() ED2_eligible = (ED2_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], ED2_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)', 'Total Eligible on Date (Age > 11 Years and < 18 Years)']]) ED2_visits = (ED2_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'], ED2_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)', 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']]) # In[134]: ED3_total = ED3.sum() ED3_eligible = (ED3_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)', 'Total Eligible on Date (Age > 11 Years and < 18 Years)']]) ED3_visits = (ED3_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'], ED3_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)', 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']]) # In[135]: ED4_total = ED4.sum() ED4_eligible = (ED4_total['Total Eligible on Date (Age > 14 Days and < 5 Years)'], ED3_total[['Total Eligible on Date (Age > 5 Years and < 11 Years)', 'Total Eligible on Date (Age > 11 Years and < 18 Years)']]) ED4_visits = (ED4_total['Total Visits to ED on Date (Age > 14 Days and < 5 Years)'], ED4_total[['Total Visits to ED on Date (Age > 5 Years and < 11 Years)', 'Total Visits to ED on Date (Age > 11 Years and < 18 Years)']]) # In[136]: ED_all = pd.concat([ED2, ED3, ED4]) eligible_cols = ED_all.columns[ED_all.columns.str.contains('Eligible')] # In[137]: ED_counts = ED_all[eligible_cols].sum(1).astype(int) # In[138]: rotavirus_ed = rotavirus_data[rotavirus_data.setting==ED] surveillance_counts = rotavirus_data.groupby(['scrdate'])['caseid'].count() # In[139]: ED_counts.name = 'count24' surveillance_counts.name = 'count8' # In[140]: surveillance = pd.concat([ED_counts, surveillance_counts], axis=1).dropna() # Drop days where 8-hour count exceeds 24-hour count surveillance = surveillance[surveillance.count8<=surveillance.count24] # Here is what it looks like if you resample to weekly # In[141]: weekly_surveillance = surveillance.reindex(pd.to_datetime(surveillance.index)).resample('1W') (weekly_surveillance.count8.mean() / weekly_surveillance.count24.mean()).hist() # We will just use the proportion of the pooled suveillance # In[142]: surveillance_total = surveillance.sum() surveillance_total # In[143]: surveillance_proportion = surveillance_total.count8 / surveillance_total.count24 surveillance_proportion # In[144]: def generate_ed_model(ED_data): assert not ED_data.isnull().sum().any() with Model() as model: data_len = ED_data.shape[0] cases, n = ED_data[['cases','n']].values.T # Weights according to ED monitoring intensity, market share and 8-hour surveillance weights = (np.array(monitoring_rate)[ED] * np.array(ed_market_share)[ED_data.year] * surveillance_proportion) # Uniform prior on weighted cases weighted_cases = Uniform('weighted_cases', cases, n, shape=data_len) # Likelihood of observed cases weighting = Binomial('weighting', n=weighted_cases, p=weights, observed=cases) θ = Normal('θ', 0, sd=1000, shape=data_len) # Poisson rate parametera λ = Deterministic('λ', tt.exp(θ)) # Poisson likelihood AGE_obs = Potential('AGE_obs', Poisson.dist(λ*n).logp(weighted_cases)) return model # In[145]: ed_pool_race_data = (setting_data[ED].drop('setting', axis=1) .groupby(['age_group', 'year'])[['cases','n']] .sum() .reset_index()) ed_pool_race_data # Back-of-the-envelope rate calculation # In[146]: ed_pool_race_data.assign(empirical_rate=rate_factor * ed_pool_race_data.cases/(ed_pool_race_data.n * np.array(monitoring_rate)[ED] * surveillance_proportion * np.array(ed_market_share)[ed_pool_race_data.year])) # In[147]: ed_model = generate_ed_model(ed_pool_race_data) # In[148]: with ed_model: ed_trace = sample(1000, tune=tune, njobs=2, random_seed=seed_numbers) # In[149]: forestplot(ed_trace, varnames=['λ']) # In[150]: ed_data_labels = (ed_pool_race_data[['age_group', 'year']] .reset_index(drop=True) .replace(label_map) .rename(columns={'age_group':'age'})) # ### ED AGE estimates # In[151]: ed_table = generate_table(ed_trace, ed_data_labels, index=['year'], columns=['age']) ed_table.to_excel('results/ed_pool_race.xlsx') ed_table # Create data frame of MCMC samples for plotting # In[152]: ed_rate_samples = pd.concat([ed_pool_race_data[['age_group', 'year']], pd.DataFrame(ed_trace['λ']).T], axis=1) # In[153]: ed_rate_samples = (pd.melt(ed_rate_samples, id_vars=['age_group', 'year']) .replace({'under_5':{0:'5+', 1:'<5'}, 'year':{0:'2012/13', 1:'2013/14'}}) .rename(columns={'age_group':'age group'})) # In[154]: sns.set_context(font_scale=1.5) sns.boxplot(x="year", y="rate", hue="age group", data=ed_rate_samples.assign(rate=ed_rate_samples.value*rate_factor), palette="gray", linewidth=0.6, fliersize=0) # # Virus Rate Models # In[155]: collected_stools = (rotavirus_data.groupby(['age_group', 'black', 'year', 'setting']) .agg({'stool_collected':['sum', 'count']})) levels = collected_stools.columns.levels labels = collected_stools.columns.labels collected_stools.columns = ['stool_collected', 'enrolled'] collected_stools = collected_stools.reset_index() collected_stools.head() # In[156]: virus_summary = (rotavirus_data.dropna(subset=virus_cols) .groupby(['age_group', 'black', 'year', 'setting', 'rotavirus', 'sapovirus', 'astrovirus', 'norovirus'])['caseid'].count() .reset_index() .rename(columns={'caseid':'cases'})) virus_summary.head() # In[157]: merged_virus_data = (virus_summary.merge(collected_stools, on=['age_group', 'black', 'year', 'setting']) .merge(census_clean, on=['age_group', 'year', 'black'], how='left')) merged_virus_data.head(10) # In[158]: any_virus = (merged_virus_data.groupby(['black','age_group', 'year', 'setting']) .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max})).sum(level=['age_group', 'year', 'setting']) any_virus # All possible combinations of age, year and setting for index. # In[159]: complete_index = pd.MultiIndex.from_product([range(3), range(3), range(4)], names=['age_group', 'year', 'setting']) # In[160]: sapo_data = (merged_virus_data[merged_virus_data.sapovirus==1] .groupby(['age_group', 'year', 'setting']) .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max}) .reindex(complete_index) .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected) .fillna(0).astype(int)) rota_data = (merged_virus_data[merged_virus_data.rotavirus==1] .groupby(['age_group', 'year', 'setting']) .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max}) .reindex(complete_index) .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected) .fillna(0).astype(int)) astro_data = (merged_virus_data[merged_virus_data.astrovirus==1] .groupby(['age_group', 'year', 'setting']) .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max}) .reindex(complete_index) .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected) .fillna(0).astype(int)) noro_data = (merged_virus_data[merged_virus_data.norovirus==1] .groupby(['age_group', 'year', 'setting']) .agg({'cases':sum, 'stool_collected':max, 'enrolled':max, 'n':max}) .reindex(complete_index) .assign(n=any_virus.n, enrolled=any_virus.enrolled, stool_collected=any_virus.stool_collected) .fillna(0).astype(int)) # In[161]: virus_totals = pd.concat([astro_data.reset_index().assign(virus='astrovirus'), noro_data.reset_index().assign(virus='norovirus'), rota_data.reset_index().assign(virus='rotavirus'), sapo_data.reset_index().assign(virus='sapovirus')]) virus_totals['cases'] = virus_totals.cases.astype(int) virus_totals['n'] = virus_totals.n.astype(int) # ## Inpatient virus rate model # In[205]: virus_lookup = {0:'norovirus', 1:'rotavirus', 2:'sapovirus', 3:'astrovirus'} # In[162]: virus_inpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==INPATIENT].reset_index(drop=True) # In[163]: cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T # In[164]: with Model() as virus_inpatient_model: groups = virus_inpatient_data.shape[0] # Probability of stool collection ψ = Beta('ψ', 10, 1, shape=3) # Age-group specific enrolled inpatients, and inpatients with collected stool inpatient_enrolled = rotavirus_data.groupby('age_group').apply(lambda x: (x.setting==INPATIENT).sum()).values collected = (rotavirus_data.groupby('age_group') .apply(lambda x: x[x.setting==INPATIENT].stool_collected.sum()) .values) stool_likelihood = Binomial('stool_likelihood', n=inpatient_enrolled, p=ψ, observed=collected) # Extract data columns cases, enrolled, year, n, age_group = virus_inpatient_data[['cases', 'enrolled', 'year', 'n', 'age_group']].values.T enrolled_cases = DiscreteUniform('enrolled_cases', lower=cases, upper=enrolled, shape=groups, testval=(cases/ψ[age_group.astype(int)]).astype('int32')) # Estimate total VUMC cases in setting enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', n=enrolled_cases, p=ψ[age_group.astype(int)], observed=cases) # Calculate an "effective N" that corresponds to sampled population weights = monitoring_rate[INPATIENT] * np.array(inpatient_market_share)[year.astype(int)] n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, shape=groups) davidson_cases_likelihood = Potential('davidson_cases_likelihood', Binomial.dist(n=n, p=weights).logp(n_eff.astype('int32')).sum()) # Population rate θ = Normal('θ', 0, sd=10, shape=groups) λ = Deterministic('λ', tt.exp(θ)) # Data likelihood virus_obs = Potential('virus_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases)) # In[165]: with virus_inpatient_model: virus_inpatient_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # Posterior distribution of stool sampling rate # In[166]: forestplot(virus_inpatient_trace, varnames=['ψ']) # In[167]: traceplot(virus_inpatient_trace, varnames=['ψ']) # In[168]: forestplot(virus_inpatient_trace, varnames=['λ']) # ### Inpatient virus rates, plotted and tabulated # In[169]: inpatient_sample_melted = pd.melt(pd.DataFrame(virus_inpatient_trace['λ'])) # In[170]: inpatient_virus_samples = (virus_inpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'], axis=1).reset_index(drop=True) .merge(inpatient_sample_melted, left_index=True, right_on='variable') .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[171]: sns.set(style="ticks") inpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', data=inpatient_virus_samples.assign(rate=inpatient_virus_samples.value*rate_factor), palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2, fliersize=0) inpatient_rateplot.despine(left=True) # In[172]: virus_inpatient_data.to_excel('inpatient_virus_data.xlsx') # In[173]: inpatient_virus_labels = (virus_inpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'], axis=1).reset_index(drop=True) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[174]: labels = inpatient_virus_labels trace = virus_inpatient_trace varnames=['λ'] # In[175]: labels.head() # In[176]: rate_df_index = pd.MultiIndex.from_tuples(labels.values.tolist(), names=labels.columns) rate_df = (df_summary(trace, varnames=varnames) * rate_factor)[['mean', 'hpd_2.5', 'hpd_97.5']] #rate_df.index = rate_df_index # In[177]: virus_inpatient_table = generate_table(virus_inpatient_trace, inpatient_virus_labels, index=['age group', 'year'], columns=['virus']).sort_index() virus_inpatient_table.to_excel('results/virus_inpatient_table.xlsx') # In[178]: virus_inpatient_table # ## Outpatient virus rate model # In[179]: virus_outpatient_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==OUTPATIENT] # In[180]: with Model() as virus_outpatient_model: groups = virus_outpatient_data.shape[0] # Probability of stool collection ψ = Beta('ψ', 1, 1) outpatient_enrolled = (rotavirus_data.setting==OUTPATIENT).sum() collected = rotavirus_data[rotavirus_data.setting==OUTPATIENT].stool_collected.sum() stool = Binomial('stool', n=outpatient_enrolled, p=ψ, observed=collected) cases, enrolled, n = virus_outpatient_data[['cases', 'enrolled', 'n']].values.T enrolled_cases_out = Uniform('enrolled_cases_out', lower=cases, upper=enrolled, shape=groups) p = monitoring_rate[OUTPATIENT]*ψ # Estimate total VUMC cases in setting enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', n=enrolled_cases_out, p=p, observed=cases) π = Beta('π', 1, 10, shape=groups) # Data likeihood virus_out_obs = Potential('virus_out_obs', Binomial.dist(n=n, p=π).logp(enrolled_cases_out).sum()) # In[181]: with virus_outpatient_model: virus_outpatient_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # Posterior distribution of stool sampling rate # In[182]: traceplot(virus_outpatient_trace, varnames=['ψ']) # ### Outpatient virus rates, plotted and tabulated # In[183]: outpatient_sample_melted = pd.melt(pd.DataFrame(virus_outpatient_trace['π'])) # In[184]: outpatient_virus_samples = (virus_outpatient_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'], axis=1).reset_index(drop=True) .merge(outpatient_sample_melted, left_index=True, right_on='variable') .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[185]: outpatient_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', data=outpatient_virus_samples.assign(rate=outpatient_virus_samples.value*rate_factor), palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2, fliersize=0) outpatient_rateplot.despine(left=True) # In[186]: outpatient_virus_labels = (virus_outpatient_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'], axis=1).reset_index(drop=True) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[187]: virus_outpatient_table = generate_table(virus_outpatient_trace, outpatient_virus_labels, index=['age group', 'year'], varnames=['π'], columns=['virus']).sort_index() virus_outpatient_table.to_excel('results/virus_outpatient_table.xlsx') # In[188]: virus_outpatient_table # ## ED virus rate model # In[189]: virus_ed_data = (virus_totals.replace({'virus': virus_lookup}))[virus_totals.setting==ED] # In[190]: with Model() as virus_ed_model: # Probability of stool collection ψ = Beta('ψ', 1, 1) ed_enrolled = (rotavirus_data.setting==ED).sum() collected = rotavirus_data[rotavirus_data.setting==ED].stool_collected.sum() stool = Binomial('stool', n=ed_enrolled, p=ψ, observed=collected) data_len = virus_ed_data.shape[0] cases, enrolled, year, n = virus_ed_data[['cases', 'enrolled', 'year', 'n']].values.T enrolled_cases = Uniform('enrolled_cases', lower=cases, upper=enrolled, shape=groups) # Estimate total VUMC cases in setting enrolled_cases_likelihood = Binomial('enrolled_cases_likelihood', n=enrolled_cases, p=ψ, observed=cases) ω = (np.array(monitoring_rate)[ED] * np.array(ed_market_share)[year.astype(int)] * surveillance_proportion) n_eff = DiscreteUniform('n_eff', lower=cases, upper=n, shape=groups) n_eff_likelihood = Potential('davidson_cases_likelihood', Binomial.dist(n=n, p=ω).logp(n_eff).sum()) # Population rate θ = Normal('θ', 0, sd=10, shape=groups) λ = Deterministic('λ', tt.exp(θ)) # Data likelihood virus_ed_obs = Potential('virus_ed_obs', Poisson.dist(λ*n_eff).logp(enrolled_cases)) # In[191]: with virus_ed_model: virus_ed_trace = sample(iterations, tune=tune, njobs=2, random_seed=seed_numbers) # Posterior distribution of stool sampling rate # In[192]: traceplot(virus_ed_trace, varnames=['ψ']) # ### ED virus rate estimates, plotted and tabulated # In[193]: ed_sample_melted = pd.melt(pd.DataFrame(virus_ed_trace['λ'])) # In[194]: ed_virus_samples = (virus_ed_data.drop(['enrolled', 'setting', 'stool_collected', 'cases'], axis=1).reset_index() .merge(ed_sample_melted, left_index=True, right_on='variable') .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[195]: ed_rateplot = sns.factorplot(x="year", y="rate", hue="age group", col='virus', data=ed_virus_samples.assign(rate=ed_virus_samples.value*rate_factor), palette="Blues", size=5, width=0.5, kind='box', linewidth=0.6, col_wrap=2, fliersize=0) ed_rateplot.despine(left=True) # In[196]: ed_virus_labels = (virus_ed_data.drop(['enrolled', 'n', 'setting', 'stool_collected', 'cases'], axis=1).reset_index(drop=True) .replace({'age_group':{0:'0-1', 1:'2-4', 2:'5+'}, 'year':{0:'2012/13', 1:'2013/14', 2:'2014/15'}, 'virus':virus_lookup}) .rename(columns={'age_group':'age group'})) # In[197]: virus_ed_table = generate_table(virus_ed_trace, ed_virus_labels, index=['age group', 'year'], columns=['virus']).sort_index() virus_ed_table.to_excel('results/virus_ed_table.xlsx') # In[198]: virus_ed_table