#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np from datetime import datetime import seaborn as sns import matplotlib.pylab as plt import geopandas as gpd RANDOM_SEEDS = 20090425, 19700903 # ## Data import and cleaning # # Dropping all columns that are mostly missing; converting dates. # In[2]: DATA_DIR = '../data/' # In[3]: gbs_data = pd.read_csv(DATA_DIR+'raw/gbs_neo_27oct2016.csv', encoding='ISO-8859-1', parse_dates=['ADMIT', 'DISCHRG', 'BIRTH']).dropna('columns', thresh=300) # Number of rows and columns in resulting import # In[4]: gbs_data.shape # Proportion of races. # In[5]: gbs_data[['white','black','amerind','asian','pacific','unkrace']].mean() # A large portion of ethnicity is undeclared, so probably not useful. # In[6]: gbs_data.ETHNIC.value_counts() # In[7]: gbs_data['hispanic'] = gbs_data.ETHNIC==1 gbs_data.loc[gbs_data.ETHNIC==9, 'hispanic'] = np.nan # In[8]: gbs_data.hispanic.value_counts() # Drop entries with no admit date or before 2004 # In[9]: gbs_data = gbs_data[gbs_data.ADMIT>'2003-12-31'].dropna(subset=['ADMIT']) # Data of last admission in dataset # In[10]: gbs_data.ADMIT.max() # In[11]: gbs_data.ADMIT.min() # Distribution of age in days: # In[12]: ax = gbs_data.AGE_DYS.hist() ax.set_xlabel('Age (days)') ax.set_ylabel('Frequency'); # Flag for early onset # In[13]: gbs_data['early_onset'] = (gbs_data.AGE_DYS<7).astype(int) # Counts of records by county # In[14]: gbs_data.COUNTY.value_counts() # Plot of time series of counts # In[15]: sns.set_style('dark') sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique())) fig, axes = plt.subplots(4,5, figsize=(14,8), sharey=True, sharex=True) for ax, (county, data) in zip(axes.ravel(), gbs_data.groupby('COUNTY')): (data.assign(case=gbs_data.NotACase==0) .set_index('ADMIT') .resample('A') .case.sum() .fillna(0) .apply(lambda x: x + np.random.randn()*0.04) .plot(style='o', label=county, alpha=0.6, ax=ax)) ax.set_title(county) ax.set_xlim('2005-1-1', '2016-1-1') ax.set_xlabel('') # In[16]: sns.set_style('dark') sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique())) fig = plt.figure(figsize=(14,6)) for county, data in gbs_data.groupby('COUNTY'): (data.assign(case=gbs_data.NotACase==0) .set_index('ADMIT') .resample('A') .case.sum() .fillna(0) .apply(lambda x: x + np.random.randn()*0.04) .plot(style='.-', label=county, alpha=0.6)) plt.legend(bbox_to_anchor=(1.01, 1.0)) plt.ylabel('Count') plt.xlabel(''); # ## Geographic data # In[17]: county_map = gpd.read_file(DATA_DIR+'GIS/TN_counties') county_map['NAME'] = county_map.NAME.str.upper() # In[18]: county_map = county_map.set_index('NAME').join(gbs_data.COUNTY.value_counts()) county_map['gbs_count'] = county_map.COUNTY.fillna(0) # Assign counties to regions # In[19]: study_counties = county_map[county_map.index.isin(gbs_data.COUNTY.unique())].copy() study_counties['Region'] = 'East' study_counties.loc[['ROBERTSON', 'SUMNER', 'CHEATHAM', 'DAVIDSON', 'WILSON', 'DICKSON', 'RUTHERFORD', 'WILLIAMSON'], 'Region'] = 'Middle' study_counties.loc[['MADISON', 'SHELBY'], 'Region'] = 'West' # In[20]: gbs_data = gbs_data.merge(study_counties, left_on='COUNTY', right_index=True).drop(['COUNTY_x', 'COUNTY_y'], axis=1) # In[21]: gbs_data['region_ind'] = gbs_data.Region.replace({'West':0, 'Middle':1, 'East':2}) # In[22]: with sns.axes_style("ticks"): ax = county_map.plot(column='gbs_count', cmap='Blues', figsize=(14,8)) sns.despine(bottom=True, left=True) (study_counties.reset_index() .apply(lambda x: ax.annotate(s=x.NAME, xy=x.geometry.centroid.coords[0], ha='center'),axis=1)) # Year of admission # In[23]: gbs_data['admit_year'] = gbs_data.ADMIT.dt.year # In[24]: pd.set_option('display.max_rows', 20, 'display.max_columns', 35) gbs_pivot= gbs_data.pivot_table(index=['black', 'COUNTY', 'early_onset'], columns=['admit_year'], values='ADMIT', aggfunc='count').fillna(0).astype(int) # In[25]: complete_index = pd.MultiIndex.from_product(gbs_pivot.index.levels, names=['black', 'COUNTY', 'early_onset']) # In[26]: annual_cases = pd.melt(gbs_pivot.reindex(complete_index, fill_value=0).reset_index(), id_vars=['black', 'COUNTY', 'early_onset'], value_name='cases') # In[27]: annual_cases.head() # ### Tennessee birth data # In[28]: births_sheets = pd.read_excel(DATA_DIR+'raw/births.xlsx', sheetname=None) # In[29]: births_raw = pd.concat([births_sheets[s].assign(year=int(s)) for s in births_sheets]) births_raw['county'] = births_raw.county.str.upper() births_raw.head() # Remove unneeded counties # In[30]: county_list = gbs_data.COUNTY.unique() county_list.sort() births_subset = births_raw[births_raw.county.isin(county_list)] births_subset.shape # In[31]: births_race = births_subset[births_subset.race!='all'] births_race.drop(['rate','population'], axis=1).pivot_table(index=['race', 'county'], columns='year') # In[32]: births_long = (pd.melt(births_race, value_vars=['births'], id_vars=['county', 'race', 'year']) .drop('variable', axis=1)) births_long.head() # In[33]: min_year = births_long.year.min() min_year # Create numeric fields for analysis, sort by race, year, county. # In[34]: births_numeric = births_long.copy() births_numeric['county'] = births_numeric.county.apply(county_list.searchsorted) births_numeric['race'] = (births_numeric.race=='black').astype(int) births_numeric['year'] = births_numeric.year - births_numeric.year.min() births_numeric.columns = ['county', 'black', 'year', 'births'] births_numeric = (births_numeric.sort_values(by=['black', 'county', 'year']) .reset_index(drop=True)) births_numeric # In[35]: white_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='white'] white_births.pivot(columns='year', index='county') # In[36]: black_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='black'] black_births.pivot(columns='year', index='county') # ### County infant mortality # In[37]: mortality_all = pd.read_excel(DATA_DIR+'raw/county_infant_mortality.xlsx') mortality_all.head() # Remove unneeded counties and convert to probabilities # In[38]: mortality_subset = (mortality_all[mortality_all.index.str.upper().isin(gbs_data.COUNTY.unique())]/1000).astype(float) mortality_subset # Calculate quarterly survival # In[39]: quarterly_survival = (1 - mortality_subset)**(1/4) quarterly_survival # Will simply assume 2013 mortality carries forward to 2015 # In[40]: quarterly_survival[2014] = quarterly_survival[2015] = quarterly_survival[2013] quarterly_survival = quarterly_survival.drop(2004, axis=1) # In[41]: mean_quarterly_survival = quarterly_survival.mean() # ### Merge GBS and birth counts # In[42]: annual_cases.head() # In[43]: cases_numeric = annual_cases[annual_cases.admit_year>=min_year] cases_numeric = (cases_numeric.assign(county=cases_numeric.COUNTY.apply(county_list.searchsorted), year=cases_numeric.admit_year - cases_numeric.admit_year.min()) .drop(['COUNTY', 'admit_year'], axis=1)) cases_numeric.head() # Indices of counties within regions # In[44]: middle_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='Middle'].COUNTY.unique()))[0] # In[45]: west_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='West'].COUNTY.unique()))[0] # In[46]: east_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='East'].COUNTY.unique()))[0] # ## Model specification # In[78]: from pymc3 import Model, sample, traceplot, forestplot, NUTS, Metropolis from pymc3 import energyplot, fit, summary, sample_ppc from pymc3 import Multinomial, Normal, Poisson, Deterministic, Gamma, MvNormal from pymc3 import Binomial, HalfCauchy, Uniform, StudentT, Exponential, GaussianRandomWalk from pymc3.math import invlogit from pymc3.gp import cov, mean import theano.tensor as tt from theano import shared # In[50]: merge_cols = ['county', 'black', 'year'] merged_covs = (births_numeric.set_index(merge_cols) .join(cases_numeric.set_index(merge_cols)) .fillna(0) .reset_index() .astype(int)) merged_covs = merged_covs.sort_values(by=['early_onset', 'black', 'county', 'year'], ascending=True) # In[51]: ncounties = int(merged_covs.county.max()+1) nyears = len(merged_covs.year.unique()) data_shape = (2, ncounties, nyears) data_shape # In[52]: merged_covs.groupby('early_onset').count() # In[53]: county_ind = np.where(county_list=='Davidson'.upper())[0][0] merged_covs[merged_covs.county==county_ind] # In[54]: def county_model(county, early_onset): county_ind = np.where(county_list==county.upper())[0][0] with Model() as model: # Make local references to data subset_ind = (merged_covs.county==county_ind) & (merged_covs.early_onset==early_onset) _, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T data_shape = cases.shape births_matrix = tt.reshape(births, data_shape) surv_matrix = np.repeat([quarterly_survival.loc[county].values], 2) rng = tt.shared_randomstreams.RandomStreams() n_vector = rng.binomial(n=births_matrix, p=surv_matrix, size=data_shape) ρ = Gamma('ρ', 1, 0.1, shape=2) η = Gamma('η', 1, 0.1, shape=2) K_white = η[0] * cov.ExpQuad(1, ρ[0]) K_black = η[1] * cov.ExpQuad(1, ρ[1]) σ = HalfCauchy('σ', 3, shape=2) s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)]) X = np.arange(nyears).reshape(-1,1) θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, shape=(nyears,)) θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, shape=(nyears,)) # Poisson GBS rate, by time race and county λ_black = Deterministic('λ_black', tt.exp(θ_black)) λ_white = Deterministic('λ_white', tt.exp(θ_white)) # Data likelihood obs = Poisson('obs', n_vector * tt.concatenate([λ_white, λ_black]), observed=shared(cases)) return model # ### Davidson county # # Early onset # In[55]: nsamples = 5000 ntune = 4000 # In[56]: davidson_model_early = county_model('Davidson', 1) # In[58]: with davidson_model_early: approx = fit(n=10000, random_seed=RANDOM_SEEDS[0]) davidson_trace_early = approx.sample(draws=1000) # In[59]: traceplot(davidson_trace_early, varnames=['σ', 'ρ', 'η']); # Late onset # In[60]: davidson_model = county_model('Davidson', 0) # In[62]: with davidson_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) davidson_trace = approx.sample(draws=1000) # In[63]: traceplot(davidson_trace, varnames=['σ', 'ρ', 'η']); # In[64]: def plot_rates(trace, ax=None, factor=1): #rate_est = pd.DataFrame(trace['λ', -500:].T) * 10000 rate_est = pd.DataFrame(np.concatenate([trace['λ_white'], trace['λ_black']], 1).T)*factor rate_est['year'] = np.ravel([np.arange(11)+2005]*2) rate_est['race'] = np.r_[['white']*11, ['black']*11] race_df = pd.melt(rate_est, id_vars=['year', 'race'], value_name='rate') if ax is None: fig, ax = plt.subplots(figsize=(14,8)) g = sns.boxplot(x="year", y="rate", hue="race", data=race_df, fliersize=0, palette=sns.diverging_palette(220, 20, n=2), ax=ax) return g # In[65]: def multi_rate_plot(trace, early_trace, ylim=None, factor=1): fig, axes = plt.subplots(2, 1) plot_rates(trace, axes[0], factor=factor) plot_rates(early_trace, axes[1], factor=factor) axes[0].set_title('Late onset') axes[1].set_title('Early onset') axes[0].set_xticklabels(['']) axes[0].set_xlabel('') axes[1].legend_.remove() if ylim is not None: axes[0].set_ylim(ylim) axes[1].set_ylim(ylim) return axes # In[66]: multi_rate_plot(davidson_trace, davidson_trace_early, factor=10000) # ### Shelby county # # Early onset # In[68]: shelby_model_early = county_model('Shelby', 1) # In[69]: with shelby_model_early: approx = fit(n=50000, random_seed=RANDOM_SEEDS[0]) shelby_early_trace = approx.sample(draws=1000) # In[70]: traceplot(shelby_early_trace, varnames=['σ', 'ρ', 'η']); # Late onset model # In[71]: shelby_model = county_model('Shelby', 0) # In[73]: with shelby_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) shelby_trace = approx.sample(draws=1000) # Plots of posterior parameter estimates # In[74]: traceplot(shelby_trace, varnames=['σ', 'ρ', 'η']); # Boxplots and tables of rate estimates # In[76]: multi_rate_plot(shelby_trace, shelby_early_trace, factor=10000) # Model fit check # In[79]: ppc = sample_ppc(davidson_trace, samples=500, model=davidson_model) # In[80]: county_ind = np.where(county_list=='DAVIDSON')[0][0] _, black, year, births, early_onset, cases = merged_covs[merged_covs.county==county_ind].values.T plt.hist([np.searchsorted(sample, case)/500 for case,sample in zip(cases, ppc['obs'].T) if case]) # ## Regional models # In[ ]: def region_model(region, early_onset): county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0] with Model() as model: # Make local references to data subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset) county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T data_shape = cases.shape # Index survival rate from county qsurv = quarterly_survival.reset_index(drop=True) surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)]) # Apply survival to births rng = tt.shared_randomstreams.RandomStreams() n_vector = rng.binomial(n=births, p=surv, size=data_shape) # African american effect ρ = Gamma('ρ', 1, 0.1, shape=2) η = Gamma('η', 1, 0.1, shape=2) K_white = η[0] * cov.ExpQuad(1, ρ[0]) K_black = η[1] * cov.ExpQuad(1, ρ[1]) σ = HalfCauchy('σ', 3, shape=2) s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)]) X = np.arange(nyears).reshape(-1,1) θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, shape=(nyears,)) θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_black(X) + tt.eye(nyears)*s, shape=(nyears,)) λ = Deterministic('λ', tt.exp(tt.concatenate([θ_white[year[black==0]], θ_black[year[black==1]]]))) # Poisson GBS rate, by time race and county λ_black = Deterministic('λ_black', tt.exp(θ_black)*10000) λ_white = Deterministic('λ_white', tt.exp(θ_white)*10000) # Data likelihood obs = Poisson('obs', n_vector * λ, observed=shared(cases)) return model # ### Eastern Tennessee # # Late onset model # In[ ]: with region_model('East', 0) as east_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) east_trace = approx.sample(draws=1000) # Plots of posterior parameter estimates # In[ ]: traceplot(east_trace, varnames=['σ', 'ρ', 'η']); # Early onset model # In[ ]: with region_model('East', 1) as east_early_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) east_early_trace = approx.sample(draws=1000) # In[ ]: traceplot(east_early_trace, varnames=['σ', 'ρ', 'η']); # Plots and tables of rates # In[ ]: multi_rate_plot(east_trace, east_early_trace, ylim=(0,40)) # In[ ]: def rate_table(trace, race=''): varname = 'λ_%s' % race if race else 'λ' df = df_summary(trace, varnames=[varname]) df.index = range(2005, 2016) return df # In[ ]: rate_table(east_trace, 'black') # In[ ]: rate_table(east_trace, 'white') # ### West Tennessee # # Late onset # In[ ]: with region_model('West', 0) as west_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) west_trace = approx.sample(draws=1000) # In[ ]: traceplot(west_trace, varnames=['σ', 'ρ', 'η']); # Early onset # In[ ]: with region_model('West', 1) as west_early_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) west_early_trace = approx.sample(draws=1000) # In[ ]: traceplot(west_early_trace, varnames=['σ', 'ρ', 'η']); # Plots and tables of rates # In[ ]: multi_rate_plot(west_trace, west_early_trace) # In[ ]: rate_table(west_trace, 'black') # In[ ]: rate_table(west_trace, 'white') # ### Middle Tennessee # In[ ]: with region_model('Middle', 0) as middle_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) middle_trace = approx.sample(draws=1000) # In[ ]: traceplot(middle_trace, varnames=['σ', 'ρ', 'η']); # In[ ]: with region_model('Middle', 1) as middle_early_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) middle_early_trace = approx.sample(draws=1000) # In[ ]: traceplot(middle_early_trace, varnames=['σ', 'ρ', 'η']); # Plot and table of rates # In[ ]: multi_rate_plot(middle_trace, middle_early_trace) # In[ ]: rate_table(middle_trace, 'black') # In[ ]: rate_table(middle_trace, 'white') # ## Race-pooled model # In[ ]: def pooled_model(region, early_onset): county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0] with Model() as model: # Make local references to data subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset) county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T data_shape = cases.shape # Index survival rate from county qsurv = quarterly_survival.reset_index(drop=True) surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)]) # Apply survival to births rng = tt.shared_randomstreams.RandomStreams() n_vector = rng.binomial(n=births, p=surv, size=data_shape) # African american effect ρ = Gamma('ρ', 1, 0.1) η = Gamma('η', 1, 0.1) K = η * cov.ExpQuad(1, ρ) σ = HalfCauchy('σ', 3) X = np.arange(nyears).reshape(-1,1) θ = MvNormal('θ', mu=tt.zeros(nyears), cov=K(X) + tt.eye(nyears)*σ, shape=(nyears,)) # Poisson GBS rate, by time race and county λ = Deterministic('λ', tt.exp(θ)*10000) # Data likelihood obs = Poisson('obs', n_vector * tt.exp(θ[year]), observed=shared(cases)) return model # Middle region # In[ ]: with pooled_model('Middle', 0) as middle_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) middle_pooled_trace = approx.sample(draws=1000) # In[ ]: with pooled_model('Middle', 1) as middle_early_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) middle_early_pooled_trace = approx.sample(draws=1000) # In[ ]: def plot_pooled_rates(trace, early_trace, ax=None, factor=1): sns.set_context("notebook", font_scale=1.5) rate_est = pd.DataFrame(np.concatenate([trace['λ'], early_trace['λ']], 1).T)*factor rate_est['year'] = np.ravel([np.arange(11)+2005]*2) rate_est['onset'] = np.r_[['late']*11, ['early']*11] race_df = pd.melt(rate_est, id_vars=['year', 'onset'], value_name='rate') if ax is None: fig, ax = plt.subplots(figsize=(14,8)) g = sns.boxplot(x="year", y="rate", hue="onset", data=race_df, fliersize=0, palette=sns.color_palette("husl", 2), ax=ax) return g # In[ ]: ax = plot_pooled_rates(middle_pooled_trace, middle_early_pooled_trace) ax.set_title('Middle Tennessee GBS rates'); # In[ ]: rate_table(middle_pooled_trace) # In[ ]: rate_table(middle_early_pooled_trace) # East region # In[ ]: with pooled_model('East', 0) as east_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) east_pooled_trace = approx.sample(draws=1000) # In[ ]: with pooled_model('East', 1) as east_early_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) east_early_pooled_trace = approx.sample(draws=1000) # In[ ]: ax = plot_pooled_rates(east_pooled_trace, east_early_pooled_trace) ax.set_title('East Tennessee GBS rates'); # In[ ]: rate_table(east_pooled_trace) # In[ ]: rate_table(east_early_pooled_trace) # West region # In[ ]: with pooled_model('West', 0) as west_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) west_pooled_trace = approx.sample(draws=1000) # In[ ]: with pooled_model('West', 1) as west_early_pooled_model: approx = fit(n=20000, random_seed=RANDOM_SEEDS[0]) west_early_pooled_trace = approx.sample(draws=1000) # In[ ]: ax = plot_pooled_rates(west_pooled_trace, west_early_pooled_trace) ax.set_title('West Tennessee GBS rates'); # In[ ]: rate_table(west_pooled_trace) # In[ ]: rate_table(west_early_pooled_trace)