In [1]:

%matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pylab as plt
import geopandas as gpd

RANDOM_SEEDS = 20090425, 19700903

Data import and cleaning¶

Dropping all columns that are mostly missing; converting dates.

In [2]:

DATA_DIR = '../data/'

In [3]:

gbs_data = pd.read_csv(DATA_DIR+'raw/gbs_neo_27oct2016.csv', 
                        encoding='ISO-8859-1',
                        parse_dates=['ADMIT', 'DISCHRG', 'BIRTH']).dropna('columns', thresh=300)

Number of rows and columns in resulting import

In [4]:

gbs_data.shape

Out[4]:

(543, 156)

Proportion of races.

In [5]:

gbs_data[['white','black','amerind','asian','pacific','unkrace']].mean()

Out[5]:

white      0.405157
black      0.526703
amerind    0.000000
asian      0.005525
pacific    0.000000
unkrace    0.047882
dtype: float64

A large portion of ethnicity is undeclared, so probably not useful.

In [6]:

gbs_data.ETHNIC.value_counts()

Out[6]:

2    311
9    195
1     37
Name: ETHNIC, dtype: int64

In [7]:

gbs_data['hispanic'] = gbs_data.ETHNIC==1
gbs_data.loc[gbs_data.ETHNIC==9, 'hispanic'] = np.nan

In [8]:

gbs_data.hispanic.value_counts()

Out[8]:

0.0    311
1.0     37
Name: hispanic, dtype: int64

Drop entries with no admit date or before 2004

In [9]:

gbs_data = gbs_data[gbs_data.ADMIT>'2003-12-31'].dropna(subset=['ADMIT'])

Data of last admission in dataset

In [10]:

gbs_data.ADMIT.max()

Out[10]:

Timestamp('2015-12-29 00:00:00')

In [11]:

gbs_data.ADMIT.min()

Out[11]:

Timestamp('2004-01-03 00:00:00')

Distribution of age in days:

In [12]:

ax = gbs_data.AGE_DYS.hist()
ax.set_xlabel('Age (days)')
ax.set_ylabel('Frequency');

Flag for early onset

In [13]:

gbs_data['early_onset'] = (gbs_data.AGE_DYS<7).astype(int)

Counts of records by county

In [14]:

gbs_data.COUNTY.value_counts()

Out[14]:

SHELBY        222
DAVIDSON       70
HAMILTON       62
KNOX           44
RUTHERFORD     29
SUMNER         11
WILLIAMSON     11
WILSON          8
ROBERTSON       8
MADISON         7
JEFFERSON       3
SEVIER          3
DICKSON         3
BLOUNT          3
CHEATHAM        2
LOUDON          2
ANDERSON        2
ROANE           1
GRAINGER        1
Name: COUNTY, dtype: int64

Plot of time series of counts

In [15]:

sns.set_style('dark')
sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique()))
fig, axes = plt.subplots(4,5, figsize=(14,8), sharey=True, sharex=True)
for ax, (county, data) in zip(axes.ravel(), gbs_data.groupby('COUNTY')): 
    (data.assign(case=gbs_data.NotACase==0)
         .set_index('ADMIT')
         .resample('A')
         .case.sum()
         .fillna(0)
         .apply(lambda x: x + np.random.randn()*0.04)
         .plot(style='o', label=county, alpha=0.6, ax=ax))
    ax.set_title(county)
    ax.set_xlim('2005-1-1', '2016-1-1')
    ax.set_xlabel('')

In [16]:

sns.set_style('dark')
sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique()))
fig = plt.figure(figsize=(14,6))
for county, data in gbs_data.groupby('COUNTY'): 
    (data.assign(case=gbs_data.NotACase==0)
         .set_index('ADMIT')
         .resample('A')
         .case.sum()
         .fillna(0)
         .apply(lambda x: x + np.random.randn()*0.04)
         .plot(style='.-', label=county, alpha=0.6))
plt.legend(bbox_to_anchor=(1.01, 1.0))  
plt.ylabel('Count')
plt.xlabel('');

Geographic data¶

In [17]:

county_map = gpd.read_file(DATA_DIR+'GIS/TN_counties')
county_map['NAME'] = county_map.NAME.str.upper()

In [18]:

county_map = county_map.set_index('NAME').join(gbs_data.COUNTY.value_counts())
county_map['gbs_count'] = county_map.COUNTY.fillna(0)

Assign counties to regions

In [19]:

study_counties = county_map[county_map.index.isin(gbs_data.COUNTY.unique())].copy()
study_counties['Region'] = 'East'
study_counties.loc[['ROBERTSON', 'SUMNER', 'CHEATHAM', 'DAVIDSON', 'WILSON',
       'DICKSON', 'RUTHERFORD', 'WILLIAMSON'], 'Region'] = 'Middle'
study_counties.loc[['MADISON', 'SHELBY'], 'Region'] = 'West'

In [20]:

gbs_data = gbs_data.merge(study_counties, left_on='COUNTY', right_index=True).drop(['COUNTY_x', 'COUNTY_y'], axis=1)

In [21]:

gbs_data['region_ind'] = gbs_data.Region.replace({'West':0, 'Middle':1, 'East':2})

In [22]:

with sns.axes_style("ticks"):
    ax = county_map.plot(column='gbs_count', cmap='Blues', figsize=(14,8))
    sns.despine(bottom=True, left=True)
    
    (study_counties.reset_index()
             .apply(lambda x: ax.annotate(s=x.NAME, xy=x.geometry.centroid.coords[0], ha='center'),axis=1))

Year of admission

In [23]:

gbs_data['admit_year'] = gbs_data.ADMIT.dt.year

In [24]:

pd.set_option('display.max_rows', 20, 'display.max_columns', 35)
gbs_pivot= gbs_data.pivot_table(index=['black', 'COUNTY', 'early_onset'], columns=['admit_year'], 
                     values='ADMIT', aggfunc='count').fillna(0).astype(int)

In [25]:

complete_index = pd.MultiIndex.from_product(gbs_pivot.index.levels, 
                                            names=['black', 'COUNTY', 'early_onset'])

In [26]:

annual_cases = pd.melt(gbs_pivot.reindex(complete_index, fill_value=0).reset_index(), 
                       id_vars=['black', 'COUNTY', 'early_onset'], 
                       value_name='cases')

In [27]:

annual_cases.head()

Out[27]:

	COUNTY	early_onset	admit_year
0	ANDERSON	0	2004
1	ANDERSON	1	2004
2	BLOUNT	0	2004
3	BLOUNT	1	2004
4	CHEATHAM	0	2004

Tennessee birth data¶

In [28]:

births_sheets = pd.read_excel(DATA_DIR+'raw/births.xlsx', sheetname=None)

/home/fonnesbeck/anaconda3/envs/dev/lib/python3.6/site-packages/pandas/io/excel.py:329: FutureWarning: The `sheetname` keyword is deprecated, use `sheet_name` instead
  **kwds)

In [29]:

births_raw = pd.concat([births_sheets[s].assign(year=int(s)) for s in births_sheets])
births_raw['county'] = births_raw.county.str.upper()
births_raw.head()

Out[29]:

	county	rate	births	population	race	year
0	MONTGOMERY	19.7	2890	146853	all	2005
1	BEDFORD	17.1	722	42247	all	2005
2	ROBERTSON	16.5	996	60374	all	2005
3	DAVIDSON	16.4	9409	574419	all	2005
4	RUTHERFORD	16.0	3491	218458	all	2005

Remove unneeded counties

In [30]:

county_list = gbs_data.COUNTY.unique()
county_list.sort()

births_subset = births_raw[births_raw.county.isin(county_list)]
births_subset.shape

Out[30]:

(627, 6)

In [31]:

births_race = births_subset[births_subset.race!='all']
births_race.drop(['rate','population'], axis=1).pivot_table(index=['race', 'county'], 
                                                            columns='year')

Out[31]:

		births
	year	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015
race	county
black	ANDERSON	34	44	43	39	38	32	32	32	34	33	28
	BLOUNT	38	37	45	40	40	34	32	32	45	41	38
	CHEATHAM	10	7	3	3	3	4	5	1	6	4	8
	DAVIDSON	2857	2951	3012	3116	3032	2883	2820	2803	2961	3010	2995
	DICKSON	22	35	13	34	20	26	17	15	23	34	20
	GRAINGER	1	0	1	1	1	0	0	0	1	3	1
	HAMILTON	999	1095	1124	1095	1021	991	910	979	1026	983	1005
	JEFFERSON	7	7	8	7	5	9	12	11	9	9	11
	KNOX	527	564	563	609	570	593	546	533	532	568	547
	LOUDON	9	6	3	4	9	5	5	5	6	8	7
...	...	...	...	...	...	...	...	...	...	...	...	...
white	LOUDON	455	520	537	528	515	459	544	527	531	495	508
	MADISON	796	805	824	742	749	664	654	716	662	684	655
	ROANE	493	516	574	531	467	506	481	464	460	421	438
	ROBERTSON	906	955	999	969	886	757	839	828	799	804	791
	RUTHERFORD	2938	3053	3249	3349	3007	3133	3015	2951	3063	3199	3176
	SEVIER	966	1084	1140	1111	1077	1023	1000	998	973	1032	1015
	SHELBY	5795	5991	5895	5581	5022	5046	5183	4968	5004	4995	4836
	SUMNER	1724	1830	1877	1927	1776	1703	1741	1742	1816	1901	1896
	WILLIAMSON	1858	1999	2060	1851	1878	1791	1837	1748	1889	1963	2045
	WILSON	1247	1282	1282	1277	1180	1222	1213	1241	1260	1323	1324

38 rows × 11 columns

In [32]:

births_long = (pd.melt(births_race, value_vars=['births'], id_vars=['county', 'race', 'year'])
                   .drop('variable', axis=1))
births_long.head()

Out[32]:

	county	race	year	value
0	RUTHERFORD	black	2005	396
1	DAVIDSON	black	2005	2857
2	MADISON	black	2005	565
3	SHELBY	black	2005	8206
4	CHEATHAM	black	2005	10

In [33]:

min_year = births_long.year.min()
min_year

Out[33]:

Create numeric fields for analysis, sort by race, year, county.

In [34]:

births_numeric = births_long.copy()
births_numeric['county'] = births_numeric.county.apply(county_list.searchsorted)
births_numeric['race'] = (births_numeric.race=='black').astype(int)
births_numeric['year'] = births_numeric.year - births_numeric.year.min()
births_numeric.columns = ['county', 'black', 'year', 'births']
births_numeric = (births_numeric.sort_values(by=['black', 'county', 'year'])
                      .reset_index(drop=True))
births_numeric

Out[34]:

	county	black	year	births
0	0	0	0	828
1	0	0	1	742
2	0	0	2	881
3	0	0	3	815
4	0	0	4	733
5	0	0	5	785
6	0	0	6	754
7	0	0	7	762
8	0	0	8	716
9	0	0	9	741
...	...	...	...	...
408	18	1	1	88
409	18	1	2	80
410	18	1	3	86
411	18	1	4	94
412	18	1	5	87
413	18	1	6	90
414	18	1	7	84
415	18	1	8	77
416	18	1	9	92
417	18	1	10	107

418 rows × 4 columns

In [35]:

white_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='white']
white_births.pivot(columns='year', index='county')

Out[35]:

	births
year	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015
county
ANDERSON	828	742	881	815	733	785	754	762	716	741	721
BLOUNT	1254	1282	1295	1329	1226	1139	1234	1218	1243	1264	1165
CHEATHAM	479	446	511	510	448	401	420	438	419	461	429
DAVIDSON	6175	6594	6565	6540	6056	6247	6310	6411	6441	6674	6678
DICKSON	662	663	648	653	560	627	560	541	592	580	606
GRAINGER	259	268	283	261	254	234	222	218	214	244	206
HAMILTON	2874	3062	3108	3108	3030	2993	3014	3046	3030	3029	3140
JEFFERSON	561	577	592	590	529	512	512	518	489	529	512
KNOX	4360	4668	4617	4705	4524	4320	4407	4568	4421	4482	4593
LOUDON	455	520	537	528	515	459	544	527	531	495	508
MADISON	796	805	824	742	749	664	654	716	662	684	655
ROANE	493	516	574	531	467	506	481	464	460	421	438
ROBERTSON	906	955	999	969	886	757	839	828	799	804	791
RUTHERFORD	2938	3053	3249	3349	3007	3133	3015	2951	3063	3199	3176
SEVIER	966	1084	1140	1111	1077	1023	1000	998	973	1032	1015
SHELBY	5795	5991	5895	5581	5022	5046	5183	4968	5004	4995	4836
SUMNER	1724	1830	1877	1927	1776	1703	1741	1742	1816	1901	1896
WILLIAMSON	1858	1999	2060	1851	1878	1791	1837	1748	1889	1963	2045
WILSON	1247	1282	1282	1277	1180	1222	1213	1241	1260	1323	1324

In [36]:

black_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='black']
black_births.pivot(columns='year', index='county')

Out[36]:

	births
year	2005	2006	2007	2008	2009	2010	2011	2012	2013	2014	2015
county
ANDERSON	34	44	43	39	38	32	32	32	34	33	28
BLOUNT	38	37	45	40	40	34	32	32	45	41	38
CHEATHAM	10	7	3	3	3	4	5	1	6	4	8
DAVIDSON	2857	2951	3012	3116	3032	2883	2820	2803	2961	3010	2995
DICKSON	22	35	13	34	20	26	17	15	23	34	20
GRAINGER	1	0	1	1	1	0	0	0	1	3	1
HAMILTON	999	1095	1124	1095	1021	991	910	979	1026	983	1005
JEFFERSON	7	7	8	7	5	9	12	11	9	9	11
KNOX	527	564	563	609	570	593	546	533	532	568	547
LOUDON	9	6	3	4	9	5	5	5	6	8	7
MADISON	565	550	617	639	582	550	595	554	582	544	576
ROANE	12	13	12	20	13	12	10	11	12	6	4
ROBERTSON	74	71	72	84	83	77	77	78	64	71	51
RUTHERFORD	396	464	517	545	475	479	478	503	522	620	594
SEVIER	5	8	8	6	5	10	5	6	11	11	10
SHELBY	8206	8640	8763	8971	8554	8204	8244	8350	8313	8375	8030
SUMNER	147	142	151	165	137	127	134	141	141	177	158
WILLIAMSON	85	84	93	104	78	89	72	77	90	78	79
WILSON	79	88	80	86	94	87	90	84	77	92	107

County infant mortality¶

In [37]:

mortality_all = pd.read_excel(DATA_DIR+'raw/county_infant_mortality.xlsx')
mortality_all.head()

Out[37]:

	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
Tennessee	8.62	8.79	8.69	8.29	8.03	7.98	7.89	7.39	7.18	6.78
Anderson	7.1	5.67	12.45	4.2	2.3	5.01	7.21	8.72	7.34	6.54
Bedford	12.21	18.01	2.82	17.71	6.13	7.36	7.79	11.71	4.85	4.98
Benton	6.17	17.44	12.35	0	0	0	19.35	0	0	0
Bledsoe	7.52	0	15.27	6.45	0	26.09	15.15	0	0	0

Remove unneeded counties and convert to probabilities

In [38]:

mortality_subset = (mortality_all[mortality_all.index.str.upper().isin(gbs_data.COUNTY.unique())]/1000).astype(float)
mortality_subset

Out[38]:

	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
Anderson	0.00710	0.00567	0.01245	0.00420	0.00230	0.00501	0.00721	0.00872	0.00734	0.00654
Blount	0.00675	0.00230	0.00601	0.00882	0.00720	0.00230	0.00673	0.00696	0.00314	0.00306
Cheatham	0.01222	0.01008	0.01538	0.00000	0.00193	0.00217	0.01220	0.00466	0.00677	0.00932
Davidson	0.00655	0.00786	0.00933	0.00801	0.00673	0.00767	0.00764	0.00750	0.00710	0.00767
Dickson	0.00622	0.00871	0.00708	0.01046	0.00574	0.01017	0.01212	0.00172	0.00175	0.00323
Grainger	0.01709	0.00382	0.00000	0.00000	0.00379	0.00775	0.00422	0.01794	0.00455	0.00463
Hamilton	0.00729	0.00934	0.01125	0.00970	0.00973	0.00784	0.00974	0.00791	0.00791	0.00671
Jefferson	0.00765	0.00522	0.00852	0.00979	0.01329	0.00548	0.00749	0.00755	0.00739	0.00400
Knox	0.00452	0.00776	0.00650	0.00468	0.00710	0.00507	0.00668	0.00525	0.00549	0.00489
Loudon	0.00588	0.00638	0.00563	0.00181	0.00370	0.00183	0.00423	0.00538	0.00184	0.00921
Madison	0.01567	0.01729	0.00948	0.01160	0.01216	0.00659	0.00563	0.00787	0.00927	0.00637
Roane	0.00000	0.00000	0.00753	0.00845	0.01266	0.01029	0.00192	0.01212	0.01046	0.00840
Robertson	0.00628	0.00602	0.00478	0.00735	0.00658	0.00497	0.00471	0.00537	0.00651	0.00688
Rutherford	0.00510	0.00630	0.00522	0.00377	0.00589	0.00811	0.00664	0.00493	0.00413	0.00454
Sevier	0.00307	0.01010	0.00361	0.00942	0.00703	0.00902	0.00753	0.00491	0.00587	0.00997
Shelby	0.01284	0.01153	0.01378	0.01267	0.01230	0.01298	0.01030	0.00958	0.01065	0.00923
Sumner	0.00434	0.00578	0.00689	0.00386	0.00927	0.00351	0.00907	0.00780	0.00466	0.00600
Williamson	0.00412	0.00298	0.00278	0.00311	0.00195	0.00097	0.00457	0.00398	0.00362	0.00334
Wilson	0.00850	0.00892	0.00577	0.00501	0.00648	0.00456	0.00591	0.00374	0.00221	0.00364

Calculate quarterly survival

In [39]:

quarterly_survival = (1 - mortality_subset)**(1/4)
quarterly_survival

Out[39]:

	2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
Anderson	0.998220	0.998579	0.996873	0.998948	0.999425	0.998745	0.998193	0.997813	0.998160	0.998361
Blount	0.998308	0.999425	0.998494	0.997788	0.998195	0.999425	0.998313	0.998255	0.999214	0.999234
Cheatham	0.996931	0.997470	0.996133	1.000000	0.999517	0.999457	0.996936	0.998833	0.998303	0.997662
Davidson	0.998358	0.998029	0.997659	0.997991	0.998313	0.998077	0.998085	0.998120	0.998220	0.998077
Dickson	0.998441	0.997815	0.998225	0.997375	0.998562	0.997448	0.996956	0.999570	0.999562	0.999192
Grainger	0.995700	0.999044	1.000000	1.000000	0.999051	0.998057	0.998943	0.995485	0.998861	0.998840
Hamilton	0.998172	0.997657	0.997176	0.997566	0.997559	0.998034	0.997556	0.998017	0.998017	0.998318
Jefferson	0.998082	0.998692	0.997863	0.997543	0.996661	0.998627	0.998122	0.998107	0.998147	0.998998
Knox	0.998868	0.998054	0.998371	0.998828	0.998220	0.998730	0.998326	0.998685	0.998625	0.998775
Loudon	0.998527	0.998401	0.998590	0.999547	0.999074	0.999542	0.998941	0.998652	0.999540	0.997690
Madison	0.996059	0.995649	0.997622	0.997087	0.996946	0.998348	0.998590	0.998027	0.997674	0.998404
Roane	1.000000	1.000000	0.998112	0.997881	0.996820	0.997418	0.999520	0.996956	0.997375	0.997893
Robertson	0.998426	0.998492	0.998803	0.998157	0.998351	0.998755	0.998820	0.998655	0.998369	0.998276
Rutherford	0.998723	0.998421	0.998692	0.999056	0.998524	0.997966	0.998336	0.998765	0.998966	0.998863
Sevier	0.999232	0.997465	0.999096	0.997637	0.998238	0.997737	0.998112	0.998770	0.998529	0.997498
Shelby	0.996774	0.997105	0.996537	0.996817	0.996911	0.996739	0.997415	0.997596	0.997327	0.997684
Sumner	0.998913	0.998552	0.998273	0.999034	0.997674	0.999121	0.997725	0.998044	0.998833	0.998497
Williamson	0.998968	0.999254	0.999304	0.999222	0.999512	0.999757	0.998856	0.999004	0.999094	0.999164
Wilson	0.997868	0.997763	0.998554	0.998745	0.998376	0.998858	0.998519	0.999064	0.999447	0.999089

Will simply assume 2013 mortality carries forward to 2015

In [40]:

quarterly_survival[2014] = quarterly_survival[2015] = quarterly_survival[2013]
quarterly_survival = quarterly_survival.drop(2004, axis=1)

In [41]:

mean_quarterly_survival = quarterly_survival.mean()

Merge GBS and birth counts¶

In [42]:

annual_cases.head()

Out[42]:

	COUNTY	early_onset	admit_year
0	ANDERSON	0	2004
1	ANDERSON	1	2004
2	BLOUNT	0	2004
3	BLOUNT	1	2004
4	CHEATHAM	0	2004

In [43]:

cases_numeric = annual_cases[annual_cases.admit_year>=min_year]
cases_numeric = (cases_numeric.assign(county=cases_numeric.COUNTY.apply(county_list.searchsorted),
                     year=cases_numeric.admit_year - cases_numeric.admit_year.min())
            .drop(['COUNTY', 'admit_year'], axis=1))
cases_numeric.head()

Out[43]:

	early_onset	county
76	0	0
77	1	0
78	0	1
79	1	1
80	0	2

Indices of counties within regions

In [44]:

middle_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='Middle'].COUNTY.unique()))[0]

In [45]:

west_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='West'].COUNTY.unique()))[0]

In [46]:

east_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='East'].COUNTY.unique()))[0]

Model specification¶

In [78]:

from pymc3 import Model, sample, traceplot, forestplot, NUTS, Metropolis
from pymc3 import energyplot, fit, summary, sample_ppc
from pymc3 import Multinomial, Normal, Poisson, Deterministic, Gamma, MvNormal
from pymc3 import Binomial, HalfCauchy, Uniform, StudentT, Exponential, GaussianRandomWalk
from pymc3.math import invlogit
from pymc3.gp import cov, mean

import theano.tensor as tt
from theano import shared

In [50]:

merge_cols = ['county', 'black', 'year']
merged_covs = (births_numeric.set_index(merge_cols)
                   .join(cases_numeric.set_index(merge_cols))
                   .fillna(0)
                   .reset_index()
                   .astype(int))

merged_covs = merged_covs.sort_values(by=['early_onset', 'black', 'county', 'year'], ascending=True)

In [51]:

ncounties = int(merged_covs.county.max()+1)
nyears = len(merged_covs.year.unique())
data_shape = (2, ncounties, nyears)
data_shape

Out[51]:

(2, 19, 11)

In [52]:

merged_covs.groupby('early_onset').count()

Out[52]:

	county	black	year	births	cases
early_onset
0	418	418	418	418	418
1	418	418	418	418	418

In [53]:

county_ind = np.where(county_list=='Davidson'.upper())[0][0]
merged_covs[merged_covs.county==county_ind]

Out[53]:

	county	black	year	births	early_onset	cases
132	3	0	0	6175	0	4
134	3	0	1	6594	0	1
136	3	0	2	6565	0	0
138	3	0	3	6540	0	2
140	3	0	4	6056	0	3
142	3	0	5	6247	0	2
144	3	0	6	6310	0	0
146	3	0	7	6411	0	2
148	3	0	8	6441	0	0
150	3	0	9	6674	0	3
...	...	...	...	...	...	...
157	3	1	1	2951	1	0
159	3	1	2	3012	1	3
161	3	1	3	3116	1	3
163	3	1	4	3032	1	1
165	3	1	5	2883	1	1
167	3	1	6	2820	1	1
169	3	1	7	2803	1	1
171	3	1	8	2961	1	0
173	3	1	9	3010	1	0
175	3	1	10	2995	1	0

44 rows × 6 columns

In [54]:

def county_model(county, early_onset):
    
    county_ind = np.where(county_list==county.upper())[0][0]

    with Model() as model:

        # Make local references to data
        subset_ind = (merged_covs.county==county_ind) & (merged_covs.early_onset==early_onset)
        _, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T

        data_shape = cases.shape

        births_matrix = tt.reshape(births, data_shape)
        surv_matrix = np.repeat([quarterly_survival.loc[county].values], 2)
        rng = tt.shared_randomstreams.RandomStreams()
        n_vector = rng.binomial(n=births_matrix, p=surv_matrix, size=data_shape)
        
        ρ = Gamma('ρ', 1, 0.1, shape=2)
        η = Gamma('η', 1, 0.1, shape=2)
        K_white = η[0] * cov.ExpQuad(1, ρ[0])
        K_black = η[1] * cov.ExpQuad(1, ρ[1])
        
        σ = HalfCauchy('σ', 3, shape=2)
        s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)])

        X = np.arange(nyears).reshape(-1,1)
        θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, 
                                        shape=(nyears,))
        θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, 
                                       shape=(nyears,))

        # Poisson GBS rate, by time race and county
        λ_black = Deterministic('λ_black', tt.exp(θ_black))
        λ_white = Deterministic('λ_white', tt.exp(θ_white))

        # Data likelihood
        obs = Poisson('obs', n_vector * tt.concatenate([λ_white, λ_black]), observed=shared(cases))
        
    return model

Davidson county¶

Early onset

In [55]:

nsamples = 5000
ntune = 4000

In [56]:

davidson_model_early = county_model('Davidson', 1)

In [58]:

with davidson_model_early:
    approx = fit(n=10000, random_seed=RANDOM_SEEDS[0])
    davidson_trace_early = approx.sample(draws=1000)

Average Loss = 7,220.7: 100%|██████████| 10000/10000 [00:52<00:00, 190.48it/s]
Finished [100%]: Average Loss = 7,201.6

In [59]:

traceplot(davidson_trace_early, varnames=['σ', 'ρ', 'η']);

Late onset

In [60]:

davidson_model = county_model('Davidson', 0)

In [62]:

with davidson_model:
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    davidson_trace = approx.sample(draws=1000)

Average Loss = 292.35: 100%|██████████| 20000/20000 [01:56<00:00, 170.99it/s]
Finished [100%]: Average Loss = 291.62

In [63]:

traceplot(davidson_trace, varnames=['σ', 'ρ', 'η']);

In [64]:

def plot_rates(trace, ax=None, factor=1):
    
    #rate_est = pd.DataFrame(trace['λ', -500:].T) * 10000
    rate_est = pd.DataFrame(np.concatenate([trace['λ_white'], trace['λ_black']], 1).T)*factor

    rate_est['year'] = np.ravel([np.arange(11)+2005]*2)
    rate_est['race'] = np.r_[['white']*11, ['black']*11]

    race_df = pd.melt(rate_est, id_vars=['year', 'race'], value_name='rate')
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(14,8))
    g = sns.boxplot(x="year", y="rate", hue="race", data=race_df, fliersize=0,
                   palette=sns.diverging_palette(220, 20, n=2), ax=ax)
    
    return g

In [65]:

def multi_rate_plot(trace, early_trace, ylim=None, factor=1):
    
    fig, axes = plt.subplots(2, 1)
    plot_rates(trace, axes[0], factor=factor)
    plot_rates(early_trace, axes[1], factor=factor)
    axes[0].set_title('Late onset')
    axes[1].set_title('Early onset')
    axes[0].set_xticklabels([''])
    axes[0].set_xlabel('')
    axes[1].legend_.remove()
    if ylim is not None:
        axes[0].set_ylim(ylim)
        axes[1].set_ylim(ylim)
    
    return axes

In [66]:

multi_rate_plot(davidson_trace, davidson_trace_early, factor=10000)

Out[66]:

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7b263eb10b70>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7b263ead6e80>],
      dtype=object)

Shelby county¶

Early onset

In [68]:

shelby_model_early = county_model('Shelby', 1)

In [69]:

with shelby_model_early:
    approx = fit(n=50000, random_seed=RANDOM_SEEDS[0])
    shelby_early_trace = approx.sample(draws=1000)

Average Loss = 66.344: 100%|██████████| 50000/50000 [04:45<00:00, 174.98it/s]
Finished [100%]: Average Loss = 66.342

In [70]:

traceplot(shelby_early_trace, varnames=['σ', 'ρ', 'η']);

Late onset model

In [71]:

shelby_model = county_model('Shelby', 0)

In [73]:

with shelby_model:
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    shelby_trace = approx.sample(draws=1000)

Average Loss = 322.71: 100%|██████████| 20000/20000 [01:52<00:00, 177.45it/s]
Finished [100%]: Average Loss = 321.76

Plots of posterior parameter estimates

In [74]:

traceplot(shelby_trace, varnames=['σ', 'ρ', 'η']);

Boxplots and tables of rate estimates

In [76]:

multi_rate_plot(shelby_trace, shelby_early_trace, factor=10000)

Out[76]:

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7b263f67e4e0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7b26248c4860>],
      dtype=object)

Model fit check

In [79]:

ppc = sample_ppc(davidson_trace, samples=500, model=davidson_model)

100%|██████████| 500/500 [00:12<00:00, 40.10it/s]

In [80]:

county_ind = np.where(county_list=='DAVIDSON')[0][0]
_, black, year, births, early_onset, cases = merged_covs[merged_covs.county==county_ind].values.T
plt.hist([np.searchsorted(sample, case)/500 for case,sample in zip(cases, ppc['obs'].T) if case])

Out[80]:

(array([ 0.,  0.,  0.,  0.,  0., 17.,  0.,  0.,  0.,  0.]),
 array([-0.5, -0.4, -0.3, -0.2, -0.1,  0. ,  0.1,  0.2,  0.3,  0.4,  0.5]),
 <a list of 10 Patch objects>)

Regional models¶

In [ ]:

def region_model(region, early_onset):
        
    county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0]

    with Model() as model:

        # Make local references to data
        subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset)
        county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T

        data_shape = cases.shape

        # Index survival rate from county
        qsurv = quarterly_survival.reset_index(drop=True)
        surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)])
        # Apply survival to births
        rng = tt.shared_randomstreams.RandomStreams()
        n_vector = rng.binomial(n=births, p=surv, size=data_shape)

        # African american effect
        ρ = Gamma('ρ', 1, 0.1, shape=2)
        η = Gamma('η', 1, 0.1, shape=2)
        K_white = η[0] * cov.ExpQuad(1, ρ[0])
        K_black = η[1] * cov.ExpQuad(1, ρ[1])

        σ = HalfCauchy('σ', 3, shape=2)
        s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)])
        
        X = np.arange(nyears).reshape(-1,1)
        θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s, 
                                       shape=(nyears,))
        θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_black(X) + tt.eye(nyears)*s, 
                                        shape=(nyears,))
        
        λ = Deterministic('λ', tt.exp(tt.concatenate([θ_white[year[black==0]], θ_black[year[black==1]]])))

        # Poisson GBS rate, by time race and county
        λ_black = Deterministic('λ_black', tt.exp(θ_black)*10000)
        λ_white = Deterministic('λ_white', tt.exp(θ_white)*10000)

        # Data likelihood
        obs = Poisson('obs', n_vector * λ, observed=shared(cases))
        
    return model

Eastern Tennessee¶

Late onset model

In [ ]:

with region_model('East', 0) as east_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    east_trace = approx.sample(draws=1000)

Plots of posterior parameter estimates

In [ ]:

traceplot(east_trace, varnames=['σ', 'ρ', 'η']);

Early onset model

In [ ]:

with region_model('East', 1) as east_early_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    east_early_trace = approx.sample(draws=1000)

In [ ]:

traceplot(east_early_trace, varnames=['σ', 'ρ', 'η']);

Plots and tables of rates

In [ ]:

multi_rate_plot(east_trace, east_early_trace, ylim=(0,40))

In [ ]:

def rate_table(trace, race=''):
    varname = 'λ_%s' % race if race else 'λ'
    df = df_summary(trace, varnames=[varname])
    df.index = range(2005, 2016)
    return df

In [ ]:

rate_table(east_trace, 'black')

In [ ]:

rate_table(east_trace, 'white')

West Tennessee¶

Late onset

In [ ]:

with region_model('West', 0) as west_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    west_trace = approx.sample(draws=1000)

In [ ]:

traceplot(west_trace, varnames=['σ', 'ρ', 'η']);

Early onset

In [ ]:

with region_model('West', 1) as west_early_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    west_early_trace = approx.sample(draws=1000)

In [ ]:

traceplot(west_early_trace, varnames=['σ', 'ρ', 'η']);

Plots and tables of rates

In [ ]:

multi_rate_plot(west_trace, west_early_trace)

In [ ]:

rate_table(west_trace, 'black')

In [ ]:

rate_table(west_trace, 'white')

Middle Tennessee¶

In [ ]:

with region_model('Middle', 0) as middle_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    middle_trace = approx.sample(draws=1000)

In [ ]:

traceplot(middle_trace, varnames=['σ', 'ρ', 'η']);

In [ ]:

with region_model('Middle', 1) as middle_early_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    middle_early_trace = approx.sample(draws=1000)

In [ ]:

traceplot(middle_early_trace, varnames=['σ', 'ρ', 'η']);

Plot and table of rates

In [ ]:

multi_rate_plot(middle_trace, middle_early_trace)

In [ ]:

rate_table(middle_trace, 'black')

In [ ]:

rate_table(middle_trace, 'white')

Race-pooled model¶

In [ ]:

def pooled_model(region, early_onset):
        
    county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0]

    with Model() as model:

        # Make local references to data
        subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset)
        county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T

        data_shape = cases.shape

        # Index survival rate from county
        qsurv = quarterly_survival.reset_index(drop=True)
        surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)])
        # Apply survival to births
        rng = tt.shared_randomstreams.RandomStreams()
        n_vector = rng.binomial(n=births, p=surv, size=data_shape)

        # African american effect
        ρ = Gamma('ρ', 1, 0.1)
        η = Gamma('η', 1, 0.1)
        K = η * cov.ExpQuad(1, ρ)

        σ = HalfCauchy('σ', 3)
        
        X = np.arange(nyears).reshape(-1,1)
        θ = MvNormal('θ', mu=tt.zeros(nyears), cov=K(X) + tt.eye(nyears)*σ, 
                                       shape=(nyears,))
        
        # Poisson GBS rate, by time race and county
        λ = Deterministic('λ', tt.exp(θ)*10000)

        # Data likelihood
        obs = Poisson('obs', n_vector * tt.exp(θ[year]), observed=shared(cases))
        
    return model

Middle region

In [ ]:

with pooled_model('Middle', 0) as middle_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    middle_pooled_trace = approx.sample(draws=1000)

In [ ]:

with pooled_model('Middle', 1) as middle_early_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    middle_early_pooled_trace = approx.sample(draws=1000)

In [ ]:

def plot_pooled_rates(trace, early_trace, ax=None, factor=1):
    
    sns.set_context("notebook", font_scale=1.5)
    
    rate_est = pd.DataFrame(np.concatenate([trace['λ'], 
                                            early_trace['λ']], 1).T)*factor

    rate_est['year'] = np.ravel([np.arange(11)+2005]*2)
    rate_est['onset'] = np.r_[['late']*11, ['early']*11]

    race_df = pd.melt(rate_est, id_vars=['year', 'onset'], value_name='rate')
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(14,8))
    g = sns.boxplot(x="year", y="rate", hue="onset", data=race_df, fliersize=0,
                   palette=sns.color_palette("husl", 2), ax=ax)
    
    return g

In [ ]:

ax = plot_pooled_rates(middle_pooled_trace, middle_early_pooled_trace)
ax.set_title('Middle Tennessee GBS rates');

In [ ]:

rate_table(middle_pooled_trace)

In [ ]:

rate_table(middle_early_pooled_trace)

East region

In [ ]:

with pooled_model('East', 0) as east_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    east_pooled_trace = approx.sample(draws=1000)

In [ ]:

with pooled_model('East', 1) as east_early_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    east_early_pooled_trace = approx.sample(draws=1000)

In [ ]:

ax = plot_pooled_rates(east_pooled_trace, east_early_pooled_trace)
ax.set_title('East Tennessee GBS rates');

In [ ]:

rate_table(east_pooled_trace)

In [ ]:

rate_table(east_early_pooled_trace)

West region

In [ ]:

with pooled_model('West', 0) as west_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    west_pooled_trace = approx.sample(draws=1000)

In [ ]:

with pooled_model('West', 1) as west_early_pooled_model:
    
    approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
    west_early_pooled_trace = approx.sample(draws=1000)

In [ ]:

ax = plot_pooled_rates(west_pooled_trace, west_early_pooled_trace)
ax.set_title('West Tennessee GBS rates');

In [ ]:

rate_table(west_pooled_trace)

In [ ]:

rate_table(west_early_pooled_trace)

	county	black	year	births
0	0	0	0	828
1	0	0	1	742
2	0	0	2	881
3	0	0	3	815
4	0	0	4	733
5	0	0	5	785
6	0	0	6	754
7	0	0	7	762
8	0	0	8	716
9	0	0	9	741
...	...	...	...	...
408	18	1	1	88
409	18	1	2	80
410	18	1	3	86
411	18	1	4	94
412	18	1	5	87
413	18	1	6	90
414	18	1	7	84
415	18	1	8	77
416	18	1	9	92
417	18	1	10	107

	county	black	year	births
0	0	0	0	828
1	0	0	1	742
2	0	0	2	881
3	0	0	3	815
4	0	0	4	733
5	0	0	5	785
6	0	0	6	754
7	0	0	7	762
8	0	0	8	716
9	0	0	9	741
...	...	...	...	...
408	18	1	1	88
409	18	1	2	80
410	18	1	3	86
411	18	1	4	94
412	18	1	5	87
413	18	1	6	90
414	18	1	7	84
415	18	1	8	77
416	18	1	9	92
417	18	1	10	107

	county	black	year	births
0	0	0	0	828
1	0	0	1	742
2	0	0	2	881
3	0	0	3	815
4	0	0	4	733
5	0	0	5	785
6	0	0	6	754
7	0	0	7	762
8	0	0	8	716
9	0	0	9	741
...	...	...	...	...
408	18	1	1	88
409	18	1	2	80
410	18	1	3	86
411	18	1	4	94
412	18	1	5	87
413	18	1	6	90
414	18	1	7	84
415	18	1	8	77
416	18	1	9	92
417	18	1	10	107