%matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pylab as plt
import geopandas as gpd
RANDOM_SEEDS = 20090425, 19700903
Dropping all columns that are mostly missing; converting dates.
DATA_DIR = '../data/'
gbs_data = pd.read_csv(DATA_DIR+'raw/gbs_neo_27oct2016.csv',
encoding='ISO-8859-1',
parse_dates=['ADMIT', 'DISCHRG', 'BIRTH']).dropna('columns', thresh=300)
Number of rows and columns in resulting import
gbs_data.shape
(543, 156)
Proportion of races.
gbs_data[['white','black','amerind','asian','pacific','unkrace']].mean()
white 0.405157 black 0.526703 amerind 0.000000 asian 0.005525 pacific 0.000000 unkrace 0.047882 dtype: float64
A large portion of ethnicity is undeclared, so probably not useful.
gbs_data.ETHNIC.value_counts()
2 311 9 195 1 37 Name: ETHNIC, dtype: int64
gbs_data['hispanic'] = gbs_data.ETHNIC==1
gbs_data.loc[gbs_data.ETHNIC==9, 'hispanic'] = np.nan
gbs_data.hispanic.value_counts()
0.0 311 1.0 37 Name: hispanic, dtype: int64
Drop entries with no admit date or before 2004
gbs_data = gbs_data[gbs_data.ADMIT>'2003-12-31'].dropna(subset=['ADMIT'])
Data of last admission in dataset
gbs_data.ADMIT.max()
Timestamp('2015-12-29 00:00:00')
gbs_data.ADMIT.min()
Timestamp('2004-01-03 00:00:00')
Distribution of age in days:
ax = gbs_data.AGE_DYS.hist()
ax.set_xlabel('Age (days)')
ax.set_ylabel('Frequency');
Flag for early onset
gbs_data['early_onset'] = (gbs_data.AGE_DYS<7).astype(int)
Counts of records by county
gbs_data.COUNTY.value_counts()
SHELBY 222 DAVIDSON 70 HAMILTON 62 KNOX 44 RUTHERFORD 29 SUMNER 11 WILLIAMSON 11 WILSON 8 ROBERTSON 8 MADISON 7 JEFFERSON 3 SEVIER 3 DICKSON 3 BLOUNT 3 CHEATHAM 2 LOUDON 2 ANDERSON 2 ROANE 1 GRAINGER 1 Name: COUNTY, dtype: int64
Plot of time series of counts
sns.set_style('dark')
sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique()))
fig, axes = plt.subplots(4,5, figsize=(14,8), sharey=True, sharex=True)
for ax, (county, data) in zip(axes.ravel(), gbs_data.groupby('COUNTY')):
(data.assign(case=gbs_data.NotACase==0)
.set_index('ADMIT')
.resample('A')
.case.sum()
.fillna(0)
.apply(lambda x: x + np.random.randn()*0.04)
.plot(style='o', label=county, alpha=0.6, ax=ax))
ax.set_title(county)
ax.set_xlim('2005-1-1', '2016-1-1')
ax.set_xlabel('')
sns.set_style('dark')
sns.set_palette("plasma", n_colors=len(gbs_data.COUNTY.unique()))
fig = plt.figure(figsize=(14,6))
for county, data in gbs_data.groupby('COUNTY'):
(data.assign(case=gbs_data.NotACase==0)
.set_index('ADMIT')
.resample('A')
.case.sum()
.fillna(0)
.apply(lambda x: x + np.random.randn()*0.04)
.plot(style='.-', label=county, alpha=0.6))
plt.legend(bbox_to_anchor=(1.01, 1.0))
plt.ylabel('Count')
plt.xlabel('');
county_map = gpd.read_file(DATA_DIR+'GIS/TN_counties')
county_map['NAME'] = county_map.NAME.str.upper()
county_map = county_map.set_index('NAME').join(gbs_data.COUNTY.value_counts())
county_map['gbs_count'] = county_map.COUNTY.fillna(0)
Assign counties to regions
study_counties = county_map[county_map.index.isin(gbs_data.COUNTY.unique())].copy()
study_counties['Region'] = 'East'
study_counties.loc[['ROBERTSON', 'SUMNER', 'CHEATHAM', 'DAVIDSON', 'WILSON',
'DICKSON', 'RUTHERFORD', 'WILLIAMSON'], 'Region'] = 'Middle'
study_counties.loc[['MADISON', 'SHELBY'], 'Region'] = 'West'
gbs_data = gbs_data.merge(study_counties, left_on='COUNTY', right_index=True).drop(['COUNTY_x', 'COUNTY_y'], axis=1)
gbs_data['region_ind'] = gbs_data.Region.replace({'West':0, 'Middle':1, 'East':2})
with sns.axes_style("ticks"):
ax = county_map.plot(column='gbs_count', cmap='Blues', figsize=(14,8))
sns.despine(bottom=True, left=True)
(study_counties.reset_index()
.apply(lambda x: ax.annotate(s=x.NAME, xy=x.geometry.centroid.coords[0], ha='center'),axis=1))
Year of admission
gbs_data['admit_year'] = gbs_data.ADMIT.dt.year
pd.set_option('display.max_rows', 20, 'display.max_columns', 35)
gbs_pivot= gbs_data.pivot_table(index=['black', 'COUNTY', 'early_onset'], columns=['admit_year'],
values='ADMIT', aggfunc='count').fillna(0).astype(int)
complete_index = pd.MultiIndex.from_product(gbs_pivot.index.levels,
names=['black', 'COUNTY', 'early_onset'])
annual_cases = pd.melt(gbs_pivot.reindex(complete_index, fill_value=0).reset_index(),
id_vars=['black', 'COUNTY', 'early_onset'],
value_name='cases')
annual_cases.head()
black | COUNTY | early_onset | admit_year | cases | |
---|---|---|---|---|---|
0 | 0 | ANDERSON | 0 | 2004 | 0 |
1 | 0 | ANDERSON | 1 | 2004 | 0 |
2 | 0 | BLOUNT | 0 | 2004 | 0 |
3 | 0 | BLOUNT | 1 | 2004 | 0 |
4 | 0 | CHEATHAM | 0 | 2004 | 0 |
births_sheets = pd.read_excel(DATA_DIR+'raw/births.xlsx', sheetname=None)
/home/fonnesbeck/anaconda3/envs/dev/lib/python3.6/site-packages/pandas/io/excel.py:329: FutureWarning: The `sheetname` keyword is deprecated, use `sheet_name` instead **kwds)
births_raw = pd.concat([births_sheets[s].assign(year=int(s)) for s in births_sheets])
births_raw['county'] = births_raw.county.str.upper()
births_raw.head()
county | rate | births | population | race | year | |
---|---|---|---|---|---|---|
0 | MONTGOMERY | 19.7 | 2890 | 146853 | all | 2005 |
1 | BEDFORD | 17.1 | 722 | 42247 | all | 2005 |
2 | ROBERTSON | 16.5 | 996 | 60374 | all | 2005 |
3 | DAVIDSON | 16.4 | 9409 | 574419 | all | 2005 |
4 | RUTHERFORD | 16.0 | 3491 | 218458 | all | 2005 |
Remove unneeded counties
county_list = gbs_data.COUNTY.unique()
county_list.sort()
births_subset = births_raw[births_raw.county.isin(county_list)]
births_subset.shape
(627, 6)
births_race = births_subset[births_subset.race!='all']
births_race.drop(['rate','population'], axis=1).pivot_table(index=['race', 'county'],
columns='year')
births | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
year | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | |
race | county | |||||||||||
black | ANDERSON | 34 | 44 | 43 | 39 | 38 | 32 | 32 | 32 | 34 | 33 | 28 |
BLOUNT | 38 | 37 | 45 | 40 | 40 | 34 | 32 | 32 | 45 | 41 | 38 | |
CHEATHAM | 10 | 7 | 3 | 3 | 3 | 4 | 5 | 1 | 6 | 4 | 8 | |
DAVIDSON | 2857 | 2951 | 3012 | 3116 | 3032 | 2883 | 2820 | 2803 | 2961 | 3010 | 2995 | |
DICKSON | 22 | 35 | 13 | 34 | 20 | 26 | 17 | 15 | 23 | 34 | 20 | |
GRAINGER | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 3 | 1 | |
HAMILTON | 999 | 1095 | 1124 | 1095 | 1021 | 991 | 910 | 979 | 1026 | 983 | 1005 | |
JEFFERSON | 7 | 7 | 8 | 7 | 5 | 9 | 12 | 11 | 9 | 9 | 11 | |
KNOX | 527 | 564 | 563 | 609 | 570 | 593 | 546 | 533 | 532 | 568 | 547 | |
LOUDON | 9 | 6 | 3 | 4 | 9 | 5 | 5 | 5 | 6 | 8 | 7 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
white | LOUDON | 455 | 520 | 537 | 528 | 515 | 459 | 544 | 527 | 531 | 495 | 508 |
MADISON | 796 | 805 | 824 | 742 | 749 | 664 | 654 | 716 | 662 | 684 | 655 | |
ROANE | 493 | 516 | 574 | 531 | 467 | 506 | 481 | 464 | 460 | 421 | 438 | |
ROBERTSON | 906 | 955 | 999 | 969 | 886 | 757 | 839 | 828 | 799 | 804 | 791 | |
RUTHERFORD | 2938 | 3053 | 3249 | 3349 | 3007 | 3133 | 3015 | 2951 | 3063 | 3199 | 3176 | |
SEVIER | 966 | 1084 | 1140 | 1111 | 1077 | 1023 | 1000 | 998 | 973 | 1032 | 1015 | |
SHELBY | 5795 | 5991 | 5895 | 5581 | 5022 | 5046 | 5183 | 4968 | 5004 | 4995 | 4836 | |
SUMNER | 1724 | 1830 | 1877 | 1927 | 1776 | 1703 | 1741 | 1742 | 1816 | 1901 | 1896 | |
WILLIAMSON | 1858 | 1999 | 2060 | 1851 | 1878 | 1791 | 1837 | 1748 | 1889 | 1963 | 2045 | |
WILSON | 1247 | 1282 | 1282 | 1277 | 1180 | 1222 | 1213 | 1241 | 1260 | 1323 | 1324 |
38 rows × 11 columns
births_long = (pd.melt(births_race, value_vars=['births'], id_vars=['county', 'race', 'year'])
.drop('variable', axis=1))
births_long.head()
county | race | year | value | |
---|---|---|---|---|
0 | RUTHERFORD | black | 2005 | 396 |
1 | DAVIDSON | black | 2005 | 2857 |
2 | MADISON | black | 2005 | 565 |
3 | SHELBY | black | 2005 | 8206 |
4 | CHEATHAM | black | 2005 | 10 |
min_year = births_long.year.min()
min_year
2005
Create numeric fields for analysis, sort by race, year, county.
births_numeric = births_long.copy()
births_numeric['county'] = births_numeric.county.apply(county_list.searchsorted)
births_numeric['race'] = (births_numeric.race=='black').astype(int)
births_numeric['year'] = births_numeric.year - births_numeric.year.min()
births_numeric.columns = ['county', 'black', 'year', 'births']
births_numeric = (births_numeric.sort_values(by=['black', 'county', 'year'])
.reset_index(drop=True))
births_numeric
county | black | year | births | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 828 |
1 | 0 | 0 | 1 | 742 |
2 | 0 | 0 | 2 | 881 |
3 | 0 | 0 | 3 | 815 |
4 | 0 | 0 | 4 | 733 |
5 | 0 | 0 | 5 | 785 |
6 | 0 | 0 | 6 | 754 |
7 | 0 | 0 | 7 | 762 |
8 | 0 | 0 | 8 | 716 |
9 | 0 | 0 | 9 | 741 |
... | ... | ... | ... | ... |
408 | 18 | 1 | 1 | 88 |
409 | 18 | 1 | 2 | 80 |
410 | 18 | 1 | 3 | 86 |
411 | 18 | 1 | 4 | 94 |
412 | 18 | 1 | 5 | 87 |
413 | 18 | 1 | 6 | 90 |
414 | 18 | 1 | 7 | 84 |
415 | 18 | 1 | 8 | 77 |
416 | 18 | 1 | 9 | 92 |
417 | 18 | 1 | 10 | 107 |
418 rows × 4 columns
white_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='white']
white_births.pivot(columns='year', index='county')
births | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
year | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 |
county | |||||||||||
ANDERSON | 828 | 742 | 881 | 815 | 733 | 785 | 754 | 762 | 716 | 741 | 721 |
BLOUNT | 1254 | 1282 | 1295 | 1329 | 1226 | 1139 | 1234 | 1218 | 1243 | 1264 | 1165 |
CHEATHAM | 479 | 446 | 511 | 510 | 448 | 401 | 420 | 438 | 419 | 461 | 429 |
DAVIDSON | 6175 | 6594 | 6565 | 6540 | 6056 | 6247 | 6310 | 6411 | 6441 | 6674 | 6678 |
DICKSON | 662 | 663 | 648 | 653 | 560 | 627 | 560 | 541 | 592 | 580 | 606 |
GRAINGER | 259 | 268 | 283 | 261 | 254 | 234 | 222 | 218 | 214 | 244 | 206 |
HAMILTON | 2874 | 3062 | 3108 | 3108 | 3030 | 2993 | 3014 | 3046 | 3030 | 3029 | 3140 |
JEFFERSON | 561 | 577 | 592 | 590 | 529 | 512 | 512 | 518 | 489 | 529 | 512 |
KNOX | 4360 | 4668 | 4617 | 4705 | 4524 | 4320 | 4407 | 4568 | 4421 | 4482 | 4593 |
LOUDON | 455 | 520 | 537 | 528 | 515 | 459 | 544 | 527 | 531 | 495 | 508 |
MADISON | 796 | 805 | 824 | 742 | 749 | 664 | 654 | 716 | 662 | 684 | 655 |
ROANE | 493 | 516 | 574 | 531 | 467 | 506 | 481 | 464 | 460 | 421 | 438 |
ROBERTSON | 906 | 955 | 999 | 969 | 886 | 757 | 839 | 828 | 799 | 804 | 791 |
RUTHERFORD | 2938 | 3053 | 3249 | 3349 | 3007 | 3133 | 3015 | 2951 | 3063 | 3199 | 3176 |
SEVIER | 966 | 1084 | 1140 | 1111 | 1077 | 1023 | 1000 | 998 | 973 | 1032 | 1015 |
SHELBY | 5795 | 5991 | 5895 | 5581 | 5022 | 5046 | 5183 | 4968 | 5004 | 4995 | 4836 |
SUMNER | 1724 | 1830 | 1877 | 1927 | 1776 | 1703 | 1741 | 1742 | 1816 | 1901 | 1896 |
WILLIAMSON | 1858 | 1999 | 2060 | 1851 | 1878 | 1791 | 1837 | 1748 | 1889 | 1963 | 2045 |
WILSON | 1247 | 1282 | 1282 | 1277 | 1180 | 1222 | 1213 | 1241 | 1260 | 1323 | 1324 |
black_births = births_subset.drop(['rate','population','race'], axis=1)[births_subset.race=='black']
black_births.pivot(columns='year', index='county')
births | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
year | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 |
county | |||||||||||
ANDERSON | 34 | 44 | 43 | 39 | 38 | 32 | 32 | 32 | 34 | 33 | 28 |
BLOUNT | 38 | 37 | 45 | 40 | 40 | 34 | 32 | 32 | 45 | 41 | 38 |
CHEATHAM | 10 | 7 | 3 | 3 | 3 | 4 | 5 | 1 | 6 | 4 | 8 |
DAVIDSON | 2857 | 2951 | 3012 | 3116 | 3032 | 2883 | 2820 | 2803 | 2961 | 3010 | 2995 |
DICKSON | 22 | 35 | 13 | 34 | 20 | 26 | 17 | 15 | 23 | 34 | 20 |
GRAINGER | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 3 | 1 |
HAMILTON | 999 | 1095 | 1124 | 1095 | 1021 | 991 | 910 | 979 | 1026 | 983 | 1005 |
JEFFERSON | 7 | 7 | 8 | 7 | 5 | 9 | 12 | 11 | 9 | 9 | 11 |
KNOX | 527 | 564 | 563 | 609 | 570 | 593 | 546 | 533 | 532 | 568 | 547 |
LOUDON | 9 | 6 | 3 | 4 | 9 | 5 | 5 | 5 | 6 | 8 | 7 |
MADISON | 565 | 550 | 617 | 639 | 582 | 550 | 595 | 554 | 582 | 544 | 576 |
ROANE | 12 | 13 | 12 | 20 | 13 | 12 | 10 | 11 | 12 | 6 | 4 |
ROBERTSON | 74 | 71 | 72 | 84 | 83 | 77 | 77 | 78 | 64 | 71 | 51 |
RUTHERFORD | 396 | 464 | 517 | 545 | 475 | 479 | 478 | 503 | 522 | 620 | 594 |
SEVIER | 5 | 8 | 8 | 6 | 5 | 10 | 5 | 6 | 11 | 11 | 10 |
SHELBY | 8206 | 8640 | 8763 | 8971 | 8554 | 8204 | 8244 | 8350 | 8313 | 8375 | 8030 |
SUMNER | 147 | 142 | 151 | 165 | 137 | 127 | 134 | 141 | 141 | 177 | 158 |
WILLIAMSON | 85 | 84 | 93 | 104 | 78 | 89 | 72 | 77 | 90 | 78 | 79 |
WILSON | 79 | 88 | 80 | 86 | 94 | 87 | 90 | 84 | 77 | 92 | 107 |
mortality_all = pd.read_excel(DATA_DIR+'raw/county_infant_mortality.xlsx')
mortality_all.head()
2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|
Tennessee | 8.62 | 8.79 | 8.69 | 8.29 | 8.03 | 7.98 | 7.89 | 7.39 | 7.18 | 6.78 |
Anderson | 7.1 | 5.67 | 12.45 | 4.2 | 2.3 | 5.01 | 7.21 | 8.72 | 7.34 | 6.54 |
Bedford | 12.21 | 18.01 | 2.82 | 17.71 | 6.13 | 7.36 | 7.79 | 11.71 | 4.85 | 4.98 |
Benton | 6.17 | 17.44 | 12.35 | 0 | 0 | 0 | 19.35 | 0 | 0 | 0 |
Bledsoe | 7.52 | 0 | 15.27 | 6.45 | 0 | 26.09 | 15.15 | 0 | 0 | 0 |
Remove unneeded counties and convert to probabilities
mortality_subset = (mortality_all[mortality_all.index.str.upper().isin(gbs_data.COUNTY.unique())]/1000).astype(float)
mortality_subset
2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|
Anderson | 0.00710 | 0.00567 | 0.01245 | 0.00420 | 0.00230 | 0.00501 | 0.00721 | 0.00872 | 0.00734 | 0.00654 |
Blount | 0.00675 | 0.00230 | 0.00601 | 0.00882 | 0.00720 | 0.00230 | 0.00673 | 0.00696 | 0.00314 | 0.00306 |
Cheatham | 0.01222 | 0.01008 | 0.01538 | 0.00000 | 0.00193 | 0.00217 | 0.01220 | 0.00466 | 0.00677 | 0.00932 |
Davidson | 0.00655 | 0.00786 | 0.00933 | 0.00801 | 0.00673 | 0.00767 | 0.00764 | 0.00750 | 0.00710 | 0.00767 |
Dickson | 0.00622 | 0.00871 | 0.00708 | 0.01046 | 0.00574 | 0.01017 | 0.01212 | 0.00172 | 0.00175 | 0.00323 |
Grainger | 0.01709 | 0.00382 | 0.00000 | 0.00000 | 0.00379 | 0.00775 | 0.00422 | 0.01794 | 0.00455 | 0.00463 |
Hamilton | 0.00729 | 0.00934 | 0.01125 | 0.00970 | 0.00973 | 0.00784 | 0.00974 | 0.00791 | 0.00791 | 0.00671 |
Jefferson | 0.00765 | 0.00522 | 0.00852 | 0.00979 | 0.01329 | 0.00548 | 0.00749 | 0.00755 | 0.00739 | 0.00400 |
Knox | 0.00452 | 0.00776 | 0.00650 | 0.00468 | 0.00710 | 0.00507 | 0.00668 | 0.00525 | 0.00549 | 0.00489 |
Loudon | 0.00588 | 0.00638 | 0.00563 | 0.00181 | 0.00370 | 0.00183 | 0.00423 | 0.00538 | 0.00184 | 0.00921 |
Madison | 0.01567 | 0.01729 | 0.00948 | 0.01160 | 0.01216 | 0.00659 | 0.00563 | 0.00787 | 0.00927 | 0.00637 |
Roane | 0.00000 | 0.00000 | 0.00753 | 0.00845 | 0.01266 | 0.01029 | 0.00192 | 0.01212 | 0.01046 | 0.00840 |
Robertson | 0.00628 | 0.00602 | 0.00478 | 0.00735 | 0.00658 | 0.00497 | 0.00471 | 0.00537 | 0.00651 | 0.00688 |
Rutherford | 0.00510 | 0.00630 | 0.00522 | 0.00377 | 0.00589 | 0.00811 | 0.00664 | 0.00493 | 0.00413 | 0.00454 |
Sevier | 0.00307 | 0.01010 | 0.00361 | 0.00942 | 0.00703 | 0.00902 | 0.00753 | 0.00491 | 0.00587 | 0.00997 |
Shelby | 0.01284 | 0.01153 | 0.01378 | 0.01267 | 0.01230 | 0.01298 | 0.01030 | 0.00958 | 0.01065 | 0.00923 |
Sumner | 0.00434 | 0.00578 | 0.00689 | 0.00386 | 0.00927 | 0.00351 | 0.00907 | 0.00780 | 0.00466 | 0.00600 |
Williamson | 0.00412 | 0.00298 | 0.00278 | 0.00311 | 0.00195 | 0.00097 | 0.00457 | 0.00398 | 0.00362 | 0.00334 |
Wilson | 0.00850 | 0.00892 | 0.00577 | 0.00501 | 0.00648 | 0.00456 | 0.00591 | 0.00374 | 0.00221 | 0.00364 |
Calculate quarterly survival
quarterly_survival = (1 - mortality_subset)**(1/4)
quarterly_survival
2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|
Anderson | 0.998220 | 0.998579 | 0.996873 | 0.998948 | 0.999425 | 0.998745 | 0.998193 | 0.997813 | 0.998160 | 0.998361 |
Blount | 0.998308 | 0.999425 | 0.998494 | 0.997788 | 0.998195 | 0.999425 | 0.998313 | 0.998255 | 0.999214 | 0.999234 |
Cheatham | 0.996931 | 0.997470 | 0.996133 | 1.000000 | 0.999517 | 0.999457 | 0.996936 | 0.998833 | 0.998303 | 0.997662 |
Davidson | 0.998358 | 0.998029 | 0.997659 | 0.997991 | 0.998313 | 0.998077 | 0.998085 | 0.998120 | 0.998220 | 0.998077 |
Dickson | 0.998441 | 0.997815 | 0.998225 | 0.997375 | 0.998562 | 0.997448 | 0.996956 | 0.999570 | 0.999562 | 0.999192 |
Grainger | 0.995700 | 0.999044 | 1.000000 | 1.000000 | 0.999051 | 0.998057 | 0.998943 | 0.995485 | 0.998861 | 0.998840 |
Hamilton | 0.998172 | 0.997657 | 0.997176 | 0.997566 | 0.997559 | 0.998034 | 0.997556 | 0.998017 | 0.998017 | 0.998318 |
Jefferson | 0.998082 | 0.998692 | 0.997863 | 0.997543 | 0.996661 | 0.998627 | 0.998122 | 0.998107 | 0.998147 | 0.998998 |
Knox | 0.998868 | 0.998054 | 0.998371 | 0.998828 | 0.998220 | 0.998730 | 0.998326 | 0.998685 | 0.998625 | 0.998775 |
Loudon | 0.998527 | 0.998401 | 0.998590 | 0.999547 | 0.999074 | 0.999542 | 0.998941 | 0.998652 | 0.999540 | 0.997690 |
Madison | 0.996059 | 0.995649 | 0.997622 | 0.997087 | 0.996946 | 0.998348 | 0.998590 | 0.998027 | 0.997674 | 0.998404 |
Roane | 1.000000 | 1.000000 | 0.998112 | 0.997881 | 0.996820 | 0.997418 | 0.999520 | 0.996956 | 0.997375 | 0.997893 |
Robertson | 0.998426 | 0.998492 | 0.998803 | 0.998157 | 0.998351 | 0.998755 | 0.998820 | 0.998655 | 0.998369 | 0.998276 |
Rutherford | 0.998723 | 0.998421 | 0.998692 | 0.999056 | 0.998524 | 0.997966 | 0.998336 | 0.998765 | 0.998966 | 0.998863 |
Sevier | 0.999232 | 0.997465 | 0.999096 | 0.997637 | 0.998238 | 0.997737 | 0.998112 | 0.998770 | 0.998529 | 0.997498 |
Shelby | 0.996774 | 0.997105 | 0.996537 | 0.996817 | 0.996911 | 0.996739 | 0.997415 | 0.997596 | 0.997327 | 0.997684 |
Sumner | 0.998913 | 0.998552 | 0.998273 | 0.999034 | 0.997674 | 0.999121 | 0.997725 | 0.998044 | 0.998833 | 0.998497 |
Williamson | 0.998968 | 0.999254 | 0.999304 | 0.999222 | 0.999512 | 0.999757 | 0.998856 | 0.999004 | 0.999094 | 0.999164 |
Wilson | 0.997868 | 0.997763 | 0.998554 | 0.998745 | 0.998376 | 0.998858 | 0.998519 | 0.999064 | 0.999447 | 0.999089 |
Will simply assume 2013 mortality carries forward to 2015
quarterly_survival[2014] = quarterly_survival[2015] = quarterly_survival[2013]
quarterly_survival = quarterly_survival.drop(2004, axis=1)
mean_quarterly_survival = quarterly_survival.mean()
annual_cases.head()
black | COUNTY | early_onset | admit_year | cases | |
---|---|---|---|---|---|
0 | 0 | ANDERSON | 0 | 2004 | 0 |
1 | 0 | ANDERSON | 1 | 2004 | 0 |
2 | 0 | BLOUNT | 0 | 2004 | 0 |
3 | 0 | BLOUNT | 1 | 2004 | 0 |
4 | 0 | CHEATHAM | 0 | 2004 | 0 |
cases_numeric = annual_cases[annual_cases.admit_year>=min_year]
cases_numeric = (cases_numeric.assign(county=cases_numeric.COUNTY.apply(county_list.searchsorted),
year=cases_numeric.admit_year - cases_numeric.admit_year.min())
.drop(['COUNTY', 'admit_year'], axis=1))
cases_numeric.head()
black | early_onset | cases | county | year | |
---|---|---|---|---|---|
76 | 0 | 0 | 0 | 0 | 0 |
77 | 0 | 1 | 0 | 0 | 0 |
78 | 0 | 0 | 0 | 1 | 0 |
79 | 0 | 1 | 0 | 1 | 0 |
80 | 0 | 0 | 0 | 2 | 0 |
Indices of counties within regions
middle_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='Middle'].COUNTY.unique()))[0]
west_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='West'].COUNTY.unique()))[0]
east_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region=='East'].COUNTY.unique()))[0]
from pymc3 import Model, sample, traceplot, forestplot, NUTS, Metropolis
from pymc3 import energyplot, fit, summary, sample_ppc
from pymc3 import Multinomial, Normal, Poisson, Deterministic, Gamma, MvNormal
from pymc3 import Binomial, HalfCauchy, Uniform, StudentT, Exponential, GaussianRandomWalk
from pymc3.math import invlogit
from pymc3.gp import cov, mean
import theano.tensor as tt
from theano import shared
merge_cols = ['county', 'black', 'year']
merged_covs = (births_numeric.set_index(merge_cols)
.join(cases_numeric.set_index(merge_cols))
.fillna(0)
.reset_index()
.astype(int))
merged_covs = merged_covs.sort_values(by=['early_onset', 'black', 'county', 'year'], ascending=True)
ncounties = int(merged_covs.county.max()+1)
nyears = len(merged_covs.year.unique())
data_shape = (2, ncounties, nyears)
data_shape
(2, 19, 11)
merged_covs.groupby('early_onset').count()
county | black | year | births | cases | |
---|---|---|---|---|---|
early_onset | |||||
0 | 418 | 418 | 418 | 418 | 418 |
1 | 418 | 418 | 418 | 418 | 418 |
county_ind = np.where(county_list=='Davidson'.upper())[0][0]
merged_covs[merged_covs.county==county_ind]
county | black | year | births | early_onset | cases | |
---|---|---|---|---|---|---|
132 | 3 | 0 | 0 | 6175 | 0 | 4 |
134 | 3 | 0 | 1 | 6594 | 0 | 1 |
136 | 3 | 0 | 2 | 6565 | 0 | 0 |
138 | 3 | 0 | 3 | 6540 | 0 | 2 |
140 | 3 | 0 | 4 | 6056 | 0 | 3 |
142 | 3 | 0 | 5 | 6247 | 0 | 2 |
144 | 3 | 0 | 6 | 6310 | 0 | 0 |
146 | 3 | 0 | 7 | 6411 | 0 | 2 |
148 | 3 | 0 | 8 | 6441 | 0 | 0 |
150 | 3 | 0 | 9 | 6674 | 0 | 3 |
... | ... | ... | ... | ... | ... | ... |
157 | 3 | 1 | 1 | 2951 | 1 | 0 |
159 | 3 | 1 | 2 | 3012 | 1 | 3 |
161 | 3 | 1 | 3 | 3116 | 1 | 3 |
163 | 3 | 1 | 4 | 3032 | 1 | 1 |
165 | 3 | 1 | 5 | 2883 | 1 | 1 |
167 | 3 | 1 | 6 | 2820 | 1 | 1 |
169 | 3 | 1 | 7 | 2803 | 1 | 1 |
171 | 3 | 1 | 8 | 2961 | 1 | 0 |
173 | 3 | 1 | 9 | 3010 | 1 | 0 |
175 | 3 | 1 | 10 | 2995 | 1 | 0 |
44 rows × 6 columns
def county_model(county, early_onset):
county_ind = np.where(county_list==county.upper())[0][0]
with Model() as model:
# Make local references to data
subset_ind = (merged_covs.county==county_ind) & (merged_covs.early_onset==early_onset)
_, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T
data_shape = cases.shape
births_matrix = tt.reshape(births, data_shape)
surv_matrix = np.repeat([quarterly_survival.loc[county].values], 2)
rng = tt.shared_randomstreams.RandomStreams()
n_vector = rng.binomial(n=births_matrix, p=surv_matrix, size=data_shape)
ρ = Gamma('ρ', 1, 0.1, shape=2)
η = Gamma('η', 1, 0.1, shape=2)
K_white = η[0] * cov.ExpQuad(1, ρ[0])
K_black = η[1] * cov.ExpQuad(1, ρ[1])
σ = HalfCauchy('σ', 3, shape=2)
s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)])
X = np.arange(nyears).reshape(-1,1)
θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s,
shape=(nyears,))
θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s,
shape=(nyears,))
# Poisson GBS rate, by time race and county
λ_black = Deterministic('λ_black', tt.exp(θ_black))
λ_white = Deterministic('λ_white', tt.exp(θ_white))
# Data likelihood
obs = Poisson('obs', n_vector * tt.concatenate([λ_white, λ_black]), observed=shared(cases))
return model
Early onset
nsamples = 5000
ntune = 4000
davidson_model_early = county_model('Davidson', 1)
with davidson_model_early:
approx = fit(n=10000, random_seed=RANDOM_SEEDS[0])
davidson_trace_early = approx.sample(draws=1000)
Average Loss = 7,220.7: 100%|██████████| 10000/10000 [00:52<00:00, 190.48it/s] Finished [100%]: Average Loss = 7,201.6
traceplot(davidson_trace_early, varnames=['σ', 'ρ', 'η']);
Late onset
davidson_model = county_model('Davidson', 0)
with davidson_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
davidson_trace = approx.sample(draws=1000)
Average Loss = 292.35: 100%|██████████| 20000/20000 [01:56<00:00, 170.99it/s] Finished [100%]: Average Loss = 291.62
traceplot(davidson_trace, varnames=['σ', 'ρ', 'η']);
def plot_rates(trace, ax=None, factor=1):
#rate_est = pd.DataFrame(trace['λ', -500:].T) * 10000
rate_est = pd.DataFrame(np.concatenate([trace['λ_white'], trace['λ_black']], 1).T)*factor
rate_est['year'] = np.ravel([np.arange(11)+2005]*2)
rate_est['race'] = np.r_[['white']*11, ['black']*11]
race_df = pd.melt(rate_est, id_vars=['year', 'race'], value_name='rate')
if ax is None:
fig, ax = plt.subplots(figsize=(14,8))
g = sns.boxplot(x="year", y="rate", hue="race", data=race_df, fliersize=0,
palette=sns.diverging_palette(220, 20, n=2), ax=ax)
return g
def multi_rate_plot(trace, early_trace, ylim=None, factor=1):
fig, axes = plt.subplots(2, 1)
plot_rates(trace, axes[0], factor=factor)
plot_rates(early_trace, axes[1], factor=factor)
axes[0].set_title('Late onset')
axes[1].set_title('Early onset')
axes[0].set_xticklabels([''])
axes[0].set_xlabel('')
axes[1].legend_.remove()
if ylim is not None:
axes[0].set_ylim(ylim)
axes[1].set_ylim(ylim)
return axes
multi_rate_plot(davidson_trace, davidson_trace_early, factor=10000)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7b263eb10b70>, <matplotlib.axes._subplots.AxesSubplot object at 0x7b263ead6e80>], dtype=object)
Early onset
shelby_model_early = county_model('Shelby', 1)
with shelby_model_early:
approx = fit(n=50000, random_seed=RANDOM_SEEDS[0])
shelby_early_trace = approx.sample(draws=1000)
Average Loss = 66.344: 100%|██████████| 50000/50000 [04:45<00:00, 174.98it/s] Finished [100%]: Average Loss = 66.342
traceplot(shelby_early_trace, varnames=['σ', 'ρ', 'η']);
Late onset model
shelby_model = county_model('Shelby', 0)
with shelby_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
shelby_trace = approx.sample(draws=1000)
Average Loss = 322.71: 100%|██████████| 20000/20000 [01:52<00:00, 177.45it/s] Finished [100%]: Average Loss = 321.76
Plots of posterior parameter estimates
traceplot(shelby_trace, varnames=['σ', 'ρ', 'η']);
Boxplots and tables of rate estimates
multi_rate_plot(shelby_trace, shelby_early_trace, factor=10000)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7b263f67e4e0>, <matplotlib.axes._subplots.AxesSubplot object at 0x7b26248c4860>], dtype=object)
Model fit check
ppc = sample_ppc(davidson_trace, samples=500, model=davidson_model)
100%|██████████| 500/500 [00:12<00:00, 40.10it/s]
county_ind = np.where(county_list=='DAVIDSON')[0][0]
_, black, year, births, early_onset, cases = merged_covs[merged_covs.county==county_ind].values.T
plt.hist([np.searchsorted(sample, case)/500 for case,sample in zip(cases, ppc['obs'].T) if case])
(array([ 0., 0., 0., 0., 0., 17., 0., 0., 0., 0.]), array([-0.5, -0.4, -0.3, -0.2, -0.1, 0. , 0.1, 0.2, 0.3, 0.4, 0.5]), <a list of 10 Patch objects>)
def region_model(region, early_onset):
county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0]
with Model() as model:
# Make local references to data
subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset)
county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T
data_shape = cases.shape
# Index survival rate from county
qsurv = quarterly_survival.reset_index(drop=True)
surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)])
# Apply survival to births
rng = tt.shared_randomstreams.RandomStreams()
n_vector = rng.binomial(n=births, p=surv, size=data_shape)
# African american effect
ρ = Gamma('ρ', 1, 0.1, shape=2)
η = Gamma('η', 1, 0.1, shape=2)
K_white = η[0] * cov.ExpQuad(1, ρ[0])
K_black = η[1] * cov.ExpQuad(1, ρ[1])
σ = HalfCauchy('σ', 3, shape=2)
s = tt.concatenate([tt.tile(σ[0], 5), tt.tile(σ[1], nyears-5)])
X = np.arange(nyears).reshape(-1,1)
θ_white = MvNormal('θ_white', mu=tt.zeros(nyears), cov=K_white(X) + tt.eye(nyears)*s,
shape=(nyears,))
θ_black = MvNormal('θ_black', mu=tt.zeros(nyears), cov=K_black(X) + tt.eye(nyears)*s,
shape=(nyears,))
λ = Deterministic('λ', tt.exp(tt.concatenate([θ_white[year[black==0]], θ_black[year[black==1]]])))
# Poisson GBS rate, by time race and county
λ_black = Deterministic('λ_black', tt.exp(θ_black)*10000)
λ_white = Deterministic('λ_white', tt.exp(θ_white)*10000)
# Data likelihood
obs = Poisson('obs', n_vector * λ, observed=shared(cases))
return model
Late onset model
with region_model('East', 0) as east_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
east_trace = approx.sample(draws=1000)
Plots of posterior parameter estimates
traceplot(east_trace, varnames=['σ', 'ρ', 'η']);
Early onset model
with region_model('East', 1) as east_early_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
east_early_trace = approx.sample(draws=1000)
traceplot(east_early_trace, varnames=['σ', 'ρ', 'η']);
Plots and tables of rates
multi_rate_plot(east_trace, east_early_trace, ylim=(0,40))
def rate_table(trace, race=''):
varname = 'λ_%s' % race if race else 'λ'
df = df_summary(trace, varnames=[varname])
df.index = range(2005, 2016)
return df
rate_table(east_trace, 'black')
rate_table(east_trace, 'white')
Late onset
with region_model('West', 0) as west_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
west_trace = approx.sample(draws=1000)
traceplot(west_trace, varnames=['σ', 'ρ', 'η']);
Early onset
with region_model('West', 1) as west_early_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
west_early_trace = approx.sample(draws=1000)
traceplot(west_early_trace, varnames=['σ', 'ρ', 'η']);
Plots and tables of rates
multi_rate_plot(west_trace, west_early_trace)
rate_table(west_trace, 'black')
rate_table(west_trace, 'white')
with region_model('Middle', 0) as middle_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
middle_trace = approx.sample(draws=1000)
traceplot(middle_trace, varnames=['σ', 'ρ', 'η']);
with region_model('Middle', 1) as middle_early_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
middle_early_trace = approx.sample(draws=1000)
traceplot(middle_early_trace, varnames=['σ', 'ρ', 'η']);
Plot and table of rates
multi_rate_plot(middle_trace, middle_early_trace)
rate_table(middle_trace, 'black')
rate_table(middle_trace, 'white')
def pooled_model(region, early_onset):
county_ind = np.where(np.in1d(county_list, gbs_data[gbs_data.Region==region].COUNTY.unique()))[0]
with Model() as model:
# Make local references to data
subset_ind = (merged_covs.county.isin(county_ind)) & (merged_covs.early_onset==early_onset)
county, black, year, births, early_onset, cases = merged_covs[subset_ind].values.T
data_shape = cases.shape
# Index survival rate from county
qsurv = quarterly_survival.reset_index(drop=True)
surv = np.array([qsurv.loc[c, y+2005] for c,y in zip(county, year)])
# Apply survival to births
rng = tt.shared_randomstreams.RandomStreams()
n_vector = rng.binomial(n=births, p=surv, size=data_shape)
# African american effect
ρ = Gamma('ρ', 1, 0.1)
η = Gamma('η', 1, 0.1)
K = η * cov.ExpQuad(1, ρ)
σ = HalfCauchy('σ', 3)
X = np.arange(nyears).reshape(-1,1)
θ = MvNormal('θ', mu=tt.zeros(nyears), cov=K(X) + tt.eye(nyears)*σ,
shape=(nyears,))
# Poisson GBS rate, by time race and county
λ = Deterministic('λ', tt.exp(θ)*10000)
# Data likelihood
obs = Poisson('obs', n_vector * tt.exp(θ[year]), observed=shared(cases))
return model
Middle region
with pooled_model('Middle', 0) as middle_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
middle_pooled_trace = approx.sample(draws=1000)
with pooled_model('Middle', 1) as middle_early_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
middle_early_pooled_trace = approx.sample(draws=1000)
def plot_pooled_rates(trace, early_trace, ax=None, factor=1):
sns.set_context("notebook", font_scale=1.5)
rate_est = pd.DataFrame(np.concatenate([trace['λ'],
early_trace['λ']], 1).T)*factor
rate_est['year'] = np.ravel([np.arange(11)+2005]*2)
rate_est['onset'] = np.r_[['late']*11, ['early']*11]
race_df = pd.melt(rate_est, id_vars=['year', 'onset'], value_name='rate')
if ax is None:
fig, ax = plt.subplots(figsize=(14,8))
g = sns.boxplot(x="year", y="rate", hue="onset", data=race_df, fliersize=0,
palette=sns.color_palette("husl", 2), ax=ax)
return g
ax = plot_pooled_rates(middle_pooled_trace, middle_early_pooled_trace)
ax.set_title('Middle Tennessee GBS rates');
rate_table(middle_pooled_trace)
rate_table(middle_early_pooled_trace)
East region
with pooled_model('East', 0) as east_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
east_pooled_trace = approx.sample(draws=1000)
with pooled_model('East', 1) as east_early_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
east_early_pooled_trace = approx.sample(draws=1000)
ax = plot_pooled_rates(east_pooled_trace, east_early_pooled_trace)
ax.set_title('East Tennessee GBS rates');
rate_table(east_pooled_trace)
rate_table(east_early_pooled_trace)
West region
with pooled_model('West', 0) as west_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
west_pooled_trace = approx.sample(draws=1000)
with pooled_model('West', 1) as west_early_pooled_model:
approx = fit(n=20000, random_seed=RANDOM_SEEDS[0])
west_early_pooled_trace = approx.sample(draws=1000)
ax = plot_pooled_rates(west_pooled_trace, west_early_pooled_trace)
ax.set_title('West Tennessee GBS rates');
rate_table(west_pooled_trace)
rate_table(west_early_pooled_trace)