Import data from REDCap
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
newborns_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt").read()
newborns_proj = Project(api_url, newborns_key)
newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})
hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})
hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_proj.metadata
newborns.shape
(14093, 125)
hospitalized.shape
(3175, 338)
Hospitalized with missing ID numbers:
hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index
Index([u'D3543'], dtype=object)
Create season
variable for newborns:
hospitalized.child_birth_date[:3]
case_id A0001 2010-02-02 A0002 2010-01-29 A0003 2009-03-28 Name: child_birth_date, dtype: object
newborns.birth_date_newborn[:3]
case_id_newborn B1098 2010-11-22 B1545 2010-06-27 C2779 2010-12-20 Name: birth_date_newborn, dtype: object
#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')
hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]
#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')
newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]
hospitalized.birth_date.describe()
count 3175 unique 1185 first 2008-05-20 00:00:00 last 2013-12-20 00:00:00 top 2011-12-14 00:00:00 freq 13 dtype: object
newborns.birth_date.describe()
count 14090 unique 967 first 2010-01-29 00:00:00 last 2013-01-31 00:00:00 top 2011-08-24 00:00:00 freq 38 dtype: object
Identify and remove bogus dates:
hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]
Series([], dtype: datetime64[ns])
newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]
case_id_newborn B1098 2010-11-22 00:00:00 B1545 2010-06-27 00:00:00 C2779 2010-12-20 00:00:00 E`10602 2012-06-11 00:00:00 E0001 2010-01-30 00:00:00 E0002 2010-01-29 00:00:00 E0003 2010-01-30 00:00:00 E0004 2010-01-30 00:00:00 E0005 2010-01-31 00:00:00 E0006 2010-01-31 00:00:00 E0007 2010-02-01 00:00:00 E0008 2010-02-01 00:00:00 E0009 2010-02-02 00:00:00 E0010 2010-02-02 00:00:00 E0011 2010-02-06 00:00:00 ... H7985 2011-10-24 00:00:00 H7986 2011-10-24 00:00:00 H7987 2011-10-24 00:00:00 H7988 2011-10-24 00:00:00 H7989 2011-10-25 00:00:00 H7990 2011-10-25 00:00:00 H7991 2011-10-25 00:00:00 H7992 2011-10-25 00:00:00 H7993 2011-10-25 00:00:00 H7994 2011-10-25 00:00:00 H7995 2011-10-25 00:00:00 H7996 2011-10-27 00:00:00 H7997 2011-10-26 00:00:00 H7998 2011-10-26 00:00:00 H7999 2011-10-26 00:00:00 Name: birth_date, Length: 14093, dtype: datetime64[ns]
min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \
max(hospitalized.birth_date[hospitalized.birth_date.notnull()])
(Timestamp('2008-05-20 00:00:00', tz=None), Timestamp('2013-12-20 00:00:00', tz=None))
newborns.birth_date[newborns.birth_date.notnull()].min(), \
newborns.birth_date[newborns.birth_date.notnull()].max()
(Timestamp('2010-01-29 00:00:00', tz=None), Timestamp('2013-01-31 00:00:00', tz=None))
newborns.birth_date[newborns.birth_date.notnull()].min(), \
newborns.birth_date[newborns.birth_date.notnull()].max()
(Timestamp('2010-01-29 00:00:00', tz=None), Timestamp('2013-01-31 00:00:00', tz=None))
Remove records with null birth dates:
newborns_complete = newborns[newborns.birth_date.notnull()]
hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]
Create season-by-year variable
newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \
(b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]
hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \
(b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]
List of all study seasons:
study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]
study_seasons
[(2010, 1), (2010, 2), (2010, 3), (2010, 4), (2011, 1), (2011, 2), (2011, 3), (2011, 4), (2012, 1), (2012, 2), (2012, 3), (2012, 4), (2013, 1)]
Counts of season-by-year entries for both tables:
hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()
hospitalized_counts
(2012, 4) 70 (2012, 1) 53 (2011, 4) 53 (2011, 1) 49 (2011, 3) 45 (2012, 2) 38 (2012, 3) 34 (2011, 2) 27 (2010, 2) 18 (2010, 4) 16 (2010, 3) 15 (2010, 1) 14 (2013, 1) 6 dtype: int64
newborn_counts = newborns_complete.birth_year_season.value_counts()
newborn_counts
(2012, 4) 1483 (2011, 3) 1454 (2011, 4) 1451 (2012, 1) 1377 (2012, 2) 1291 (2012, 3) 1126 (2011, 1) 1076 (2011, 2) 1031 (2010, 2) 1009 (2010, 4) 879 (2010, 3) 866 (2010, 1) 691 (2013, 1) 356 dtype: int64
samples = []
k = 10
for ys in study_seasons:
# Get number of cases
n = hospitalized_counts[ys]
# Randomize list of all corresponding newborns
newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())
# Select 3 newborns for each case
sample = newborns_perm[:(n*k)]
# Verify that we got enough samples
if len(sample)!=(n*k):
print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)
samples += np.reshape(sample, (n,k)).tolist()
bogus_samples = pd.DataFrame(samples)
try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]
len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))
True
Test that all samples are unique (no double sampling):
len(np.unique(np.ravel(samples))) == len(np.ravel(samples))
True
samples_df = pd.DataFrame(samples)
#samples_df.to_csv('newborn_samples.csv')
Resample for cases that could not get a control:
samples = pd.read_csv('newborn_samples.csv', index_col=0)
samples.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | H3818 | E0070 | G2743 | F5003 | F1039 | G2053 | G2779 | G2785 | H3005 | E0065 |
1 | G2044 | H3082 | G2786 | E0629 | G2041 | H3073 | F1038 | H3835 | G2071 | F1050 |
2 | G2046 | G2759 | F 1053 | H3074 | H3811 | E0078 | E0049 | H3805 | G2774 | F5057 |
3 | F5052 | G2063 | F5022 | G2013 | E0636 | E0674 | H3053 | F5000 | E0086 | F1057 |
4 | H3841 | G2765 | E0029 | G2011 | E0056 | E0055 | F5011 | F1076 | G2748 | F5025 |
exclude = np.array(samples).ravel()
exclude
array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)
try_again = [tuple(i) for i in np.array(try_again, int)]
exclude = [i for i in exclude if i in newborns_complete.index]
newborns_remainder = newborns_complete.drop(exclude)
new_samples = []
k = 10
for ys in try_again:
# Get number of cases
n = 1
# Randomize list of all corresponding newborns
newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())
# Select k newborns for each case
sample = newborns_perm[:(n*k)]
# Verify that we got enough samples
if len(sample)!=(n*k):
print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)
new_samples += np.reshape(sample, (n,k)).tolist()
len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))
True
new_samples_df = pd.DataFrame(new_samples)
#new_samples_df.to_csv('replacement_newborn_samples.csv')