Import data from REDCap

In [ ]:

from redcap import Project

api_url = 'https://redcap.vanderbilt.edu/api/'

newborns_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt").read()
newborns_proj = Project(api_url, newborns_key)
newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})

hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})

In [ ]:

hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read()
hospitalized_proj = Project(api_url, hospitalized_key)
hospitalized_proj.metadata

In [2]:

newborns.shape

Out[2]:

(14093, 125)

In [3]:

hospitalized.shape

Out[3]:

(3175, 338)

Hospitalized with missing ID numbers:

In [4]:

hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index

Out[4]:

Index([u'D3543'], dtype=object)

Create season variable for newborns:

In [5]:

hospitalized.child_birth_date[:3]

Out[5]:

case_id
A0001      2010-02-02
A0002      2010-01-29
A0003      2009-03-28
Name: child_birth_date, dtype: object

In [6]:

newborns.birth_date_newborn[:3]

Out[6]:

case_id_newborn
B1098              2010-11-22
B1545              2010-06-27
C2779              2010-12-20
Name: birth_date_newborn, dtype: object

In [7]:

#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')
hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]
#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')
newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]

In [8]:

hospitalized.birth_date.describe()

Out[8]:

count                    3175
unique                   1185
first     2008-05-20 00:00:00
last      2013-12-20 00:00:00
top       2011-12-14 00:00:00
freq                       13
dtype: object

In [9]:

newborns.birth_date.describe()

Out[9]:

count                   14090
unique                    967
first     2010-01-29 00:00:00
last      2013-01-31 00:00:00
top       2011-08-24 00:00:00
freq                       38
dtype: object

Identify and remove bogus dates:

In [10]:

hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]

Out[10]:

Series([], dtype: datetime64[ns])

In [11]:

newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]

Out[11]:

case_id_newborn
B1098             2010-11-22 00:00:00
B1545             2010-06-27 00:00:00
C2779             2010-12-20 00:00:00
E`10602           2012-06-11 00:00:00
E0001             2010-01-30 00:00:00
E0002             2010-01-29 00:00:00
E0003             2010-01-30 00:00:00
E0004             2010-01-30 00:00:00
E0005             2010-01-31 00:00:00
E0006             2010-01-31 00:00:00
E0007             2010-02-01 00:00:00
E0008             2010-02-01 00:00:00
E0009             2010-02-02 00:00:00
E0010             2010-02-02 00:00:00
E0011             2010-02-06 00:00:00
...
H7985             2011-10-24 00:00:00
H7986             2011-10-24 00:00:00
H7987             2011-10-24 00:00:00
H7988             2011-10-24 00:00:00
H7989             2011-10-25 00:00:00
H7990             2011-10-25 00:00:00
H7991             2011-10-25 00:00:00
H7992             2011-10-25 00:00:00
H7993             2011-10-25 00:00:00
H7994             2011-10-25 00:00:00
H7995             2011-10-25 00:00:00
H7996             2011-10-27 00:00:00
H7997             2011-10-26 00:00:00
H7998             2011-10-26 00:00:00
H7999             2011-10-26 00:00:00
Name: birth_date, Length: 14093, dtype: datetime64[ns]

In [12]:

min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \
max(hospitalized.birth_date[hospitalized.birth_date.notnull()])

Out[12]:

(Timestamp('2008-05-20 00:00:00', tz=None),
 Timestamp('2013-12-20 00:00:00', tz=None))

In [13]:

newborns.birth_date[newborns.birth_date.notnull()].min(), \
newborns.birth_date[newborns.birth_date.notnull()].max()

Out[13]:

(Timestamp('2010-01-29 00:00:00', tz=None),
 Timestamp('2013-01-31 00:00:00', tz=None))

In [16]:

newborns.birth_date[newborns.birth_date.notnull()].min(), \
newborns.birth_date[newborns.birth_date.notnull()].max()

Out[16]:

(Timestamp('2010-01-29 00:00:00', tz=None),
 Timestamp('2013-01-31 00:00:00', tz=None))

Remove records with null birth dates:

In [17]:

newborns_complete = newborns[newborns.birth_date.notnull()]
hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]

Create season-by-year variable

In [18]:

newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \
    (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]
hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \
    (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]

List of all study seasons:

In [19]:

study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]
study_seasons

Out[19]:

[(2010, 1),
 (2010, 2),
 (2010, 3),
 (2010, 4),
 (2011, 1),
 (2011, 2),
 (2011, 3),
 (2011, 4),
 (2012, 1),
 (2012, 2),
 (2012, 3),
 (2012, 4),
 (2013, 1)]

Counts of season-by-year entries for both tables:

In [20]:

hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()
hospitalized_counts

Out[20]:

(2012, 4)    70
(2012, 1)    53
(2011, 4)    53
(2011, 1)    49
(2011, 3)    45
(2012, 2)    38
(2012, 3)    34
(2011, 2)    27
(2010, 2)    18
(2010, 4)    16
(2010, 3)    15
(2010, 1)    14
(2013, 1)     6
dtype: int64

In [21]:

newborn_counts = newborns_complete.birth_year_season.value_counts()
newborn_counts

Out[21]:

(2012, 4)    1483
(2011, 3)    1454
(2011, 4)    1451
(2012, 1)    1377
(2012, 2)    1291
(2012, 3)    1126
(2011, 1)    1076
(2011, 2)    1031
(2010, 2)    1009
(2010, 4)     879
(2010, 3)     866
(2010, 1)     691
(2013, 1)     356
dtype: int64

In [114]:

samples = []
k = 10

for ys in study_seasons:
    # Get number of cases
    n = hospitalized_counts[ys]
    # Randomize list of all corresponding newborns
    newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())
    # Select 3 newborns for each case
    sample = newborns_perm[:(n*k)]
    # Verify that we got enough samples
    if len(sample)!=(n*k):
        print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)
    samples += np.reshape(sample, (n,k)).tolist()

In [42]:

bogus_samples = pd.DataFrame(samples)
try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]

In [23]:

len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))

Out[23]:

True

Test that all samples are unique (no double sampling):

In [24]:

len(np.unique(np.ravel(samples))) == len(np.ravel(samples))

Out[24]:

True

In [27]:

samples_df = pd.DataFrame(samples)
#samples_df.to_csv('newborn_samples.csv')

Resample for cases that could not get a control:

In [69]:

samples = pd.read_csv('newborn_samples.csv', index_col=0)
samples.head()

Out[69]:

	0	1	2	3	4	5	6	7	8	9
0	H3818	E0070	G2743	F5003	F1039	G2053	G2779	G2785	H3005	E0065
1	G2044	H3082	G2786	E0629	G2041	H3073	F1038	H3835	G2071	F1050
2	G2046	G2759	F 1053	H3074	H3811	E0078	E0049	H3805	G2774	F5057
3	F5052	G2063	F5022	G2013	E0636	E0674	H3053	F5000	E0086	F1057
4	H3841	G2765	E0029	G2011	E0056	E0055	F5011	F1076	G2748	F5025

In [77]:

exclude = np.array(samples).ravel()
exclude

Out[77]:

array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)

In [62]:

try_again = [tuple(i) for i in np.array(try_again, int)]

In [90]:

exclude = [i for i in exclude if i in newborns_complete.index]
newborns_remainder = newborns_complete.drop(exclude)

In [117]:

new_samples = []
k = 10

for ys in try_again:
    # Get number of cases
    n = 1
    # Randomize list of all corresponding newborns
    newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())
    # Select k newborns for each case
    sample = newborns_perm[:(n*k)]
    # Verify that we got enough samples
    if len(sample)!=(n*k):
        print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)
    new_samples += np.reshape(sample, (n,k)).tolist()

In [118]:

len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))

Out[118]:

True

In [119]:

new_samples_df = pd.DataFrame(new_samples)
#new_samples_df.to_csv('replacement_newborn_samples.csv')