#!/usr/bin/env python # coding: utf-8 # Import data from REDCap # In[ ]: from redcap import Project api_url = 'https://redcap.vanderbilt.edu/api/' newborns_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt").read() newborns_proj = Project(api_url, newborns_key) newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]}) hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read() hospitalized_proj = Project(api_url, hospitalized_key) hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]}) # In[ ]: hospitalized_key = open("/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt").read() hospitalized_proj = Project(api_url, hospitalized_key) hospitalized_proj.metadata # In[2]: newborns.shape # In[3]: hospitalized.shape # Hospitalized with missing ID numbers: # In[4]: hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index # Create `season` variable for newborns: # In[5]: hospitalized.child_birth_date[:3] # In[6]: newborns.birth_date_newborn[:3] # In[7]: #hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d') hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date] #newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise') newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn] # In[8]: hospitalized.birth_date.describe() # In[9]: newborns.birth_date.describe() # Identify and remove bogus dates: # In[10]: hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]] # In[11]: newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]] # In[12]: min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \ max(hospitalized.birth_date[hospitalized.birth_date.notnull()]) # In[13]: newborns.birth_date[newborns.birth_date.notnull()].min(), \ newborns.birth_date[newborns.birth_date.notnull()].max() # In[16]: newborns.birth_date[newborns.birth_date.notnull()].min(), \ newborns.birth_date[newborns.birth_date.notnull()].max() # Remove records with null birth dates: # In[17]: newborns_complete = newborns[newborns.birth_date.notnull()] hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()] # Create season-by-year variable # In[18]: newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \ (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date] hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \ (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date] # List of all study seasons: # In[19]: study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)] study_seasons # Counts of season-by-year entries for both tables: # In[20]: hospitalized_counts = hospitalized_complete.birth_year_season.value_counts() hospitalized_counts # In[21]: newborn_counts = newborns_complete.birth_year_season.value_counts() newborn_counts # In[114]: samples = [] k = 10 for ys in study_seasons: # Get number of cases n = hospitalized_counts[ys] # Randomize list of all corresponding newborns newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist()) # Select 3 newborns for each case sample = newborns_perm[:(n*k)] # Verify that we got enough samples if len(sample)!=(n*k): print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k) samples += np.reshape(sample, (n,k)).tolist() # In[42]: bogus_samples = pd.DataFrame(samples) try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]] # In[23]: len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]])) # Test that all samples are unique (no double sampling): # In[24]: len(np.unique(np.ravel(samples))) == len(np.ravel(samples)) # In[27]: samples_df = pd.DataFrame(samples) #samples_df.to_csv('newborn_samples.csv') # Resample for cases that could not get a control: # In[69]: samples = pd.read_csv('newborn_samples.csv', index_col=0) samples.head() # In[77]: exclude = np.array(samples).ravel() exclude # In[62]: try_again = [tuple(i) for i in np.array(try_again, int)] # In[90]: exclude = [i for i in exclude if i in newborns_complete.index] newborns_remainder = newborns_complete.drop(exclude) # In[117]: new_samples = [] k = 10 for ys in try_again: # Get number of cases n = 1 # Randomize list of all corresponding newborns newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist()) # Select k newborns for each case sample = newborns_perm[:(n*k)] # Verify that we got enough samples if len(sample)!=(n*k): print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k) new_samples += np.reshape(sample, (n,k)).tolist() # In[118]: len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples)) # In[119]: new_samples_df = pd.DataFrame(new_samples) #new_samples_df.to_csv('replacement_newborn_samples.csv')