#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from datetime import datetime # In[33]: hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0) #hospitalized.head() # In[10]: hospitalized.shape # Assing study year # In[14]: hospitalized['virus_year'] = 2011 hospitalized.loc[(hospitalized.admission_date >= '2011-03-31') & (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012 hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013 hospitalized.virus_year.value_counts() # Filter out CT<30 and blood/saliva # In[15]: conditions = (hospitalized.rsv_count<30) & ((hospitalized.blood_culture==1) | (hospitalized.saliva_swab==1)) data_subset = hospitalized[conditions] # In[16]: data_subset.shape # Draw random samples: # In[23]: n = 92 n_ox = int(n*.4) n_noox = n - n_ox # In[25]: rsv_random_sample = [] for virus_year in (2011, 2012, 2013): year_subset = data_subset[data_subset.virus_year==virus_year] # Oxygen flag on_oxygen = year_subset.oxygen==1 oxygen_subset = year_subset[on_oxygen] no_oxygen_subset = year_subset[~on_oxygen] rsv_random_sample.append(oxygen_subset.sample(n=n_ox)) rsv_random_sample.append(no_oxygen_subset.sample(n=n_noox)) # In[26]: rsv_random_sample = pd.concat(rsv_random_sample) # Ensure sample size: # In[28]: assert len(rsv_random_sample) == n*3 # In[29]: pd.crosstab(rsv_random_sample.virus_year, rsv_random_sample.oxygen) # In[31]: assert not (rsv_random_sample.rsv_count>30).sum() # Export the samples: # In[32]: rsv_random_sample.to_csv("rsv_random_sample.csv")