In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
In [33]:
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
#hospitalized.head()
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,262,281,282,283,297,298) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [10]:
hospitalized.shape
Out[10]:
(3168, 407)

Assing study year

In [14]:
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31') 
                 & (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013

hospitalized.virus_year.value_counts()
Out[14]:
2012    1191
2013    1179
2011     798
Name: virus_year, dtype: int64

Filter out CT<30 and blood/saliva

In [15]:
conditions = (hospitalized.rsv_count<30) & ((hospitalized.blood_culture==1) | (hospitalized.saliva_swab==1))
data_subset = hospitalized[conditions]
In [16]:
data_subset.shape
Out[16]:
(636, 408)

Draw random samples:

In [23]:
n = 92
n_ox = int(n*.4)
n_noox = n - n_ox
In [25]:
rsv_random_sample = []

for virus_year in (2011, 2012, 2013):
    
    year_subset = data_subset[data_subset.virus_year==virus_year]
    
    # Oxygen flag
    on_oxygen = year_subset.oxygen==1

    oxygen_subset = year_subset[on_oxygen]
    no_oxygen_subset = year_subset[~on_oxygen]
    
    rsv_random_sample.append(oxygen_subset.sample(n=n_ox))
    rsv_random_sample.append(no_oxygen_subset.sample(n=n_noox))
    
In [26]:
rsv_random_sample = pd.concat(rsv_random_sample)

Ensure sample size:

In [28]:
assert len(rsv_random_sample) == n*3
In [29]:
pd.crosstab(rsv_random_sample.virus_year, rsv_random_sample.oxygen)
Out[29]:
oxygen 0.0 1.0
virus_year
2011 56 36
2012 56 36
2013 56 36
In [31]:
assert not (rsv_random_sample.rsv_count>30).sum()

Export the samples:

In [32]:
rsv_random_sample.to_csv("rsv_random_sample.csv")