import pandas as pd
import numpy as np
from datetime import datetime
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
#hospitalized.head()
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,262,281,282,283,297,298) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
hospitalized.shape
(3168, 407)
Assing study year
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31')
& (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013
hospitalized.virus_year.value_counts()
2012 1191 2013 1179 2011 798 Name: virus_year, dtype: int64
Filter out CT<30 and blood/saliva
conditions = (hospitalized.rsv_count<30) & ((hospitalized.blood_culture==1) | (hospitalized.saliva_swab==1))
data_subset = hospitalized[conditions]
data_subset.shape
(636, 408)
Draw random samples:
n = 92
n_ox = int(n*.4)
n_noox = n - n_ox
rsv_random_sample = []
for virus_year in (2011, 2012, 2013):
year_subset = data_subset[data_subset.virus_year==virus_year]
# Oxygen flag
on_oxygen = year_subset.oxygen==1
oxygen_subset = year_subset[on_oxygen]
no_oxygen_subset = year_subset[~on_oxygen]
rsv_random_sample.append(oxygen_subset.sample(n=n_ox))
rsv_random_sample.append(no_oxygen_subset.sample(n=n_noox))
rsv_random_sample = pd.concat(rsv_random_sample)
Ensure sample size:
assert len(rsv_random_sample) == n*3
pd.crosstab(rsv_random_sample.virus_year, rsv_random_sample.oxygen)
oxygen | 0.0 | 1.0 |
---|---|---|
virus_year | ||
2011 | 56 | 36 |
2012 | 56 | 36 |
2013 | 56 | 36 |
assert not (rsv_random_sample.rsv_count>30).sum()
Export the samples:
rsv_random_sample.to_csv("rsv_random_sample.csv")