In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
import pymc as pm
sb.set_style("white")
In [2]:
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
Out[2]:
greater_48hrs fever_neutropenia never_left written_consent child_name mother_name mother_birth_date mother_record mother_nationality other_mother_nationality ... was_whole_blood_obtained_f date whole_blood_complete age_months length_of_stay gest_age death hospitalized_vitamin_d wheezing_ind sex
case_id
A0001 0 0 0 1 Remas Mahmoud Jbarah Huda Katalo 1976-01-21 NaN 3 NaN ... NaN NaN 0 1 6 40 False 3 0 F
A0002 0 0 0 1 Majed Abdel Kareem Majed Noor SHa'aban Mahmood 1989-09-09 NaN 3 NaN ... NaN NaN 0 1 5 40 False 4 1 M
A0003 0 0 0 1 Rayyan Jamal Muhyi Al.Deen SAra Hussein Muhyi Al.Deen 1965-01-01 NaN 1 NaN ... NaN NaN 0 11 10 40 False 35 0 F
A0004 0 0 0 1 Hanan Mohd Mustapha Abu Othman Kawla Abu Shanab 1983-10-31 NaN 1 NaN ... NaN NaN 0 7 3 38 False 2 1 F
A0005 0 0 0 1 Yara Mahmoud Azmi Ismael Suha Abdel Aziz 1986-02-28 NaN 1 NaN ... NaN NaN 0 2 1 39 False 6 0 F

5 rows × 414 columns

Convert dates

In [3]:
hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)

Assign virus year

In [10]:
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31') 
                 & (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013

hospitalized.virus_year.value_counts()
Out[10]:
2012    1191
2013    1179
2011     798
dtype: int64

Extract RSV subset

In [72]:
hospitalized['RSV'] = hospitalized['pcr_result___1']
RSV_subset = hospitalized[hospitalized.RSV==1]
RSV_subset.death.sum()
Out[72]:
7
In [78]:
RSV_subset.icu.mean()
Out[78]:
0.08110065170166546
In [73]:
deaths = RSV_subset[RSV_subset.death==1]

Dictionary to hold samples

In [79]:
random_sample = {2011: [], 2012: [], 2013: []}
In [80]:
for i,d in deaths.iterrows():
    random_sample[d.virus_year].append(i)
In [81]:
for year in random_sample:
    
    year_subset = RSV_subset[RSV_subset.virus_year==year]
    
    n_required = 31 - len(random_sample[year])
    
    # Determine number in each group
    n_oxygen = int(n_required * 0.4)
    n_vent_icu = int(n_required * 0.1)
    n_none = n_required - n_oxygen - n_vent

    # Mechanical vent or ICU patients
    random_sample[year] += year_subset[(year_subset.vent==1) | (year_subset.icu==1)].sample(n=n_vent).index.values.tolist()
    # Oxygen patients
    random_sample[year] += year_subset[(year_subset.oxygen==1) & (year_subset.vent==0)].sample(n=n_oxygen).index.values.tolist()    
    # No oxygen or ventilator
    random_sample[year] += year_subset[(year_subset.oxygen==0) & (year_subset.vent==0)].sample(n=n_none).index.values.tolist()

Here is the sample from each study year

In [82]:
random_sample
Out[82]:
{2011: ['B1191',
  'D3040',
  'C2104',
  'B1146',
  'B1105',
  'A0077',
  'A0137',
  'D3097',
  'D3080',
  'A0024',
  'C2015',
  'A0155',
  'A0109',
  'C2001',
  'C2138',
  'A0104',
  'C2086',
  'D3128',
  'A0182',
  'B1160',
  'D3109',
  'A0121',
  'D3142',
  'C2022',
  'D3089',
  'B1134',
  'B1030',
  'B1172',
  'B1165',
  'A0125',
  'B1193'],
 2012: ['A0368',
  'B1515',
  'D3416',
  'A0366',
  'D3374',
  'A0456',
  'C2382',
  'A0391',
  'C2383',
  'C2363',
  'D3292',
  'C2448',
  'D3436',
  'C2357',
  'D3430',
  'D3354',
  'C2353',
  'B1396',
  'D3346',
  'B1375',
  'C2402',
  'A0371',
  'A0430',
  'B1522',
  'C2464',
  'B1451',
  'D3345',
  'B1380',
  'C2459',
  'D3449',
  'C2462'],
 2013: ['A0718',
  'B1594',
  'D3731',
  'D3539',
  'A0614',
  'B1831',
  'C2637',
  'B1559',
  'D3633',
  'D3762',
  'C2673',
  'D3677',
  'B1724',
  'A0753',
  'B1752',
  'A0675',
  'A0720',
  'B1554',
  'B1568',
  'A0515',
  'A0683',
  'B1767',
  'A0748',
  'D3714',
  'D3679',
  'D3629',
  'C2779',
  'D3667',
  'B1772',
  'C2708',
  'A0530']}