%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
import pymc as pm
sb.set_style("white")
hospitalized = pd.read_csv('data/hospitalized.csv', index_col=0)
hospitalized.head()
/usr/local/lib/python3.4/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (140,142,144,146,148,181,206,212,213,263,282,283,284,298,299) have mixed types. Specify dtype option on import or set low_memory=False. data = self._reader.read(nrows)
greater_48hrs | fever_neutropenia | never_left | written_consent | child_name | mother_name | mother_birth_date | mother_record | mother_nationality | other_mother_nationality | ... | was_whole_blood_obtained_f | date | whole_blood_complete | age_months | length_of_stay | gest_age | death | hospitalized_vitamin_d | wheezing_ind | sex | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
case_id | |||||||||||||||||||||
A0001 | 0 | 0 | 0 | 1 | Remas Mahmoud Jbarah | Huda Katalo | 1976-01-21 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 6 | 40 | False | 3 | 0 | F |
A0002 | 0 | 0 | 0 | 1 | Majed Abdel Kareem Majed | Noor SHa'aban Mahmood | 1989-09-09 | NaN | 3 | NaN | ... | NaN | NaN | 0 | 1 | 5 | 40 | False | 4 | 1 | M |
A0003 | 0 | 0 | 0 | 1 | Rayyan Jamal Muhyi Al.Deen | SAra Hussein Muhyi Al.Deen | 1965-01-01 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 11 | 10 | 40 | False | 35 | 0 | F |
A0004 | 0 | 0 | 0 | 1 | Hanan Mohd Mustapha Abu Othman | Kawla Abu Shanab | 1983-10-31 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 7 | 3 | 38 | False | 2 | 1 | F |
A0005 | 0 | 0 | 0 | 1 | Yara Mahmoud Azmi Ismael | Suha Abdel Aziz | 1986-02-28 | NaN | 1 | NaN | ... | NaN | NaN | 0 | 2 | 1 | 39 | False | 6 | 0 | F |
5 rows × 414 columns
Convert dates
hospitalized.child_birth_date = pd.to_datetime(hospitalized.child_birth_date)
hospitalized.enrollment_date = pd.to_datetime(hospitalized.enrollment_date)
hospitalized.admission_date = pd.to_datetime(hospitalized.admission_date)
hospitalized.discharge_date = pd.to_datetime(hospitalized.discharge_date)
Assign virus year
hospitalized['virus_year'] = 2011
hospitalized.loc[(hospitalized.admission_date >= '2011-03-31')
& (hospitalized.admission_date <= '2012-03-31'), 'virus_year'] = 2012
hospitalized.loc[hospitalized.admission_date > '2012-03-31', 'virus_year'] = 2013
hospitalized.virus_year.value_counts()
2012 1191 2013 1179 2011 798 dtype: int64
Extract RSV subset
hospitalized['RSV'] = hospitalized['pcr_result___1']
RSV_subset = hospitalized[hospitalized.RSV==1]
RSV_subset.death.sum()
7
RSV_subset.icu.mean()
0.08110065170166546
deaths = RSV_subset[RSV_subset.death==1]
Dictionary to hold samples
random_sample = {2011: [], 2012: [], 2013: []}
for i,d in deaths.iterrows():
random_sample[d.virus_year].append(i)
for year in random_sample:
year_subset = RSV_subset[RSV_subset.virus_year==year]
n_required = 31 - len(random_sample[year])
# Determine number in each group
n_oxygen = int(n_required * 0.4)
n_vent_icu = int(n_required * 0.1)
n_none = n_required - n_oxygen - n_vent
# Mechanical vent or ICU patients
random_sample[year] += year_subset[(year_subset.vent==1) | (year_subset.icu==1)].sample(n=n_vent).index.values.tolist()
# Oxygen patients
random_sample[year] += year_subset[(year_subset.oxygen==1) & (year_subset.vent==0)].sample(n=n_oxygen).index.values.tolist()
# No oxygen or ventilator
random_sample[year] += year_subset[(year_subset.oxygen==0) & (year_subset.vent==0)].sample(n=n_none).index.values.tolist()
Here is the sample from each study year
random_sample
{2011: ['B1191', 'D3040', 'C2104', 'B1146', 'B1105', 'A0077', 'A0137', 'D3097', 'D3080', 'A0024', 'C2015', 'A0155', 'A0109', 'C2001', 'C2138', 'A0104', 'C2086', 'D3128', 'A0182', 'B1160', 'D3109', 'A0121', 'D3142', 'C2022', 'D3089', 'B1134', 'B1030', 'B1172', 'B1165', 'A0125', 'B1193'], 2012: ['A0368', 'B1515', 'D3416', 'A0366', 'D3374', 'A0456', 'C2382', 'A0391', 'C2383', 'C2363', 'D3292', 'C2448', 'D3436', 'C2357', 'D3430', 'D3354', 'C2353', 'B1396', 'D3346', 'B1375', 'C2402', 'A0371', 'A0430', 'B1522', 'C2464', 'B1451', 'D3345', 'B1380', 'C2459', 'D3449', 'C2462'], 2013: ['A0718', 'B1594', 'D3731', 'D3539', 'A0614', 'B1831', 'C2637', 'B1559', 'D3633', 'D3762', 'C2673', 'D3677', 'B1724', 'A0753', 'B1752', 'A0675', 'A0720', 'B1554', 'B1568', 'A0515', 'A0683', 'B1767', 'A0748', 'D3714', 'D3679', 'D3629', 'C2779', 'D3667', 'B1772', 'C2708', 'A0530']}