%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='ticks')
DATA_PATH = '../data/NCS/'
teaching_child = pd.read_csv(DATA_PATH + 'ncs_teaching_child_v1_1.csv',
index_col=0, na_values=['M'])
teaching_childhealth = pd.read_csv(DATA_PATH + 'ncs_teaching_childhealth_v1.csv',
na_values=['M'])
teaching_mompreghealth = pd.read_csv(DATA_PATH + 'ncs_teaching_mompreghealth_v1.csv',
index_col=0, na_values=['M'])
teaching_childhealth.CHILD_AGE.hist(bins=30);
wt_under_10m = teaching_childhealth.loc[teaching_childhealth.CHILD_AGE<10, ['CHILD_PIDX', 'VISIT_WT']]
child_data = teaching_child[['MOM_PIDX', 'CHILD_SEX', 'GESTATIONAL_AGE']].merge(wt_under_10m, left_index=True, right_on='CHILD_PIDX')
data_merged = teaching_mompreghealth.merge(child_data, left_index=True, right_on='MOM_PIDX')
data_merged.head()
HEALTH | BMI | BMI_CAT | THYROID | HIGHBP_NOTPREG | ASTHMA | DIABETES | HIGHBP_PREG | PREECLAMPSIA | EARLY_LABOR | ... | RH_DISEASE | URINE | VAGINOSIS | GROUP_B | CIG_NOW | MOM_PIDX | CHILD_SEX | GESTATIONAL_AGE | CHILD_PIDX | VISIT_WT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9589 | 1.0 | 22.0 | 2.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | b00014490 | 2.0 | 4.0 | a65385688 | 16.0 |
10207 | 1.0 | 22.9 | 2.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | b00028364 | 1.0 | 4.0 | a69997363 | NaN |
4943 | 2.0 | 24.8 | 2.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | b00048093 | 2.0 | 4.0 | a33612802 | 17.0 |
4604 | 2.0 | 37.8 | 5.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | b00060642 | 1.0 | 4.0 | a31316804 | 16.0 |
13800 | 1.0 | 28.0 | 3.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | b00096696 | 2.0 | 4.0 | a95202738 | 14.0 |
5 rows × 23 columns
data_merged.isnull().mean()
HEALTH 0.040450 BMI 0.185016 BMI_CAT 0.185016 THYROID 0.000000 HIGHBP_NOTPREG 0.000000 ASTHMA 0.000000 DIABETES 0.000000 HIGHBP_PREG 0.000000 PREECLAMPSIA 0.000000 EARLY_LABOR 0.000000 ANEMIA 0.000000 KIDNEY 0.000000 NAUSEA 0.000000 RH_DISEASE 0.000000 URINE 0.000000 VAGINOSIS 0.000000 GROUP_B 0.000000 CIG_NOW 0.000000 MOM_PIDX 0.000000 CHILD_SEX 0.000000 GESTATIONAL_AGE 0.008442 CHILD_PIDX 0.000000 VISIT_WT 0.220190 dtype: float64
I will try to fit the following model, which contains both child and mother attributes:
visit_weight ~ child_sex + gest_age + mom_bmi + mom_health
analysis_subset = data_merged[['VISIT_WT', 'CHILD_SEX', 'GESTATIONAL_AGE', 'BMI', 'HEALTH']].dropna()
analysis_subset['MALE'] = (analysis_subset.CHILD_SEX==1).astype(int)
analysis_subset['dBMI'] = analysis_subset.BMI - analysis_subset.BMI.mean()
analysis_subset['PRETERM'] = (analysis_subset.GESTATIONAL_AGE<4).astype(int)
analysis_subset.head()
VISIT_WT | CHILD_SEX | GESTATIONAL_AGE | BMI | HEALTH | MALE | dBMI | PRETERM | |
---|---|---|---|---|---|---|---|---|
9589 | 16.0 | 2.0 | 4.0 | 22.0 | 1.0 | 0 | -4.247242 | 0 |
4943 | 17.0 | 2.0 | 4.0 | 24.8 | 2.0 | 0 | -1.447242 | 0 |
4604 | 16.0 | 1.0 | 4.0 | 37.8 | 2.0 | 1 | 11.552758 | 0 |
13800 | 14.0 | 2.0 | 4.0 | 28.0 | 1.0 | 0 | 1.752758 | 0 |
11099 | 16.0 | 2.0 | 4.0 | 24.2 | 1.0 | 0 | -2.047242 | 0 |
analysis_subset.VISIT_WT.hist();
import pymc3 as pm
GLM = pm.glm.GLM
model_formula = 'VISIT_WT ~ MALE + GESTATIONAL_AGE + dBMI + HEALTH'
with pm.Model() as weight_model:
lm = GLM.from_formula(model_formula, data=analysis_subset)
samples = pm.sample(1000, tune=2000, njobs=2)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (2 chains in 2 jobs) NUTS: [sd, HEALTH, dBMI, GESTATIONAL_AGE, MALE, Intercept] Sampling 2 chains: 100%|██████████| 6000/6000 [00:40<00:00, 149.17draws/s]
pm.forestplot(samples, varnames=['MALE', 'GESTATIONAL_AGE', 'dBMI', 'HEALTH']);
pm.summary(samples).round(2)
mean | sd | mc_error | hpd_2.5 | hpd_97.5 | n_eff | Rhat | |
---|---|---|---|---|---|---|---|
Intercept | 11.84 | 0.54 | 0.02 | 10.78 | 12.88 | 1015.20 | 1.0 |
MALE | 1.51 | 0.11 | 0.00 | 1.31 | 1.75 | 1732.75 | 1.0 |
GESTATIONAL_AGE | 0.78 | 0.13 | 0.00 | 0.53 | 1.02 | 1049.85 | 1.0 |
dBMI | 0.01 | 0.01 | 0.00 | -0.01 | 0.03 | 1545.33 | 1.0 |
HEALTH | 0.13 | 0.11 | 0.00 | -0.08 | 0.34 | 1574.31 | 1.0 |
sd | 2.34 | 0.04 | 0.00 | 2.27 | 2.42 | 2055.47 | 1.0 |