#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set(style='ticks') # In[2]: DATA_PATH = '../data/NCS/' teaching_child = pd.read_csv(DATA_PATH + 'ncs_teaching_child_v1_1.csv', index_col=0, na_values=['M']) teaching_childhealth = pd.read_csv(DATA_PATH + 'ncs_teaching_childhealth_v1.csv', na_values=['M']) teaching_mompreghealth = pd.read_csv(DATA_PATH + 'ncs_teaching_mompreghealth_v1.csv', index_col=0, na_values=['M']) # In[3]: teaching_childhealth.CHILD_AGE.hist(bins=30); # In[4]: wt_under_10m = teaching_childhealth.loc[teaching_childhealth.CHILD_AGE<10, ['CHILD_PIDX', 'VISIT_WT']] child_data = teaching_child[['MOM_PIDX', 'CHILD_SEX', 'GESTATIONAL_AGE']].merge(wt_under_10m, left_index=True, right_on='CHILD_PIDX') data_merged = teaching_mompreghealth.merge(child_data, left_index=True, right_on='MOM_PIDX') data_merged.head() # In[5]: data_merged.isnull().mean() # I will try to fit the following model, which contains both child and mother attributes: # # visit_weight ~ child_sex + gest_age + mom_bmi + mom_health # In[6]: analysis_subset = data_merged[['VISIT_WT', 'CHILD_SEX', 'GESTATIONAL_AGE', 'BMI', 'HEALTH']].dropna() analysis_subset['MALE'] = (analysis_subset.CHILD_SEX==1).astype(int) analysis_subset['dBMI'] = analysis_subset.BMI - analysis_subset.BMI.mean() analysis_subset['PRETERM'] = (analysis_subset.GESTATIONAL_AGE<4).astype(int) analysis_subset.head() # In[7]: analysis_subset.VISIT_WT.hist(); # In[8]: import pymc3 as pm GLM = pm.glm.GLM model_formula = 'VISIT_WT ~ MALE + GESTATIONAL_AGE + dBMI + HEALTH' with pm.Model() as weight_model: lm = GLM.from_formula(model_formula, data=analysis_subset) samples = pm.sample(1000, tune=2000, njobs=2) # In[10]: pm.forestplot(samples, varnames=['MALE', 'GESTATIONAL_AGE', 'dBMI', 'HEALTH']); # In[12]: pm.summary(samples).round(2)