#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='ticks')


# In[2]:


DATA_PATH = '../data/NCS/'

teaching_child = pd.read_csv(DATA_PATH + 'ncs_teaching_child_v1_1.csv', 
                            index_col=0, na_values=['M'])
teaching_childhealth = pd.read_csv(DATA_PATH + 'ncs_teaching_childhealth_v1.csv',
                                  na_values=['M'])
teaching_mompreghealth = pd.read_csv(DATA_PATH + 'ncs_teaching_mompreghealth_v1.csv',
                                    index_col=0, na_values=['M'])


# In[3]:


teaching_childhealth.CHILD_AGE.hist(bins=30);


# In[4]:


wt_under_10m = teaching_childhealth.loc[teaching_childhealth.CHILD_AGE<10, ['CHILD_PIDX', 'VISIT_WT']]
child_data = teaching_child[['MOM_PIDX', 'CHILD_SEX', 'GESTATIONAL_AGE']].merge(wt_under_10m, left_index=True, right_on='CHILD_PIDX')
data_merged = teaching_mompreghealth.merge(child_data, left_index=True, right_on='MOM_PIDX')
data_merged.head()


# In[5]:


data_merged.isnull().mean()


# I will try to fit the following model, which contains both child and mother attributes:
# 
#     visit_weight ~ child_sex + gest_age + mom_bmi + mom_health

# In[6]:


analysis_subset = data_merged[['VISIT_WT', 'CHILD_SEX', 'GESTATIONAL_AGE', 'BMI', 'HEALTH']].dropna()
analysis_subset['MALE'] = (analysis_subset.CHILD_SEX==1).astype(int)
analysis_subset['dBMI'] = analysis_subset.BMI - analysis_subset.BMI.mean()
analysis_subset['PRETERM'] = (analysis_subset.GESTATIONAL_AGE<4).astype(int)
analysis_subset.head()


# In[7]:


analysis_subset.VISIT_WT.hist();


# In[8]:


import pymc3 as pm
GLM = pm.glm.GLM

model_formula = 'VISIT_WT ~ MALE + GESTATIONAL_AGE + dBMI + HEALTH'

with pm.Model() as weight_model:
    
    lm = GLM.from_formula(model_formula, data=analysis_subset)
    samples = pm.sample(1000, tune=2000, njobs=2)


# In[10]:


pm.forestplot(samples, varnames=['MALE', 'GESTATIONAL_AGE', 'dBMI', 'HEALTH']);


# In[12]:


pm.summary(samples).round(2)