# expect non-root permission error when installing R package in "default location" 

install.packages("mice",repos="http://cran.r-project.org")

# non-root permission workaround to install R package in "custom location"

system('cd ~/work/')
system('mkdir myr')

install.packages("mice",lib="~/work/myr/",repos="http://cran.r-project.org")
install.packages("mitools",lib="~/work/myr/",repos="http://cran.r-project.org")
install.packages("survey",lib="~/work/myr/",repos="http://cran.r-project.org")


# survey packages
library(mice, lib.loc="~/work/myr/")
library(mitools, lib.loc="~/work/myr/")
library(survey, lib.loc="~/work/myr/")

# data wrangling packages (already installed)
library(RCurl)
library(car)
library(dplyr)

sessionInfo()

########################################
# get brfss data example adapted from corey_sparks
########################################

# library(RCurl)
# library(dplyr)
# library(car)

url = "https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv"
brfss_11 = getURL(url)
brfss_11 = read.csv(textConnection(brfss_11), header=T, dec=",")
nams = names(brfss_11)
newnames = gsub("_", "", nams)
names(brfss_11) = tolower(newnames)

brfss_11$statefip = sprintf("%02d", brfss_11$state )
brfss_11$cofip = sprintf("%03d", brfss_11$cnty )
brfss_11$cofips = paste(brfss_11$statefip, brfss_11$cofip, sep="")


brfss_11$weight = as.numeric(as.character(brfss_11$cntywt))
#just for brevity, I just select TX respondents with non missing weights
brfss_11 = brfss_11[is.na(brfss_11$cntywt)==F,]

dim(brfss_11)


################################################
# wrangle 'measurement' variables
################################################

# ?car::recode

# smoking currently
brfss_11$smoke = car::recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")

# low physical activity
brfss_11$lowact = car::recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")

# high cholesterol
brfss_11$hc = car::recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")

# marital status
brfss_11$marst = car::recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst = relevel(brfss_11$marst, ref='married')

# income
brfss_11$inc = as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))

# age z-standardized
brfss_11$agez = scale(brfss_11$age, center=T, scale=T)

brfss_11 = brfss_11[complete.cases(brfss_11[, c("agez")]),]
# names(brfss_11)

# parec
# what is cryptically named "parec" ? 
# exercise: look up in codebook
summary(brfss_11$parec)


# the 'measurement' variables to use
names_measure_var = c("agez",'hc','marst','lowact','smoke','inc')

brfss_11[,names_measure_var] %>% summary
brfss_11[,names_measure_var] %>% head


################################################
# wrangle 'design' variables
################################################

brfss_11$cntywt = as.numeric(as.character(brfss_11$cntywt))
brfss_11$sampleid = paste(brfss_11$state, brfss_11$ststr)

# within each sampling unit, sum the weights
wts = tapply(brfss_11$cntywt,brfss_11$sampleid,sum)

# make a data frame from this
wts = data.frame(id=names(unlist(wts)), wt=unlist(wts))

# get the unique sampling location ids'
t1 = as.data.frame(table(brfss_11$sampleid))

# put all of this into a data set
wts2 = data.frame(ids=wts$id, sumwt=wts$wt, jn=t1$Freq)

# merge all of this back to the original data file
brfss_11 = merge(brfss_11, wts2, by.x="sampleid", by.y="ids", all.x=T)
rm(wts2)

# In the new data set, multiply the original weight by the fraction of the
# sampling unit total population each person represents
brfss_11$swts = brfss_11$cntywt*(brfss_11$jn/brfss_11$sumwt)

hist(brfss_11$swts)
# head(brfss_11)

# names(brfss_11)

# the 'design' variables to use
names_design_var = c("ststr","swts")
brfss_11[,names_design_var] %>% head
brfss_11[,names_design_var] %>% summary


#################################
# gather 'measurement' and 'design' variables
#################################
dat_imp_ex = brfss_11[,c(names_design_var,names_measure_var)]

dat_imp_ex %>% head
dat_imp_ex %>% summary


## optional: export original BRFSS data from R to stata

# library(haven)
# names(brfss_11) %>% gsub(pattern='\\.',replacement='_') %>% sort
# 
# names(brfss_11) = gsub(x=names(brfss_11),pattern='\\.',replacement='_')
# write_dta(brfss_11,path='~/projects/survey_complex/data/proc/brfss_11.DTA')
# # read_dta('~/projects/survey_complex/data/proc/brfss_11.DTA')
# 
# write_dta(dat_imp_ex,path='~/projects/survey_complex/data/proc/dat_imp_ex.DTA')
# # read_dta('~/projects/survey_complex/data/proc/dat_imp_ex.DTA')

# library(survey)

########################################
# set survey design
# non-imputation example
########################################

dim(brfss_11)
dim(dat_imp_ex)

options(survey.lonely.psu = "adjust")

# ?svydesign

des_non_impute = svydesign(ids=~1, 
                           strata=~ststr, 
                           weights=~swts, 
                           data=brfss_11)

attributes(des_non_impute)

########################
# descriptives
########################

# head(des_non_impute)
# str(des_non_impute)

# summary(des_non_impute$variables)

# ?svymean
message('Note: age does not have missing cases')
svymean(~age, design = des_non_impute)

message('Note: smoke has missing cases, hence NA svymean() result')
svymean(~as.factor(smoke), design = des_non_impute)

message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion')
svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE)


message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion')


########################
# regression
########################

# codebook says the cryptically named 'parec' variable is
# parec: Aerobic and Strengthening Guideline

fit_non_impute_age = svyglm(design=des_non_impute,
                        formula=agez ~ as.factor(parec),
                        family='gaussian')
summary(fit_non_impute_age)

# note: default omitted NAs
fit_non_impute_smoke = svyglm(design=des_non_impute,
                        formula=smoke ~ agez + inc + lowact,
                        family=quasibinomial())

summary(fit_non_impute_smoke)


########################
# subpopulation (subdomain) analysis
########################

# library(survey)
methods(subset)
# ?subset

# ?subset.survey.design
# names(des_non_impute$variables) %>% sort

message('mean of age on whole sample')
svymean(~age, design = des_non_impute)

message('mean of age on Age 50 and older sample')
des_sub1 = survey:::subset.survey.design(age>=50,x=des_non_impute)
svymean(~age,design=des_sub1)

message('mean of age on Age under 50 sample')
des_sub2 = survey:::subset.survey.design(age<50,x=des_non_impute)
svymean(~age,design=des_sub2)

########################################
# impute
########################################
# library(dplyr)

# gather 'survey design variables' and 'measure variables'


# names(brfss_11)
dim(brfss_11)
# summary(brfss_11)


dim(dat_imp_ex)
summary(dat_imp_ex)


# note: later, we will even use vars in 'names_design_var' to impute
message('design variables')
names_design_var

message('measurement variables')
names_measure_var

message('dataset for imputation')
names(dat_imp_ex)


# use mice to impute and pool
# library(mice)
?mice


########################################
# use 'full' PredictorMatrix (default)
# eg X_j|X_{-j} for all j
# if 'design' variables part of j, then they are used to impute
########################################

imp_pred_mat_all <- mice(dat_imp_ex,n.imp=5,seed=1234)
summary(imp_pred_mat_all)

# 'imp' on lhs will be what's used in design+analysis going forward
imp = imp_pred_mat_all

# note: interpret 1s/0s of 'PredictorMatrix'

########################################
# exclude variables in column 1 and 2
########################################

names(dat_imp_ex)
# columns 1 and 2 were 'design' variables
predictorMatrix_cust = (1 - diag(1, ncol(dat_imp_ex)))
predictorMatrix_cust[,1] = 0
predictorMatrix_cust[,2] = 0

predictorMatrix_cust

imp_pred_mat_custom <- mice(dat_imp_ex,n.imp=5,seed=1234,
                            predictorMatrix=predictorMatrix_cust)
summary(imp_pred_mat_custom)

# this is just a demo on how to toggle custom predictorMatrix
# note: 'imp_pred_mat_custom' is not used going forward


########################################
# middle man processeing step
# mice::complete + imputationList()
# 'survey' needs list structure
########################################
# ?mice::complete  # to concatenate 'observed' and 'imputed' items
# ?mitools::imputationList  # to structure as list, where list item is a 'completed' dataset

# library(mitools)

data_imputed <- mitools::imputationList(lapply(1:5, FUN=mice::complete, x=imp))
str(data_imputed)

message('the list of imputations')
summary(data_imputed)

########################################
# use library(survey)
# 1) respect the survey design
# 2) later: conduct survey analysis
########################################
# library(survey)

# respect survey design
# ouptut of imputationList() 'data_imputed' as data argument
# svydesign(data=data_imputed,.)  
# previously 'data_no_imputation' was data argument when no imputation used
# svydesign(data=data_no_imputation,.)

# ?survey::svydesign

options(survey.lonely.psu = "adjust")

# force completed data thru asserted design
des_imputed <- svydesign(data=data_imputed,
                         ids=~1, strata=~ststr, weights=~swts, 
                         nest=TRUE)

message('the list of imputations propogated into the survey design')

class(des_imputed)
summary(des_imputed)


########################
# descriptives
########################
summary(des_imputed)

message('proportion of smoke after imputation')
MIcombine(with(des_imputed, svymean(~as.factor(smoke))))

message('total counts of smoke after imputation')
MIcombine(with(des_imputed, svytotal(~as.factor(smoke))))

message('compare with non-imputed design ')

message('proportion of smoke on non-imputed')
svymean(~as.factor(smoke),design=des_non_impute)

message('proportion of smoke on case-deleted')
svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)

message('total counts of smoke on case-deleted')
svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)


########################################
# smoke as outcome in logistic regression
# run model with svyglm using 5 imputed data sets contained in design
# NOTE: regression estimates / inference are 'design-based' 
########################################

fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact,
                                        family=quasibinomial))

message('glm results from 1st imputed dataset')
fit_imputed_smoke[[1]]

message('glm results from 5th imputed dataset')
fit_imputed_smoke[[5]]

message('glm results from combining all imputed datasets')
summary(MIcombine(fit_imputed_smoke))


message('compare glm results with non-imputed design')
message('note: default glm() behavior omits rows with NAs (case-deletion)')

fit_non_impute_smoke = svyglm(design=des_non_impute,
                              formula=smoke ~ agez + inc + lowact,
                              family=quasibinomial())

summary(fit_non_impute_smoke)