# expect non-root permission error when installing R package in "default location" install.packages("mice",repos="http://cran.r-project.org") # non-root permission workaround to install R package in "custom location" system('cd ~/work/') system('mkdir myr') install.packages("mice",lib="~/work/myr/",repos="http://cran.r-project.org") install.packages("mitools",lib="~/work/myr/",repos="http://cran.r-project.org") install.packages("survey",lib="~/work/myr/",repos="http://cran.r-project.org") # survey packages library(mice, lib.loc="~/work/myr/") library(mitools, lib.loc="~/work/myr/") library(survey, lib.loc="~/work/myr/") # data wrangling packages (already installed) library(RCurl) library(car) library(dplyr) sessionInfo() ######################################## # get brfss data example adapted from corey_sparks ######################################## # library(RCurl) # library(dplyr) # library(car) url = "https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv" brfss_11 = getURL(url) brfss_11 = read.csv(textConnection(brfss_11), header=T, dec=",") nams = names(brfss_11) newnames = gsub("_", "", nams) names(brfss_11) = tolower(newnames) brfss_11$statefip = sprintf("%02d", brfss_11$state ) brfss_11$cofip = sprintf("%03d", brfss_11$cnty ) brfss_11$cofips = paste(brfss_11$statefip, brfss_11$cofip, sep="") brfss_11$weight = as.numeric(as.character(brfss_11$cntywt)) #just for brevity, I just select TX respondents with non missing weights brfss_11 = brfss_11[is.na(brfss_11$cntywt)==F,] dim(brfss_11) ################################################ # wrangle 'measurement' variables ################################################ # ?car::recode # smoking currently brfss_11$smoke = car::recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA") # low physical activity brfss_11$lowact = car::recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA") # high cholesterol brfss_11$hc = car::recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA") # marital status brfss_11$marst = car::recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T) brfss_11$marst = relevel(brfss_11$marst, ref='married') # income brfss_11$inc = as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg)) # age z-standardized brfss_11$agez = scale(brfss_11$age, center=T, scale=T) brfss_11 = brfss_11[complete.cases(brfss_11[, c("agez")]),] # names(brfss_11) # parec # what is cryptically named "parec" ? # exercise: look up in codebook summary(brfss_11$parec) # the 'measurement' variables to use names_measure_var = c("agez",'hc','marst','lowact','smoke','inc') brfss_11[,names_measure_var] %>% summary brfss_11[,names_measure_var] %>% head ################################################ # wrangle 'design' variables ################################################ brfss_11$cntywt = as.numeric(as.character(brfss_11$cntywt)) brfss_11$sampleid = paste(brfss_11$state, brfss_11$ststr) # within each sampling unit, sum the weights wts = tapply(brfss_11$cntywt,brfss_11$sampleid,sum) # make a data frame from this wts = data.frame(id=names(unlist(wts)), wt=unlist(wts)) # get the unique sampling location ids' t1 = as.data.frame(table(brfss_11$sampleid)) # put all of this into a data set wts2 = data.frame(ids=wts$id, sumwt=wts$wt, jn=t1$Freq) # merge all of this back to the original data file brfss_11 = merge(brfss_11, wts2, by.x="sampleid", by.y="ids", all.x=T) rm(wts2) # In the new data set, multiply the original weight by the fraction of the # sampling unit total population each person represents brfss_11$swts = brfss_11$cntywt*(brfss_11$jn/brfss_11$sumwt) hist(brfss_11$swts) # head(brfss_11) # names(brfss_11) # the 'design' variables to use names_design_var = c("ststr","swts") brfss_11[,names_design_var] %>% head brfss_11[,names_design_var] %>% summary ################################# # gather 'measurement' and 'design' variables ################################# dat_imp_ex = brfss_11[,c(names_design_var,names_measure_var)] dat_imp_ex %>% head dat_imp_ex %>% summary ## optional: export original BRFSS data from R to stata # library(haven) # names(brfss_11) %>% gsub(pattern='\\.',replacement='_') %>% sort # # names(brfss_11) = gsub(x=names(brfss_11),pattern='\\.',replacement='_') # write_dta(brfss_11,path='~/projects/survey_complex/data/proc/brfss_11.DTA') # # read_dta('~/projects/survey_complex/data/proc/brfss_11.DTA') # # write_dta(dat_imp_ex,path='~/projects/survey_complex/data/proc/dat_imp_ex.DTA') # # read_dta('~/projects/survey_complex/data/proc/dat_imp_ex.DTA') # library(survey) ######################################## # set survey design # non-imputation example ######################################## dim(brfss_11) dim(dat_imp_ex) options(survey.lonely.psu = "adjust") # ?svydesign des_non_impute = svydesign(ids=~1, strata=~ststr, weights=~swts, data=brfss_11) attributes(des_non_impute) ######################## # descriptives ######################## # head(des_non_impute) # str(des_non_impute) # summary(des_non_impute$variables) # ?svymean message('Note: age does not have missing cases') svymean(~age, design = des_non_impute) message('Note: smoke has missing cases, hence NA svymean() result') svymean(~as.factor(smoke), design = des_non_impute) message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion') svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE) message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion') ######################## # regression ######################## # codebook says the cryptically named 'parec' variable is # parec: Aerobic and Strengthening Guideline fit_non_impute_age = svyglm(design=des_non_impute, formula=agez ~ as.factor(parec), family='gaussian') summary(fit_non_impute_age) # note: default omitted NAs fit_non_impute_smoke = svyglm(design=des_non_impute, formula=smoke ~ agez + inc + lowact, family=quasibinomial()) summary(fit_non_impute_smoke) ######################## # subpopulation (subdomain) analysis ######################## # library(survey) methods(subset) # ?subset # ?subset.survey.design # names(des_non_impute$variables) %>% sort message('mean of age on whole sample') svymean(~age, design = des_non_impute) message('mean of age on Age 50 and older sample') des_sub1 = survey:::subset.survey.design(age>=50,x=des_non_impute) svymean(~age,design=des_sub1) message('mean of age on Age under 50 sample') des_sub2 = survey:::subset.survey.design(age<50,x=des_non_impute) svymean(~age,design=des_sub2) ######################################## # impute ######################################## # library(dplyr) # gather 'survey design variables' and 'measure variables' # names(brfss_11) dim(brfss_11) # summary(brfss_11) dim(dat_imp_ex) summary(dat_imp_ex) # note: later, we will even use vars in 'names_design_var' to impute message('design variables') names_design_var message('measurement variables') names_measure_var message('dataset for imputation') names(dat_imp_ex) # use mice to impute and pool # library(mice) ?mice ######################################## # use 'full' PredictorMatrix (default) # eg X_j|X_{-j} for all j # if 'design' variables part of j, then they are used to impute ######################################## imp_pred_mat_all <- mice(dat_imp_ex,n.imp=5,seed=1234) summary(imp_pred_mat_all) # 'imp' on lhs will be what's used in design+analysis going forward imp = imp_pred_mat_all # note: interpret 1s/0s of 'PredictorMatrix' ######################################## # exclude variables in column 1 and 2 ######################################## names(dat_imp_ex) # columns 1 and 2 were 'design' variables predictorMatrix_cust = (1 - diag(1, ncol(dat_imp_ex))) predictorMatrix_cust[,1] = 0 predictorMatrix_cust[,2] = 0 predictorMatrix_cust imp_pred_mat_custom <- mice(dat_imp_ex,n.imp=5,seed=1234, predictorMatrix=predictorMatrix_cust) summary(imp_pred_mat_custom) # this is just a demo on how to toggle custom predictorMatrix # note: 'imp_pred_mat_custom' is not used going forward ######################################## # middle man processeing step # mice::complete + imputationList() # 'survey' needs list structure ######################################## # ?mice::complete # to concatenate 'observed' and 'imputed' items # ?mitools::imputationList # to structure as list, where list item is a 'completed' dataset # library(mitools) data_imputed <- mitools::imputationList(lapply(1:5, FUN=mice::complete, x=imp)) str(data_imputed) message('the list of imputations') summary(data_imputed) ######################################## # use library(survey) # 1) respect the survey design # 2) later: conduct survey analysis ######################################## # library(survey) # respect survey design # ouptut of imputationList() 'data_imputed' as data argument # svydesign(data=data_imputed,.) # previously 'data_no_imputation' was data argument when no imputation used # svydesign(data=data_no_imputation,.) # ?survey::svydesign options(survey.lonely.psu = "adjust") # force completed data thru asserted design des_imputed <- svydesign(data=data_imputed, ids=~1, strata=~ststr, weights=~swts, nest=TRUE) message('the list of imputations propogated into the survey design') class(des_imputed) summary(des_imputed) ######################## # descriptives ######################## summary(des_imputed) message('proportion of smoke after imputation') MIcombine(with(des_imputed, svymean(~as.factor(smoke)))) message('total counts of smoke after imputation') MIcombine(with(des_imputed, svytotal(~as.factor(smoke)))) message('compare with non-imputed design ') message('proportion of smoke on non-imputed') svymean(~as.factor(smoke),design=des_non_impute) message('proportion of smoke on case-deleted') svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE) message('total counts of smoke on case-deleted') svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE) ######################################## # smoke as outcome in logistic regression # run model with svyglm using 5 imputed data sets contained in design # NOTE: regression estimates / inference are 'design-based' ######################################## fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact, family=quasibinomial)) message('glm results from 1st imputed dataset') fit_imputed_smoke[[1]] message('glm results from 5th imputed dataset') fit_imputed_smoke[[5]] message('glm results from combining all imputed datasets') summary(MIcombine(fit_imputed_smoke)) message('compare glm results with non-imputed design') message('note: default glm() behavior omits rows with NAs (case-deletion)') fit_non_impute_smoke = svyglm(design=des_non_impute, formula=smoke ~ agez + inc + lowact, family=quasibinomial()) summary(fit_non_impute_smoke)