library(tidyverse)
library(magrittr)
library(betareg)
library(broom)
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ── ✔ ggplot2 3.1.0 ✔ purrr 0.2.5 ✔ tibble 2.0.1 ✔ dplyr 0.7.8 ✔ tidyr 0.8.2 ✔ stringr 1.3.1 ✔ readr 1.3.1 ✔ forcats 0.3.0 Warning message: “package ‘tibble’ was built under R version 3.5.2”── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() Attaching package: ‘magrittr’ The following object is masked from ‘package:purrr’: set_names The following object is masked from ‘package:tidyr’: extract
load('../data/artsengagement.Rda')
load('../data/open_response_data/labeled_topic_prevs.Rda')
source('../scripts/select_helpers.R')
ls()
df %>% select(contains('sr_identity')) %>% names
identity <- df %>% select('sr_identity_musician','sr_identity_visart','sr_identity_dancer','sr_identity_cwriter',
'sr_identity_film','sr_identity_gdesigner','sr_identity_architect','sr_identity_actor',
'sr_identity_designer','sr_identity_studyarts', 'sr_identity_makemusic','sr_identity_makevisart',
'sr_identity_dance','sr_identity_cwrite','sr_identity_makefilms','sr_identity_perform')
identity_agg <- c()
for(i in seq(nrow(identity))) {
artist <- NA
for(j in seq(identity)) {
if (!is.na(identity[i,j]) & identity[i,j] == "Yes") {
artist <- 1
break
}
if (!is.na(identity[i,j]) & identity[i,j] == "No") {
artist <- 2
}
}
identity_agg %<>% c(artist)
}
df$sr_identity_artist <- factor(identity_agg, labels=c("Yes","No"))
motiv_demos <- df %>% select(contains('sr_motivation')) %>% names
identity_demos <- df %>% select(contains('sr_identity')) %>% names
participation_demos <- df %>% select(all_arts('participation')) %>% select(starts_with('sr')) %>% names
real_demos <- c('ethnic_group', 'sex', 'school', 'parented', 'income', 'hstype', 'hssize',
'hslocation', 'hs_arts_freq', 'hs_encouragement', 'hs_required', 'hs_fees',
'so_childhood1', 'so_childhood3', 'artsincollege', 'so_childhood5', 'sr_participated',
'sr_highestdegreeplanned')
demo_groups <- c('real_demos','motiv_demos','identity_demos','participation_demos')
demo_groups <- list(real_demos, motiv_demos, identity_demos, participation_demos)
thetas[[9]] %>% select(contains('Role')) %>% head
Played a Role | Didn't Play a Role |
---|---|
0.07166238 | 0.00000000 |
0.07834615 | 0.00000000 |
0.04303464 | 0.00000000 |
0.00000000 | 0.07846073 |
0.12374583 | 0.00000000 |
0.07635667 | 0.00000000 |
zeroval <- 0.0001
for(i in seq(nrow(thetas[[9]]))) {
if (thetas[[9]][i,2] == 0) {
thetas[[9]][i,2] = zeroval
thetas[[9]][i,11] = thetas[[9]][i,11] - zeroval
}
if (thetas[[9]][i,11] == 0) {
thetas[[9]][i,11] = zeroval
thetas[[9]][i,2] = thetas[[9]][i,2] - zeroval
}
}
thetas[[9]] %>% select(contains('Role')) %>% head
Played a Role | Didn't Play a Role |
---|---|
0.07156238 | 0.00010000 |
0.07824615 | 0.00010000 |
0.04293464 | 0.00010000 |
0.00010000 | 0.07836073 |
0.12364583 | 0.00010000 |
0.07625667 | 0.00010000 |
thetas %>% names
sig_contrasts.demos <- c()
for(k in c(4,8)) {
for(d in seq(demo_groups)) {
question_name <- names(thetas)[k]
merged <- merge(thetas[[question_name]], select(df, key, demo_groups[[d]]), by = 'key')
merged <- merged[-1]
for(l in seq(demo_groups[[d]])) {
## NA the lonely eggs
merged[[demo_groups[[d]][l]]][merged[[demo_groups[[d]][l]]] %in%
(merged[[demo_groups[[d]][l]]] %>% table %>% tidy %>% filter(n < 25) %>% pull(1))] <- NA
merged[[demo_groups[[d]][l]]] %<>% droplevels
if (nlevels(merged[[demo_groups[[d]][l]]]) < 2)
merged %<>% select(-matches(demo_groups[[d]][l]))
}
# now we only have good demos, and all topics
topics <- merged[1:(ncol(thetas[[question_name]])-1)]
demo <- merged[ncol(thetas[[question_name]]):ncol(merged)]
for(i in seq(topics)) {
temp <- bind_cols(topics[i], demo)
names(temp) <- c('topic', names(demo))
beta.out <- betareg(topic ~ ., temp)
df.residual <- beta.out$df.residual
df.null <- beta.out$df.null
r.squared <- beta.out$pseudo.r.squared
options(warn=-1)
beta.out %<>% tidy %>% filter(component == 'mean') %>%
select(-component) %>% filter(term != '(Intercept)')
options(warn=0)
beta.out$p.value %<>% p.adjust(method='holm')
beta.out %<>% filter(p.value < 0.05) %>% mutate(topic = names(topics[i]),
question = question_name,
residual_df = df.residual,
null_df = df.null,
r_sq = r.squared)
if(nrow(beta.out) > 0) {
options(warn=-1)
sig_contrasts.demos %<>% bind_rows(beta.out)
options(warn=0)
}
}
}
}
split_list <- gregexpr("[A-Z]|[0-9]", sig_contrasts.demos$term)
demo_list <- c()
level_list <- c()
for(i in seq(sig_contrasts.demos$term)) {
demo_list %<>% c(substr(sig_contrasts.demos$term[i], 1, split_list[[i]] - 1))
level_list %<>% c(substr(sig_contrasts.demos$term[i], split_list[[i]],
nchar(sig_contrasts.demos$term[i])))
}
sig_contrasts.demos$demo <- demo_list
sig_contrasts.demos$level <- level_list
sig_contrasts.demos$term <- NULL
empty_list <- sig_contrasts.demos %>% filter(demo == '')
split_list <- gregexpr("[0-9]", empty_list$level)
demo_list <- c()
level_list <- c()
for(i in seq(empty_list$level)) {
demo_list %<>% c(substr(empty_list$level[i], 1, split_list[[i]] - 1))
level_list %<>% c(substr(empty_list$level[i], split_list[[i]],
nchar(empty_list$level[i])))
}
empty_list$demo <- demo_list
empty_list$level <- level_list
sig_contrasts.demos %<>% bind_rows(empty_list)
sig_contrasts.demos %<>% filter(demo != '')
sig_contrasts.demos %>% nrow
sig_contrasts.demos
estimate | std.error | statistic | p.value | topic | question | residual_df | null_df | r_sq | demo | level |
---|---|---|---|---|---|---|---|---|---|---|
0.9933077 | 0.1917906 | 5.179128 | 6.687779e-06 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | ethnic_group | Asian |
-0.8221978 | 0.1416449 | -5.804642 | 2.064112e-07 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | sex | Female |
-0.9405522 | 0.2549704 | -3.688869 | 6.081847e-03 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | school | Ross School of Business |
-1.7024398 | 0.4044585 | -4.209183 | 7.432575e-04 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | hssize | More than 3001 |
1.3042886 | 0.3311390 | 3.938795 | 2.292976e-03 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | hslocation | Rural |
-0.7694124 | 0.1454879 | -5.288497 | 3.823086e-06 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | hs_arts_freq | Frequently |
0.5763670 | 0.1837866 | 3.136067 | 4.451987e-02 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | hs_encouragement | Yes |
0.4439039 | 0.1422351 | 3.120915 | 4.507247e-02 | More Confident & Social | behavior2 | 30 | 62 | 0.4515716 | hs_fees | Yes |
-1.0383347 | 0.2781618 | -3.732845 | 5.869201e-03 | Deeper Understanding of Art & The World | behavior2 | 30 | 62 | 0.4455261 | school | Ross School of Business |
0.9149535 | 0.1912546 | 4.783955 | 5.500143e-05 | Deeper Understanding of Art & The World | behavior2 | 30 | 62 | 0.4455261 | sr_highestdegreeplanned | Master's degree |
1.0567834 | 0.3313881 | 3.188960 | 3.712429e-02 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | ethnic_group | Hispanic |
0.5073364 | 0.1539249 | 3.295999 | 2.647950e-02 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | school | Engineering |
0.7546784 | 0.2089958 | 3.610975 | 8.846397e-03 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | parented | Master's |
-0.7369082 | 0.1770650 | -4.161795 | 9.472684e-04 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | income | More than $150,000 |
0.8007149 | 0.2399230 | 3.337383 | 2.367999e-02 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | hslocation | Rural |
-0.4789299 | 0.1084851 | -4.414705 | 3.135581e-04 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | hs_arts_freq | Frequently |
0.5912219 | 0.1022762 | 5.780640 | 2.381338e-07 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | hs_fees | Yes |
0.3782069 | 0.1189064 | 3.180710 | 3.712429e-02 | More Critical Thinking | behavior2 | 30 | 62 | 0.7115845 | sr_participated | Yes |
0.8000761 | 0.1800304 | 4.444117 | 2.735863e-04 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | ethnic_group | Asian |
-0.8180881 | 0.2122767 | -3.853877 | 3.487865e-03 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | school | Engineering |
0.7678724 | 0.2258567 | 3.399820 | 1.955474e-02 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | hstype | Private religious |
1.0608601 | 0.3134339 | 3.384638 | 1.995623e-02 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | hslocation | Suburban |
1.6145819 | 0.3151517 | 5.123189 | 9.613154e-06 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | hslocation | Rural |
0.4632224 | 0.1431172 | 3.236665 | 3.265247e-02 | Attending More Events | behavior2 | 30 | 62 | 0.5050804 | sr_highestdegreeplanned | Doctoral or professional degree |
1.7307381 | 0.3704740 | 4.671686 | 9.559614e-05 | More Aware | behavior2 | 30 | 62 | 0.6253664 | school | Stamps School of Art & Design |
0.4328560 | 0.1228578 | 3.523227 | 1.236346e-02 | More Appreciative of Arts & Others | behavior2 | 30 | 62 | 0.4716291 | hs_required | Yes |
-0.5402647 | 0.1194513 | -4.522888 | 1.952046e-04 | More Appreciative of Arts & Others | behavior2 | 30 | 62 | 0.4716291 | hs_fees | Yes |
0.4366903 | 0.1168151 | 3.738305 | 5.557954e-03 | More Appreciative of Arts & Others | behavior2 | 30 | 62 | 0.4716291 | so_childhood | 5No |
0.5604396 | 0.1429007 | 3.921881 | 2.723667e-03 | More Appreciative of Arts & Others | behavior2 | 30 | 62 | 0.4716291 | sr_highestdegreeplanned | Master's degree |
0.5379562 | 0.1595643 | 3.371406 | 2.393134e-02 | New Outlooks | behavior2 | 30 | 62 | 0.4571121 | sex | Female |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
1.3705931 | 0.22732657 | 6.029181 | 5.273350e-08 | Misc. Music | behavior2 | 30 | 62 | 0.48368240 | hs_arts_freq | Frequently |
-0.8999722 | 0.27161150 | -3.313454 | 2.764538e-02 | Misc. Music | behavior2 | 30 | 62 | 0.48368240 | hs_encouragement | Yes |
-0.9287445 | 0.26310237 | -3.529974 | 1.288362e-02 | Misc. Music | behavior2 | 30 | 62 | 0.48368240 | sr_highestdegreeplanned | Master's degree |
-1.3221420 | 0.29429467 | -4.492579 | 1.759146e-04 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | parented | Associate's |
-1.3102288 | 0.21656955 | -6.049922 | 4.347490e-08 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | parented | Bachelor's |
-1.4650365 | 0.22374275 | -6.547861 | 1.809374e-09 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | parented | Master's |
-1.9673279 | 0.29688241 | -6.626623 | 1.099049e-09 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | parented | Ph.D or professional degree |
0.7907064 | 0.23937033 | 3.303277 | 2.006803e-02 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | income$ | 100,001-$150,000 |
-1.0172833 | 0.20464279 | -4.971019 | 1.864851e-05 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | hstype | Private religious |
-0.9580613 | 0.20460643 | -4.682459 | 7.369798e-05 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | hssize | 1001-2000 |
0.4528630 | 0.14280622 | 3.171171 | 3.036512e-02 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | hs_encouragement | Yes |
-0.3739433 | 0.11043067 | -3.386227 | 1.558938e-02 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | hs_fees | Yes |
-1.1691721 | 0.24299331 | -4.811540 | 4.043836e-05 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | so_childhood | 1No |
0.9124845 | 0.15481523 | 5.894023 | 1.093025e-07 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | so_childhood | 3No |
0.5186899 | 0.14394134 | 3.603481 | 7.221616e-03 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | sr_participated | Yes |
-0.4386914 | 0.11843215 | -3.704158 | 5.090266e-03 | Seeing From More Perspectives | behavior2 | 30 | 62 | 0.63488174 | sr_highestdegreeplanned | Doctoral or professional degree |
-0.2666217 | 0.07734493 | -3.447177 | 1.359545e-02 | Seeing From More Perspectives | behavior2 | 393 | 417 | 0.09086713 | sr_motivation_problemsolving | Yes |
-0.3099045 | 0.09722342 | -3.187549 | 3.730584e-02 | More Aware | behavior2 | 363 | 389 | 0.10735024 | sr_identity_dancer | No |
1.0828003 | 0.34424499 | 3.145435 | 4.809359e-02 | Career Tradeoff Concerns | sr_career | 42 | 71 | 0.35591151 | parented | Associate's |
0.5198800 | 0.13823582 | 3.760820 | 4.911361e-03 | Art as a Career | sr_career | 42 | 71 | 0.37205498 | hs_arts_freq | Frequently |
0.4578252 | 0.12406376 | 3.690241 | 6.273164e-03 | Art as a Career | sr_career | 42 | 71 | 0.37205498 | hs_required | Yes |
1.7238058 | 0.33406398 | 5.160107 | 7.157458e-06 | Want Art to be Part of Future | sr_career | 42 | 71 | 0.34033663 | parented | Associate's |
-0.4778890 | 0.14091776 | -3.391262 | 1.948007e-02 | Want Art to be Part of Future | sr_career | 42 | 71 | 0.34033663 | hs_encouragement | Yes |
0.6918821 | 0.19375677 | 3.570880 | 1.031774e-02 | Some Impact | sr_career | 42 | 71 | 0.38985069 | hssize | 500-1000 |
0.9332654 | 0.26505612 | 3.521011 | 1.246726e-02 | Looking for Creative Expression in Non-Artistic Career | sr_career | 42 | 71 | 0.37730674 | ethnic_group | Hispanic |
-0.7635134 | 0.23356800 | -3.268913 | 3.022927e-02 | Looking for Creative Expression in Non-Artistic Career | sr_career | 42 | 71 | 0.37730674 | hslocation | Rural |
-0.3671697 | 0.11887656 | -3.088663 | 4.825417e-02 | No Impact 1 | sr_career | 481 | 505 | 0.25552354 | sr_motivation_creativeabilities | Yes |
-0.5530415 | 0.16434058 | -3.365216 | 1.988580e-02 | Art as a Career | sr_career | 466 | 492 | 0.13824483 | sr_identity_film | No |
-0.2704459 | 0.07336906 | -3.686104 | 5.920557e-03 | Want Art to be Part of Future | sr_career | 466 | 492 | 0.16213946 | sr_identity_studyarts | No |
0.2144737 | 0.07453378 | 2.877537 | 4.408728e-02 | Art as a Career | sr_career | 485 | 496 | 0.06310533 | sr_film_participation | Yes |
ungrouped_demos <- c(demo_groups[[1]], demo_groups[[2]], 'sr_identity_artist', demo_groups[[4]])
na_medians <- c()
na_means <- c()
for(k in seq(thetas)) {
question_name <- names(thetas)[k]
merged <- merge(thetas[[question_name]], select(df, key, ungrouped_demos), by = 'key')
merged <- merged[-1] #remove key
for(l in seq(ungrouped_demos)) {
## NA the lonely eggs
merged[[ungrouped_demos[l]]][merged[[ungrouped_demos[l]]] %in%
(merged[[ungrouped_demos[l]]] %>% table %>% tidy %>% filter(n < 25) %>% pull(1))] <- NA
merged[[ungrouped_demos[l]]] %<>% droplevels
if (nlevels(merged[[ungrouped_demos[l]]]) < 2)
merged %<>% select(-matches(ungrouped_demos[l]))
}
# now we only have good demos, and all topics
topics <- merged[1:(ncol(thetas[[question_name]])-1)]
demo <- merged[ncol(thetas[[question_name]]):ncol(merged)]
na_medians %<>% c(demo %>% sapply(function(x)(sum(is.na(x)))/nrow(demo)) %>% as.vector %>% median)
na_means %<>% c(demo %>% sapply(function(x)(sum(is.na(x)))/nrow(demo)) %>% as.vector %>% mean)
}
data.frame(na_means, na_medians, seq(12))
na_means | na_medians | seq.12. |
---|---|---|
0.46329111 | 0.57259528 | 1 |
0.56369913 | 0.72809335 | 2 |
0.57852594 | 0.73376335 | 3 |
0.28515532 | 0.32419355 | 4 |
0.41644596 | 0.52490315 | 5 |
0.27844881 | 0.31980116 | 6 |
0.03875380 | 0.04395604 | 7 |
0.06760413 | 0.04339623 | 8 |
0.06907355 | 0.03201970 | 9 |
0.07678826 | 0.04016478 | 10 |
0.06147959 | 0.01428571 | 11 |
0.08089980 | 0.04153355 | 12 |
sig_contrasts.demos_no_grp <- c()
betaregs <- c()
# bad_input <- c(4,8,Inf)
idx = 1
for(k in seq(thetas)) {
# if (k == bad_input[idx]) {
# idx = idx + 1
# next
# }
question_name <- names(thetas)[k]
merged <- merge(thetas[[question_name]], select(df, key, ungrouped_demos), by = 'key')
merged <- merged[-1] #remove key
for(l in seq(ungrouped_demos)) {
## NA the lonely eggs
merged[[ungrouped_demos[l]]][merged[[ungrouped_demos[l]]] %in%
(merged[[ungrouped_demos[l]]] %>% table %>% tidy %>% filter(n < 25) %>% pull(1))] <- NA
merged[[ungrouped_demos[l]]] %<>% droplevels
if (nlevels(merged[[ungrouped_demos[l]]]) < 2)
merged %<>% select(-matches(ungrouped_demos[l]))
}
# now we only have good demos, and all topics
topics <- merged[1:(ncol(thetas[[question_name]])-1)]
demo <- merged[ncol(thetas[[question_name]]):ncol(merged)]
for(i in seq(topics)) {
temp <- bind_cols(topics[i], demo)
names(temp) <- c('topic', names(demo))
beta.out <- betareg(topic ~ ., temp)#, na.action = na.exclude)
df.residual <- beta.out$df.residual
df.null <- beta.out$df.null
r.squared <- beta.out$pseudo.r.squared
options(warn=-1)
beta.out %<>% tidy %>% filter(component == 'mean') %>%
select(-component) %>% filter(term != '(Intercept)')
options(warn=0)
beta.out$p.value %<>% p.adjust(method='holm')
beta.out %<>% filter(p.value < 0.05) %>% mutate(topic = names(topics[i]),
question = question_name,
residual_df = df.residual,
null_df = df.null,
r_sq = r.squared)
if(nrow(beta.out) > 0) {
options(warn=-1)
sig_contrasts.demos_no_grp %<>% bind_rows(beta.out)
options(warn=0)
}
}
}
Warning message in betareg.fit(X, Y, Z, weights, offset, link, link.phi, type, control): “no valid starting value for precision parameter found, using 1 instead”
Error in optim(par = start, fn = loglikfun, gr = gradfun, method = method, : non-finite value supplied by optim Traceback: 1. betareg(topic ~ ., temp) 2. betareg.fit(X, Y, Z, weights, offset, link, link.phi, type, control) 3. optim(par = start, fn = loglikfun, gr = gradfun, method = method, . hessian = hessian, control = control)
split_list <- gregexpr("[A-Z]|[0-9]", sig_contrasts.demos_no_grp$term)
demo_list <- c()
level_list <- c()
for(i in seq(sig_contrasts.demos_no_grp$term)) {
demo_list %<>% c(substr(sig_contrasts.demos_no_grp$term[i], 1, split_list[[i]] - 1))
level_list %<>% c(substr(sig_contrasts.demos_no_grp$term[i], split_list[[i]],
nchar(sig_contrasts.demos_no_grp$term[i])))
}
sig_contrasts.demos_no_grp$demo <- demo_list
sig_contrasts.demos_no_grp$level <- level_list
sig_contrasts.demos_no_grp$term <- NULL
empty_list <- sig_contrasts.demos_no_grp %>% filter(demo == '')
split_list <- gregexpr("[0-9]", empty_list$level)
demo_list <- c()
level_list <- c()
for(i in seq(empty_list$level)) {
demo_list %<>% c(substr(empty_list$level[i], 1, split_list[[i]] - 1))
level_list %<>% c(substr(empty_list$level[i], split_list[[i]],
nchar(empty_list$level[i])))
}
empty_list$demo <- demo_list
empty_list$level <- level_list
sig_contrasts.demos_no_grp %<>% bind_rows(empty_list)
sig_contrasts.demos_no_grp %<>% filter(demo != '')
sig_contrasts.demos_no_grp %>% nrow
sig_contrasts.demos_no_grp$question %>% table
. barriers change childhood2 definition 11 9 30 6 feel sr_aftercollege4 sr_development sr_othergrowth 30 24 31 19 sr_role sr_society 69 65
qns <- sig_contrasts.demos_no_grp %>% pull(question) %>% unique
qns
dir <- 'topic_demo_csvs/'
dir.create(file.path(dir))
for(i in seq(qns)) {
write.csv(x = sig_contrasts.demos_no_grp %>% filter(question == qns[i]) %>% arrange(demo),
file = paste0(dir, qns[i], '_demographics.csv'))
}