suppressWarnings(library(stm)) suppressWarnings(library(stringr)) DF<-read.table("totaljobsCoordRates.txt") DF<-DF[!is.na(DF$Description),]# stm doesn't work with missing data DF<-DF[!is.na(DF$rateby),] DF$Description = str_replace_all(DF$Description, "/"," ")# substitute "/" with a space DF$Description = str_replace_all(DF$Description, "'|"|/", "'") ## links and other eventual html encoding (adapted from https://juliasilge.com/blog/evaluating-stm/) DF$Description = str_replace_all(DF$Description, "", " ") ## DF$Description = str_replace_all(DF$Description, ">|<|&", " ") ## DF$Description = str_replace_all(DF$Description, "&#[:digit:]+;", " ") ## DF$Description = str_remove_all(DF$Description, "<[^>]*>") ## textProcessor prepares the corpus, representing the documents as lists containting #word indicese and associated word counts, the vocab character vector associated with #the word indices and a metadata matrix containing the covariates. As we can remove some words, #here we opt to remove "work" and "will" from the corpus processed<-textProcessor(DF$Description, metadata = DF, customstopwords=c("work", "will"), verbose=FALSE) ##PrepDocuments is a helpful function to perform some manipulations like removing words based on thresholds #without compromising the indexes out<-prepDocuments(processed$documents, processed$vocab, processed$meta, verbose=FALSE) docs<-out$documents vocab<-out$vocab meta<-out$meta #identify documents where the word "heartland" has been used wntword<-which(processed$vocab=="heartland") filterdocs<-lapply(processed$documents, function(ch) grep(wntword, ch)) indexList<-filterdocs[sapply(filterdocs, function(x) length(x) > 0)] DF[as.numeric(names(indexList)),2:3] storage1<-searchK(docs, vocab, K = c(5,10,15,20, 50), prevalence=~ rateby, data=meta,set.seed(9999), verbose=FALSE) print(storage1$results) options(repr.plot.width=6, repr.plot.height=6) plot(storage1) model10Prrateby<-stm(documents=out$documents, vocab=out$vocab, prevalence =~ rateby, K=10, data=out$meta, init.type = "Spectral", verbose=FALSE) model15Prrateby<-stm(documents=out$documents, vocab=out$vocab, prevalence =~ rateby, K=15, data=out$meta, init.type = "Spectral", verbose=FALSE) model20Prrateby<-stm(documents=out$documents, vocab=out$vocab, prevalence =~ rateby, K=20, data=out$meta, init.type = "Spectral", verbose=FALSE) suppressWarnings(library(ggplot2)) suppressWarnings(library(htmlwidgets)) M10ExSem<-as.data.frame(cbind(c(1:10),exclusivity(model10Prrateby), semanticCoherence(model=model10Prrateby, docs), "Mod10")) M15ExSem<-as.data.frame(cbind(c(1:15),exclusivity(model15Prrateby), semanticCoherence(model=model15Prrateby, docs), "Mod15")) M20ExSem<-as.data.frame(cbind(c(1:20),exclusivity(model20Prrateby), semanticCoherence(model=model20Prrateby, docs), "Mod20")) ModsExSem<-rbind(M10ExSem, M15ExSem, M20ExSem) colnames(ModsExSem)<-c("K","Exclusivity", "SemanticCoherence", "Model") ModsExSem$Exclusivity<-as.numeric(as.character(ModsExSem$Exclusivity)) ModsExSem$SemanticCoherence<-as.numeric(as.character(ModsExSem$SemanticCoherence)) options(repr.plot.width=7, repr.plot.height=6, repr.plot.res=100) plotexcoer<-ggplot(ModsExSem, aes(SemanticCoherence, Exclusivity, color = Model))+geom_point(size = 2, alpha = 0.7) + geom_text(aes(label=K), nudge_y=.04)+ labs(x = "Semantic coherence", y = "Exclusivity", title = "Comparing exclusivity and semantic coherence") plotexcoer topicprop15<-make.dt(model15Prrateby, meta) topicprop15[1:3,c(1:18, 27)]#visualize proportions, job title and rateby options(repr.plot.width=7, repr.plot.height=7) plot.STM(model15Prrateby, "hist") suppressWarnings(library(tidytext))# the package sometimes is not loaded correctly. If this happens, you might have to re-start the kernel td_theta <- tidytext::tidy(model15Prrateby, matrix = "theta") selectiontdthteta<-td_theta[td_theta$document%in%c(1:15),]#select the first 30 documents. be careful to select a sensible interval, as attempting to load a very huge corpus might crash the kernel thetaplot1<-ggplot(selectiontdthteta, aes(y=gamma, x=as.factor(topic), fill = as.factor(topic))) + geom_bar(stat="identity",alpha = 0.8, show.legend = FALSE) + facet_wrap(~ document, ncol = 3) + labs(title = "Theta values per document", y = expression(theta), x = "Topic") options(repr.plot.width=8, repr.plot.height=7, repr.plot.res=100) thetaplot1 suppressWarnings(library(dplyr)) suppressWarnings(library(drlib))#drlib is available on github and needs devtools to be installed td_beta <- tidytext::tidy(model15Prrateby) options(repr.plot.width=7, repr.plot.height=8, repr.plot.res=100) td_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% mutate(topic = paste0("Topic ", topic), term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = as.factor(topic))) + geom_col(alpha = 0.8, show.legend = FALSE) + facet_wrap(~ topic, scales = "free_y") + coord_flip() + scale_x_reordered() + labs(x = NULL, y = expression(beta), title = "Highest word probabilities for each topic", subtitle = "Different words are associated with different topics") betaT1<-td_beta %>% mutate(topic = paste0("Topic ", topic), term = reorder_within(term, beta, topic)) %>%filter(topic=="Topic 1")#beta values for topic 1 betaplotT1<-ggplot(betaT1[betaT1$beta>0.003,], aes(term, beta, fill = as.factor(topic))) + geom_bar(alpha = 0.8, show.legend = FALSE, stat = "Identity")+coord_flip()+labs(x ="Terms", y = expression(beta), title = "Word probabilities for Topic 1")#plot word probabilities higher than 0.003 for topic 1 options(repr.plot.width=9, repr.plot.height=10, repr.plot.res=100) betaplotT1 options(repr.plot.width=7, repr.plot.height=7, repr.plot.res=100) plot.STM(model15Prrateby, "summary", n=5)# distribution and top 5 words per topic labelTopics(model15Prrateby, topics=c(5,6,9), n=10)# complete list of top 10 words per topics 5-6-9 par(mfrow=c(1,1), mar=c(0,2,0,0)) options(repr.plot.width=5, repr.plot.height=3, repr.plot.res=100) plot.STM(model15Prrateby, "labels", topics=c(5,6,9), label="frex", n=10, width=55)#top 10 FREX words per topics 5-6-9 thoughts5 <- findThoughts(model15Prrateby,texts=DF$Description, topics=5, n=3)$docs[[1]]# take 3 representative documents per topic 5 thoughts6 <- findThoughts(model15Prrateby,texts=DF$Description, topics=6, n=3)$docs[[1]]# take 3 representative documents per topic 6 thoughts9 <- findThoughts(model15Prrateby,texts=DF$Description, topics=9, n=3)$docs[[1]]# take 3 representative documents per topic 9 options(repr.plot.width=10, repr.plot.height=12, repr.plot.res=100) par(mfrow=c(1,4), mar=c(0,0,2,2)) plotQuote(thoughts5, width=30, maxwidth=500, text.cex=1.25, main="Topic 5") plotQuote(thoughts6, width=30, maxwidth=500, text.cex=1.25, main="Topic 6") plotQuote(thoughts9, width=30, maxwidth=500, text.cex=1.25, main="Topic 9") options(repr.plot.width=7, repr.plot.height=6, repr.plot.res=100) ggplot(ModsExSem[ModsExSem$Model=="Mod15",], aes(SemanticCoherence, Exclusivity, color = Model))+geom_point(size = 2, alpha = 0.7) + geom_text(aes(label=K), nudge_x=.05, nudge_y=.05)+ labs(x = "Semantic coherence", y = "Exclusivity", title = "Comparing exclusivity and semantic coherence for model with 15 topics") options(repr.plot.width=7, repr.plot.height=7, repr.plot.res=100) par(mfrow=c(2,2), mar=c(0,0,2,2)) plot.STM(model15Prrateby, "perspectives", topics=c(5,9)) plot.STM(model15Prrateby, "perspectives", topics=c(5,6)) plot.STM(model15Prrateby, "perspectives", topics=c(6,9)) out$meta$rateby<-as.factor(out$meta$rateby) prep<-estimateEffect(1:15~ rateby, model15Prrateby, metadata=out$meta, uncertainty="Global")#nsim is defaulted to 25, but on a small model a higher number lead to more consistent results summary(prep, topics=c(1:3), nsim=1000)# summary of regression on topic 1-3 options(repr.plot.width=7, repr.plot.height=7, repr.plot.res=100) plot.estimateEffect(prep, model=model15Prrateby, covariate="rateby", topics=3, method="pointestimate", xlim=c(-.5,1)) ggplot(td_theta, aes(gamma, fill = as.factor(topic))) + geom_histogram(alpha = 0.8, show.legend = FALSE) + facet_wrap(~ topic, ncol = 3) + labs(title = "Distribution of document probabilities for each topic", y = "Number of documents", x = expression(theta))