require(quanteda) load("data/DTM.2.RData") set.seed(1) id_train <- sample(1:ndoc(DTM.2), ndoc(DTM.2) %*% .8, replace = FALSE) training_dfm <- DTM.2[id_train] # training set test_dfm <- DTM.2[!docvars(DTM.2)$X %in% id_train] # test set table(docvars(training_dfm, "rating")) nb <- textmodel_nb(training_dfm, docvars(training_dfm, "rating")) summary(nb) test_dfm <- dfm_select(test_dfm, training_dfm) table(docvars(test_dfm, "rating")) actual_class <- docvars(test_dfm, "rating") predicted_class <- predict(nb, test_dfm) table(actual_class, predicted_class) require(topicmodels) dtm <- convert(DTM.2, to = "topicmodels") lda <- LDA(dtm, k = 10, method="Gibbs", control=list(iter = 100, verbose = 20, alpha = 0.2, estimate.beta = TRUE)) terms(lda, 10) example_ids <- c(1, 2, 3) require("reshape2") require("ggplot2") lda_posterior <- posterior(lda) top5termsPerTopicProb <- lda::top.topic.words(lda_posterior$terms, 5, by.score = T) topicProportionExamples <- lda_posterior$topics[example_ids, ] colnames(topicProportionExamples) <- apply(top5termsPerTopicProb, 2, paste, collapse = " ") vizDataFrame <- melt(data = cbind(data.frame(topicProportionExamples), document = docvars(DTM.2[example_ids])$docname), variable.name = "topic", id.vars = "document") ggplot(data = vizDataFrame, aes(x = topic, y = value, fill = document), ylab = "proportion") + geom_bar(stat = "identity", position = "stack") + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position="none") + coord_flip() + facet_wrap(~document, ncol = length(example_ids)) library(stm) DTM.stm <- convert(DTM.2, to = "stm") head(DTM.stm$meta) poliblogPrevFit <- stm(documents = DTM.stm$documents, vocab = DTM.stm$vocab, K = 10, prevalence =~ rating + s(day), max.em.its = 75, data = DTM.stm$meta) labelTopics(poliblogPrevFit) plot(poliblogPrevFit, type = "summary", xlim = c(0, .3)) prep <- estimateEffect(1:10 ~ rating+s(day), poliblogPrevFit, meta=DTM.stm$meta, uncertainty="Global") plot(prep, "day", method = "continuous", topics = 7, model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)") monthseq <- seq(from = as.Date("2008-01-01"), to = as.Date("2008-12-01"), by = "month") monthnames <- months(monthseq) axis(1, at=as.numeric(monthseq)-min(as.numeric(monthseq)), labels=monthnames)