options(stringsAsFactors = FALSE) require(topicmodels) require(lda) require(data.table) require(quanteda) require(ggplot2) require(magrittr) require(dplyr) # textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"") # textdata <- as.data.table(textdata) # textdata %<>% mutate(d_id = 1:nrow(textdata)) # Load the posterior and data from tutorial I load("DTM.RData") load("tmResult.RData") load("topicModel.RData") load("corpus.RData") load("textdata.RData") K <- topicModel@k # set a global topic model K parameter to use # From the posterior get theta (p(z|d)) and phi (p(w|z)) theta <- tmResult$topics phi <- tmResult$terms # visualize topics as word cloud library(wordcloud) #generate the names again to filter for a certain topic #terms: Function to extract the most likely terms for each topic or the most likely topics for each document. top5termsPerTopicProb <- lda::top.topic.words(phi, 5, by.score = T) topicNames <- apply(top5termsPerTopicProb, 2, paste, collapse = " ") topicToViz <- 11 # change for your own topic of interest topicToViz <- grep('engineer', topicNames)[1] # Or select a topic by a term contained in its name # select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order top40terms <- sort(phi[topicToViz,], decreasing = TRUE)[1:40] words <- names(top40terms) # extract the probabilities of each of the 40 terms probabilities <- sort(phi[topicToViz,], decreasing = TRUE)[1:40] # visualize the terms as word-cloud library(wordcloud) wordcloud(names(probabilities), probabilities, scale = c(3, .9), colors = brewer.pal(8, "Dark2"), random.order = F) library(IRdisplay) exampleIds <- c(150, 10000, 15000) display_html(paste0("",gsub("\n", "
", textdata$jobpost[exampleIds[1]]))) display_html(paste0("

",gsub("\n", "
", textdata$jobpost[exampleIds[2]]))) display_html(paste0("

",gsub("\n", "
", textdata$jobpost[exampleIds[3]]))) # load libraries for visualization require("reshape2") require("ggplot2") N <- length(exampleIds) # get topic proportions form example documents topicProportionExamples <- theta[as.character(textdata$d_id[exampleIds]), ] colnames(topicProportionExamples) <- topicNames vizDataFrame <- melt(data = cbind(data.frame(topicProportionExamples), document = stringr::str_sub(textdata$Company[exampleIds],1,10)), variable.name = "topic", id.vars = "document") ggplot(data = vizDataFrame, aes(x = topic, y = value, fill = document), ylab = "proportion") + geom_bar(stat = "identity", position = "stack") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() + facet_wrap(~document, ncol = N) library(RColorBrewer) getPalette = colorRampPalette(brewer.pal(9, "Set1")) # Again we use the topic distribution over documents to determine the share of each topic # What are the most probable topics in the entire collection? topicProportions <- colSums(theta) / sum(theta) names(topicProportions) <- topicNames # For the next examples we create a main sorting and coloring of the topics which we could apply to all visualizations in ggplot # We start with sorting the topics by their probability topicProportions <- topicProportions[order(topicProportions)] # ordering in the ggplot library can be done using a factor for the topic labels topicsOrd <- factor( names(topicProportions), # Take the names of the topics as examples levels = names(topicProportions), # Set them also as possible levels of the nominal factor ordered = T) # the given order of the topic names is also the order of the factor # next we randomly create some colors from the "rainbow"-palette of R # colorScale <- sample(rainbow(length(topicsOrd))) # Alternative: use precompiled color palettes from the the pals package colorScale <- paste0(getPalette(length(topicsOrd)), "FF") # Finally, a data frame is created associating the colors with the topic names refOrdColors <- data.frame(topicsOrd, colorScale) # ggplot2 does only understand data.frame objects # melt creates a data.frame from our matrix representing each cell as a row topicProportions_df <- melt(topicProportions) # add the just created factor as name description column to the rows of the data.frame topicProportions_df$topicNamesFactor <- refOrdColors$topicsOrd # Create a bar plot: # Initialize the plot by assigning the values to the y-axis. The running order of the topics is given by fill and the ordered topic factor. The scale_fill_manual command defines the order of the colors and is assigned to our reference colors. bp <- ggplot(topicProportions_df, aes(x = "", y = value, fill = refOrdColors$topicsOrd)) + geom_bar(width = 1, stat = "identity") + scale_fill_manual(values = refOrdColors$colorScale) require(scales) # from the bar plot create a polar coordinate view, choose a minimal theme pie <- bp + coord_polar("y", start = 0) + theme_minimal() + theme( axis.text.x = element_blank(), axis.title.x = element_blank(), # make every graphical element blank except the pie axis.title.y = element_blank(), panel.border = element_blank(), panel.grid = element_blank(), axis.ticks = element_blank(), legend.position = "left", legend.key.width = unit(3, "mm"), legend.key.height = unit(3, "mm"), plot.title = element_text(size = 14, face = "bold") ) + ggtitle("Topic distribution in corpus") + geom_text(size = 3, aes(x = 1.7, label = percent(value)), position = position_stack(vjust = 0.5)) + guides(fill = guide_legend(title = "Topic names", reverse = T)) print(pie) ggplot(data = topicProportions_df, aes(x = topicNamesFactor, y = value, fill = refOrdColors$topicsOrd)) + # set data for axis and fill a gradient geom_bar(stat = "identity", width = .5) + # define attributes for bars scale_fill_manual(values = refOrdColors$colorScale) + coord_flip() + # flip the plot to horizontal bars guides(fill = FALSE) + # hide guide ggtitle("Topic proportions") + # set the title xlab("Topic name") + # set the x axis label ylab("Proportion") # set the y axis label topicToFilter <- 20 # you can set this manually ... # ... or have it selected by a term in the topic name (e.g. 'job') topicToFilter <- grep('engineer', topicNames)[1] topicThreshold <- 0.15 selectedDocumentIndexes <- which(theta[, topicToFilter] >= topicThreshold) selectedDocumentIndexes <- as.integer(rownames(theta[selectedDocumentIndexes,])) #The document Id's from the corpus are contained within the rownames of the posterior variable theta filteredCorpus <- corpus_subset(data_corpus, textdata$d_id %in% selectedDocumentIndexes) # show length of filtered corpus length(filteredCorpus) # look into post display_html(gsub("\n", "
", filteredCorpus[1])) docYear <- textdata$Year[textdata$d_id %in% selectedDocumentIndexes] # Second, count how many occurrences are present for each decade docsPerYear <- table(docYear) # classic way in R # plot(docsPerDecade, type = "o", xlab = "Jahr", ylab = "Absolute frequency", main = paste0("Topic ", topicNames[topicToFilter])) # Determine the topic color from our ordered reference factor topicColor <- refOrdColors[refOrdColors$topicsOrd == topicNames[topicToFilter], "colorScale"] ggplot(data = melt(docsPerYear), aes(x = docYear, y = value)) + geom_line(aes(linetype = "solid"), size=1,colour=topicColor) + # define a line plot xlab("Decade") + ylab("Count") + ggtitle("Topic count per decade") + theme(legend.position="none") # We create a matrix where we repeat the creation of a time series topicThreshold <- 0.1 all_years <- unique(textdata$Year) hm_matrix <- matrix(0, nrow = K, ncol = length(all_years), dimnames = list(topicNames, all_years)) for (k in 1:K) { selectedDocumentIndexes <- theta[, k] >= topicThreshold docYears <- textdata$Year[selectedDocumentIndexes] docsPerYear <- table(docYears) hm_matrix[k, names(docsPerYear)] <- docsPerYear } # The basic ?heatmap command of R is able to cluster similar time series and place them next to each other in the plot. In ggplot we need to do this by ourselves. But the plot looks prettier. # We cluster the data by the Manhattan measure and build a cluster dendrogram in order to determine the optimal sorting within the heat-map. ord <- hclust(dist(hm_matrix, method = "manhattan"), method = "average" )$order # According to the sorting we create a data.frame with the melt command and the given order by the clustering. hm_matrix_ord <- melt(hm_matrix[ord, ]) colnames(hm_matrix_ord) <- c("topicNames", "Year","value") ggplot(hm_matrix_ord, aes(Year, topicNames)) + geom_tile(aes(fill = value), colour = "grey") + scale_fill_gradient(low = "white",high = "green") + theme_bw() + xlab("Year") + ylab("Topic name") + theme(legend.position="none") # We order the decade counts by topics, and then normalize them to sum to 1 hm_matrix_norm <- t(hm_matrix[as.character(refOrdColors$topicsOrd), ]) hm_matrix_norm <- hm_matrix_norm / rowSums(hm_matrix_norm) matrix.m <- melt(t(hm_matrix_norm)) colnames(matrix.m) <- c("topicNames", "Year", "value") head(matrix.m) # Establish a sorting of topics by tranforming the topicNames column into a factor # with a specific ordering of levels matrix.m$topicNames <- factor(matrix.m$topicNames, levels(refOrdColors$topicsOrd), ordered = T) ggplot(matrix.m, aes(x = Year, y = value, fill = topicNames)) + geom_area() + scale_fill_manual(values = refOrdColors$colorScale) require(igraph) # Load tm result on paragraphs from Tutorial I load("tmResult_documents.RData") # Create a 0-matrix of the same dimension like theta doc_topic_matrix <- matrix(0, nrow = nrow(new_data$topics), ncol = ncol(new_data$topics)) topicNames <- apply(lda::top.topic.words(new_data$terms, 5, by.score = T), 2, paste, collapse = " ") colnames(doc_topic_matrix) <- topicNames # Count the 2 most probable topics for each paragraph sapply(1:nrow(new_data$topics), function(x){ doc_topic_matrix[x, order(new_data$topics[x, ], decreasing = T)[1:2]] <<- 1 }) # count co-occurrence of topics topic_topic_matrix <- t(doc_topic_matrix) %*% doc_topic_matrix diag(topic_topic_matrix) <- 0 # now we calculate Dice statistics to determine significant topic combinations final_topic_coocs <- matrix(0, nrow = nrow(topic_topic_matrix), ncol = ncol(topic_topic_matrix), dimnames = dimnames(topic_topic_matrix)) k <- nrow(doc_topic_matrix) # number of all documents kj <- colSums(doc_topic_matrix) # number of docs containing topic j as primary/secondary names(kj) <- colnames(doc_topic_matrix) for (topicName in colnames(doc_topic_matrix)) { # retrieve numbers for statistic calculation ki <- kj[topicName] kij <- topic_topic_matrix[topicName, ] dicesig <- 2 * kij / (ki + kj) sig <- dicesig sig[is.na(sig)] <- 0 final_topic_coocs[topicName,] <- sig } # Create a data.frame to produce the graph object topicGraph <- melt(final_topic_coocs) # The table must be of the form from to sig -- This is the description of all edges in the graph colnames(topicGraph) <- c("from","to","sig") # We only use the edges with a significance of more than a defined threshold topicGraph <- topicGraph[topicGraph[, 3] > 0.07, ] # The visualization is done with the igraph package require(igraph) # We initialize the graph with our edge list and define the type of the graph as undirected graphNetwork <- graph.data.frame(topicGraph, directed = F) # The vertices get a size based on the proportion of the topic in the overall collection V(graphNetwork)$size <- colSums(new_data$topics) / sum(new_data$topics) * 200 # But vertices get a minimum size V(graphNetwork)$size[V(graphNetwork)$size < 3] <- 3 # We deactivate the standard curved form of the edges in the graph and force straight lines. E(graphNetwork)$curved <- 0 # Define the thickness of the edges E(graphNetwork)$width <- 2 # Definition of some margins for the plot par(mai=c(0,0,0,0)) plot(graphNetwork, layout = layout.fruchterman.reingold, # Force Directed Layout main = "Topic co-occurrence", # Title vertex.label.family = "sans", vertex.shape = "circle", vertex.label.dist = 0.5, # Slightly push the labels away from the vertices vertex.frame.color = 'darkolivegreen', vertex.label.color = 'black', # Vertex label color vertex.label.font = 2, # font for the vertex label vertex.label = V(graphNetwork)$name, # content of the vertex label vertex.label.cex = 0.7 # size of the vertex label )