options(stringsAsFactors = FALSE)
require(topicmodels)
require(lda)
require(data.table)
require(quanteda)
require(ggplot2)
require(magrittr)
require(dplyr)
# textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"")
# textdata <- as.data.table(textdata)
# textdata %<>% mutate(d_id = 1:nrow(textdata))
# Load the posterior and data from tutorial I
load("DTM.RData")
load("tmResult.RData")
load("topicModel.RData")
load("corpus.RData")
load("textdata.RData")
K <- topicModel@k # set a global topic model K parameter to use
# From the posterior get theta (p(z|d)) and phi (p(w|z))
theta <- tmResult$topics
phi <- tmResult$terms
# visualize topics as word cloud
library(wordcloud)
#generate the names again to filter for a certain topic
#terms: Function to extract the most likely terms for each topic or the most likely topics for each document.
top5termsPerTopicProb <- lda::top.topic.words(phi, 5, by.score = T)
topicNames <- apply(top5termsPerTopicProb, 2, paste, collapse = " ")
topicToViz <- 11 # change for your own topic of interest
topicToViz <- grep('engineer', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(phi[topicToViz,], decreasing = TRUE)[1:40]
words <- names(top40terms)
# extract the probabilities of each of the 40 terms
probabilities <- sort(phi[topicToViz,], decreasing = TRUE)[1:40]
# visualize the terms as word-cloud
library(wordcloud)
wordcloud(names(probabilities), probabilities, scale = c(3, .9), colors = brewer.pal(8, "Dark2"), random.order = F)
library(IRdisplay)
exampleIds <- c(150, 10000, 15000)
display_html(paste0("",gsub("\n", "
", textdata$jobpost[exampleIds[1]])))
display_html(paste0("
",gsub("\n", "
", textdata$jobpost[exampleIds[2]])))
display_html(paste0("
",gsub("\n", "
", textdata$jobpost[exampleIds[3]])))
# load libraries for visualization
require("reshape2")
require("ggplot2")
N <- length(exampleIds)
# get topic proportions form example documents
topicProportionExamples <- theta[as.character(textdata$d_id[exampleIds]), ]
colnames(topicProportionExamples) <- topicNames
vizDataFrame <- melt(data = cbind(data.frame(topicProportionExamples), document = stringr::str_sub(textdata$Company[exampleIds],1,10)),
variable.name = "topic",
id.vars = "document")
ggplot(data = vizDataFrame, aes(x = topic, y = value, fill = document), ylab = "proportion") +
geom_bar(stat = "identity", position = "stack") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() + facet_wrap(~document, ncol = N)
library(RColorBrewer)
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
# Again we use the topic distribution over documents to determine the share of each topic
# What are the most probable topics in the entire collection?
topicProportions <- colSums(theta) / sum(theta)
names(topicProportions) <- topicNames
# For the next examples we create a main sorting and coloring of the topics which we could apply to all visualizations in ggplot
# We start with sorting the topics by their probability
topicProportions <- topicProportions[order(topicProportions)]
# ordering in the ggplot library can be done using a factor for the topic labels
topicsOrd <- factor(
names(topicProportions), # Take the names of the topics as examples
levels = names(topicProportions), # Set them also as possible levels of the nominal factor
ordered = T) # the given order of the topic names is also the order of the factor
# next we randomly create some colors from the "rainbow"-palette of R
# colorScale <- sample(rainbow(length(topicsOrd)))
# Alternative: use precompiled color palettes from the the pals package
colorScale <- paste0(getPalette(length(topicsOrd)), "FF")
# Finally, a data frame is created associating the colors with the topic names
refOrdColors <- data.frame(topicsOrd, colorScale)
# ggplot2 does only understand data.frame objects
# melt creates a data.frame from our matrix representing each cell as a row
topicProportions_df <- melt(topicProportions)
# add the just created factor as name description column to the rows of the data.frame
topicProportions_df$topicNamesFactor <- refOrdColors$topicsOrd
# Create a bar plot:
# Initialize the plot by assigning the values to the y-axis. The running order of the topics is given by fill and the ordered topic factor. The scale_fill_manual command defines the order of the colors and is assigned to our reference colors.
bp <- ggplot(topicProportions_df, aes(x = "", y = value, fill = refOrdColors$topicsOrd)) +
geom_bar(width = 1, stat = "identity") +
scale_fill_manual(values = refOrdColors$colorScale)
require(scales)
# from the bar plot create a polar coordinate view, choose a minimal theme
pie <- bp + coord_polar("y", start = 0) + theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(), # make every graphical element blank except the pie
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
legend.position = "left",
legend.key.width = unit(3, "mm"),
legend.key.height = unit(3, "mm"),
plot.title = element_text(size = 14, face = "bold")
) +
ggtitle("Topic distribution in corpus") +
geom_text(size = 3, aes(x = 1.7,
label = percent(value)),
position = position_stack(vjust = 0.5)) +
guides(fill = guide_legend(title = "Topic names", reverse = T))
print(pie)
ggplot(data = topicProportions_df,
aes(x = topicNamesFactor, y = value, fill = refOrdColors$topicsOrd)) + # set data for axis and fill a gradient
geom_bar(stat = "identity", width = .5) + # define attributes for bars
scale_fill_manual(values = refOrdColors$colorScale) +
coord_flip() + # flip the plot to horizontal bars
guides(fill = FALSE) + # hide guide
ggtitle("Topic proportions") + # set the title
xlab("Topic name") + # set the x axis label
ylab("Proportion") # set the y axis label
topicToFilter <- 20 # you can set this manually ...
# ... or have it selected by a term in the topic name (e.g. 'job')
topicToFilter <- grep('engineer', topicNames)[1]
topicThreshold <- 0.15
selectedDocumentIndexes <- which(theta[, topicToFilter] >= topicThreshold)
selectedDocumentIndexes <- as.integer(rownames(theta[selectedDocumentIndexes,]))
#The document Id's from the corpus are contained within the rownames of the posterior variable theta
filteredCorpus <- corpus_subset(data_corpus, textdata$d_id %in% selectedDocumentIndexes)
# show length of filtered corpus
length(filteredCorpus)
# look into post
display_html(gsub("\n", "
", filteredCorpus[1]))
docYear <- textdata$Year[textdata$d_id %in% selectedDocumentIndexes]
# Second, count how many occurrences are present for each decade
docsPerYear <- table(docYear)
# classic way in R
# plot(docsPerDecade, type = "o", xlab = "Jahr", ylab = "Absolute frequency", main = paste0("Topic ", topicNames[topicToFilter]))
# Determine the topic color from our ordered reference factor
topicColor <- refOrdColors[refOrdColors$topicsOrd == topicNames[topicToFilter], "colorScale"]
ggplot(data = melt(docsPerYear), aes(x = docYear, y = value)) +
geom_line(aes(linetype = "solid"), size=1,colour=topicColor) + # define a line plot
xlab("Decade") + ylab("Count") +
ggtitle("Topic count per decade") +
theme(legend.position="none")
# We create a matrix where we repeat the creation of a time series
topicThreshold <- 0.1
all_years <- unique(textdata$Year)
hm_matrix <- matrix(0, nrow = K, ncol = length(all_years), dimnames = list(topicNames, all_years))
for (k in 1:K) {
selectedDocumentIndexes <- theta[, k] >= topicThreshold
docYears <- textdata$Year[selectedDocumentIndexes]
docsPerYear <- table(docYears)
hm_matrix[k, names(docsPerYear)] <- docsPerYear
}
# The basic ?heatmap command of R is able to cluster similar time series and place them next to each other in the plot. In ggplot we need to do this by ourselves. But the plot looks prettier.
# We cluster the data by the Manhattan measure and build a cluster dendrogram in order to determine the optimal sorting within the heat-map.
ord <- hclust(dist(hm_matrix, method = "manhattan"), method = "average" )$order
# According to the sorting we create a data.frame with the melt command and the given order by the clustering.
hm_matrix_ord <- melt(hm_matrix[ord, ])
colnames(hm_matrix_ord) <- c("topicNames", "Year","value")
ggplot(hm_matrix_ord, aes(Year, topicNames)) + geom_tile(aes(fill = value), colour = "grey") +
scale_fill_gradient(low = "white",high = "green") +
theme_bw() +
xlab("Year") + ylab("Topic name") +
theme(legend.position="none")
# We order the decade counts by topics, and then normalize them to sum to 1
hm_matrix_norm <- t(hm_matrix[as.character(refOrdColors$topicsOrd), ])
hm_matrix_norm <- hm_matrix_norm / rowSums(hm_matrix_norm)
matrix.m <- melt(t(hm_matrix_norm))
colnames(matrix.m) <- c("topicNames", "Year", "value")
head(matrix.m)
# Establish a sorting of topics by tranforming the topicNames column into a factor
# with a specific ordering of levels
matrix.m$topicNames <- factor(matrix.m$topicNames, levels(refOrdColors$topicsOrd), ordered = T)
ggplot(matrix.m, aes(x = Year, y = value, fill = topicNames)) +
geom_area() + scale_fill_manual(values = refOrdColors$colorScale)
require(igraph)
# Load tm result on paragraphs from Tutorial I
load("tmResult_documents.RData")
# Create a 0-matrix of the same dimension like theta
doc_topic_matrix <- matrix(0, nrow = nrow(new_data$topics), ncol = ncol(new_data$topics))
topicNames <- apply(lda::top.topic.words(new_data$terms, 5, by.score = T), 2, paste, collapse = " ")
colnames(doc_topic_matrix) <- topicNames
# Count the 2 most probable topics for each paragraph
sapply(1:nrow(new_data$topics), function(x){
doc_topic_matrix[x, order(new_data$topics[x, ], decreasing = T)[1:2]] <<- 1
})
# count co-occurrence of topics
topic_topic_matrix <- t(doc_topic_matrix) %*% doc_topic_matrix
diag(topic_topic_matrix) <- 0
# now we calculate Dice statistics to determine significant topic combinations
final_topic_coocs <- matrix(0, nrow = nrow(topic_topic_matrix), ncol = ncol(topic_topic_matrix), dimnames = dimnames(topic_topic_matrix))
k <- nrow(doc_topic_matrix) # number of all documents
kj <- colSums(doc_topic_matrix) # number of docs containing topic j as primary/secondary
names(kj) <- colnames(doc_topic_matrix)
for (topicName in colnames(doc_topic_matrix)) {
# retrieve numbers for statistic calculation
ki <- kj[topicName]
kij <- topic_topic_matrix[topicName, ]
dicesig <- 2 * kij / (ki + kj)
sig <- dicesig
sig[is.na(sig)] <- 0
final_topic_coocs[topicName,] <- sig
}
# Create a data.frame to produce the graph object
topicGraph <- melt(final_topic_coocs)
# The table must be of the form from to sig -- This is the description of all edges in the graph
colnames(topicGraph) <- c("from","to","sig")
# We only use the edges with a significance of more than a defined threshold
topicGraph <- topicGraph[topicGraph[, 3] > 0.07, ]
# The visualization is done with the igraph package
require(igraph)
# We initialize the graph with our edge list and define the type of the graph as undirected
graphNetwork <- graph.data.frame(topicGraph, directed = F)
# The vertices get a size based on the proportion of the topic in the overall collection
V(graphNetwork)$size <- colSums(new_data$topics) / sum(new_data$topics) * 200
# But vertices get a minimum size
V(graphNetwork)$size[V(graphNetwork)$size < 3] <- 3
# We deactivate the standard curved form of the edges in the graph and force straight lines.
E(graphNetwork)$curved <- 0
# Define the thickness of the edges
E(graphNetwork)$width <- 2
# Definition of some margins for the plot
par(mai=c(0,0,0,0))
plot(graphNetwork,
layout = layout.fruchterman.reingold, # Force Directed Layout
main = "Topic co-occurrence", # Title
vertex.label.family = "sans",
vertex.shape = "circle",
vertex.label.dist = 0.5, # Slightly push the labels away from the vertices
vertex.frame.color = 'darkolivegreen',
vertex.label.color = 'black', # Vertex label color
vertex.label.font = 2, # font for the vertex label
vertex.label = V(graphNetwork)$name, # content of the vertex label
vertex.label.cex = 0.7 # size of the vertex label
)