require(quanteda) require(magrittr) poliblogs2008 <- read.csv("data/poliblogs2008.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"", stringsAsFactors = F) head(poliblogs2008,2) # inspect the first 2 documents table(poliblogs2008$rating) table(poliblogs2008$blog) data_corpus <- corpus(poliblogs2008, text_field = "documents") DTM.1 <- data_corpus %>% tokens() %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% dfm() DTM.1 wordlist <- textstat_frequency(DTM.1) head(wordlist, 20) plot(wordlist$frequency , type = "l", lwd=2, main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency") plot(wordlist$frequency , type = "l", log="xy", lwd=2, main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency") stopw_idx <- which(wordlist$feature %in% stopwords('en')) low_frequent_idx <- which(wordlist$frequency < 10) trash_idx <- union(stopw_idx, low_frequent_idx) vocab_idx <- setdiff(1:nrow(wordlist), trash_idx) plot(wordlist$frequency, type = "l", log="xy",lwd=2, main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency") lines(vocab_idx, wordlist$frequency[vocab_idx], col = "green", lwd=2, type="p", pch=20) head(wordlist[vocab_idx], 20) head(wordlist[trash_idx], 20) DTM.2 <- dfm_remove(DTM.1, wordlist[trash_idx]$feature) DTM.2 textplot_wordcloud(DTM.2, max_words = 100) some_docname <- "at0800300_2.text" print(poliblogs2008[poliblogs2008$docname == some_docname, ]$documents) number_of_docs <- nrow(DTM.2) term_in_docs <- colSums(DTM.2 > 0) idf <- log2(number_of_docs / term_in_docs) tf <- as.vector(DTM.2[poliblogs2008[poliblogs2008$docname == some_docname, ]$X, ]) tf_idf <- tf * idf names(tf_idf) <- colnames(DTM.2) head(sort(tf_idf, decreasing = T),10) positive_terms <- data_dictionary_LSD2015$positive negative_terms <- data_dictionary_LSD2015$negative positive_terms_in_suto <- intersect(colnames(DTM.2), positive_terms) counts_positive <- rowSums(DTM.2[, positive_terms_in_suto]) negative_terms_in_suto <- intersect(colnames(DTM.2), negative_terms) counts_negative <- rowSums(DTM.2[, negative_terms_in_suto]) counts_all_terms <- rowSums(DTM.2) relative_sentiment_frequencies <- data.frame( docname = docvars(DTM.2)$docname, positive = counts_positive / counts_all_terms, negative = counts_negative / counts_all_terms ) head(relative_sentiment_frequencies,5) # save(DTM.2, file = "data/DTM.2.RData")