options(stringsAsFactors = FALSE) require(quanteda) require(magrittr) require(dplyr) require(data.table) textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"") # dimensions of the data frame dim(textdata) # column names of text and metadata colnames(textdata) table(textdata[, "Year"]) textdata <- as.data.table(textdata) english_stopwords <- readLines("data/stopwords_en.txt", encoding = "UTF-8") textdata %<>% filter(!duplicated(jobpost)) textdata %<>% mutate(d_id = 1:nrow(textdata)) #Build a dictionary of lemmas lemmaData <- read.csv2("data/baseform_en.tsv", sep="\t", header=FALSE, encoding = "UTF-8", stringsAsFactors = F) data_corpus <- corpus(textdata$jobpost, docnames = textdata$d_id) # accessing a single document object data_corpus[1] paste0(substring(as.character(data_corpus[1]), 0, 120), "...") length(data_corpus$documents$texts) data_dfm_entries <- data_corpus %>% tokens() %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% tokens_replace(., lemmaData$V1, lemmaData$V2) %>% tokens_ngrams(1) %>% dfm() data_dfm_entries_sub <- data_dfm_entries %>% dfm_select(pattern = "[a-z]", valuetype = "regex", selection = 'keep') colnames(data_dfm_entries_sub) <- colnames(data_dfm_entries_sub) %>% stringi::stri_replace_all_regex("[^_a-z]", "") DTM <- dfm_compress(data_dfm_entries_sub, "features") DTM # sum columns for word counts freqs <- colSums(DTM) # get vocabulary vector words <- colnames(DTM) # combine words and their frequencies in a data frame wordlist <- data.frame(words, freqs) # re-order the wordlist by decreasing frequency wordIndexes <- order(wordlist[, "freqs"], decreasing = TRUE) wordlist <- wordlist[wordIndexes, ] # show the most frequent words head(wordlist, 25) plot(wordlist$freqs , type = "l", lwd=2, main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency") plot(wordlist$freqs , type = "l", log="xy", lwd=2, main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency") plot(wordlist$freqs, type = "l", log="xy",lwd=2, main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency") stopwords_idx <- which(wordlist$words %in% english_stopwords) low_frequent_idx <- which(wordlist$freqs < 10) insignificant_idx <- union(stopwords_idx, low_frequent_idx) meaningful_range_idx <- setdiff(1:nrow(wordlist), insignificant_idx) lines(meaningful_range_idx, wordlist$freqs[meaningful_range_idx], col = "green", lwd=2, type="p", pch=20) head(wordlist[meaningful_range_idx, ], 25) sum(wordlist$freqs == 1) / nrow(wordlist) ncol(DTM) / sum(DTM)