options(stringsAsFactors = FALSE)

require(quanteda)
require(magrittr)
require(dplyr)
require(data.table)

textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"")

# dimensions of the data frame
dim(textdata)

# column names of text and metadata
colnames(textdata)

table(textdata[, "Year"])

textdata <- as.data.table(textdata)

english_stopwords <- readLines("data/stopwords_en.txt", encoding = "UTF-8")

textdata %<>% filter(!duplicated(jobpost))
textdata %<>% mutate(d_id = 1:nrow(textdata))

#Build a dictionary of lemmas
lemmaData <- read.csv2("data/baseform_en.tsv", sep="\t", header=FALSE, encoding = "UTF-8", stringsAsFactors = F)

data_corpus <- corpus(textdata$jobpost, docnames = textdata$d_id)

# accessing a single document object
data_corpus[1]

paste0(substring(as.character(data_corpus[1]), 0, 120), "...")

length(data_corpus$documents$texts)

data_dfm_entries <- data_corpus %>% tokens() %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% 
  tokens_replace(., lemmaData$V1, lemmaData$V2) %>%
  tokens_ngrams(1) %>% dfm() 

data_dfm_entries_sub <- data_dfm_entries %>%
  dfm_select(pattern = "[a-z]", valuetype = "regex", selection = 'keep')

colnames(data_dfm_entries_sub) <- colnames(data_dfm_entries_sub) %>% stringi::stri_replace_all_regex("[^_a-z]", "") 

DTM <- dfm_compress(data_dfm_entries_sub, "features")

DTM

# sum columns for word counts
freqs <- colSums(DTM)
# get vocabulary vector
words <- colnames(DTM)
# combine words and their frequencies in a data frame
wordlist <- data.frame(words, freqs)
# re-order the wordlist by decreasing frequency
wordIndexes <- order(wordlist[, "freqs"], decreasing = TRUE)
wordlist <- wordlist[wordIndexes, ]
# show the most frequent words
head(wordlist, 25)

plot(wordlist$freqs , type = "l", lwd=2, main = "Rank frequency Plot", xlab="Rank", ylab ="Frequency")

plot(wordlist$freqs , type = "l", log="xy", lwd=2, main = "Rank-Frequency Plot", xlab="log-Rank", ylab ="log-Frequency")

plot(wordlist$freqs, type = "l", log="xy",lwd=2, main = "Rank-Frequency plot", xlab="Rank", ylab = "Frequency")

stopwords_idx <- which(wordlist$words %in% english_stopwords)
low_frequent_idx <- which(wordlist$freqs < 10)
insignificant_idx <- union(stopwords_idx, low_frequent_idx)
meaningful_range_idx <- setdiff(1:nrow(wordlist), insignificant_idx)
lines(meaningful_range_idx, wordlist$freqs[meaningful_range_idx], col = "green", lwd=2, type="p", pch=20)

head(wordlist[meaningful_range_idx, ], 25)

sum(wordlist$freqs == 1) / nrow(wordlist)

ncol(DTM) / sum(DTM)