options(stringsAsFactors = FALSE)
require(quanteda)
require(magrittr)
require(dplyr)
require(data.table)

textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"")

# we add some more metadata columns to the data frame
OL <- Sys.getlocale("LC_TIME")
#set the new locale
Sys.setlocale("LC_TIME","C")
textdata$date <- as.Date(textdata$date, format = "%b %d, %Y")
textdata$year <- substr(textdata$date, 0, 4)
textdata$decade <- paste0(substr(textdata$date, 0, 3), "0")
#Delete not identifiable Dates
textdata <- textdata[!is.na(textdata$date),]
#Change the locale back to the old value
Sys.setlocale("LC_TIME", OL)

textdata <- as.data.table(textdata)

english_stopwords <- readLines("data/stopwords_en.txt", encoding = "UTF-8")

textdata %<>% filter(!duplicated(jobpost))
textdata %<>% mutate(d_id = 1:nrow(textdata))

#Build a dictionary of lemmas
lemmaData <- read.csv2("data/baseform_en.tsv", sep="\t", header=FALSE, encoding = "UTF-8", stringsAsFactors = F)

data_corpus <- corpus(textdata$jobpost, docnames = textdata$d_id)

# Create a DTM (may take a while)
data_dfm_entries <- data_corpus %>% tokens() %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% 
  tokens_replace(., lemmaData$V1, lemmaData$V2) %>%
  tokens_ngrams(1) %>% tokens_remove(pattern = stopwords()) %>% dfm() 


data_dfm_entries_sub <- data_dfm_entries %>%
  dfm_select(pattern = "[a-z]", valuetype = "regex", selection = 'keep')

colnames(data_dfm_entries_sub) <- colnames(data_dfm_entries_sub) %>% stringi::stri_replace_all_regex("[^_a-z]", "") 

DTM <- dfm_compress(data_dfm_entries_sub, "features")
# Show some information
DTM


terms_to_observe <- c("experience", "manual", "creative", "hard", "team")

DTM_reduced <- as.matrix(DTM[, terms_to_observe])

length(terms_to_observe)

nrow(DTM_reduced)

counts_per_year <- aggregate(DTM_reduced, by = list(year = textdata$year), sum)

# give x and y values beautiful names
years <- counts_per_year$year
frequencies <- counts_per_year[, terms_to_observe]

# plot multiple frequencies
matplot(years, frequencies, type = "l")

# add legend to the plot
l <- length(terms_to_observe)
legend('topleft', legend = terms_to_observe, col=1:l, text.col = 1:l, lty = 1:l)

positive_terms_all <- readLines("data/senti_words_positive.txt")
negative_terms_all <- readLines("data/senti_words_negative.txt")

positive_terms_in_suto <- intersect(colnames(DTM), positive_terms_all)
counts_positive <- rowSums(DTM[, positive_terms_in_suto])

negative_terms_in_suto <- intersect(colnames(DTM), negative_terms_all)
counts_negative <- rowSums(DTM[, negative_terms_in_suto])

counts_all_terms <- rowSums(DTM)

relative_sentiment_frequencies <- data.frame(
  positive = counts_positive / counts_all_terms,
  negative = counts_negative / counts_all_terms
)

sentiments_per_Company <- aggregate(relative_sentiment_frequencies, by = list(Company = textdata$Company), mean)

company_count <-table(textdata$Company)
company_count <- company_count[company_count > 50]
head(sentiments_per_Company[sentiments_per_Company$Company %in% names(company_count),])

require(reshape2)
df <- melt(head(sentiments_per_Company[sentiments_per_Company$Company %in% names(company_count),],n = 20), id.vars = "Company")
require(ggplot2)
ggplot(data = df, aes(x = Company, y = value, fill = variable)) + 
  geom_bar(stat="identity", position=position_dodge()) + coord_flip()

# order by positive sentiments
ggplot(data = df, aes(x = reorder(Company, df$value, head, 1), y = value, fill = variable)) + geom_bar(stat="identity", position=position_dodge()) + coord_flip()

# order by negative sentiments
ggplot(data = df, aes(x = reorder(Company, df$value, tail, 1), y = value, fill = variable)) + geom_bar(stat="identity", position=position_dodge()) + coord_flip()

terms_to_observe <- c("work", "responsibility", "health", "hard", "creative", 
                      "competetive", "friendly", "reliable", "technology", 
                      "manual", "skill", "payment")
DTM_reduced <- as.matrix(DTM[, terms_to_observe])
counts_per_year <- aggregate(DTM_reduced, by = list(year = textdata$year), sum)
rownames(counts_per_year) <- counts_per_year$year
counts_per_year <- counts_per_year[!(colnames(counts_per_year) %in% "year")]
heatmap(t(counts_per_year), Colv=NA, col = rev(heat.colors(256)), keep.dendro= FALSE, margins = c(5, 10))