options(stringsAsFactors = FALSE) require(quanteda) require(magrittr) require(dplyr) require(data.table) textdata <- read.csv("data/data job posts.csv", header = TRUE, sep = ",", encoding = "UTF-8",quote = "\"") # we add some more metadata columns to the data frame OL <- Sys.getlocale("LC_TIME") #set the new locale Sys.setlocale("LC_TIME","C") textdata$date <- as.Date(textdata$date, format = "%b %d, %Y") textdata$year <- substr(textdata$date, 0, 4) textdata$decade <- paste0(substr(textdata$date, 0, 3), "0") #Delete not identifiable Dates textdata <- textdata[!is.na(textdata$date),] #Change the locale back to the old value Sys.setlocale("LC_TIME", OL) textdata <- as.data.table(textdata) english_stopwords <- readLines("data/stopwords_en.txt", encoding = "UTF-8") textdata %<>% filter(!duplicated(jobpost)) textdata %<>% mutate(d_id = 1:nrow(textdata)) #Build a dictionary of lemmas lemmaData <- read.csv2("data/baseform_en.tsv", sep="\t", header=FALSE, encoding = "UTF-8", stringsAsFactors = F) data_corpus <- corpus(textdata$jobpost, docnames = textdata$d_id) # Create a DTM (may take a while) data_dfm_entries <- data_corpus %>% tokens() %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% tokens_tolower() %>% tokens_replace(., lemmaData$V1, lemmaData$V2) %>% tokens_ngrams(1) %>% tokens_remove(pattern = stopwords()) %>% dfm() data_dfm_entries_sub <- data_dfm_entries %>% dfm_select(pattern = "[a-z]", valuetype = "regex", selection = 'keep') colnames(data_dfm_entries_sub) <- colnames(data_dfm_entries_sub) %>% stringi::stri_replace_all_regex("[^_a-z]", "") DTM <- dfm_compress(data_dfm_entries_sub, "features") # Show some information DTM terms_to_observe <- c("experience", "manual", "creative", "hard", "team") DTM_reduced <- as.matrix(DTM[, terms_to_observe]) length(terms_to_observe) nrow(DTM_reduced) counts_per_year <- aggregate(DTM_reduced, by = list(year = textdata$year), sum) # give x and y values beautiful names years <- counts_per_year$year frequencies <- counts_per_year[, terms_to_observe] # plot multiple frequencies matplot(years, frequencies, type = "l") # add legend to the plot l <- length(terms_to_observe) legend('topleft', legend = terms_to_observe, col=1:l, text.col = 1:l, lty = 1:l) positive_terms_all <- readLines("data/senti_words_positive.txt") negative_terms_all <- readLines("data/senti_words_negative.txt") positive_terms_in_suto <- intersect(colnames(DTM), positive_terms_all) counts_positive <- rowSums(DTM[, positive_terms_in_suto]) negative_terms_in_suto <- intersect(colnames(DTM), negative_terms_all) counts_negative <- rowSums(DTM[, negative_terms_in_suto]) counts_all_terms <- rowSums(DTM) relative_sentiment_frequencies <- data.frame( positive = counts_positive / counts_all_terms, negative = counts_negative / counts_all_terms ) sentiments_per_Company <- aggregate(relative_sentiment_frequencies, by = list(Company = textdata$Company), mean) company_count <-table(textdata$Company) company_count <- company_count[company_count > 50] head(sentiments_per_Company[sentiments_per_Company$Company %in% names(company_count),]) require(reshape2) df <- melt(head(sentiments_per_Company[sentiments_per_Company$Company %in% names(company_count),],n = 20), id.vars = "Company") require(ggplot2) ggplot(data = df, aes(x = Company, y = value, fill = variable)) + geom_bar(stat="identity", position=position_dodge()) + coord_flip() # order by positive sentiments ggplot(data = df, aes(x = reorder(Company, df$value, head, 1), y = value, fill = variable)) + geom_bar(stat="identity", position=position_dodge()) + coord_flip() # order by negative sentiments ggplot(data = df, aes(x = reorder(Company, df$value, tail, 1), y = value, fill = variable)) + geom_bar(stat="identity", position=position_dodge()) + coord_flip() terms_to_observe <- c("work", "responsibility", "health", "hard", "creative", "competetive", "friendly", "reliable", "technology", "manual", "skill", "payment") DTM_reduced <- as.matrix(DTM[, terms_to_observe]) counts_per_year <- aggregate(DTM_reduced, by = list(year = textdata$year), sum) rownames(counts_per_year) <- counts_per_year$year counts_per_year <- counts_per_year[!(colnames(counts_per_year) %in% "year")] heatmap(t(counts_per_year), Colv=NA, col = rev(heat.colors(256)), keep.dendro= FALSE, margins = c(5, 10))