#!/usr/bin/env python
# coding: utf-8

# Author: **Mattias Östmar**
# 
# Date: **2019-03-14**
# 
# Contact: **mattiasostmar at gmail dot com**
# 
# Thanks to Mikael Huss for being a good speaking partner.
# 
# In this notebook we're going to use the [python version of fasttext](https://pypi.org/project/fasttext/), based on [Facebooks fasttext](https://github.com/facebookresearch/fastText) tool, to try to predict the [Jungian cognitive function](https://en.wikipedia.org/wiki/Jungian_cognitive_functions) of the authors writing style as appearing in blog posts.

# In[1]:


import csv
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
import fasttext


# Download the annotated dataset as semi-colon separated CSV from [https://osf.io/zvw5g/download](https://osf.io/zvw5g/download) (66,1 MB file size)

# In[2]:


df = pd.read_csv("blog_texts_and_cognitive_function.csv", sep=";", index_col=0)
df.head(3)


# In[5]:


df.info()


# In[3]:


df.base_function.value_counts()


# Let's see, crudely, if the blog writers of a certain class writes longer or shorter texts in average.

# In[11]:


tokens = []
df.text.apply(lambda x: tokens.append(len(x.split())))
df["text_len"] = pd.Series(tokens)
df.groupby("base_function").mean()


# Let's try to predict the four base cognitive functions. We need to prepare the labels to suite fasttexts formatting.

# In[76]:


dataset = df[["base_function","text"]]
dataset["label"] = df.base_function.apply(lambda x: "__label__" + x)
dataset.drop("base_function", axis=1, inplace=True)
dataset = dataset[["label","text"]]
dataset.head(3)


# In[37]:


dataset.tail(3)


# Now let's separate the dataset into two separate files for 80 per cent training and 20 per cent evaluation respectively.

# In[7]:


train, test = train_test_split(dataset, test_size=0.2)
print("Rows in training data: {}".format(len(train)))
print("Rows in test data: {}".format(len(test)))


# Now we create two separate textfiles for the training and evaluation respectively, with each row containing the label and the text according to fasttexts formatting standards.

# In[18]:


train.to_csv(r'jung_training.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test.to_csv(r'jung_evaluation.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")


# Now we can train our model with the default settings and no text preprocessing to get an initial setup.

# In[81]:


classifier1 = fasttext.supervised("jung_training.txt","model_jung_default")


# Then we can evaluate the model using our test data.

# In[82]:


result = classifier1.test("jung_evaluation.txt")
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)


# The results are slightly better than pure chance (0.415). Let's see if we can improve the model by some crude preprocessing of the texts, removing non-alphanumeric characters and making all words lowercase.

# In[75]:


processed = dataset.copy()
processed["text"] = processed.text.str.replace(r"[\W ]"," ") # replace all characters that are not a-z, A-Z or 0-9
processed["text"] = processed.text.str.lower() # make all characters lower case
processed["text"] = processed.text.str.replace(r' +',' ') # Remove multiple spaces
processed["text"] = processed.text.str.replace(r'^ +','') # Remove resulting initial spaces

processed.head(3)


# And then we create training and evaluation data from the processed dataframe and store them to two new files with the prefix "processed_"

# In[77]:


train, test = train_test_split(processed, test_size=0.2)
print("Rows in training data: {}".format(len(train)))
print("Rows in test data: {}".format(len(test)))

train.to_csv(r'processed_jung_training.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test.to_csv(r'processed_jung_evaluation.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")


# And re-run the training and evaluation.

# In[84]:


classifier2 = fasttext.supervised("processed_jung_training.txt","model_jung_preprocessed")
result = classifier2.test("processed_jung_evaluation.txt")
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)


# Even worse results now. Apparently capital letters and special characters are features that help distinguish between the different labels, so let's keep the original trainingdata for further training and tuning.
# 
# What happens if we increase the number of epochs from the default 5 epochs to 25?

# In[90]:


classifier3 = fasttext.supervised("jung_training.txt", "model_jung_default_25epochs", epoch=25)
result = classifier3.test("jung_evaluation.txt")
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)


# The results actually deteriorates from 0.422 to 0.355.

# What happens if we increase the learning rate from default 0.05 to 1?

# In[89]:


classifier4 = fasttext.supervised("jung_training.txt", "model_jung_default_lr0.5", lr=1)
result = classifier4.test("jung_evaluation.txt")
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)


# A miniscule improvement from 0.422 to 0.423.

# What happens if we use word_ngrams of 2?
# This makes my kernel crash in Jupyter Notebook
classifier5 = fasttext.supervised("jung_training.txt", "model_jung_default_ngrams2", word_ngrams=2)
result = classifier5.test("jung_evaluation.txt")
print('P@1:', result.precision)
print('R@1:', result.recall)
print('Number of examples:', result.nexamples)# Instead I download the compiled fasttext from https://github.com/facebookresearch/fastTex
# and run in terminal

fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_ngrams2 -wordNgrams 2
Read 9M words
Number of words:  610075
Number of labels: 4
Progress: 100.0% words/sec/thread: 1017869 lr:  0.000000 loss:  1.301996 ETA:   0h 0m

fastText-0.2.0 $ ./fasttext test ../model_jung_default_ngrams2.bin ../jung_evaluation.txt
N	4518
P@1	0.423
R@1	0.423
# With ngrams set to 2 we get a similar result of 0.423 as when we increase the learning rate to 1.

# What if we use pre-trained vectors when building the classifier? They can be downloaded from [fasttext.cc](https://fasttext.cc/docs/en/english-vectors.html).
# 
# First we use the smallest vektor-file. Note that we have to increase the number of dimensions used when training from default 100 to 300 to match the vector-file.
fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_wiki-news-300d-1M -dim 300 -pretrainedVectors wiki-news-300d-1M.vec
Read 9M words
Number of words:  610075
Number of labels: 4
Progress: 100.0% words/sec/thread:  648220 lr:  0.000000 loss:  1.290941 ETA:   0h 0m

fastText-0.2.0 $ ./fasttext test ../model_jung_default_wiki-news-300d-1M.bin ../jung_evaluation.txt
N	4518
P@1	0.417
R@1	0.417
# Then we try it with the largest vector-file that also includes subword-information.
fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_crawl-300d-2M-subword -dim 300 -pretrainedVectors ./crawl-300d-2M-subword/crawl-300d-2M-subword.vec
Read 9M words
Number of words:  610075
Number of labels: 4
Progress: 100.0% words/sec/thread:  947173 lr:  0.000000 loss:  1.288895 ETA:   0h 0m

fastText-0.2.0 $ ./fasttext test ../model_jung_default_crawl-300d-2M-subword.bin ../jung_evaluation.txt
N	4518
P@1	0.419
R@1	0.419

# The results improve by a mere 0.002. 
# 
# Just in case, we also train on the preprocessed texts again using the largest pre-trained vectors.
fastText-0.2.0 $ ./fasttext supervised -input ../processed_jung_training.txt -output ../model_jung_processed_crawl-300d-2M-subword -dim 300 -verbose 1 -pretrainedVectors ./crawl-300d-2M-subword/crawl-300d-2M-subword.vec
Read 8M words
Number of words:  261935
Number of labels: 4
Progress: 100.0% words/sec/thread: 1049889 lr:  0.000000 loss:  1.286178 ETA:   0h 0m

fastText-0.2.0 $ ./fasttext test ../model_jung_processed_crawl-300d-2M-subword.bin ../jung_evaluation.txt
N	4518
P@1	0.425
R@1	0.425
(nlp) mos@mosmbp fastText-0.2.0 $
# Now we get the best results this far. 0.425 in precision when baseline is 0.415, as n = 9380 in the largest class N from a total of 22588 in the original dataset. But that is only 2.4 % better than chance, so it doesn't say very much about the predictability of blog authors Jungian cognitive function based on their writing style.