#!/usr/bin/env python # coding: utf-8 # Author: **Mattias Östmar** # # Date: **2019-03-14** # # Contact: **mattiasostmar at gmail dot com** # # Thanks to Mikael Huss for being a good speaking partner. # # In this notebook we're going to use the [python version of fasttext](https://pypi.org/project/fasttext/), based on [Facebooks fasttext](https://github.com/facebookresearch/fastText) tool, to try to predict the [Jungian cognitive function](https://en.wikipedia.org/wiki/Jungian_cognitive_functions) of the authors writing style as appearing in blog posts. # In[1]: import csv import requests import pandas as pd from sklearn.model_selection import train_test_split import fasttext # Download the annotated dataset as semi-colon separated CSV from [https://osf.io/zvw5g/download](https://osf.io/zvw5g/download) (66,1 MB file size) # In[2]: df = pd.read_csv("blog_texts_and_cognitive_function.csv", sep=";", index_col=0) df.head(3) # In[5]: df.info() # In[3]: df.base_function.value_counts() # Let's see, crudely, if the blog writers of a certain class writes longer or shorter texts in average. # In[11]: tokens = [] df.text.apply(lambda x: tokens.append(len(x.split()))) df["text_len"] = pd.Series(tokens) df.groupby("base_function").mean() # Let's try to predict the four base cognitive functions. We need to prepare the labels to suite fasttexts formatting. # In[76]: dataset = df[["base_function","text"]] dataset["label"] = df.base_function.apply(lambda x: "__label__" + x) dataset.drop("base_function", axis=1, inplace=True) dataset = dataset[["label","text"]] dataset.head(3) # In[37]: dataset.tail(3) # Now let's separate the dataset into two separate files for 80 per cent training and 20 per cent evaluation respectively. # In[7]: train, test = train_test_split(dataset, test_size=0.2) print("Rows in training data: {}".format(len(train))) print("Rows in test data: {}".format(len(test))) # Now we create two separate textfiles for the training and evaluation respectively, with each row containing the label and the text according to fasttexts formatting standards. # In[18]: train.to_csv(r'jung_training.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") test.to_csv(r'jung_evaluation.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") # Now we can train our model with the default settings and no text preprocessing to get an initial setup. # In[81]: classifier1 = fasttext.supervised("jung_training.txt","model_jung_default") # Then we can evaluate the model using our test data. # In[82]: result = classifier1.test("jung_evaluation.txt") print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # The results are slightly better than pure chance (0.415). Let's see if we can improve the model by some crude preprocessing of the texts, removing non-alphanumeric characters and making all words lowercase. # In[75]: processed = dataset.copy() processed["text"] = processed.text.str.replace(r"[\W ]"," ") # replace all characters that are not a-z, A-Z or 0-9 processed["text"] = processed.text.str.lower() # make all characters lower case processed["text"] = processed.text.str.replace(r' +',' ') # Remove multiple spaces processed["text"] = processed.text.str.replace(r'^ +','') # Remove resulting initial spaces processed.head(3) # And then we create training and evaluation data from the processed dataframe and store them to two new files with the prefix "processed_" # In[77]: train, test = train_test_split(processed, test_size=0.2) print("Rows in training data: {}".format(len(train))) print("Rows in test data: {}".format(len(test))) train.to_csv(r'processed_jung_training.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") test.to_csv(r'processed_jung_evaluation.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") # And re-run the training and evaluation. # In[84]: classifier2 = fasttext.supervised("processed_jung_training.txt","model_jung_preprocessed") result = classifier2.test("processed_jung_evaluation.txt") print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # Even worse results now. Apparently capital letters and special characters are features that help distinguish between the different labels, so let's keep the original trainingdata for further training and tuning. # # What happens if we increase the number of epochs from the default 5 epochs to 25? # In[90]: classifier3 = fasttext.supervised("jung_training.txt", "model_jung_default_25epochs", epoch=25) result = classifier3.test("jung_evaluation.txt") print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # The results actually deteriorates from 0.422 to 0.355. # What happens if we increase the learning rate from default 0.05 to 1? # In[89]: classifier4 = fasttext.supervised("jung_training.txt", "model_jung_default_lr0.5", lr=1) result = classifier4.test("jung_evaluation.txt") print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # A miniscule improvement from 0.422 to 0.423. # What happens if we use word_ngrams of 2? # This makes my kernel crash in Jupyter Notebook classifier5 = fasttext.supervised("jung_training.txt", "model_jung_default_ngrams2", word_ngrams=2) result = classifier5.test("jung_evaluation.txt") print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples)# Instead I download the compiled fasttext from https://github.com/facebookresearch/fastTex # and run in terminal fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_ngrams2 -wordNgrams 2 Read 9M words Number of words: 610075 Number of labels: 4 Progress: 100.0% words/sec/thread: 1017869 lr: 0.000000 loss: 1.301996 ETA: 0h 0m fastText-0.2.0 $ ./fasttext test ../model_jung_default_ngrams2.bin ../jung_evaluation.txt N 4518 P@1 0.423 R@1 0.423 # With ngrams set to 2 we get a similar result of 0.423 as when we increase the learning rate to 1. # What if we use pre-trained vectors when building the classifier? They can be downloaded from [fasttext.cc](https://fasttext.cc/docs/en/english-vectors.html). # # First we use the smallest vektor-file. Note that we have to increase the number of dimensions used when training from default 100 to 300 to match the vector-file. fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_wiki-news-300d-1M -dim 300 -pretrainedVectors wiki-news-300d-1M.vec Read 9M words Number of words: 610075 Number of labels: 4 Progress: 100.0% words/sec/thread: 648220 lr: 0.000000 loss: 1.290941 ETA: 0h 0m fastText-0.2.0 $ ./fasttext test ../model_jung_default_wiki-news-300d-1M.bin ../jung_evaluation.txt N 4518 P@1 0.417 R@1 0.417 # Then we try it with the largest vector-file that also includes subword-information. fastText-0.2.0 $ ./fasttext supervised -input ../jung_training.txt -output ../model_jung_default_crawl-300d-2M-subword -dim 300 -pretrainedVectors ./crawl-300d-2M-subword/crawl-300d-2M-subword.vec Read 9M words Number of words: 610075 Number of labels: 4 Progress: 100.0% words/sec/thread: 947173 lr: 0.000000 loss: 1.288895 ETA: 0h 0m fastText-0.2.0 $ ./fasttext test ../model_jung_default_crawl-300d-2M-subword.bin ../jung_evaluation.txt N 4518 P@1 0.419 R@1 0.419 # The results improve by a mere 0.002. # # Just in case, we also train on the preprocessed texts again using the largest pre-trained vectors. fastText-0.2.0 $ ./fasttext supervised -input ../processed_jung_training.txt -output ../model_jung_processed_crawl-300d-2M-subword -dim 300 -verbose 1 -pretrainedVectors ./crawl-300d-2M-subword/crawl-300d-2M-subword.vec Read 8M words Number of words: 261935 Number of labels: 4 Progress: 100.0% words/sec/thread: 1049889 lr: 0.000000 loss: 1.286178 ETA: 0h 0m fastText-0.2.0 $ ./fasttext test ../model_jung_processed_crawl-300d-2M-subword.bin ../jung_evaluation.txt N 4518 P@1 0.425 R@1 0.425 (nlp) mos@mosmbp fastText-0.2.0 $ # Now we get the best results this far. 0.425 in precision when baseline is 0.415, as n = 9380 in the largest class N from a total of 22588 in the original dataset. But that is only 2.4 % better than chance, so it doesn't say very much about the predictability of blog authors Jungian cognitive function based on their writing style.