In [23]:

from os.path import join

import matplotlib.pyplot as plt
%matplotlib inline

import gensim
import numpy as np
import scipy as sp

from sklearn.datasets import fetch_20newsgroups


%run ../src/load_data_utils.py
%run ../src/glove_2_word2vec.py

call

make prepare_embeddings in_file_name=glove.6B.100d.txt out_file_name=glove.6B.100d.w2v

To setup word vectors (you can also change 100 to 50, 200 or 300)

In [24]:

__, DATA_DIR = get_env_vars(True)
VECTORS_DIR = join(DATA_DIR, 'glove.6B')
FILENAME_PREFIX = 'glove.6B.100d'
VECTORS_FILENAME_DIR = join(VECTORS_DIR, FILENAME_PREFIX + '.txt')

newsgroups_train = fetch_20newsgroups()
newsgroups_test = fetch_20newsgroups(subset='test')

model = gensim.models.KeyedVectors.load(str(join(VECTORS_DIR, FILENAME_PREFIX + '.w2v')))

Table of contents¶

Encoding

[LSA of tf-idf vectors plot](#tsvd plot)

Aggregations

[plots](#aggregated plots)

Classification

[Why averaging word embeddings is not a great idea](#document embeddings)

Checking words similarity¶

$king + woman - man \approx queen$ example

Preprocessing text¶

This will make retrieving weights for incorporating tf-idf to aggregated word embeddings easier.

In [3]:

model.most_similar(positive=['king', 'woman'], negative=['man'])

Out[3]:

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755735874176025),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534753799438),
 ('prince', 0.6517034769058228),
 ('elizabeth', 0.6464517712593079),
 ('mother', 0.6311717629432678),
 ('emperor', 0.6106470823287964),
 ('wife', 0.6098655462265015)]

In [27]:

from collections import OrderedDict
import re

TOKENIZING_PATTERN = '(?u)\\b\\w\\w+\\b'

def preprocess_texts(sentences_list):
  return (list(
    map(
      lambda sentence: 
        ' '.join(OrderedDict.fromkeys(re.findall(TOKENIZING_PATTERN, sentence)))
          .lower(),
      sentences_list)))

X_train_text = preprocess_texts(newsgroups_train['data'])
X_test_text = preprocess_texts(newsgroups_test['data'])

Bag of Words encoding¶

In [5]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(token_pattern=TOKENIZING_PATTERN, min_df=2, max_df=0.025)

X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)

X_train.shape

Out[5]:

(11314, 55564)

In [6]:

y_train = newsgroups_train['target']
y_test = newsgroups_test['target']

In [7]:

from sklearn.decomposition import TruncatedSVD

tsvd = TruncatedSVD(n_iter=50)
%time X_reduced = tsvd.fit_transform(X_train)

CPU times: user 7.88 s, sys: 13.5 s, total: 21.4 s
Wall time: 6.93 s

Truncated SVD plot¶

Truncated SVD of rank $k$ is algorithm that computes $k$ principal components of a matrix, like PCA but iteratively.

In [8]:

plt.figure(figsize=(16, 12))

plt.scatter(*X_reduced.T, c=y_train)
plt.show()

Aggregating word embeddings¶

We'll take two aggregation strategies: the first would be to just average word vectors over the words in a sentence that happen to be in model vocabulary.

The second one also incorporates weights from tf-idf model.

In [9]:

def encode_sentence(glove_model, sent, weights=None):
  if weights is None:
    normalizing_factor = len(sent)
    word_vectors = (glove_model[w] for w in sent if glove_model.vocab.get(w))
  else:
    normalizing_factor = len([w for w in sent if glove_model.vocab.get(w)])
    word_vectors = (glove_model[w] * weights[i] for (i, w) in enumerate(sent) if glove_model.vocab.get(w))
  return sum(word_vectors) / normalizing_factor

In [10]:

%time X_glove_train = np.array([encode_sentence(model, s.split()) for s in X_train_text])
%time X_glove_test = np.array([encode_sentence(model, s.split()) for s in X_test_text])

CPU times: user 6.75 s, sys: 0 ns, total: 6.75 s
Wall time: 6.75 s
CPU times: user 4.27 s, sys: 0 ns, total: 4.27 s
Wall time: 4.27 s

In [11]:

from operator import itemgetter
from itertools import groupby

def encode_sentences(X_tfidf, tfidf):
  def get_nonzeros(v):
    try:
      nz = v[v.nonzero()].tolist()
    except AttributeError:
      print(v[v.nonzero()])
    else:
      return v[v.nonzero()].tolist()[0]
  retrieved_sentences = tfidf.inverse_transform(X_tfidf)
  return (np.array(
    [
      encode_sentence(
        model, 
        retrieved_sentences[i],
        weights=get_nonzeros(X_tfidf[i, :]))
      for i in range(X_tfidf.shape[0])]))

In [12]:

%time X_glove_weighted_train = encode_sentences(X_train, tfidf)
%time X_glove_weighted_test = encode_sentences(X_test, tfidf)

CPU times: user 16.8 s, sys: 88 ms, total: 16.9 s
Wall time: 16.9 s
CPU times: user 10.7 s, sys: 4 ms, total: 10.7 s
Wall time: 10.7 s

In [13]:

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

%time X_glove_weighted_train_pca = pca.fit_transform(X_glove_weighted_train)
%time X_glove_train_pca = pca.fit_transform(X_glove_train)

CPU times: user 156 ms, sys: 192 ms, total: 348 ms
Wall time: 137 ms
CPU times: user 140 ms, sys: 264 ms, total: 404 ms
Wall time: 141 ms

Aggregated word embeddings plot¶

In [14]:

plt.figure(figsize=(16, 12))

plt.title('Mean averaged word embeddings')
plt.scatter(*X_glove_train_pca.T, c=y_train)
plt.show()

plt.figure(figsize=(16, 12))

plt.title('Tfidf weight-averaged word embeddings')
plt.scatter(*X_glove_weighted_train_pca.T, c=y_train)
plt.show()

Classification¶

We'll use SGDClassifier which uses (linear) SVMs internally.

In [15]:

from sklearn.metrics import f1_score 
from sklearn.linear_model import SGDClassifier 

sparse_lreg = SGDClassifier(n_iter=25, alpha=0.0001)

%time sparse_lreg.fit(X_train, y_train)

CPU times: user 2.3 s, sys: 0 ns, total: 2.3 s
Wall time: 2.29 s

Out[15]:

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=25, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

Accuracy on tfidf data

In [16]:

print('accuracy:', round(sparse_lreg.score(X_test, y_test), 3))
print('f1:', round(f1_score(y_test, sparse_lreg.predict(X_test), average='weighted'), 3))

accuracy: 0.846
f1: 0.844

In [17]:

from sklearn.preprocessing import StandardScaler

dense_lreg = SGDClassifier(n_iter=50, alpha=0.00005)

sscaler = StandardScaler()

X_glove_normalized_train = sscaler.fit_transform(X_glove_train)

%time dense_lreg.fit(X_glove_train, y_train)#, classes=np.unique(y_train))

CPU times: user 3.78 s, sys: 4 ms, total: 3.79 s
Wall time: 3.79 s

Out[17]:

SGDClassifier(alpha=5e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

Accuracy on means of embedded words

In [18]:

print('accuracy:', round(dense_lreg.score(X_glove_test, y_test), 3))
print('f1:', round(f1_score(y_test, dense_lreg.predict(X_glove_test), average='weighted'), 3))

accuracy: 0.604
f1: 0.583

In [19]:

dense_lreg = SGDClassifier(n_iter=50, alpha=0.00001)

%time dense_lreg.fit(X_glove_weighted_train, y_train)

CPU times: user 3.93 s, sys: 8 ms, total: 3.94 s
Wall time: 3.94 s

Out[19]:

SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

Accuracy on weighted embedded words

In [20]:

print('accuracy:', round(dense_lreg.score(X_glove_weighted_test, y_test), 3))
print('f1:', round(f1_score(y_test, dense_lreg.predict(X_glove_weighted_test), average='weighted'), 3))

accuracy: 0.585
f1: 0.568

As we can see, there is radical drop in performance. The next couple of examples can illustrate why - it seems like averaging tends to lose information about less common, and thus more informative words.

In [21]:

n = 5
no_closest=5

text_encodings = [X_glove_train[i, :] for i in range(n)]
text_encodings_weighted = [X_glove_weighted_train[i, :] for i in range(n)]

Closest $10$ words to mean-aggregated texts

In [22]:

model.most_similar([text_encodings[0]], topn=5)

Out[22]:

[('so', 0.9109116792678833),
 ('this', 0.9107068777084351),
 ('one', 0.9097325801849365),
 ('well', 0.9066551327705383),
 ('even', 0.9066197872161865)]

In [23]:

model.most_similar([text_encodings[1]], topn=5)

Out[23]:

[('this', 0.8972651958465576),
 ('time', 0.8940219879150391),
 ('only', 0.8890316486358643),
 ('same', 0.8874689340591431),
 ('well', 0.8864277601242065)]

In [24]:

model.most_similar([text_encodings[2]], topn=5)

Out[24]:

[('so', 0.9269064664840698),
 ('even', 0.9240497350692749),
 ('well', 0.9103280901908875),
 ('it', 0.9094198346138),
 ('this', 0.9068231582641602)]

Closest words to mean-aggregated texts

In [25]:

model.most_similar([text_encodings_weighted[0]], topn=5)

Out[25]:

[('today', 0.7099319696426392),
 ('well', 0.7080519199371338),
 ('.', 0.701697587966919),
 ('new', 0.7008072733879089),
 ('few', 0.6982711553573608)]

In [26]:

model.most_similar([text_encodings_weighted[1]], topn=5)

Out[26]:

[('time', 0.80085289478302),
 ('this', 0.7895544767379761),
 ('same', 0.7890008091926575),
 ('only', 0.7835026979446411),
 ('one', 0.7550361156463623)]

In [27]:

model.most_similar([text_encodings_weighted[2]], topn=5)

Out[27]:

[('just', 0.8935133218765259),
 ('so', 0.8860389590263367),
 ('going', 0.8701095581054688),
 ('even', 0.8676593899726868),
 ('only', 0.8574004173278809)]

First text

In [28]:

X_train_text[0]

Out[28]:

'from lerxst wam umd edu where my thing subject what car is this nntp posting host rac3 organization university of maryland college park lines 15 was wondering if anyone out there could enlighten me on saw the other day it door sports looked to be from late 60s early 70s called bricklin the doors were really small in addition front bumper separate rest body this all know if can tellme model name engine specs years production made history or whatever info you have funky looking please mail thanks il brought by your neighborhood lerxst'

Words that get encoded by td-idf model and whether they are in GloVe model's vocabulary

In [33]:

from gensim.summarization import summarize, keywords

In [46]:

txt = newsgroups_train['data'][0].replace('\n', '. ')
summarize(txt)

Out[46]:

"From: lerxst@wam.umd.edu (where's my thing).\nIt was a 2-door sports car, looked to be from the late 60s/.\nhave on this funky looking car, please e-mail.."

In [54]:

print(keywords(txt))

specs
park
college
looked
looking

In [50]:

txt

Out[50]:

"From: lerxst@wam.umd.edu (where's my thing). Subject: WHAT car is this!?. Nntp-Posting-Host: rac3.wam.umd.edu. Organization: University of Maryland, College Park. Lines: 15. .  I was wondering if anyone out there could enlighten me on this car I saw. the other day. It was a 2-door sports car, looked to be from the late 60s/. early 70s. It was called a Bricklin. The doors were really small. In addition,. the front bumper was separate from the rest of the body. This is . all I know. If anyone can tellme a model name, engine specs, years. of production, where this car is made, history, or whatever info you. have on this funky looking car, please e-mail.. . Thanks,. - IL.    ---- brought to you by your neighborhood Lerxst ----. . . . . "