from os.path import join
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import numpy as np
import scipy as sp
from sklearn.datasets import fetch_20newsgroups
%run ../src/load_data_utils.py
%run ../src/glove_2_word2vec.py
call
make prepare_embeddings in_file_name=glove.6B.100d.txt out_file_name=glove.6B.100d.w2v
To setup word vectors (you can also change 100 to 50, 200 or 300)
__, DATA_DIR = get_env_vars(True)
VECTORS_DIR = join(DATA_DIR, 'glove.6B')
FILENAME_PREFIX = 'glove.6B.100d'
VECTORS_FILENAME_DIR = join(VECTORS_DIR, FILENAME_PREFIX + '.txt')
newsgroups_train = fetch_20newsgroups()
newsgroups_test = fetch_20newsgroups(subset='test')
model = gensim.models.KeyedVectors.load(str(join(VECTORS_DIR, FILENAME_PREFIX + '.w2v')))
[LSA of tf-idf vectors plot](#tsvd plot)
[plots](#aggregated plots)
[Why averaging word embeddings is not a great idea](#document embeddings)
$king + woman - man \approx queen$ example
This will make retrieving weights for incorporating tf-idf to aggregated word embeddings easier.
model.most_similar(positive=['king', 'woman'], negative=['man'])
[('queen', 0.7698541283607483), ('monarch', 0.6843380928039551), ('throne', 0.6755735874176025), ('daughter', 0.6594556570053101), ('princess', 0.6520534753799438), ('prince', 0.6517034769058228), ('elizabeth', 0.6464517712593079), ('mother', 0.6311717629432678), ('emperor', 0.6106470823287964), ('wife', 0.6098655462265015)]
from collections import OrderedDict
import re
TOKENIZING_PATTERN = '(?u)\\b\\w\\w+\\b'
def preprocess_texts(sentences_list):
return (list(
map(
lambda sentence:
' '.join(OrderedDict.fromkeys(re.findall(TOKENIZING_PATTERN, sentence)))
.lower(),
sentences_list)))
X_train_text = preprocess_texts(newsgroups_train['data'])
X_test_text = preprocess_texts(newsgroups_test['data'])
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern=TOKENIZING_PATTERN, min_df=2, max_df=0.025)
X_train = tfidf.fit_transform(X_train_text)
X_test = tfidf.transform(X_test_text)
X_train.shape
(11314, 55564)
y_train = newsgroups_train['target']
y_test = newsgroups_test['target']
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_iter=50)
%time X_reduced = tsvd.fit_transform(X_train)
CPU times: user 7.88 s, sys: 13.5 s, total: 21.4 s Wall time: 6.93 s
Truncated SVD of rank $k$ is algorithm that computes $k$ principal components of a matrix, like PCA but iteratively.
plt.figure(figsize=(16, 12))
plt.scatter(*X_reduced.T, c=y_train)
plt.show()
We'll take two aggregation strategies: the first would be to just average word vectors over the words in a sentence that happen to be in model vocabulary.
The second one also incorporates weights from tf-idf model.
def encode_sentence(glove_model, sent, weights=None):
if weights is None:
normalizing_factor = len(sent)
word_vectors = (glove_model[w] for w in sent if glove_model.vocab.get(w))
else:
normalizing_factor = len([w for w in sent if glove_model.vocab.get(w)])
word_vectors = (glove_model[w] * weights[i] for (i, w) in enumerate(sent) if glove_model.vocab.get(w))
return sum(word_vectors) / normalizing_factor
%time X_glove_train = np.array([encode_sentence(model, s.split()) for s in X_train_text])
%time X_glove_test = np.array([encode_sentence(model, s.split()) for s in X_test_text])
CPU times: user 6.75 s, sys: 0 ns, total: 6.75 s Wall time: 6.75 s CPU times: user 4.27 s, sys: 0 ns, total: 4.27 s Wall time: 4.27 s
from operator import itemgetter
from itertools import groupby
def encode_sentences(X_tfidf, tfidf):
def get_nonzeros(v):
try:
nz = v[v.nonzero()].tolist()
except AttributeError:
print(v[v.nonzero()])
else:
return v[v.nonzero()].tolist()[0]
retrieved_sentences = tfidf.inverse_transform(X_tfidf)
return (np.array(
[
encode_sentence(
model,
retrieved_sentences[i],
weights=get_nonzeros(X_tfidf[i, :]))
for i in range(X_tfidf.shape[0])]))
%time X_glove_weighted_train = encode_sentences(X_train, tfidf)
%time X_glove_weighted_test = encode_sentences(X_test, tfidf)
CPU times: user 16.8 s, sys: 88 ms, total: 16.9 s Wall time: 16.9 s CPU times: user 10.7 s, sys: 4 ms, total: 10.7 s Wall time: 10.7 s
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
%time X_glove_weighted_train_pca = pca.fit_transform(X_glove_weighted_train)
%time X_glove_train_pca = pca.fit_transform(X_glove_train)
CPU times: user 156 ms, sys: 192 ms, total: 348 ms Wall time: 137 ms CPU times: user 140 ms, sys: 264 ms, total: 404 ms Wall time: 141 ms
plt.figure(figsize=(16, 12))
plt.title('Mean averaged word embeddings')
plt.scatter(*X_glove_train_pca.T, c=y_train)
plt.show()
plt.figure(figsize=(16, 12))
plt.title('Tfidf weight-averaged word embeddings')
plt.scatter(*X_glove_weighted_train_pca.T, c=y_train)
plt.show()
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
sparse_lreg = SGDClassifier(n_iter=25, alpha=0.0001)
%time sparse_lreg.fit(X_train, y_train)
CPU times: user 2.3 s, sys: 0 ns, total: 2.3 s Wall time: 2.29 s
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=25, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)
Accuracy on tfidf data
print('accuracy:', round(sparse_lreg.score(X_test, y_test), 3))
print('f1:', round(f1_score(y_test, sparse_lreg.predict(X_test), average='weighted'), 3))
accuracy: 0.846 f1: 0.844
from sklearn.preprocessing import StandardScaler
dense_lreg = SGDClassifier(n_iter=50, alpha=0.00005)
sscaler = StandardScaler()
X_glove_normalized_train = sscaler.fit_transform(X_glove_train)
%time dense_lreg.fit(X_glove_train, y_train)#, classes=np.unique(y_train))
CPU times: user 3.78 s, sys: 4 ms, total: 3.79 s Wall time: 3.79 s
SGDClassifier(alpha=5e-05, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)
Accuracy on means of embedded words
print('accuracy:', round(dense_lreg.score(X_glove_test, y_test), 3))
print('f1:', round(f1_score(y_test, dense_lreg.predict(X_glove_test), average='weighted'), 3))
accuracy: 0.604 f1: 0.583
dense_lreg = SGDClassifier(n_iter=50, alpha=0.00001)
%time dense_lreg.fit(X_glove_weighted_train, y_train)
CPU times: user 3.93 s, sys: 8 ms, total: 3.94 s Wall time: 3.94 s
SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)
Accuracy on weighted embedded words
print('accuracy:', round(dense_lreg.score(X_glove_weighted_test, y_test), 3))
print('f1:', round(f1_score(y_test, dense_lreg.predict(X_glove_weighted_test), average='weighted'), 3))
accuracy: 0.585 f1: 0.568
As we can see, there is radical drop in performance. The next couple of examples can illustrate why - it seems like averaging tends to lose information about less common, and thus more informative words.
n = 5
no_closest=5
text_encodings = [X_glove_train[i, :] for i in range(n)]
text_encodings_weighted = [X_glove_weighted_train[i, :] for i in range(n)]
model.most_similar([text_encodings[0]], topn=5)
[('so', 0.9109116792678833), ('this', 0.9107068777084351), ('one', 0.9097325801849365), ('well', 0.9066551327705383), ('even', 0.9066197872161865)]
model.most_similar([text_encodings[1]], topn=5)
[('this', 0.8972651958465576), ('time', 0.8940219879150391), ('only', 0.8890316486358643), ('same', 0.8874689340591431), ('well', 0.8864277601242065)]
model.most_similar([text_encodings[2]], topn=5)
[('so', 0.9269064664840698), ('even', 0.9240497350692749), ('well', 0.9103280901908875), ('it', 0.9094198346138), ('this', 0.9068231582641602)]
Closest words to mean-aggregated texts
model.most_similar([text_encodings_weighted[0]], topn=5)
[('today', 0.7099319696426392), ('well', 0.7080519199371338), ('.', 0.701697587966919), ('new', 0.7008072733879089), ('few', 0.6982711553573608)]
model.most_similar([text_encodings_weighted[1]], topn=5)
[('time', 0.80085289478302), ('this', 0.7895544767379761), ('same', 0.7890008091926575), ('only', 0.7835026979446411), ('one', 0.7550361156463623)]
model.most_similar([text_encodings_weighted[2]], topn=5)
[('just', 0.8935133218765259), ('so', 0.8860389590263367), ('going', 0.8701095581054688), ('even', 0.8676593899726868), ('only', 0.8574004173278809)]
First text
X_train_text[0]
'from lerxst wam umd edu where my thing subject what car is this nntp posting host rac3 organization university of maryland college park lines 15 was wondering if anyone out there could enlighten me on saw the other day it door sports looked to be from late 60s early 70s called bricklin the doors were really small in addition front bumper separate rest body this all know if can tellme model name engine specs years production made history or whatever info you have funky looking please mail thanks il brought by your neighborhood lerxst'
Words that get encoded by td-idf model and whether they are in GloVe model's vocabulary
from gensim.summarization import summarize, keywords
txt = newsgroups_train['data'][0].replace('\n', '. ')
summarize(txt)
"From: lerxst@wam.umd.edu (where's my thing).\nIt was a 2-door sports car, looked to be from the late 60s/.\nhave on this funky looking car, please e-mail.."
print(keywords(txt))
specs park college looked looking
txt
"From: lerxst@wam.umd.edu (where's my thing). Subject: WHAT car is this!?. Nntp-Posting-Host: rac3.wam.umd.edu. Organization: University of Maryland, College Park. Lines: 15. . I was wondering if anyone out there could enlighten me on this car I saw. the other day. It was a 2-door sports car, looked to be from the late 60s/. early 70s. It was called a Bricklin. The doors were really small. In addition,. the front bumper was separate from the rest of the body. This is . all I know. If anyone can tellme a model name, engine specs, years. of production, where this car is made, history, or whatever info you. have on this funky looking car, please e-mail.. . Thanks,. - IL. ---- brought to you by your neighborhood Lerxst ----. . . . . "