# Finding Synonyms and Analogies¶

## Using Pre-trained Word Vectors¶

In [1]:
from mxnet import nd
from mxnet.contrib import text

text.embedding.get_pretrained_file_names().keys()

Out[1]:
dict_keys(['glove', 'fasttext'])

All pre-trained models

In [2]:
print(text.embedding.get_pretrained_file_names('glove'))

['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', 'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', 'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt']


Grab a particular model.

In [3]:
glove_6b50d = text.embedding.create(
'glove', pretrained_file_name='glove.6B.50d.txt')


Print the dictionary size.

In [4]:
len(glove_6b50d)

Out[4]:
400001

We can use a word to get its index in the dictionary, or we can get the word from its index.

In [5]:
glove_6b50d.token_to_idx['beautiful'], glove_6b50d.idx_to_token[3367]

Out[5]:
(3367, 'beautiful')

## Applying Pre-trained Word Vectors¶

### Finding Synonyms¶

In [6]:
def knn(W, x, k):
# The added 1e-9 is for numerical stability
cos = nd.dot(W, x.reshape((-1,))) / (
(nd.sum(W * W, axis=1) + 1e-9).sqrt() * nd.sum(x * x).sqrt())
topk = nd.topk(cos, k=k, ret_typ='indices').asnumpy().astype('int32')
return topk, [cos[i].asscalar() for i in topk]


Then, we search for synonyms by pre-training the word vector instance embed.

In [7]:
def get_similar_tokens(query_token, k, embed):
topk, cos = knn(embed.idx_to_vec,
embed.get_vecs_by_tokens([query_token]), k+1)
for i, c in zip(topk[1:], cos[1:]):  # Remove input words
print('cosine sim=%.3f: %s' % (c, (embed.idx_to_token[i])))


Search in 400,000 words.

In [18]:
get_similar_tokens('amazon', 3, glove_6b50d)

cosine sim=0.663: unbox
cosine sim=0.653: amazon.com
cosine sim=0.647: palm

In [9]:
get_similar_tokens('baby', 3, glove_6b50d)

cosine sim=0.839: babies
cosine sim=0.800: boy
cosine sim=0.792: girl

In [10]:
get_similar_tokens('beautiful', 3, glove_6b50d)

cosine sim=0.921: lovely
cosine sim=0.893: gorgeous
cosine sim=0.830: wonderful


### Finding Analogies¶

In [11]:
def get_analogy(token_a, token_b, token_c, embed):
vecs = embed.get_vecs_by_tokens([token_a, token_b, token_c])
x = vecs[1] - vecs[0] + vecs[2]
topk, cos = knn(embed.idx_to_vec, x, 1)
return embed.idx_to_token[topk[0]]

In [19]:
get_analogy('man', 'woman', 'boy', glove_6b50d)

Out[19]:
'girl'
In [21]:
get_analogy('china', 'beijing', 'japan', glove_6b50d)

Out[21]:
'tokyo'
In [23]:
get_analogy('bad', 'worst', 'nice', glove_6b50d)

Out[23]:
'place'
In [15]:
get_analogy('do', 'did', 'go', glove_6b50d)

Out[15]:
'went'