from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
from tensorflow.python.client import device_lib
%matplotlib inline
local_device_protos = device_lib.list_local_devices()
[x.name for x in local_device_protos if x.device_type == 'GPU']
['/gpu:0']
# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8.zip', 31344016)
Found and verified text8.zip
# Read the data into a list of strings.
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words"""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print('Data size', len(words))
Data size 17005207
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)] Sample data [5239, 3082, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]],
'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
3082 originated -> 12 as 3082 originated -> 5239 anarchism 12 as -> 6 a 12 as -> 3082 originated 6 a -> 12 as 6 a -> 195 term 195 term -> 6 a 195 term -> 2 of
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/gpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.initialize_all_variables()
WARNING:tensorflow:From <ipython-input-13-11e9672ed0e1>:44: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02. Instructions for updating: Use `tf.global_variables_initializer` instead.
num_steps = 100001
with tf.Session(config=tf.ConfigProto(log_device_placement=True), graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
Initialized Average loss at step 0 : 259.940246582 Nearest to that: tremblay, utilizing, enchanted, dismantle, dinosaur, exception, priors, myoglobin, Nearest to may: orchard, meyers, leno, druidry, peritoneum, usable, pauses, roleplaying, Nearest to its: results, scalia, vigour, adonijah, berman, foibles, humber, approximation, Nearest to more: buckyballs, botanical, mitosis, hendrik, linemen, win, primitives, superpowers, Nearest to UNK: ooze, aafc, ntfs, gretzky, highschool, partitioning, donelson, schoolmaster, Nearest to they: steering, congreso, darker, provisions, yerushalayim, fisc, realm, resonance, Nearest to d: wainwright, toonopedia, tinbergen, shamans, imitate, dagestan, cabinda, adversus, Nearest to during: romagna, motions, zigzag, swanson, tsim, kaplan, widest, rupp, Nearest to often: offshoot, snacks, corvinus, elected, overdose, fremen, ducktales, mohenjo, Nearest to between: wladislaus, propto, quest, nikaya, unconnected, branching, ailey, bats, Nearest to state: lazarus, uncountably, coupon, ect, circe, gabonese, louisville, practise, Nearest to for: ikea, domes, fpss, amazonas, atoll, expresses, microscopic, buddhist, Nearest to see: thera, gallardo, periodic, pyrenees, burner, crtc, midrashic, intrigued, Nearest to no: matured, suprarenal, lazarus, line, bronx, eyepiece, beggar, forgive, Nearest to such: homily, seventies, whore, candu, sla, aleinu, mov, wednesday, Nearest to after: recognizing, concurrency, kenya, nebulous, parish, gresham, doe, wiccans, Average loss at step 2000 : 113.381620845 Average loss at step 4000 : 52.7048133955 Average loss at step 6000 : 33.4920249097 Average loss at step 8000 : 23.6978426812 Average loss at step 10000 : 17.7604874439 Nearest to that: and, archie, didn, ry, enchanted, released, trouble, fins, Nearest to may: pharaohs, usually, victoria, zero, probably, nine, malignancies, vs, Nearest to its: the, surrendered, approximation, berman, zermelo, his, results, address, Nearest to more: appointed, linemen, win, hotel, put, zone, mathbf, botanical, Nearest to UNK: one, and, mathbf, linebackers, archie, the, a, agave, Nearest to they: he, realm, strains, aeneas, zero, it, clan, settle, Nearest to d: toonopedia, reported, cabinda, ads, clarke, propaganda, bills, civilization, Nearest to during: motions, reginae, newly, all, sheridan, bn, parks, festivals, Nearest to often: inspiration, languages, elected, casualties, ve, graph, mystery, cyanide, Nearest to between: quest, subject, effect, ufo, supplies, quantum, proving, onto, Nearest to state: practise, introduced, objections, aires, gabonese, agave, wire, sherman, Nearest to for: in, of, with, and, to, antimatter, structural, from, Nearest to see: six, periodic, outsider, well, finalist, pyrenees, syphilis, garlic, Nearest to no: line, cardinality, class, dim, reason, beliefs, massachusetts, again, Nearest to such: contribution, jung, archie, homily, religion, word, senses, seventies, Nearest to after: from, sets, kenya, hoare, emergency, constantinople, in, bilingual, Average loss at step 12000 : 14.0305621388 Average loss at step 14000 : 11.6425184925 Average loss at step 16000 : 10.0025892993 Average loss at step 18000 : 8.43416487277 Average loss at step 20000 : 8.01691550207 Nearest to that: and, which, operatorname, anchoring, not, agouti, aoc, impressions, Nearest to may: victoria, usually, scriptures, peritoneum, circ, pharaohs, to, would, Nearest to its: the, his, their, origen, ignatius, agouti, marry, tyrant, Nearest to more: appointed, primitives, agouti, frying, win, linemen, zone, explodes, Nearest to UNK: agouti, dasyprocta, and, one, operatorname, linebackers, archie, three, Nearest to they: he, it, realm, dasyprocta, there, who, strains, settle, Nearest to d: and, toonopedia, sargon, one, clarke, propaganda, ads, m, Nearest to during: motions, and, in, dasyprocta, reginae, newly, sheridan, all, Nearest to often: antoninus, operatorname, inspiration, and, languages, hello, graph, ve, Nearest to between: of, for, agouti, quest, ufo, quantum, proving, subject, Nearest to state: apatosaurus, practise, agouti, dasyprocta, funnel, aires, operatorname, hijacking, Nearest to for: in, of, with, and, from, agouti, as, to, Nearest to see: six, periodic, pyrenees, waite, outsider, operatorname, burner, is, Nearest to no: libby, not, anchoring, line, it, agouti, this, shirkuh, Nearest to such: homily, contribution, senses, farmed, jung, seventies, tempe, religion, Nearest to after: from, sets, constantinople, hoare, kenya, in, for, nicobar, Average loss at step 22000 : 6.99141346753 Average loss at step 24000 : 6.90114425349 Average loss at step 26000 : 6.79805140388 Average loss at step 28000 : 6.31811893368 Average loss at step 30000 : 5.93085589266 Nearest to that: which, tonnage, operatorname, agouti, anchoring, trapezohedron, aoc, also, Nearest to may: can, would, could, usually, victoria, scriptures, circ, to, Nearest to its: the, their, his, arin, agouti, marry, ignatius, origen, Nearest to more: appointed, primitives, agouti, win, considered, linemen, frying, explodes, Nearest to UNK: agouti, dasyprocta, operatorname, bcl, four, three, archie, two, Nearest to they: he, it, there, who, realm, settle, pear, not, Nearest to d: b, toonopedia, sargon, and, american, clarke, m, r, Nearest to during: in, motions, and, dasyprocta, sheridan, reginae, newly, all, Nearest to often: antoninus, operatorname, inspiration, now, zero, languages, graph, it, Nearest to between: for, with, agouti, ufo, cordoba, over, seven, quest, Nearest to state: apatosaurus, practise, abet, agouti, hijacking, funnel, gabonese, af, Nearest to for: with, in, and, of, from, agouti, to, heinz, Nearest to see: burner, six, periodic, hodge, pyrenees, waite, is, adapa, Nearest to no: it, suprarenal, not, anchoring, libby, agouti, this, line, Nearest to such: homily, tempe, farmed, jung, well, senses, contribution, textures, Nearest to after: from, with, in, for, hoare, kenya, agouti, sets, Average loss at step 32000 : 5.9782226367 Average loss at step 34000 : 5.69418454897 Average loss at step 36000 : 5.76266445196 Average loss at step 38000 : 5.48095271897 Average loss at step 40000 : 5.27526280099 Nearest to that: which, this, operatorname, but, agouti, tonnage, however, it, Nearest to may: can, would, could, will, scriptures, zero, victoria, usually, Nearest to its: their, his, the, agouti, arin, a, reconstruction, replicating, Nearest to more: appointed, primitives, contended, considered, agouti, most, recycling, win, Nearest to UNK: agouti, dasyprocta, four, operatorname, three, vma, abandonware, seven, Nearest to they: he, it, there, who, not, you, settle, realm, Nearest to d: b, toonopedia, m, sargon, r, prequel, clarke, eight, Nearest to during: in, motions, romagna, dasyprocta, and, newly, sheridan, reginae, Nearest to often: antoninus, now, also, operatorname, usually, inspiration, zero, barney, Nearest to between: in, agouti, with, over, cordoba, ufo, about, for, Nearest to state: apatosaurus, abet, agouti, practise, recitative, dasyprocta, hijacking, gabonese, Nearest to for: of, with, heinz, in, or, from, and, operatorname, Nearest to see: burner, periodic, hodge, waite, pyrenees, outsider, merrill, adapa, Nearest to no: it, libby, matured, anchoring, suprarenal, a, not, agouti, Nearest to such: well, homily, tempe, textures, known, senses, jung, farmed, Nearest to after: from, in, before, with, three, agouti, hoare, five,
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)