# Importing Packages
import codecs
import os
import re
import time
import gensim
import pandas as pd
import glob
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%pylab inline
Populating the interactive namespace from numpy and matplotlib
/usr/local/lib/python2.7/dist-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['seed'] `%matplotlib` prevents importing * from pylab and numpy "\n`%matplotlib` prevents importing * from pylab and numpy"
# Books present
books = sorted(glob.glob("data/harrypotter/*.txt"))
print "Available Books: \n"
for i in books:
print i.split("/")[2].split("_")[0]
Available Books: Book 1 - The Philosopher's Stone.txt Book 2 - The Chamber of Secrets.txt Book 3 - The Prisoner of Azkaban.txt Book 4 - The Goblet of Fire.txt Book 5 - The Order of the Phoenix.txt Book 6 - The Half Blood Prince.txt Book 7 - The Deathly Hallows.txt
# Read data from all books to single corpus variable
temp = ""
t = ""
chars = []
for book in books:
print
print "Reading " + str(book).split("/")[2].split("_")[0]
with codecs.open(book, "rb", "utf-8") as infile:
temp += infile.read()
chars.append(len(temp))
print "Characters read so far " + str(len(temp))
Reading Book 1 - The Philosopher's Stone.txt Characters read so far 474429 Reading Book 2 - The Chamber of Secrets.txt Characters read so far 1006137 Reading Book 3 - The Prisoner of Azkaban.txt Characters read so far 1683115 Reading Book 4 - The Goblet of Fire.txt Characters read so far 2870365 Reading Book 5 - The Order of the Phoenix.txt Characters read so far 4479128 Reading Book 6 - The Half Blood Prince.txt Characters read so far 5538150 Reading Book 7 - The Deathly Hallows.txt Characters read so far 6765174
lens = []
lens.append(chars[0])
for i in xrange(1, len(chars)):
lens.append(chars[i] - chars[i-1])
lens
[474429, 531708, 676978, 1187250, 1608763, 1059022, 1227024]
y = lens
N = len(y)
x = [i+1 for i in range(N)]
width = 1/1.5
pylab.xlabel("Book")
pylab.ylabel("Length")
plt.bar(x, y, width, color="red", align='center')
<Container object of 7 artists>
# Split into sentences
sentences = nltk.tokenize.sent_tokenize(temp)
print "Total Sentences are " + str(len(sentences))
Total Sentences are 63914
# sentences to list of words
sent_words = []
total_tokens = 0
for raw_sent in sentences:
clean = nltk.word_tokenize(re.sub("[^a-zA-Z]"," ", raw_sent.strip().lower()))
tokens = [i for i in clean if len(i) > 1]
total_tokens += len(tokens)
sent_words.append(tokens)
print "Total tokens are " + str(total_tokens)
Total tokens are 1103615
# capture collocations
bigram = gensim.models.Phrases(sent_words)
final = []
for s in sent_words:
processed_sent = bigram[s]
final.append(processed_sent)
# Sample first two sentences
final[:2]
[[u'the', u'boy', u'who_lived', u'mr', u'and', u'mrs_dursley', u'of', u'number_four', u'privet_drive', u'were', u'proud', u'to', u'say', u'that', u'they', u'were', u'perfectly', u'normal', u'thank_you', u'very_much'], [u'they', u'were', u'the', u'last', u'people', u'you', u'expect', u'to', u'be', u'involved', u'in', u'anything', u'strange', u'or', u'mysterious', u'because', u'they', u'just', u'didn', u'hold', u'with', u'such', u'nonsense']]
We are now ready to train our word embeddings over all Harry Potter books. One thing that we need to decide upon is the model parameters. We will be using gensim's word2vec model implementation for training our model.
To know more about the parameters Click Here
num_features = 300
min_word_count = 3
num_workers = 3
context_size = 7
seed = 1
model = gensim.models.Word2Vec(sent_words, window=context_size, \
min_count=min_word_count, workers=num_workers, \
seed=seed, size=num_features
)
model.train(sent_words)
4119276
print 'Vocabulary ' + str(len(model.wv.vocab))
Vocabulary 11735
if not os.path.exists("model"):
os.makedirs("model")
model.save(os.path.join("model", "harry2vec.w2v"))
# words
print 'Similar kind of words for AZKABAN: '
print [i[0] for i in model.wv.most_similar('azkaban')]
print '\n'
print 'Similar kind of words for SNAPE: '
print [i[0] for i in model.wv.most_similar('snape')]
Similar kind of words for AZKABAN: [u'chamber', u'goblet', u'prisoner', u'hallows', u'secrets', u'deathly', u'order', u'philosophers', u'phoenix', u'prince'] Similar kind of words for SNAPE: [u'dumbledore', u'slughorn', u'quirrell', u'moody', u'lupin', u'karkaroff', u'voldemort', u'sirius', u'umbridge', u'flitwick']
start = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = model.wv.syn0
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
stop = time.time() - start
print 'Time taken is ' + str(stop)
Time taken is 372.693608046
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in [
(word, all_word_vectors_matrix_2d[model.vocab[word].index])
for word in model.vocab
]
],
columns=["word", "x", "y"]
)
points.head(20)
word | x | y | |
---|---|---|---|
0 | raining | -0.815268 | -2.405247 |
1 | yellow | 3.388919 | 2.934252 |
2 | four | 2.455602 | 3.883873 |
3 | rocketing | -0.084980 | -5.673094 |
4 | woods | -3.480195 | -2.075241 |
5 | spiders | 4.297810 | 0.316103 |
6 | ornate | -1.020054 | 1.384538 |
7 | conjuring | -0.355317 | -3.486272 |
8 | aggression | 0.735922 | -4.838119 |
9 | marching | -3.540225 | -2.435823 |
10 | distractedly | -0.530218 | -0.837392 |
11 | crooned | 0.098604 | -7.877066 |
12 | unblocked | -0.499605 | -6.515115 |
13 | attracted | 1.028695 | -3.972730 |
14 | electricity | -0.581077 | -5.762300 |
15 | wizardry | 3.994892 | -6.683068 |
16 | sunlit | -2.941011 | -2.336507 |
17 | fingernails | -1.801686 | -3.226996 |
18 | tingle | 1.581573 | -6.702398 |
19 | sputter | -0.494680 | -6.612611 |
sns.set_context("poster")
points.plot.scatter("x", "y", s=10, figsize=(20, 12))
<matplotlib.axes._subplots.AxesSubplot at 0x7f0208229590>