In [32]:
# Importing Packages
import codecs
import os
import re
import time
import gensim
import pandas as pd
import glob
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%pylab inline
Populating the interactive namespace from numpy and matplotlib
/usr/local/lib/python2.7/dist-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['seed']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
In [10]:
# Books present
books = sorted(glob.glob("data/harrypotter/*.txt"))

print "Available Books: \n"
for i in books:
    print i.split("/")[2].split("_")[0]
Available Books: 

Book 1 - The Philosopher's Stone.txt
Book 2 - The Chamber of Secrets.txt
Book 3 - The Prisoner of Azkaban.txt
Book 4 - The Goblet of Fire.txt
Book 5 - The Order of the Phoenix.txt
Book 6 - The Half Blood Prince.txt
Book 7 - The Deathly Hallows.txt
In [11]:
# Read data from all books to single corpus variable
temp = ""
t = ""
chars = []
for book in books:
    print 
    print "Reading " + str(book).split("/")[2].split("_")[0]
    with codecs.open(book, "rb", "utf-8") as infile:
        temp += infile.read()
        chars.append(len(temp))
        print "Characters read so far " + str(len(temp))
Reading Book 1 - The Philosopher's Stone.txt
Characters read so far 474429

Reading Book 2 - The Chamber of Secrets.txt
Characters read so far 1006137

Reading Book 3 - The Prisoner of Azkaban.txt
Characters read so far 1683115

Reading Book 4 - The Goblet of Fire.txt
Characters read so far 2870365

Reading Book 5 - The Order of the Phoenix.txt
Characters read so far 4479128

Reading Book 6 - The Half Blood Prince.txt
Characters read so far 5538150

Reading Book 7 - The Deathly Hallows.txt
Characters read so far 6765174
In [12]:
lens = []
lens.append(chars[0])
for i in xrange(1, len(chars)):
    lens.append(chars[i] - chars[i-1])
lens
Out[12]:
[474429, 531708, 676978, 1187250, 1608763, 1059022, 1227024]
In [13]:
y = lens
N = len(y)
x = [i+1 for i in range(N)]
width = 1/1.5

pylab.xlabel("Book")
pylab.ylabel("Length")
plt.bar(x, y, width, color="red", align='center')
Out[13]:
<Container object of 7 artists>
In [14]:
# Split into sentences
sentences = nltk.tokenize.sent_tokenize(temp)
print "Total Sentences are " + str(len(sentences))
Total Sentences are 63914
In [23]:
# sentences to list of words
sent_words = []
total_tokens = 0
for raw_sent in sentences:
    clean = nltk.word_tokenize(re.sub("[^a-zA-Z]"," ", raw_sent.strip().lower()))
    tokens = [i for i in clean if len(i) > 1]
    total_tokens += len(tokens)
    sent_words.append(tokens)

print "Total tokens are " + str(total_tokens)
Total tokens are 1103615
In [24]:
# capture collocations
bigram = gensim.models.Phrases(sent_words)
final = []
for s in sent_words:
    processed_sent = bigram[s]
    final.append(processed_sent)
In [25]:
# Sample first two sentences
final[:2]
Out[25]:
[[u'the',
  u'boy',
  u'who_lived',
  u'mr',
  u'and',
  u'mrs_dursley',
  u'of',
  u'number_four',
  u'privet_drive',
  u'were',
  u'proud',
  u'to',
  u'say',
  u'that',
  u'they',
  u'were',
  u'perfectly',
  u'normal',
  u'thank_you',
  u'very_much'],
 [u'they',
  u'were',
  u'the',
  u'last',
  u'people',
  u'you',
  u'expect',
  u'to',
  u'be',
  u'involved',
  u'in',
  u'anything',
  u'strange',
  u'or',
  u'mysterious',
  u'because',
  u'they',
  u'just',
  u'didn',
  u'hold',
  u'with',
  u'such',
  u'nonsense']]

Model Parameters

We are now ready to train our word embeddings over all Harry Potter books. One thing that we need to decide upon is the model parameters. We will be using gensim's word2vec model implementation for training our model.

Parameters
  1. Window Size: 7
  2. Min Count: 2
  3. Dimension: 200
  4. Workers: 3

To know more about the parameters Click Here

In [26]:
num_features = 300
min_word_count = 3
num_workers = 3
context_size = 7
seed = 1
In [27]:
model = gensim.models.Word2Vec(sent_words, window=context_size, \
                               min_count=min_word_count, workers=num_workers, \
                               seed=seed, size=num_features
                              )
In [28]:
model.train(sent_words)
Out[28]:
4119276
In [29]:
print 'Vocabulary ' + str(len(model.wv.vocab))
Vocabulary 11735
In [30]:
if not os.path.exists("model"):
    os.makedirs("model")
model.save(os.path.join("model", "harry2vec.w2v"))
In [50]:
# words 
print 'Similar kind of words for AZKABAN: '
print [i[0] for i in model.wv.most_similar('azkaban')]
print '\n'
print 'Similar kind of words for SNAPE: '
print [i[0] for i in model.wv.most_similar('snape')]
Similar kind of words for AZKABAN: 
[u'chamber', u'goblet', u'prisoner', u'hallows', u'secrets', u'deathly', u'order', u'philosophers', u'phoenix', u'prince']


Similar kind of words for SNAPE: 
[u'dumbledore', u'slughorn', u'quirrell', u'moody', u'lupin', u'karkaroff', u'voldemort', u'sirius', u'umbridge', u'flitwick']
In [34]:
start = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = model.wv.syn0
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
stop = time.time() - start
print 'Time taken is ' + str(stop)
Time taken is 372.693608046
In [51]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[model.vocab[word].index])
            for word in model.vocab
        ]
    ],
    columns=["word", "x", "y"]
)
points.head(20)
Out[51]:
word x y
0 raining -0.815268 -2.405247
1 yellow 3.388919 2.934252
2 four 2.455602 3.883873
3 rocketing -0.084980 -5.673094
4 woods -3.480195 -2.075241
5 spiders 4.297810 0.316103
6 ornate -1.020054 1.384538
7 conjuring -0.355317 -3.486272
8 aggression 0.735922 -4.838119
9 marching -3.540225 -2.435823
10 distractedly -0.530218 -0.837392
11 crooned 0.098604 -7.877066
12 unblocked -0.499605 -6.515115
13 attracted 1.028695 -3.972730
14 electricity -0.581077 -5.762300
15 wizardry 3.994892 -6.683068
16 sunlit -2.941011 -2.336507
17 fingernails -1.801686 -3.226996
18 tingle 1.581573 -6.702398
19 sputter -0.494680 -6.612611
In [52]:
sns.set_context("poster")
points.plot.scatter("x", "y", s=10, figsize=(20, 12))
Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0208229590>
In [ ]: