import numpy as np
from collections import Counter
import mmap
import re
import networkx as nx
import math
## EXPonetial memoization table
MAX_EXP = 6
EXP_TABLE_SIZE = 1000
EXP_TABLE = np.arange(start = 0, stop = EXP_TABLE_SIZE,
step = 1, dtype = np.float64)
EXP_TABLE = np.exp((EXP_TABLE / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
EXP_TABLE = EXP_TABLE / (EXP_TABLE + 1.)
## HELPER FUNCTION
def file_to_wordstream(file_path):
with open(file_path) as fin:
mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
for match in re.finditer(r'(.*?)\s', mf):
word = match.group(1)
if word:
yield word
def inspect_vocab_tree(vocab):
g = nx.DiGraph()
vocab_size = len(vocab)
edges = set()
for vw in vocab:
tree_path = [i + vocab_size for i in vw['path']]
tree_path = [str(i) if i >= vocab_size
else "%d_%s(%d)" % (i, vocab[i]['word'], vocab[i]['count'])
for i in tree_path]
edges.update(zip(tree_path[:-1], tree_path[1:]))
g.add_edges_from(edges)
figure(figsize=(16, 16))
pos = nx.graphviz_layout(g, prog='dot')
nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)
return g
## HashedVocab
class HashedVocab(object):
def __init__(self):
self.HASH_SIZE = 30000000
self.MIN_COUNT = 5
self.vocab = []
self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)
self.vocab_hash.fill(-1)
def fit(self, word_stream):
counter = Counter(word_stream)
self.vocab = map(lambda x: dict(zip(['word', 'count'], x)),
filter(lambda x: x[1] > self.MIN_COUNT,
counter.most_common(len(counter))))
if len(self.vocab) > self.HASH_SIZE * 0.7:
raise RuntimeError('Vocab size too large, increase MIN_COUNT or increase HASH_SIZE')
self.build_hash()
self.build_huffman_tree()
def search_for(self, word):
word_hash = self.get_word_hash(word)
while True:
word_index = self.vocab_hash[word_hash]
if word_index == -1:
return - 1
elif word == self.vocab[word_index]['word']:
return word_index
else:
word_hash = (word_hash + 1) % self.HASH_SIZE
def __getitem__(self, word_index):
return self.vocab[word_index]
def __len__(self):
return len(self.vocab)
def build_hash(self):
self.vocab_hash = np.empty(self.HASH_SIZE, dtype = np.int)
self.vocab_hash.fill(-1)
for word_index, vocab_word in enumerate(self.vocab):
word = vocab_word['word']
word_hash = self.get_word_hash(word)
self.add_to_hash(word_hash, word_index)
def get_word_hash(self, word):
## TOO SLOW
#word_hash = sum([ord(c)*(257**i)
# for i, c in zip(range(len(word))[::-1], word)])
word_hash = 0
for c in word:
word_hash = word_hash * 257 + ord(c)
word_hash %= self.HASH_SIZE
return word_hash
def add_to_hash(self, word_hash, word_index):
while self.vocab_hash[word_hash] != -1:
word_hash = (word_hash + 1) % self.HASH_SIZE
self.vocab_hash[word_hash] = word_index
def build_huffman_tree(self):
"""
build binary Huffman Tree by word counts,
the structure will be embedded in the vocab_word
dict(word, count, path, code) in the vocab
vocab_word['code'] will be the binary representation of word
based on frequency
vocab_word['path'] will be the path from root to leaf
"""
## for arbitary full binary, n-1 internal inodes at max
## given n leaves. But in the original C code, the count
## binary and parent_node size are n*2+1 instead of n*2-1
vocab_size = len(self.vocab)
print "DEBUG:", vocab_size
## workhorse data structure for tree construction
count = np.empty(vocab_size*2-1, dtype = np.int64)
count.fill(1e15)
count[:vocab_size] = [vw['count'] for vw in self.vocab]
## boolean values of each node
binary = np.zeros(vocab_size*2-1, dtype = np.int8)
## parent node to store the path
parent_node = np.empty(vocab_size*2-1, dtype = np.int64)
## construct the tree
pos1, pos2 = vocab_size-1, vocab_size
for a in xrange(vocab_size-1):
## min1i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min1i, pos1 = pos1, pos1-1
else:
min1i, pos2 = pos2, pos2+1
else:
min1i, pos2 = pos2, pos2+1
## min2i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min2i, pos1 = pos1, pos1-1
else:
min2i, pos2 = pos2, pos2+1
else:
min2i, pos2 = pos2, pos2+1
count[vocab_size + a] = count[min1i] + count[min2i]
parent_node[min1i] = vocab_size + a
parent_node[min2i] = vocab_size + a
binary[min2i] = 1
for a in xrange(vocab_size):
b, i = a, 0
code, path = [], []
while True:
code.append(binary[b])
path.append(b)
i += 1
b = parent_node[b]
if b == vocab_size * 2 - 2: break
self.vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]
self.vocab[a]['code'] = code[::-1]
## Unigram Table
class UnigramTable(object):
def __init__(self, hashed_vocab):
self.TABLE_SIZE = int(1e8)
self.table = np.empty(self.TABLE_SIZE, np.int64)
self.power = 0.75
vocab = hashed_vocab.vocab
vocab_size = len(vocab)
train_words_pow = sum(math.pow(w['count'], self.power)
for w in vocab)
i = 0
d1 = math.pow(vocab[i]['count'], self.power) / train_words_pow
for a in xrange(self.TABLE_SIZE):
self.table[a] = i
if a * 1. / self.TABLE_SIZE > d1:
i += 1
d1 += math.pow(vocab[i]['count'], self.power) / train_words_pow
if i >= vocab_size:
i = vocab_size - 1
def __getitem__(self, index):
return self.table[index]
## Learning Models
class Word2Vec(object):
"""
Net Architecture: cbow / skip_gram
Learning: hs / negative_sampling
"""
def __init__(self, hashed_vocab, unigram_table, layer1_size,
net_type, learn_hs, learn_negative):
"""
hashed_vocab: vocab to build on
layer1_size: the dimensionality of feature space
net_type: {'cbow', 'skip_gram'}
learn_hs : hierarchical softmax
learn_negative: negative sampling
"""
self.vocab = hashed_vocab
self.table = unigram_table
self.layer1_size = layer1_size
self.net_type = net_type
self.learn_hs = learn_hs
self.learn_negative = learn_negative
self.starting_alpha = 0.025
self.sentence_len = 1000
self.sampling = 1e-4
self.window = 5 # max skip length between words
self.negative = 5 # 5 - 10
self.REAL_TYPE = np.float64
self.syn0 = np.array([], dtype = self.REAL_TYPE)
self.syn1 = np.array([], dtype = self.REAL_TYPE)
self.syn1neg = np.array([], dtype = self.REAL_TYPE)
def init_net(self):
"""
syn0 - len(vocab) * layer1_size
syn1 - len(vocab) * layer1_size
syn1neg - len(vocab) * layer1_size
"""
vocab_size = len(self.vocab)
self.syn0 = np.random.uniform(low = -.5 / self.layer1_size,
high = .5 / self.layer1_size,
size = (vocab_size, self.layer1_size)).astype(self.REAL_TYPE)
if self.learn_hs:
self.syn1 = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)
if self.learn_negative:
self.syn1neg = np.zeros((vocab_size, self.layer1_size), dtype = self.REAL_TYPE)
def fit(self, words):
"""
word_stream: stream of words
ntotal: total number of words in word_stream
"""
self.init_net()
ntotal = len(words)
alpha = self.starting_alpha
next_random = 0
nprocessed = 0
while nprocessed < ntotal:
## adjust learning rate alpha
alpha = max(self.starting_alpha * (1 - nprocessed / (ntotal + 1.)),
self.starting_alpha * 0.0001)
## refill the sentence
sentence_index = []
while nprocessed < ntotal and len(sentence_index) < self.sentence_len:
## sampling down the infrequent words
if nprocessed % 10000 == 0:
print 'progress:', nprocessed * 100. / ntotal, '%'
word = words[nprocessed]
word_index = self.vocab.search_for(word)
word_count = self.vocab[word_index]['count']
nprocessed += 1
if word_index == -1: continue
if self.sampling > 0:
ran = ( (math.sqrt(word_count / (self.sampling * ntotal)) + 1)
* (self.sampling * ntotal) / word_count);
next_random = next_random * 25214903917 + 11
if ran < (next_random & 0xFFFF) / 65536.: continue
sentence_index.append(word_index)
## for each word in sentence
## pivot is the current word index in sentence_index
for ipivot, pivot in enumerate(sentence_index):
next_random = next_random * 25214903917 + 11
b = next_random % self.window
neu1 = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)
neu1e = np.zeros(self.layer1_size, dtype = self.REAL_TYPE)
if self.net_type == 'cbow':
## accumulate neighbors into neu1
## neighbors [ipivot-(window-b), ipivot+window-b]
left = max(0, ipivot-(self.window-b))
right = min(self.sentence_len-1, ipivot+self.window-b)
## NEU1 -- ACCUMULATION IN SENTENCE
## IN -> HIDDEN
neu1 = np.sum(self.syn0[left:right+1, :], axis = 0)
if self.learn_hs:
## parent node in Huffman tree
## the last one is root
for parent_index, parent_code in zip(self.vocab[pivot]['path'][:-1],
self.vocab[pivot]['code'][:-1]):
## F IS COS-DIFF OF NEIGHBORS'SYN0 AND PARENTS (class)' SYN1
f = np.dot(neu1, self.syn1[parent_index, :])
if f <= -MAX_EXP or f >= MAX_EXP:
continue
else:
f = EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
g = (1 - parent_code - f) * alpha
neu1e += g * self.syn1[parent_index]
self.syn1[parent_index] += g * neu1
if self.learn_negative:
for d in xrange(self.negative + 1):
if d == 0:
target = pivot
label = 1
else:
next_random = next_random * 25214903917 + 11
target = self.table[(next_random >> 16) % self.table.TABLE_SIZE]
if (target == pivot): continue
label = 0
f = np.dot(neu1, self.syn1neg[target, :])
if f > MAX_EXP:
g = (label - 1) * alpha
elif f < -MAX_EXP:
g = (label - 0) * alpha
else:
g = alpha * (label - EXP_TABLE[int((f + MAX_EXP)
* (EXP_TABLE_SIZE / MAX_EXP / 2))])
neu1e += g * self.syn1neg[target]
self.syn1neg[target] += g * neu1
## HIDDEN -> IN
self.syn0[left:right+1, :] += neu1e
elif self.net_type == 'skip_gram':
pass
## TEST HashedVocab
hashed_vocab = HashedVocab()
#hashed_vocab.fit(file_to_wordstream('data/text_simple'))
hashed_vocab.fit(file_to_wordstream('data/simple.txt'))
unigram_table = UnigramTable(hashed_vocab)
DEBUG: 1179
for i, vw in enumerate(hashed_vocab.vocab):
word = vw['word']
assert i == hashed_vocab.search_for(word)
print hashed_vocab.search_for('alien')
print len(hashed_vocab.vocab)
#inspect_vocab_tree(hashed_vocab.vocab)
print hashed_vocab[10]['word']
for n in hashed_vocab[10]['path'][:-1]:
print hashed_vocab[n]['word']
-1 1179 as respect practice issued texts outside private
train_words = list(file_to_wordstream('data/simple.txt'))
word2vec = Word2Vec(hashed_vocab, unigram_table, 100, 'cbow', True, True)
word2vec.fit(train_words)
progress: 0.0 % progress: 20.000400008 % progress: 40.000800016 % progress: 60.001200024 % progress: 80.001600032 %