Main Change from v1:
from collections import namedtuple
import numpy as np
import mmap
import re
from IPython.display import display
import networkx as nx
import multiprocessing
import ctypes
from os import path
## control flags
flags = {
'debug_mode': 2
, 'hs': True
, 'negative': True
, 'train_file': 'data/text_simple'
, 'num_threads': 10
}
## types
#VocabWord = {'word':??,'count':??, 'path':??, 'code':??}
REAL = np.float64
HASH_TYPE = np.int64
SHORT_INT = np.int8
## CONSTRATS
MAX_STRING = 100
EXP_TABLE_SIZE = 1000
MAX_EXP = 6
##
#VOCAB_MAX_SIZE = 1000
VOCAB_HASH_SIZE = 30000000
n_train_words = 0
##
MIN_COUNT = 5
MIN_REDUCE = 1
MAX_SENTENCE_LENGTH = 1000
MAX_CODE_LENGTH = 40
##
WINDOW = 5
LAYER1_SIZE = 100
ALPHA = 0.025
##
TABLE_SIZE = 1e8
## DATA STRUCTURE
vocab = []
vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = HASH_TYPE)
vocab_hash.fill(-1)
exp_table = np.arange(start = 0, stop = EXP_TABLE_SIZE,
step = 1, dtype = REAL)
exp_table = np.exp((exp_table / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
exp_table = exp_table / (exp_table + 1.)
syn0 = np.array([], dtype = REAL)
syn1 = np.array([], dtype = REAL)
syn1neg = np.array([], dtype = REAL)
def get_word_hash(word):
word_hash = sum([ord(c)*(257**i)
for i, c in zip(range(len(word))[::-1], word)])
word_hash %= VOCAB_HASH_SIZE
return word_hash
def add_vocab_hash(word_hash, word_index):
global vocab_hash
while vocab_hash[word_hash] != -1:
word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
vocab_hash[word_hash] = word_index
def search_vocab(word):
"""
Search for word's vocab_index by using the vocab_hash
"""
word_hash = get_word_hash(word)
while True:
word_index = vocab_hash[word_hash]
## no found
if word_index == -1:
return -1
elif word == vocab[word_index]['word']:
return word_index
else:
word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
return -1 # should never reach here
def reduce_vocab():
"""
Reduce the vocabulary size by removing infrequent tokens
"""
global vocab, vocab_hash
## in-place remove infrequent words
a, b = 0, 0
for a in xrange(len(vocab)):
if vocab[a]['count'] > MIN_REDUCE:
vocab[b]['count'] = vocab[a]['count']
vocab[b]['word'] = vocab[a]['word']
b += 1
vocab = vocab[:b]
## reset the hash table
vocab_hash.fill(-1)
for word_index, vocab_word in enumerate(vocab):
word_hash = get_word_hash(vocab_word['word'])
add_vocab_hash(word_hash, word_index)
MIN_REDUCE += 1
def add_word_to_vocab(word):
"""
construct a VocabWord {'count', 'path', 'word', 'code'}
from word
add vocab_word to vocab
put its index to vocab_hash
word_index: the index of vocab_word in vocab
word_hash: the index of word_index in vocab_hash
"""
global vocab, vocab_hash
vocab_word = dict(count = 0, path = None, word = word, code = None)
vocab.append(vocab_word)
word_hash = get_word_hash(word)
word_index = len(vocab)-1
add_vocab_hash(word_hash, word_index)
return word_index
def sort_vocab():
"""
sort the vocabulary by frequency using word counts
</s> will be kept in vocab at the first place, with count = 0,
but it is NOT hashed, which means its hash value in vocab_hash
will be -1
"""
global vocab, vocab_hash, n_train_words
## sort the vocab
## based on word frequency in DECRESENT order
vocab = sorted(vocab, key = lambda v: v['count'],
reverse = True)
## re-initialize vocab_hash, reduce vocab
vocab_hash.fill(-1)
vocab_sz = len(vocab)
n_train_words = 0
for iword, vword in enumerate(vocab):
## discarding words less than MIN_COUNT
if vword['count'] < MIN_COUNT:
vocab_sz -= 1
else:
word_hash = get_word_hash(vword['word'])
add_vocab_hash(word_hash, iword)
n_train_words += vword['count']
## trucate vocab
vocab = vocab[:vocab_sz]
def read_word(fpath):
"""
Lazy read word from a file (words separated by whitespace)
using mmap (OS virtual memory system) to read the file,
ONLY TESTED on POSIX
WE DONT insert </s> explicitly for every \n here, like
in the original C version, though there is no \n in the
training file
"""
with open(fpath) as fin:
mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
for word in re.finditer(r'(.*?)\s', mf):
w = word.group(1)
if w:
yield w
def learn_vocab_from_file(fpath):
"""
fpath: path of train file,
train file is a collection of words delimited by whitespace
modify: vocab - np.array of dtype=vocab_word
vocab_hash - hashed index of words in vocab
"""
global vocab, vocab_hash
## prelocate data structure
vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = np.int32)
vocab_hash.fill(-1)
for i, word in enumerate(read_word(fpath)):
if flags['debug_mode'] > 1 and i % 1000000 == 0:
print "%iM" % (i/1000000)
## find the word's vocab_index by using vocab_hash
word_index = search_vocab(word)
## add new word
if word_index == -1:
word_index = add_word_to_vocab(word)
vocab[word_index]['count'] = 1
else:
vocab[word_index]['count'] += 1
## increase vocab_hash size
if len(vocab) > VOCAB_HASH_SIZE * 0.7:
reduce_vocab()
sort_vocab()
if flags['debug_mode'] > 0:
print 'Vocab Size: %i\nWords in train file: %d' % (len(vocab), i)
def init_net():
"""
syn0 - len(vocab) * layer1_size
syn1 - len(vocab) * layer1_size
Initialize with certain weight values
Make them shared variable
"""
global syn0, syn1, syn1neg
vocab_size = len(vocab)
shared_syn0_base = multiprocessing.Array(ctypes.c_float,
vocab_size * LAYER1_SIZE)
syn0 = np.ctypeslib.as_array(shared_syn0_base.get_obj())
syn0[:] = np.random.uniform(low = -.5 / LAYER1_SIZE,
high = .5 / LAYER1_SIZE,
size = vocab_size * LAYER1_SIZE)
syn0.reshape((vocab_size, LAYER1_SIZE))
if flags['hs']:
shared_syn1_base = multiprocessing.Array(ctypes.c_float,
vocab_size * LAYER1_SIZE)
syn1 = np.ctypeslib.as_array(shared_syn1_base.get_obj())
syn1.reshape((vocab_size, LAYER1_SIZE))
if flags['negative']:
shared_syn1neg_base = multiprocessing.Array(ctypes.c_float,
vocab_size * LAYER1_SIZE)
syn1neg = np.ctypeslib.as_array(shared_syn1neg_base.get_obj())
syn1neg.reshape((vocab_size, LAYER1_SIZE))
def create_binary_tree():
"""Huffman tree by word counts
word['code'] will be the binary representation of word based on frequency
word['path'] will be the path from root to leaf in the tree
"""
global vocab
## FOR arbitary full binary, n-1 internal nodes at max given n leaves
## But in the original C code, the count, binary and parent_node size
## are n*2+1 intead of n*2-1
## original version, vocab_size is actually vocab_size - 1
vocab_size = len(vocab)
## count - tree construction based on count
count = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
count.fill(1e15)
count[:vocab_size] = [vw['count'] for vw in vocab]
## binary - boolean value of each node
binary = np.zeros(vocab_size*2-1, dtype = SHORT_INT)
## parent_node
parent_node = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
## construct the tree
pos1, pos2 = vocab_size-1, vocab_size
for a in xrange(vocab_size-1):
## min1i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min1i, pos1 = pos1, pos1-1
else:
min1i, pos2 = pos2, pos2+1
else:
min1i, pos2 = pos2, pos2+1
## min2i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min2i, pos1 = pos1, pos1-1
else:
min2i, pos2 = pos2, pos2+1
else:
min2i, pos2 = pos2, pos2+1
count[vocab_size + a] = count[min1i] + count[min2i]
parent_node[min1i] = vocab_size + a;
parent_node[min2i] = vocab_size + a;
binary[min2i] = 1;
for a in xrange(vocab_size):
b, i = a, 0
code, path = [], []
while True:
code.append(binary[b])
path.append(b)
i += 1
b = parent_node[b]
if b == vocab_size * 2 - 2: break
vocab[a]['path'] = [vocab_size - 2] + [p -vocab_size for p in path[::-1]]
vocab[a]['code'] = code[::-1]
def inspect_vocab_tree(vocab):
g = nx.DiGraph()
vocab_size = len(vocab)
edges = set()
for vw in vocab:
tree_path = [i + vocab_size for i in vw['path']]
tree_path = [str(i) if i >= vocab_size
else "%d_%s(%d)" % (i, vocab[i]['word'], vocab[i]['count'])
for i in tree_path]
edges.update(zip(tree_path[:-1], tree_path[1:]))
g.add_edges_from(edges)
figure(figsize=(16, 16))
pos = nx.graphviz_layout(g, prog='dot')
nx.draw(g, pos, with_labels=True, arrows = True, node_size=3000, font_size = 30)
return g
def refill(mf, sent_len, end):
nsents = 0
sentence = []
while True:
c = mf.read(1)
if not c: break ## end of file
if mf.tell() > end: break
if c == ' ': nsents += 1
if nsents == sent_len: break
return ''.join(sentence).split()
def train_model_thread(pid):
"""architecture: cbow / skip_gram
learning: hs / negative_sampling
running in multiprocessing pool
"""
global syn0, syn1
neu1 = np.empty(LAYER1_SIZE, dtype = REAL)
neu1e = np.empty(LAYER1_SIZE, dtype = REAL)
num_threads = flags['num_threads']
fsize = path.getsize(flags['train_file'])
with open(flags['train_file'], 'r') as fin:
mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
fstart = fsize / num_threads * pid
fend = fsize / num_threads * (pid + 1)
ftell = fstart
for word in re.finditer(r'(.*?)\s', mf[fstart:]):
if ftell > fend: break
w = word.group(1)
ftell += len(word.group(0))
??
learn_vocab_from_file('data/text_simple')
0M Vocab Size: 92 Words in train file: 2899
display(vocab)
display([vocab_hash[h] for h in [get_word_hash(w['word']) for w in vocab]])
[{'code': None, 'count': 141, 'path': None, 'word': 'the'}, {'code': None, 'count': 112, 'path': None, 'word': 'acid'}, {'code': None, 'count': 84, 'path': None, 'word': 'a'}, {'code': None, 'count': 71, 'path': None, 'word': 'and'}, {'code': None, 'count': 68, 'path': None, 'word': 'of'}, {'code': None, 'count': 57, 'path': None, 'word': 'in'}, {'code': None, 'count': 53, 'path': None, 'word': 'is'}, {'code': None, 'count': 49, 'path': None, 'word': 'three'}, {'code': None, 'count': 48, 'path': None, 'word': 'to'}, {'code': None, 'count': 47, 'path': None, 'word': 'two'}, {'code': None, 'count': 35, 'path': None, 'word': 'acids'}, {'code': None, 'count': 34, 'path': None, 'word': 'are'}, {'code': None, 'count': 33, 'path': None, 'word': 'one'}, {'code': None, 'count': 33, 'path': None, 'word': 'e'}, {'code': None, 'count': 30, 'path': None, 'word': 'zero'}, {'code': None, 'count': 30, 'path': None, 'word': 'h'}, {'code': None, 'count': 28, 'path': None, 'word': 'for'}, {'code': None, 'count': 26, 'path': None, 'word': 'aq'}, {'code': None, 'count': 26, 'path': None, 'word': 'k'}, {'code': None, 'count': 23, 'path': None, 'word': 'an'}, {'code': None, 'count': 21, 'path': None, 'word': 'that'}, {'code': None, 'count': 19, 'path': None, 'word': 'asphalt'}, {'code': None, 'count': 19, 'path': None, 'word': 'standards'}, {'code': None, 'count': 18, 'path': None, 'word': 'o'}, {'code': None, 'count': 18, 'path': None, 'word': 'as'}, {'code': None, 'count': 17, 'path': None, 'word': 'abacus'}, {'code': None, 'count': 17, 'path': None, 'word': 'this'}, {'code': None, 'count': 16, 'path': None, 'word': 'by'}, {'code': None, 'count': 16, 'path': None, 'word': 'with'}, {'code': None, 'count': 16, 'path': None, 'word': 'can'}, {'code': None, 'count': 15, 'path': None, 'word': 'or'}, {'code': None, 'count': 15, 'path': None, 'word': 'five'}, {'code': None, 'count': 15, 'path': None, 'word': 'four'}, {'code': None, 'count': 15, 'path': None, 'word': 'six'}, {'code': None, 'count': 14, 'path': None, 'word': 'base'}, {'code': None, 'count': 14, 'path': None, 'word': 'nine'}, {'code': None, 'count': 14, 'path': None, 'word': 'ansi'}, {'code': None, 'count': 13, 'path': None, 'word': 'water'}, {'code': None, 'count': 12, 'path': None, 'word': 'definition'}, {'code': None, 'count': 12, 'path': None, 'word': 'found'}, {'code': None, 'count': 11, 'path': None, 'word': 'be'}, {'code': None, 'count': 11, 'path': None, 'word': 'ha'}, {'code': None, 'count': 11, 'path': None, 'word': 'seven'}, {'code': None, 'count': 11, 'path': None, 'word': 'eight'}, {'code': None, 'count': 11, 'path': None, 'word': 'proton'}, {'code': None, 'count': 11, 'path': None, 'word': 'which'}, {'code': None, 'count': 10, 'path': None, 'word': 'used'}, {'code': None, 'count': 10, 'path': None, 'word': 'dissociation'}, {'code': None, 'count': 9, 'path': None, 'word': 'american'}, {'code': None, 'count': 9, 'path': None, 'word': 'weak'}, {'code': None, 'count': 9, 'path': None, 'word': 'form'}, {'code': None, 'count': 8, 'path': None, 'word': 'they'}, {'code': None, 'count': 8, 'path': None, 'word': 'use'}, {'code': None, 'count': 8, 'path': None, 'word': 'some'}, {'code': None, 'count': 8, 'path': None, 'word': 'from'}, {'code': None, 'count': 8, 'path': None, 'word': 'lewis'}, {'code': None, 'count': 8, 'path': None, 'word': 'most'}, {'code': None, 'count': 8, 'path': None, 'word': 'strong'}, {'code': None, 'count': 8, 'path': None, 'word': 'example'}, {'code': None, 'count': 7, 'path': None, 'word': 'have'}, {'code': None, 'count': 7, 'path': None, 'word': 'also'}, {'code': None, 'count': 7, 'path': None, 'word': 'bases'}, {'code': None, 'count': 7, 'path': None, 'word': 'it'}, {'code': None, 'count': 7, 'path': None, 'word': 'constant'}, {'code': None, 'count': 7, 'path': None, 'word': 'l'}, {'code': None, 'count': 6, 'path': None, 'word': 'so'}, {'code': None, 'count': 6, 'path': None, 'word': 's'}, {'code': None, 'count': 6, 'path': None, 'word': 'more'}, {'code': None, 'count': 6, 'path': None, 'word': 'organic'}, {'code': None, 'count': 6, 'path': None, 'word': 'protons'}, {'code': None, 'count': 6, 'path': None, 'word': 'orbital'}, {'code': None, 'count': 6, 'path': None, 'word': 'its'}, {'code': None, 'count': 6, 'path': None, 'word': 'anion'}, {'code': None, 'count': 6, 'path': None, 'word': 'hydrochloric'}, {'code': None, 'count': 6, 'path': None, 'word': 'donate'}, {'code': None, 'count': 6, 'path': None, 'word': 'products'}, {'code': None, 'count': 6, 'path': None, 'word': 'institute'}, {'code': None, 'count': 5, 'path': None, 'word': 'when'}, {'code': None, 'count': 5, 'path': None, 'word': 'called'}, {'code': None, 'count': 5, 'path': None, 'word': 'was'}, {'code': None, 'count': 5, 'path': None, 'word': 'solution'}, {'code': None, 'count': 5, 'path': None, 'word': 'has'}, {'code': None, 'count': 5, 'path': None, 'word': 'according'}, {'code': None, 'count': 5, 'path': None, 'word': 'ion'}, {'code': None, 'count': 5, 'path': None, 'word': 'substances'}, {'code': None, 'count': 5, 'path': None, 'word': 'all'}, {'code': None, 'count': 5, 'path': None, 'word': 'conjugate'}, {'code': None, 'count': 5, 'path': None, 'word': 'these'}, {'code': None, 'count': 5, 'path': None, 'word': 'such'}, {'code': None, 'count': 5, 'path': None, 'word': 'sulfuric'}, {'code': None, 'count': 5, 'path': None, 'word': 'tar'}, {'code': None, 'count': 5, 'path': None, 'word': 'national'}]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]
create_binary_tree()
display(vocab)
[{'code': [1, 1, 1, 1], 'count': 141, 'path': [90, 89, 87, 83, -92], 'word': 'the'}, {'code': [1, 0, 0, 1], 'count': 112, 'path': [90, 89, 86, 80, -91], 'word': 'acid'}, {'code': [0, 0, 1, 1], 'count': 84, 'path': [90, 88, 84, 77, -90], 'word': 'a'}, {'code': [1, 1, 1, 0, 1], 'count': 71, 'path': [90, 89, 87, 83, 75, -89], 'word': 'and'}, {'code': [1, 1, 0, 1, 1], 'count': 68, 'path': [90, 89, 87, 82, 74, -88], 'word': 'of'}, {'code': [1, 0, 1, 0, 0], 'count': 57, 'path': [90, 89, 86, 81, 71, -87], 'word': 'in'}, {'code': [1, 0, 0, 0, 0], 'count': 53, 'path': [90, 89, 86, 80, 70, -86], 'word': 'is'}, {'code': [0, 1, 1, 0, 1], 'count': 49, 'path': [90, 88, 85, 79, 68, -85], 'word': 'three'}, {'code': [0, 1, 1, 0, 0], 'count': 48, 'path': [90, 88, 85, 79, 68, -84], 'word': 'to'}, {'code': [0, 1, 0, 1, 0], 'count': 47, 'path': [90, 88, 85, 78, 67, -83], 'word': 'two'}, {'code': [1, 1, 1, 0, 0, 1], 'count': 35, 'path': [90, 89, 87, 83, 75, 62, -82], 'word': 'acids'}, {'code': [1, 1, 1, 0, 0, 0], 'count': 34, 'path': [90, 89, 87, 83, 75, 62, -81], 'word': 'are'}, {'code': [1, 1, 0, 1, 0, 0], 'count': 33, 'path': [90, 89, 87, 82, 74, 61, -80], 'word': 'one'}, {'code': [1, 1, 0, 0, 1, 1], 'count': 33, 'path': [90, 89, 87, 82, 73, 60, -79], 'word': 'e'}, {'code': [1, 0, 1, 1, 1, 0], 'count': 30, 'path': [90, 89, 86, 81, 72, 58, -78], 'word': 'zero'}, {'code': [1, 0, 1, 1, 0, 1], 'count': 30, 'path': [90, 89, 86, 81, 72, 57, -77], 'word': 'h'}, {'code': [1, 0, 1, 0, 1, 0], 'count': 28, 'path': [90, 89, 86, 81, 71, 56, -76], 'word': 'for'}, {'code': [0, 1, 1, 1, 1, 0], 'count': 26, 'path': [90, 88, 85, 79, 69, 54, -75], 'word': 'aq'}, {'code': [0, 1, 1, 1, 0, 1], 'count': 26, 'path': [90, 88, 85, 79, 69, 53, -74], 'word': 'k'}, {'code': [0, 1, 0, 0, 1, 0], 'count': 23, 'path': [90, 88, 85, 78, 66, 51, -73], 'word': 'an'}, {'code': [0, 0, 1, 0, 1, 0], 'count': 21, 'path': [90, 88, 84, 77, 65, 49, -72], 'word': 'that'}, {'code': [0, 0, 0, 1, 0, 0], 'count': 19, 'path': [90, 88, 84, 76, 64, 46, -71], 'word': 'asphalt'}, {'code': [0, 0, 0, 0, 1, 1], 'count': 19, 'path': [90, 88, 84, 76, 63, 45, -70], 'word': 'standards'}, {'code': [0, 0, 0, 0, 1, 0], 'count': 18, 'path': [90, 88, 84, 76, 63, 45, -69], 'word': 'o'}, {'code': [0, 0, 0, 0, 0, 1], 'count': 18, 'path': [90, 88, 84, 76, 63, 44, -68], 'word': 'as'}, {'code': [1, 1, 0, 1, 0, 1, 1], 'count': 17, 'path': [90, 89, 87, 82, 74, 61, 43, -67], 'word': 'abacus'}, {'code': [1, 1, 0, 1, 0, 1, 0], 'count': 17, 'path': [90, 89, 87, 82, 74, 61, 43, -66], 'word': 'this'}, {'code': [1, 1, 0, 0, 1, 0, 0], 'count': 16, 'path': [90, 89, 87, 82, 73, 60, 42, -65], 'word': 'by'}, {'code': [1, 1, 0, 0, 0, 1, 1], 'count': 16, 'path': [90, 89, 87, 82, 73, 59, 41, -64], 'word': 'with'}, {'code': [1, 1, 0, 0, 0, 1, 0], 'count': 16, 'path': [90, 89, 87, 82, 73, 59, 41, -63], 'word': 'can'}, {'code': [1, 0, 1, 1, 1, 1, 0], 'count': 15, 'path': [90, 89, 86, 81, 72, 58, 39, -62], 'word': 'or'}, {'code': [1, 0, 1, 1, 0, 0, 1], 'count': 15, 'path': [90, 89, 86, 81, 72, 57, 38, -61], 'word': 'five'}, {'code': [1, 0, 1, 1, 0, 0, 0], 'count': 15, 'path': [90, 89, 86, 81, 72, 57, 38, -60], 'word': 'four'}, {'code': [1, 0, 1, 0, 1, 1, 1], 'count': 15, 'path': [90, 89, 86, 81, 71, 56, 37, -59], 'word': 'six'}, {'code': [1, 0, 0, 0, 1, 1, 1], 'count': 14, 'path': [90, 89, 86, 80, 70, 55, 36, -58], 'word': 'base'}, {'code': [1, 0, 0, 0, 1, 1, 0], 'count': 14, 'path': [90, 89, 86, 80, 70, 55, 36, -57], 'word': 'nine'}, {'code': [1, 0, 0, 0, 1, 0, 1], 'count': 14, 'path': [90, 89, 86, 80, 70, 55, 35, -56], 'word': 'ansi'}, {'code': [0, 1, 1, 1, 1, 1, 0], 'count': 13, 'path': [90, 88, 85, 79, 69, 54, 34, -55], 'word': 'water'}, {'code': [0, 1, 1, 1, 0, 0, 0], 'count': 12, 'path': [90, 88, 85, 79, 69, 53, 33, -54], 'word': 'definition'}, {'code': [0, 1, 0, 1, 1, 1, 1], 'count': 12, 'path': [90, 88, 85, 78, 67, 52, 32, -53], 'word': 'found'}, {'code': [0, 1, 0, 0, 0, 1, 1], 'count': 11, 'path': [90, 88, 85, 78, 66, 50, 29, -52], 'word': 'be'}, {'code': [0, 1, 0, 0, 0, 1, 0], 'count': 11, 'path': [90, 88, 85, 78, 66, 50, 29, -51], 'word': 'ha'}, {'code': [0, 1, 0, 0, 0, 0, 1], 'count': 11, 'path': [90, 88, 85, 78, 66, 50, 28, -50], 'word': 'seven'}, {'code': [0, 1, 0, 0, 0, 0, 0], 'count': 11, 'path': [90, 88, 85, 78, 66, 50, 28, -49], 'word': 'eight'}, {'code': [0, 0, 1, 0, 1, 1, 1], 'count': 11, 'path': [90, 88, 84, 77, 65, 49, 27, -48], 'word': 'proton'}, {'code': [0, 0, 1, 0, 1, 1, 0], 'count': 11, 'path': [90, 88, 84, 77, 65, 49, 27, -47], 'word': 'which'}, {'code': [0, 0, 1, 0, 0, 1, 0], 'count': 10, 'path': [90, 88, 84, 77, 65, 48, 26, -46], 'word': 'used'}, {'code': [0, 0, 1, 0, 0, 0, 1], 'count': 10, 'path': [90, 88, 84, 77, 65, 48, 25, -45], 'word': 'dissociation'}, {'code': [0, 0, 0, 0, 0, 0, 1], 'count': 9, 'path': [90, 88, 84, 76, 63, 44, 21, -44], 'word': 'american'}, {'code': [0, 0, 0, 0, 0, 0, 0], 'count': 9, 'path': [90, 88, 84, 76, 63, 44, 21, -43], 'word': 'weak'}, {'code': [1, 1, 0, 0, 1, 0, 1, 1], 'count': 9, 'path': [90, 89, 87, 82, 73, 60, 42, 20, -42], 'word': 'form'}, {'code': [1, 1, 0, 0, 1, 0, 1, 0], 'count': 8, 'path': [90, 89, 87, 82, 73, 60, 42, 20, -41], 'word': 'they'}, {'code': [1, 1, 0, 0, 0, 0, 1, 1], 'count': 8, 'path': [90, 89, 87, 82, 73, 59, 40, 19, -40], 'word': 'use'}, {'code': [1, 1, 0, 0, 0, 0, 1, 0], 'count': 8, 'path': [90, 89, 87, 82, 73, 59, 40, 19, -39], 'word': 'some'}, {'code': [1, 1, 0, 0, 0, 0, 0, 1], 'count': 8, 'path': [90, 89, 87, 82, 73, 59, 40, 18, -38], 'word': 'from'}, {'code': [1, 1, 0, 0, 0, 0, 0, 0], 'count': 8, 'path': [90, 89, 87, 82, 73, 59, 40, 18, -37], 'word': 'lewis'}, {'code': [1, 0, 1, 1, 1, 1, 1, 1], 'count': 8, 'path': [90, 89, 86, 81, 72, 58, 39, 17, -36], 'word': 'most'}, {'code': [1, 0, 1, 1, 1, 1, 1, 0], 'count': 8, 'path': [90, 89, 86, 81, 72, 58, 39, 17, -35], 'word': 'strong'}, {'code': [1, 0, 1, 0, 1, 1, 0, 1], 'count': 8, 'path': [90, 89, 86, 81, 71, 56, 37, 16, -34], 'word': 'example'}, {'code': [1, 0, 1, 0, 1, 1, 0, 0], 'count': 7, 'path': [90, 89, 86, 81, 71, 56, 37, 16, -33], 'word': 'have'}, {'code': [1, 0, 0, 0, 1, 0, 0, 1], 'count': 7, 'path': [90, 89, 86, 80, 70, 55, 35, 15, -32], 'word': 'also'}, {'code': [1, 0, 0, 0, 1, 0, 0, 0], 'count': 7, 'path': [90, 89, 86, 80, 70, 55, 35, 15, -31], 'word': 'bases'}, {'code': [0, 1, 1, 1, 1, 1, 1, 1], 'count': 7, 'path': [90, 88, 85, 79, 69, 54, 34, 14, -30], 'word': 'it'}, {'code': [0, 1, 1, 1, 1, 1, 1, 0], 'count': 7, 'path': [90, 88, 85, 79, 69, 54, 34, 14, -29], 'word': 'constant'}, {'code': [0, 1, 1, 1, 0, 0, 1, 1], 'count': 7, 'path': [90, 88, 85, 79, 69, 53, 33, 13, -28], 'word': 'l'}, {'code': [0, 1, 1, 1, 0, 0, 1, 0], 'count': 6, 'path': [90, 88, 85, 79, 69, 53, 33, 13, -27], 'word': 'so'}, {'code': [0, 1, 0, 1, 1, 1, 0, 1], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 32, 12, -26], 'word': 's'}, {'code': [0, 1, 0, 1, 1, 1, 0, 0], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 32, 12, -25], 'word': 'more'}, {'code': [0, 1, 0, 1, 1, 0, 1, 1], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 31, 11, -24], 'word': 'organic'}, {'code': [0, 1, 0, 1, 1, 0, 1, 0], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 31, 11, -23], 'word': 'protons'}, {'code': [0, 1, 0, 1, 1, 0, 0, 1], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 31, 10, -22], 'word': 'orbital'}, {'code': [0, 1, 0, 1, 1, 0, 0, 0], 'count': 6, 'path': [90, 88, 85, 78, 67, 52, 31, 10, -21], 'word': 'its'}, {'code': [0, 1, 0, 0, 1, 1, 1, 1], 'count': 6, 'path': [90, 88, 85, 78, 66, 51, 30, 9, -20], 'word': 'anion'}, {'code': [0, 1, 0, 0, 1, 1, 1, 0], 'count': 6, 'path': [90, 88, 85, 78, 66, 51, 30, 9, -19], 'word': 'hydrochloric'}, {'code': [0, 1, 0, 0, 1, 1, 0, 1], 'count': 6, 'path': [90, 88, 85, 78, 66, 51, 30, 8, -18], 'word': 'donate'}, {'code': [0, 1, 0, 0, 1, 1, 0, 0], 'count': 6, 'path': [90, 88, 85, 78, 66, 51, 30, 8, -17], 'word': 'products'}, {'code': [0, 0, 1, 0, 0, 1, 1, 1], 'count': 6, 'path': [90, 88, 84, 77, 65, 48, 26, 7, -16], 'word': 'institute'}, {'code': [0, 0, 1, 0, 0, 1, 1, 0], 'count': 5, 'path': [90, 88, 84, 77, 65, 48, 26, 7, -15], 'word': 'when'}, {'code': [0, 0, 1, 0, 0, 0, 0, 1], 'count': 5, 'path': [90, 88, 84, 77, 65, 48, 25, 6, -14], 'word': 'called'}, {'code': [0, 0, 1, 0, 0, 0, 0, 0], 'count': 5, 'path': [90, 88, 84, 77, 65, 48, 25, 6, -13], 'word': 'was'}, {'code': [0, 0, 0, 1, 1, 1, 1, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 24, 5, -12], 'word': 'solution'}, {'code': [0, 0, 0, 1, 1, 1, 1, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 24, 5, -11], 'word': 'has'}, {'code': [0, 0, 0, 1, 1, 1, 0, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 24, 4, -10], 'word': 'according'}, {'code': [0, 0, 0, 1, 1, 1, 0, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 24, 4, -9], 'word': 'ion'}, {'code': [0, 0, 0, 1, 1, 0, 1, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 23, 3, -8], 'word': 'substances'}, {'code': [0, 0, 0, 1, 1, 0, 1, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 23, 3, -7], 'word': 'all'}, {'code': [0, 0, 0, 1, 1, 0, 0, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 23, 2, -6], 'word': 'conjugate'}, {'code': [0, 0, 0, 1, 1, 0, 0, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 47, 23, 2, -5], 'word': 'these'}, {'code': [0, 0, 0, 1, 0, 1, 1, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 46, 22, 1, -4], 'word': 'such'}, {'code': [0, 0, 0, 1, 0, 1, 1, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 46, 22, 1, -3], 'word': 'sulfuric'}, {'code': [0, 0, 0, 1, 0, 1, 0, 1], 'count': 5, 'path': [90, 88, 84, 76, 64, 46, 22, 0, -2], 'word': 'tar'}, {'code': [0, 0, 0, 1, 0, 1, 0, 0], 'count': 5, 'path': [90, 88, 84, 76, 64, 46, 22, 0, -1], 'word': 'national'}]
%pylab inline
g = inspect_vocab_tree(vocab)
Populating the interactive namespace from numpy and matplotlib
init_net()
/usr/lib/python2.7/dist-packages/numpy/ctypeslib.py:411: RuntimeWarning: Item size computed from the PEP 3118 buffer format string does not match the actual item size. return array(obj, copy=False)