from collections import namedtuple
import numpy as np
import mmap
import re
from IPython.display import display
## data structure
## replace it with a dict (mutable)
#VocabWord = namedtuple('VocabWord', ['count', 'path', 'word', 'code'])
## control flags
flags = {
'debug_mode': 2
, 'hs': True
, 'negative': True
}
## constants
MAX_STRING = 100
EXP_TABLE_SIZE = 1000
MAX_EXP = 6
##
#VOCAB_MAX_SIZE = 1000
VOCAB_HASH_SIZE = 30000000
n_train_words = 0
##
MIN_COUNT = 5
MIN_REDUCE = 1
MAX_SENTENCE_LENGTH = 1000
MAX_CODE_LENGTH = 40
##
WINDOW = 5
LAYER1_SIZE = 100
ALPHA = 0.025
##
TABLE_SIZE = 1e8
REAL = "float"
HASH_TYPE = np.int64
## prelocated data storage
#vocab = np.empty(VOCAB_MAX_SIZE, dtype = vocab_word)
#vocab = np.array([], dtype = np.object)
## the first element in vocab is a faked element </s>
vocab = []
vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = HASH_TYPE)
vocab_hash.fill(-1)
exp_table = np.arange(start = 0, stop = EXP_TABLE_SIZE,
step = 1, dtype = REAL)
exp_table = np.exp((exp_table / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
exp_table = exp_table / (exp_table + 1.)
syn0 = np.array([], dtype = REAL)
syn1 = np.array([], dtype = REAL)
syn1neg = np.array([], dtype = REAL)
def get_word_hash(word):
word_hash = sum([ord(c)*(257**i)
for i, c in zip(range(len(word))[::-1], word)])
word_hash %= VOCAB_HASH_SIZE
return word_hash
def add_vocab_hash(word_hash, word_index):
global vocab_hash
while vocab_hash[word_hash] != -1:
word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
vocab_hash[word_hash] = word_index
def search_vocab(word):
"""
Search for word's vocab_index by using the vocab_hash
"""
word_hash = get_word_hash(word)
while True:
word_index = vocab_hash[word_hash]
## no found
if word_index == -1:
return -1
elif word == vocab[word_index]['word']:
return word_index
else:
word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
return -1 # should never reach here
def reduce_vocab():
"""
Reduce the vocabulary size by removing infrequent tokens
"""
global vocab, vocab_hash
## in-place remove infrequent words
a, b = 0, 0
for a in xrange(len(vocab)):
if vocab[a]['count'] > MIN_REDUCE:
vocab[b]['count'] = vocab[a]['count']
vocab[b]['word'] = vocab[a]['word']
b += 1
vocab = vocab[:b]
## reset the hash table
vocab_hash.fill(-1)
for word_index, vocab_word in enumerate(vocab):
word_hash = get_word_hash(vocab_word['word'])
add_vocab_hash(word_hash, word_index)
MIN_REDUCE += 1
def add_word_to_vocab(word):
"""
construct a VocabWord {'count', 'path', 'word', 'code'}
from word
add vocab_word to vocab
put its index to vocab_hash
word_index: the index of vocab_word in vocab
word_hash: the index of word_index in vocab_hash
"""
global vocab, vocab_hash
vocab_word = dict(count = 0, path = None, word = word, code = None)
vocab.append(vocab_word)
word_hash = get_word_hash(word)
word_index = len(vocab)-1
add_vocab_hash(word_hash, word_index)
return word_index
def sort_vocab():
"""
sort the vocabulary by frequency using word counts
</s> will be kept in vocab at the first place, with count = 0,
but it is NOT hashed, which means its hash value in vocab_hash
will be -1
"""
global vocab, vocab_hash, n_train_words
## sort the vocab (excluding </s> at beginning
## based on word frequency in DECRESENT order
vocab[1:] = sorted(vocab[1:], key = lambda v: v['count'],
reverse = True)
## re-initialize vocab_hash, reduce vocab
vocab_hash.fill(-1)
vocab_sz = len(vocab)
n_train_words = 0
for iword, vword in enumerate(vocab):
## discarding words less than MIN_COUNT
if vword['count'] < MIN_COUNT:
vocab_sz -= 1
else:
word_hash = get_word_hash(vword['word'])
add_vocab_hash(word_hash, iword)
n_train_words += vword['count']
#print "DEBUG: "
#display(vocab)
## vocab_sz+1 to include </s> - very mystery codeing style??!!
vocab = vocab[:(vocab_sz+1)]
def read_word(fpath):
"""
Lazy read word from a file (words separated by whitespace)
using mmap (OS virtual memory system) to read the file,
ONLY TESTED on POSIX
WE DONT insert </s> explicitly for every \n here, like
in the original C version, though there is no \n in the
training file
"""
with open(fpath) as fin:
mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
for word in re.finditer(r'(.*?)\s', mf):
w = word.group(1)
if w:
yield w
def learn_vocab_from_file(fpath):
"""
fpath: path of train file,
train file is a collection of words delimited by whitespace
modify: vocab - np.array of dtype=vocab_word
vocab_hash - hashed index of words in vocab
"""
global vocab, vocab_hash
## prelocate data structure
vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = np.int32)
vocab_hash.fill(-1)
## pivot word
add_word_to_vocab("</s>")
for i, word in enumerate(read_word(fpath)):
if flags['debug_mode'] > 1 and i % 1000000 == 0:
print "%iM" % (i/1000000)
## find the word's vocab_index by using vocab_hash
word_index = search_vocab(word)
## add new word
if word_index == -1:
word_index = add_word_to_vocab(word)
vocab[word_index]['count'] = 1
else:
vocab[word_index]['count'] += 1
## increase vocab_hash size
if len(vocab) > VOCAB_HASH_SIZE * 0.7:
reduce_vocab()
sort_vocab()
if flags['debug_mode'] > 0:
print 'Vocab Size: %i\nWords in train file: %d' % (len(vocab), i)
def init_net():
"""
syn0 - len(vocab) * layer1_size
syn1 - len(vocab) * layer1_size
Initialize with certain weight values
"""
global syn0, syn1, syn1neg
vocab_size = len(vocab)
syn0 = np.random.uniform(low = -.5 / LAYER1_SIZE,
high = .5 / LAYER1_SIZE,
size = (vocab_size, LAYER1_SIZE)).astype(REAL)
if flags['hs']:
syn1 = np.zeros((vocab_size, LAYER1_SIZE), dtype = REAL)
if flags['negative']:
syn1neg = np.zeros((vocab_size, LAYER1_SIZE), dtype = REAL)
def create_binary_tree():
"""Huffman tree by word counts
word['code'] will be the binary representation of word based on frequency
word['path'] will be the path from root to leaf in the tree
"""
global vocab
## FOR arbitary full binary, n-1 internal nodes at max given n leaves
## But in the original C code, the count, binary and parent_node size
## are n*2+1 intead of n*2-1
## original version, vocab_size is actually vocab_size - 1
vocab_size = len(vocab)
## count - tree construction based on count
count = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
count.fill(1e15)
count[:vocab_size] = [vw['count'] for vw in vocab]
## binary - boolean value of each node
binary = np.zeros(vocab_size*2-1, dtype = np.int8)
## parent_node
parent_node = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
## construct the tree
pos1, pos2 = vocab_size-2, vocab_size-1
for a in xrange(vocab_size-1):
## min1i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min1i, pos1 = pos1, pos1-1
else:
min1i, pos2 = pos2, pos2+1
else:
min1i, pos2 = pos2, pos2+1
## min2i
if pos1 >= 0:
if count[pos1] < count[pos2]:
min2i, pos1 = pos1, pos1-1
else:
min2i, pos2 = pos2, pos2+1
else:
min2i, pos2 = pos2, pos2+1
count[vocab_size + a - 1] = count[min1li] + count[min2i]
learn_vocab_from_file('data/text_simple')
vocab
0M Vocab Size: 9 Words in train file: 178
[{'code': None, 'count': 0, 'path': None, 'word': '</s>'}, {'code': None, 'count': 12, 'path': None, 'word': 'and'}, {'code': None, 'count': 11, 'path': None, 'word': 'the'}, {'code': None, 'count': 10, 'path': None, 'word': 'four'}, {'code': None, 'count': 8, 'path': None, 'word': 'in'}, {'code': None, 'count': 5, 'path': None, 'word': 'used'}, {'code': None, 'count': 5, 'path': None, 'word': 'war'}, {'code': None, 'count': 5, 'path': None, 'word': 'one'}, {'code': None, 'count': 5, 'path': None, 'word': 'nine'}]