Notebook

Test tricks for pyword2vec¶

In [23]:

from collections import namedtuple
import numpy as np
import mmap
import re
from IPython.display import display

In [24]:

## data structure
## replace it with a dict (mutable)
#VocabWord = namedtuple('VocabWord', ['count', 'path', 'word', 'code'])

In [37]:

## control flags
flags = {
     'debug_mode': 2
    , 'hs': True
    , 'negative': True
}

In [43]:

## constants
MAX_STRING = 100
EXP_TABLE_SIZE = 1000
MAX_EXP = 6
##
#VOCAB_MAX_SIZE = 1000
VOCAB_HASH_SIZE = 30000000
n_train_words = 0
##
MIN_COUNT = 5
MIN_REDUCE = 1
MAX_SENTENCE_LENGTH = 1000
MAX_CODE_LENGTH = 40
##
WINDOW = 5
LAYER1_SIZE = 100
ALPHA = 0.025
##
TABLE_SIZE = 1e8

REAL = "float"
HASH_TYPE = np.int64

In [44]:

## prelocated data storage
#vocab = np.empty(VOCAB_MAX_SIZE, dtype = vocab_word)
#vocab = np.array([], dtype = np.object)
## the first element in vocab is a faked element </s>
vocab = []

vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = HASH_TYPE)
vocab_hash.fill(-1)

exp_table = np.arange(start = 0, stop = EXP_TABLE_SIZE, 
                      step = 1, dtype = REAL)
exp_table = np.exp((exp_table / EXP_TABLE_SIZE * 2. - 1.) * MAX_EXP)
exp_table = exp_table / (exp_table + 1.)

syn0 = np.array([], dtype = REAL)
syn1 = np.array([], dtype = REAL)
syn1neg = np.array([], dtype = REAL)

In [45]:

def get_word_hash(word):
    word_hash = sum([ord(c)*(257**i) 
                     for i, c in zip(range(len(word))[::-1], word)])
    word_hash %= VOCAB_HASH_SIZE
    return word_hash

def add_vocab_hash(word_hash, word_index):
    global vocab_hash
    while vocab_hash[word_hash] != -1:
        word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
    vocab_hash[word_hash] = word_index
    
def search_vocab(word):
    """
    Search for word's vocab_index by using the vocab_hash
    """
    word_hash = get_word_hash(word)
    while True:
        word_index = vocab_hash[word_hash]
        ## no found
        if word_index == -1:
            return -1
        elif word == vocab[word_index]['word']:
            return word_index
        else:
            word_hash = (word_hash + 1) % VOCAB_HASH_SIZE
    return -1 # should never reach here

def reduce_vocab():
    """
    Reduce the vocabulary size by removing infrequent tokens
    """
    global vocab, vocab_hash
    ## in-place remove infrequent words
    a, b = 0, 0
    for a in xrange(len(vocab)):
        if vocab[a]['count'] > MIN_REDUCE:
            vocab[b]['count'] = vocab[a]['count']
            vocab[b]['word'] = vocab[a]['word']
            b += 1
    vocab = vocab[:b]
    ## reset the hash table
    vocab_hash.fill(-1)
    for word_index, vocab_word in enumerate(vocab):
        word_hash = get_word_hash(vocab_word['word'])
        add_vocab_hash(word_hash, word_index)
    MIN_REDUCE += 1

def add_word_to_vocab(word):
    """
    construct a VocabWord {'count', 'path', 'word', 'code'} 
    from word
    add vocab_word to vocab
    put its index to vocab_hash
    word_index: the index of vocab_word in vocab
    word_hash: the index of word_index in vocab_hash
    """
    global vocab, vocab_hash
    vocab_word = dict(count = 0, path = None, word = word, code = None)
    vocab.append(vocab_word)
    
    word_hash = get_word_hash(word)
    word_index = len(vocab)-1
    add_vocab_hash(word_hash, word_index)
    return word_index

def sort_vocab():
    """
    sort the vocabulary by frequency using word counts
    </s> will be kept in vocab at the first place, with count = 0,
    but it is NOT hashed, which means its hash value in vocab_hash
    will be -1
    """
    global vocab, vocab_hash, n_train_words
    ## sort the vocab (excluding </s> at beginning
    ## based on word frequency in DECRESENT order
    vocab[1:] = sorted(vocab[1:], key = lambda v: v['count'], 
                       reverse = True)
    ## re-initialize vocab_hash, reduce vocab
    vocab_hash.fill(-1)
    vocab_sz = len(vocab)
    n_train_words = 0
    for iword, vword in enumerate(vocab):
        ## discarding words less than MIN_COUNT
        if vword['count'] < MIN_COUNT:
            vocab_sz -= 1
        else: 
            word_hash = get_word_hash(vword['word'])
            add_vocab_hash(word_hash, iword)
            n_train_words += vword['count']
    #print "DEBUG: "
    #display(vocab)
    ## vocab_sz+1 to include </s> - very mystery codeing style??!!
    vocab = vocab[:(vocab_sz+1)]

def read_word(fpath):
    """
    Lazy read word from a file (words separated by whitespace)
    using mmap (OS virtual memory system) to read the file,
    ONLY TESTED on POSIX
    WE DONT insert </s> explicitly for every \n here, like
    in the original C version, though there is no \n in the 
    training file
    """
    with open(fpath) as fin:
        mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ)
        for word in re.finditer(r'(.*?)\s', mf):
            w = word.group(1)
            if w: 
                yield w

In [46]:

def learn_vocab_from_file(fpath):
    """
    fpath: path of train file,
    train file is a collection of words delimited by whitespace
    modify: vocab - np.array of dtype=vocab_word
            vocab_hash - hashed index of words in vocab
    """
    global vocab, vocab_hash
    ## prelocate data structure
    vocab_hash = np.empty(VOCAB_HASH_SIZE, dtype = np.int32)
    vocab_hash.fill(-1)
    ## pivot word
    add_word_to_vocab("</s>")
    for i, word in enumerate(read_word(fpath)):
        if flags['debug_mode'] > 1 and i % 1000000 == 0:
            print "%iM" % (i/1000000)
        ## find the word's vocab_index by using vocab_hash
        word_index = search_vocab(word)
        ## add new word
        if word_index == -1:
            word_index = add_word_to_vocab(word)
            vocab[word_index]['count'] = 1
        else:
            vocab[word_index]['count'] += 1
        ## increase vocab_hash size
        if len(vocab) > VOCAB_HASH_SIZE * 0.7:
            reduce_vocab()
    sort_vocab()
    if flags['debug_mode'] > 0:
        print 'Vocab Size: %i\nWords in train file: %d' % (len(vocab), i)

In [47]:

def init_net():
    """
    syn0 - len(vocab) * layer1_size
    syn1 - len(vocab) * layer1_size
    Initialize with certain weight values
    """
    global syn0, syn1, syn1neg
    vocab_size = len(vocab)
    syn0 = np.random.uniform(low = -.5 / LAYER1_SIZE,
                             high = .5 / LAYER1_SIZE,
                             size = (vocab_size, LAYER1_SIZE)).astype(REAL)
    if flags['hs']:
        syn1 = np.zeros((vocab_size, LAYER1_SIZE), dtype = REAL)
    if flags['negative']:
        syn1neg = np.zeros((vocab_size, LAYER1_SIZE), dtype = REAL)

In [51]:

def create_binary_tree():
    """Huffman tree by word counts
    word['code'] will be the binary representation of word based on frequency
    word['path'] will be the path from root to leaf in the tree
    """
    global vocab
    ## FOR arbitary full binary, n-1 internal nodes at max given n leaves
    ## But in the original C code, the count, binary and parent_node size
    ## are n*2+1 intead of n*2-1
    
    ## original version, vocab_size is actually vocab_size - 1
    vocab_size = len(vocab)
    ## count - tree construction based on count
    count = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
    count.fill(1e15)
    count[:vocab_size] = [vw['count'] for vw in vocab]
    ## binary - boolean value of each node
    binary = np.zeros(vocab_size*2-1, dtype = np.int8)
    ## parent_node
    parent_node = np.empty(vocab_size*2-1, dtype=HASH_TYPE)
    ## construct the tree
    pos1, pos2 = vocab_size-2, vocab_size-1
    for a in xrange(vocab_size-1):
        ## min1i
        if pos1 >= 0:
            if count[pos1] < count[pos2]:
                min1i, pos1 = pos1, pos1-1
            else:
                min1i, pos2 = pos2, pos2+1
        else:
            min1i, pos2 = pos2, pos2+1
        ## min2i
        if pos1 >= 0:
            if count[pos1] < count[pos2]:
                min2i, pos1 = pos1, pos1-1
            else:
                min2i, pos2 = pos2, pos2+1
        else:
            min2i, pos2 = pos2, pos2+1
        count[vocab_size + a - 1] = count[min1li] + count[min2i]

In [48]:

In [50]:

learn_vocab_from_file('data/text_simple')
vocab

0M
Vocab Size: 9
Words in train file: 178

Out[50]:

[{'code': None, 'count': 0, 'path': None, 'word': '</s>'},
 {'code': None, 'count': 12, 'path': None, 'word': 'and'},
 {'code': None, 'count': 11, 'path': None, 'word': 'the'},
 {'code': None, 'count': 10, 'path': None, 'word': 'four'},
 {'code': None, 'count': 8, 'path': None, 'word': 'in'},
 {'code': None, 'count': 5, 'path': None, 'word': 'used'},
 {'code': None, 'count': 5, 'path': None, 'word': 'war'},
 {'code': None, 'count': 5, 'path': None, 'word': 'one'},
 {'code': None, 'count': 5, 'path': None, 'word': 'nine'}]

In [ ]: