import numpy as np from collections import Counter import mmap import re import networkx as nx import math MAX_EXP = 6 EXP_TABLE_SIZE = 1000 EXP_TABLE = np.arange(start = 0, stop = EXP_TABLE_SIZE, step = 1, dtype = np.float64) ## Exponentials of [-MAX_EXP, MAX_EXP] EXP_TABLE = np.exp((EXP_TABLE / EXP_TABLE_SIZE * 2 - 1) * MAX_EXP) ## Logitstic Regression of [-MAX_EXP, MAX_EXP] EXP_TABLE = EXP_TABLE / (EXP_TABLE + 1.) ## SO to use the EXP_TABLE, the value f must be in range [-MAX_EXP, MAX_EXP] ## and first transform the f value to range [0, EXP_TABLE-1] ## and then get the value from the table, e.g., to get logistic function value of f f = [-6., -3., 0, 3., 6.] ## (f+MAX_EXP) / (MAX_EXP/2) transforms f to [0, 1] index4exptable = lambda f: int((f+MAX_EXP) * (EXP_TABLE_SIZE/MAX_EXP/2)) f_index = map(index4exptable, f) logistic_f = EXP_TABLE[f_index] print f print f_index print logistic_f ## values in [0, 1] for logistic regression class HashedVocab(object): HASH_SIZE = 30000000 ## max hash size ## count threshold for a word to be in the vocab - ignore infrequent word MIN_COUNT = 5 @staticmethod def file2ws(fpath): """ file to wordstream: lazily read words from a file as an iterator """ with open(fpath) as fin: word_pattern = re.compile(r'(.*?)\s') mf = mmap.mmap(fin.fileno(), 0, access = mmap.ACCESS_READ) for match in word_pattern.finditer(mf): word = match.group(1) if word: yield word def __init__(self): ## vocab stores vocab_word dict self.vocab = [] ## vocab_hash stores the index of vocab_word in vocab self.vocab_hash = np.empty(HashedVocab.HASH_SIZE, dtype = np.int64) self.vocab_hash.fill(-1) ## house-keeping - total number of word occurences in training set ## it will be used as an estimate of training size later, which ## affect the adjustment of learning rate of the deep structure self.n_train_words = 0 def fit(self, word_stream): """ build hashed_vocab and Huffman tree from word stream the word_stream is usually from reading a word file, e.g., using file2ws """ ## word counter wc = Counter(word_stream) ## total occurances of training words self.n_train_words = sum(wc.values()) ## Sort the words by their counts, filter out infrequent words, ## construct vocab_word (a dict) and put them in self.vocab self.vocab = map(lambda x: dict(zip(['word', 'count'], x)), filter(lambda x: x[1] > HashedVocab.MIN_COUNT, wc.most_common(len(wc)))) ## if vocab is already too big for hash, either (1) ignoring more infrequent ## words (as implemented in C by ReduceVocab), (2) making hash size bigger ## here we simply raise an exception for simplicity if len(self.vocab) > HashedVocab.HASH_SIZE * 0.7: raise RuntimeError('vocab size too large for hash, increase MIN_COUNT or HASH_SIZE') self.build_hash() self.build_huffman_tree() return self def index_of(self, word): """ Get the index of word in vocab by using hash, return -1 if it is NOT there """ word_hash = self.get_word_hash(word) while True: word_index = self.vocab_hash[word_hash] if word_index == -1: return -1 elif word == self.vocab[word_index]['word']: return word_index else: word_hash = (word_hash + 1) % HashedVocab.HASH_SIZE def __getitem__(self, word_index): """ get vocab_word in vocab by its index """ return self.vocab[word_index] def __len__(self): return len(self.vocab) def __iter__(self): return iter(self.vocab) def build_hash(self): self.vocab_hash = np.empty(HashedVocab.HASH_SIZE, dtype = np.int64) self.vocab_hash.fill(-1) for word_index, vocab_word in enumerate(self.vocab): word = vocab_word['word'] word_hash = self.get_word_hash(word) self.add_to_hash(word_hash, word_index) return self def get_word_hash(self, word): word_hash = sum([ord(c)*(257**i) for i, c in zip(range(len(word))[::-1], word)]) word_hash %= HashedVocab.HASH_SIZE return word_hash def add_to_hash(self, word_hash, word_index): while self.vocab_hash[word_hash] != -1: word_hash = (word_hash + 1) % HashedVocab.HASH_SIZE self.vocab_hash[word_hash] = word_index return self def build_huffman_tree(self): """Build the Huffman tree representation for word based on their freq. The vocab_word structure in self.vocab is a dict {word, count, path, code} where vocab_word['code'] is the Huffman coding of word, and vocab_word['path'] will be the path from root to leaf """ ## for a full binary tree with n leaves, n-1 internal nodes will be needed ## for the 2*n-1 long data array (e.g. count and binary), the first n will be ## for the leaf nodes, and the last n-1 will be for the internal nodes vocab_size = len(self) ## workhorse structure for tree construction ## count - the count of words (leaves) and internal nodes (sum of leave counts) count = np.empty(vocab_size * 2 - 1, dtype = np.int64) count.fill(1e15) count[:vocab_size] = [vw['count'] for vw in self.vocab] ## binary - boolean repr for leaves and internal nodes binary = np.zeros(vocab_size*2-1, dtype=np.int64) ## parent_node - storing the path for each node parent_node = np.empty(vocab_size*2-1, dtype = np.int64) ## construct the tree ## DESCRIPTION: iteratively group the two ungrouped nodes (leaf or internal) that ## have the smallest counts ## Since the vocab is sorted in decreasing counts order (first half ) and ## the newly created internal nodes (second half) will be the order of ## increasing counts (the algorithm invariant), so we only need to check ## the two nodes in the middle of array to look for candidates, that is the role ## of min1i and min2i ## start searching for min1i and min2i from the middle of array pos1, pos2 = vocab_size - 1, vocab_size ## construct the vocab_size -1 internal nodes for a in xrange(vocab_size-1): ## min1i if pos1 >= 0: # min1i = pos1 if count[pos1] < count[pos2]: min1i, pos1 = pos1, pos1-1 # min1i = pos2 else: min1i, pos2 = pos2, pos2+1 else: ## running out of leaf nodes min1i, pos2 = pos2, pos2+1 ## min2i if pos1 >= 0: if count[pos1] < count[pos2]: min2i, pos1 = pos1, pos1-1 else: min2i, pos2 = pos2, pos2+1 else: min2i, pos2 = pos2, pos2+1 ## count(parent_node) = count(child1) + count(child2) count[vocab_size + a] = count[min1i] + count[min2i] ## link parent node index parent_node[min1i] = vocab_size + a parent_node[min2i] = vocab_size + a ## binary encoding for min1i is 0 (left), for min2i is 1 (right) binary[min2i] = 1 ## put the built Huffman tree structure in the vocab_word in vocab ## for each leaf node for a in xrange(vocab_size): ## b starting from leaf, along parent_nodes, to the root b = a code, path = [], [] ## traverse along the path to root while True: code.append(binary[b]) path.append(b) b = parent_node[b] ## stop when reaching root if b == vocab_size * 2 - 2: break ## path (or point) is from root to leaf, with an index offset ## -vocab_size self.vocab[a]['path'] = [vocab_size - 2] + [p - vocab_size for p in path[::-1]] self.vocab[a]['code'] = code[::-1] def inspect_vocab_tree(self): """Draw the built Huffman binary tree for the vocab """ g = nx.DiGraph() vocab_size = len(self) edges = set() for vw in self.vocab: tree_path = [i + vocab_size for i in vw['path']] tree_path = [i if i >= vocab_size else "%s(%d)" % (self.vocab[i]['word'], self.vocab[i]['count']) for i in tree_path] edges.update(zip(tree_path[:-1], tree_path[1:])) g.add_edges_from(edges) pos = nx.graphviz_layout(g, prog = 'dot') nx.draw(g, pos, with_labels = True, arrows = True) return g ## examples of buiding the hashed_vocab and its associated Huffman tree ## simple.txt is the first 50K words from original text8 hashed_vocab = HashedVocab() hashed_vocab.fit(HashedVocab.file2ws('data/first50k.txt')) print 'totally %i words were used to build vocab' % hashed_vocab.n_train_words print '%i of them go into the vocab' % len(hashed_vocab) ## simple example to illustrate the built Huffman tree %pylab inline hashed_vocab = HashedVocab().fit(HashedVocab.file2ws('data/first300.txt')) hashed_vocab.inspect_vocab_tree() class UnigramTable(object): TABLE_SIZE = int(1e8) POWER = 0.75 def __init__(self, hashed_vocab): self.table = np.empty(UnigramTable.TABLE_SIZE, np.int64) vocab = hashed_vocab vocab_size = len(hashed_vocab) ## normalization factor of all word's frequency's power train_words_pow = sum(math.pow(vw['count'], UnigramTable.POWER) for vw in vocab) ## doing the sampling in the table ## the sampling probability of a unigram is proportional to its ## frequency to a power of POWER (=0.75) ## i marks the index of current word in vocab ## d1 marks the accumulative power-law probability up to the current word ## a / TABLE_SIZE marks the sampling proability up to the current word i = 0 d1 = math.pow(vocab[i]['count'], UnigramTable.POWER) / train_words_pow for a in xrange(self.TABLE_SIZE): self.table[a] = i ## compare accumulative sampling prob with power-law accumulative prob ## move to the sampling of next word if they start not matching if a * 1. / UnigramTable.TABLE_SIZE > d1: i += 1 d1 += math.pow(vocab[i]['count'], UnigramTable.POWER) / train_words_pow ## put the rest as sampling of the last word (round-off) if i >= vocab_size: i = vocab_size - 1 def __getitem__(self, index): return self.table[index] ## check how satisfying the sampling strategy is to the Zipf's law ## http://mathworld.wolfram.com/ZipfsLaw.html hashed_vocab = HashedVocab().fit(HashedVocab.file2ws('data/first1k.txt')) unigram_table = UnigramTable(hashed_vocab) sampling_probs = [sum(unigram_table.table == i) / 1e8 for i in xrange(len(hashed_vocab))] plot(1./np.arange(1, len(hashed_vocab)+1), sampling_probs) xlabel('$1/rank$') ylabel('$probability$') class CBOW(object): REAL_TYPE = np.float64 def __init__(self, hashed_vocab, unigram_table, layer1_size, learn_hs, learn_negative): """ hashed_vocab: the vocab to build on layer1_size: the size of layer1 of the net, effectively defines the dim of feature space learn_hs: use hierarchical softmax learning learn_negative: use negative sampling learning """ self.vocab = hashed_vocab self.table = unigram_table self.layer1_size = layer1_size self.learn_hs = learn_hs self.learn_negative = learn_negative ## initial value for learning rate, will decrease along learning self.starting_alpha = 0.025 ## a sentence is a bulk of words from train file self.sentence_len = 1000 ## downsampling rate to select words into a sentence self.sampling = 1e-4 ## window defines the neighborhood of a word in the sentence for hs learning self.window = 5 ## negative defines the neighborhood of a word for negative learning self.negative = 5 ## network weights ## syn0 - feature representations of words (leaf nodes) in Huffman tree ## shape = vocab_size x layer1_size self.syn0 = None ## syn1 - feature representations of internal nodes (non-leaf) in Huffman tree ## shape = vocab_size x layer1_size self.syn1 = None ## hidden layer for hs learning ## syn1neg - feature representations of negative-sampled words ## shape = vocab_size x layer1_size self.syn1neg = None ## hidden layer for negative learning def init_net(self): """ """ vocab_size = len(self.vocab) self.syn0 = np.random.uniform(low = -.5 / self.layer1_size, high = .5 / self.layer1_size, size = (vocab_size, self.layer1_size)).astype(CBOW.REAL_TYPE) if self.learn_hs: self.syn1 = np.zeros((vocab_size, self.layer1_size), dtype=CBOW.REAL_TYPE) if self.learn_negative: self.syn1neg = np.zeros((vocab_size, self.layer1_size), dtype = CBOW.REAL_TYPE) def fit(self, train_words): """ train_words: list of training words """ ## initialize net structure self.init_net() ntotal = len(train_words) ## initialize learning parameters alpha = self.starting_alpha next_random = 0 ## read and process words sentence by sentence ## from the train_words nprocessed = 0 while nprocessed < ntotal: ## adjust learning rate based on how many words have ## been trained on alpha = max(self.starting_alpha * (1-nprocessed/(ntotal+1.)), self.starting_alpha * 0.0001) ## refill the sentence sentence = [] while nprocessed < ntotal and len(sentence) < self.sentence_len: ## sampling down the infrequent words word = train_words[nprocessed] word_index = self.vocab.index_of(word) nprocessed += 1 if word_index == -1: continue word_count = self.vocab[word_index]['count'] if self.sampling > 0: ran = ( (math.sqrt(word_count / (self.sampling * ntotal)) + 1) * (self.sampling * ntotal) / word_count ) next_random = next_random * 25214903917 + 11 ## down sampling based on word frequency if ran < (next_random & 0xFFFF) / 65536: continue sentence.append(word_index) ## for each word in the preloaded sentence ## pivot is the vocab index of current word to be trained on ## ipivot is the index in the current setence for ipivot, pivot in enumerate(sentence): next_random = next_random * 25214903917 + 11 ## window-b defines the length of the window ## which is the neighborhood size for the current ## word in the sentence b = next_random % self.window ## initialize temp variable ## neu1: the sum of neigbhor vectors in the setence ## neu1e: the gradient vector wrt syn0 ## see the explaination above the code neu1 = np.zeros(self.layer1_size, dtype = self.REAL_TYPE) neu1e = np.zeros(self.layer1_size, dtype = self.REAL_TYPE) ## accumulate sum of neighbor words into neu1 ## neighbors are defined as [ipivot-(window-b), ipivot+(window-b)] left = max(0, ipivot - (self.window-b)) right = min(len(sentence)-1, ipivot+self.window-b) ## all neighborhood index should >= 0 (in vocab) as otherwise ## it won't go into the sentence in the first place neighborhood = [sentence[n] for n in range(left, right+1) if sentence[n] >= 0] neu1 = np.sum(self.syn0[neighborhood,:], axis = 0) ## hierarchical softmax learning if self.learn_hs: ## for each output node in layer1 ## which are parent nodes of pivot in Huffman tree ## notice the last element of 'path' is the word itself, ## so exclude it here for parent_index, parent_code in zip(self.vocab[pivot]['path'][:-1], self.vocab[pivot]['code']): ## F is the logistic transformation of dot product ## between neu1 and each parent repr in syn1 f = np.dot(neu1, self.syn1[parent_index, :]) ## output out of range if f <= - MAX_EXP or f >= MAX_EXP: continue ## logistic function transformation else: f = EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] ## pseduo target is 1 - parent_code g = (1 - parent_code - f) * alpha ## accumulate neu1 to update syn0 later neu1e += g * self.syn1[parent_index] ## update syn1 of current parent self.syn1[parent_index] += g * neu1 ## negative sampling learning ## select one positive and several negative samples if self.learn_negative: for d in range(self.negative + 1): ## make sure to select the current word ## as positive sample if d == 0: target = pivot label = 1 ## select some 'negative' samples randomly else: next_random = next_random * 25214903917 + 11 target = self.table[(next_random >> 16) % self.table.TABLE_SIZE] if (target == pivot): continue ## ignore if it is still positive label = 0 ## this time f is dot product of neu1 with ## syn1neg f = np.dot(neu1, self.syn1neg[target, :]) ## pseudo target is label ## NOTE the differece between hs and negative-sampling ## when dealing with f out of [-MAX_EXP, MAX_EXP] if f > MAX_EXP: g = (label - 1) * alpha elif f < - MAX_EXP: g = (label - 0) * alpha else: g = alpha * (label - EXP_TABLE[int((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) ## accumulate changes to syn0 to neu1e again neu1e += g * self.syn1neg[target] ## update syn0 after hs and/or negative-sampling learning self.syn0[neighborhood, :] += neu1e ## test cbow hashed_vocab = HashedVocab().fit(HashedVocab.file2ws('data/first50k.txt')) unigram_table = UnigramTable(hashed_vocab) train_words = list(HashedVocab.file2ws('data/first50k.txt')) cbow_model = CBOW(hashed_vocab, unigram_table, layer1_size = 50, learn_hs = True, learn_negative = True) cbow_model.fit(train_words) from numpy import linalg Xnorm = np.apply_along_axis(linalg.norm, 1, cbow_model.syn0) #X = cbow_model.syn0 / Xnorm[:, np.newaxis] X = cbow_model.syn0 word = cbow_model.vocab.index_of('one') word_vec = X[word] #closest_words = np.dot(X, guess).argsort()[::-1] closest_words = np.sum((X - word_vec)**2, axis = 1).argsort() for i in closest_words[:5]: print cbow_model.vocab[i]['word']