import itertools import numpy as np import collections class FixedLengthGMHMC: """Fixed-length string Gibbs-sampling Metropolis-Hastings Markov chain. This class samples from a distribution over string variables of length str_len. It only works on ASCII (or ISO-8859-1). It does that by Gibbs sampling, one character at a time. It uses Metropolis Hastings to sample from the marginal distribution of the next character using a proposal distribution log_proposal_dist. The Q function is density_fun. """ def __init__(self, str_len, proposal_dist, # For individual letter log_density_fun): self.proposal_dist = proposal_dist self.log_density_fun = log_density_fun self.str_len = str_len self.num_accepted = 0 self.num_tried = 0 def take_first_sample(self): # Take a sample from the proposal distribution to start with chars = [] for i in range(self.str_len): chars.append(chr( np.where(np.random.multinomial(1, self.proposal_dist))[0])) chars = ''.join(chars) self.last_sample = chars self.last_sample_log_density = self.log_density_fun(chars) def sample(self): # Take a sample from the conditional distribution of the # str_pos character via MH # Pick which character to resample at random str_pos = np.int(np.random.rand()*self.str_len) # Sample a char from the proposal distribution new_char = chr(np.where(np.random.multinomial(1, self.proposal_dist))[0]) # Splice it in new_sample = (self.last_sample[:str_pos] + new_char + self.last_sample[str_pos+1:]) new_sample_log_density = self.log_density_fun(new_sample) # We're doing everything on the log scale for numerical stability alpha1 = np.exp(new_sample_log_density - self.last_sample_log_density) # Q(x | x') / Q(x' | x) old_char = self.last_sample[str_pos] alpha2 = (self.proposal_dist[ord(old_char)] / self.proposal_dist[ord(new_char)]) # Acceptance ratio alpha = min(alpha1*alpha2, 1.0) # Draw a binomial sample to decide whether to accept accept = np.random.rand() < alpha if accept: self.last_sample = new_sample self.last_sample_log_density = new_sample_log_density self.num_accepted += 1 self.num_tried += 1 # Test out the sampler on an example density on the letters # ABab def sample_log_density(arr): # Is the initial letter an uppercase letter? phi0 = (arr[0] == 'A' or arr[0] == 'B') # Does the second letter follow the first letter? e.g. 'ab' phi1 = ord(arr[1]) - ord(arr[0]) == 1 return phi0 * 1.0 + phi1 * 1.5 # Compute the probability of every length 2 string containing the characters # abAB charset = 'abAB' str_len = 2 ps = {} for the_str in itertools.product(charset, repeat=str_len): the_str = the_str[0] + the_str[1] ps[the_str] = np.exp(sample_log_density(the_str)) # Create a non-uniform proposal distribution, because subtle bugs can lurk # in the way we've coded up Q(x|x') / Q(x'|x) proposal_distro = [(chr(x) in charset) / (1.0 + float(x)) for x in range(128)] proposal_distro = [x / sum(proposal_distro) for x in proposal_distro] sampler = FixedLengthGMHMC(str_len, proposal_distro, sample_log_density) sampler.take_first_sample() samples = [] # Take some samples for i in range(10000): sampler.sample() samples.append(sampler.last_sample) # Look at empirical distribution of samples cnt = collections.Counter(samples) # Concatenate the results distros = np.array([[y, cnt[x]] for x, y in ps.iteritems()]) # Normalize distros = distros / distros.sum(0).reshape((1,-1)) print 'String | Exact p(x)*1000 | Sample p(x)*1000' print '\n'.join(['%s % 8d % 8d' % (x, y[0], y[1]) for x, y in zip(ps.keys(), np.round(distros*1000))]) max_rel_disc = max(abs(.5 * (distros[:,0] - distros[:,1]) / (distros[:,0] + distros[:,1]))) print 'Max relative discrepancy: %.3f' % max_rel_disc # relative discrepancy should not be much higher than 6% assert( max_rel_disc < .06) print ('Acceptance probability: %.3f' % (sampler.num_accepted / float(sampler.num_tried))) # That looks good. %matplotlib inline import matplotlib.pyplot as plt class FixedLengthPCD: """Stores a run of persistent contrastive divergence. Some features of this version of PCD: - Uses a sequence of etas to gradient descent for better convergence. - Uses RMSProp update rule """ def __init__(self, words, feature_fun, S_chains = 10): """Initializes PCD on a given vocabulary. Args: words: a list of strings, all of the same size feature_fun: a function which takes a string and returns an array of features S_chains: the number of parallel chains to use for PCD """ self.words = words self.feature_fun = feature_fun self.S_chains = S_chains self.init_measurements() self.init_params() self.init_chains() def init_measurements(self): """Make some measurements on the vocabulary""" max_str_len = max(len(w) for w in self.words) min_str_len = min(len(w) for w in self.words) if min_str_len != max_str_len: raise Error('This class only works for fixed-length strings') # Compute the empirical distribution of tokens all_chars = ''.join(self.words) proposal_dist,_ = np.histogram(np.array([ord(x) for x in all_chars]), bins=range(128)) proposal_dist = proposal_dist / np.float(proposal_dist.sum()) # Moment matching X = np.array([self.feature_fun(w) for w in self.words]) phi_c = X.mean(0) self.proposal_dist = proposal_dist self.str_len = min_str_len self.phi_c = phi_c self.n_features = phi_c.size def log_density_fun(self, word): """Returns the log density of a word, i.e. the inner product of the features and the parameters""" phi = self.feature_fun(word) return phi.dot(self.theta) def get_new_sampler(self): """Create a sampler from current set of parameters""" s = FixedLengthGMHMC(self.str_len, self.proposal_dist, self.log_density_fun) s.take_first_sample() return s def init_chains(self): # Initialize MCMC chains chains = [] for i in range(self.S_chains): s = self.get_new_sampler() chains.append(s) self.chains = chains def init_params(self): # Initialize parameters sd = .1 # Alternatively, we could use a crude estimate as an initializer self.theta = sd * (np.random.randn(self.n_features) / np.sqrt(self.n_features)) def run(self, etas = [0.1], k_steps_per = 5, T = 100, rms_prop_alpha = .9, display_every = 1): """Run a number of persistent contrastive divergence iterations. Args: etas: a list of learning rates to use k_steps_per: the number of MCMC steps to take per gradient descent iteration T: the number of gradient descent iterations rms_prop_alpha: the decay rate of RMSProp being used Returns: (thetas, gnorm, likelihoods): sequences of (parameters, norm of the gradient, likelihoods) """ # Fudge factor for RMSProp ff = 0.001 thetas = np.zeros((T, self.n_features)) gnorms = np.zeros((T)) gnorms_adjusted = np.zeros((T)) likes = np.zeros((T)) running_g = np.ones((self.n_features)) for t in range(T): which_eta = (t * len(etas)) / T eta = etas[which_eta] # Sample for a few steps for s in self.chains: for m in range(k_steps_per): s.sample() X_ = np.array([self.feature_fun(s.last_sample) for s in self.chains]) phi_t = X_.mean(0) # Gradient g = self.phi_c - phi_t # Gradient ascent # Use an RMSGrad update for more consistent convergence gg = eta* (g / (ff + np.sqrt(running_g))) self.theta = self.theta + gg thetas[t,:] = self.theta running_g = running_g*rms_prop_alpha + g**2 # Monitor convergence gnorms[t] = np.sqrt((g**2).sum()) gnorms_adjusted[t] = np.sqrt((gg**2).sum()) # Evaluate model likelihood like = sum(np.exp(np.array( [self.log_density_fun(w) for w in self.words]))) / float(len(self.words)) likes[t] = like if t % display_every == 0: print "Iteration %04d, log likelihood = %.8f" % (t, like) return thetas, gnorms, gnorms_adjusted, likes def sample_feature_fun(arr): # Is the initial letter an uppercase letter? phi0 = (arr[0] == 'A' or arr[0] == 'B') # Does the second letter follow the first letter? e.g. 'ab' phi1 = ord(arr[1]) - ord(arr[0]) == 1 return np.array([phi0*1.0, phi1*1.0]) # Draw some words nsamples = 1000 keys = np.random.multinomial(nsamples, distros[:,0]) words = ps.keys() vocab = list(itertools.chain(*[[words[i]]*k for i, k in enumerate(keys)])) pcd = FixedLengthPCD(vocab, sample_feature_fun, S_chains = 100) convergence = pcd.run(T = 600, etas = [.1, 0.03, .01], k_steps_per = 1, rms_prop_alpha = 0.9, display_every = 100) # Plot some indices of convergence plt.subplot(411) plt.plot(convergence[0]) plt.title('theta') plt.subplot(412) plt.plot(convergence[1]) plt.xlabel('Iteration') plt.title('||g||') plt.subplot(413) plt.plot(convergence[2]) plt.xlabel('Iteration') plt.title('||g_a||') plt.subplot(414) plt.plot(convergence[3]) plt.title('likelihood') print "Actual parameters" print np.array([1.0, 1.5]) print "Estimated parameters" print pcd.theta samples = [] sampler = pcd.get_new_sampler() sampler.proposal_dist = proposal_distro for i in range(10000): sampler.sample() samples.append(sampler.last_sample) # Look at empirical distribution of samples cnt = collections.Counter(samples) # Concatenate the results distros_ = np.array([[y, cnt[x]] for x, y in ps.iteritems()]) # Normalize distros_ = distros_ / distros_.sum(0).reshape((1,-1)) print 'String | Exact p(x)*1000 | Sample p(x)*1000' print '\n'.join(['%s % 8d % 8d' % (x, y[0], y[1]) for x, y in zip(ps.keys(), np.round(distros_*1000))]) max_rel_disc = max(abs(.5 * (distros_[:,0] - distros_[:,1]) / (distros_[:,0] + distros_[:,1]))) print 'Max relative discrepancy: %.3f' % max_rel_disc words = ['Machine', 'evolved', 'pattern', 'defined', 'machine', 'ability', 'without', 'Machine', 'operate', 'example', 'program', 'Machine', 'closely', 'related', 'focuses', 'through', 'domains', 'Machine', 'Example', 'include', 'optical', 'engines', 'vision.', 'Machine', 'focuses', 'quoted,', 'program', 'respect', 'measure', 'notable', 'machine', 'think?"', 'do?"[9]', 'Machine', 'system.', 'are[10]', 'example', 'desired', 'general', 'leaving', 'towards', 'program', 'dynamic', 'perform', 'certain', 'driving', 'without', 'teacher', 'telling', 'whether', 'Another', 'example', 'playing', 'against', 'Between', 'teacher', 'signal:', 'outputs', 'special', 'problem', 'targets', 'support', 'machine', 'divides', 'learned', 'machine', 'acquire', 'through', 'Another', 'machine', 'desired', 'divided', 'learner', 'produce', 'assigns', 'tackled', 'example', 'classes', 'outputs', 'divided', 'groups.', 'Density', 'mapping', 'related', 'program', 'similar', 'topics.', 'History', 'machine', 'Already', 'problem', 'various', '"neural', 'medical', 'between', 'machine', 'systems', 'plagued', 'systems', 'leading', 'outside', 'proper,', 'pattern', 'science', 'outside', 'Hinton.', 'success', 'Machine', 'started', 'changed', 'nature.', 'shifted', 'methods', 'Machine', 'methods', 'overlap', 'roughly', 'Machine', 'focuses', 'learned', 'focuses', 'unknown', 'overlap', 'machine', 'machine', 'employs', 'methods', 'improve', 'learner', 'between', 'machine', 'usually', 'respect', 'ability', 'unknown', 'respect', 'typical', 'methods', 'Machine', 'express', 'between', 'trained', 'problem', 'trained', 'predict', 'between', 'machine', 'Machine', 'closely', 'related', 'fields.', 'Michael', 'Jordan,', 'machine', 'science', 'overall', 'Breiman', 'wherein', 'machine', 'forest.', 'adopted', 'methods', 'machine', 'leading', 'learner', 'context', 'ability', 'machine', 'perform', 'unknown', 'learner', 'general', 'enables', 'produce', 'machine', 'science', 'theory.', 'Because', 'usually', 'common.', 'trained', 'unknown', 'complex', 'complex', 'minimum', 'bounds,', 'theory,', 'results', 'certain', 'learned', 'results', 'certain', 'classes', 'learned', 'between', 'machine', 'machine', 'between', 'network', 'network', 'usually', '"neural', 'aspects', 'usually', 'complex', 'between', 'capture', 'unknown', 'between', 'uniform', 'logical', 'program', 'entails', 'related', 'Support', 'Support', 'Support', 'related', 'methods', 'whether', 'example', 'Cluster', 'Cluster', 'subsets', '(called', 'cluster', 'similar', 'defined', 'example', 'between', 'members', 'between', 'methods', 'density', 'network', 'network', 'acyclic', 'acyclic', 'network', 'between', 'network', 'compute', 'various', 'perform', 'actions', 'reward.', 'attempt', 'actions', 'states.', 'differs', 'problem', 'correct', 'actions', 'Several', 'include', 'cluster', 'attempt', 'useful,', 'unknown', 'attempt', 'learned', 'attempt', 'learned', 'zeros).', 'without', 'defined', 'machine', 'factors', 'explain', 'machine', 'similar', 'similar', 'predict', 'objects', 'method,', 'assumed', 'sparse.', 'matrix,', 'solving', 'sparse.', 'assumed', 'freedom', 'NP-hard', 'popular', 'applied', 'several', 'problem', 'classes', 'belongs', 'Suppose', 'already', 'applied', 'Genetic', 'Genetic', 'genetic', 'process', 'natural', 'methods', 'finding', 'machine', 'genetic', 'machine', 'improve', 'genetic', 'machine', 'finance', 'vision,', 'Machine', 'Medical', 'Natural', 'systems', 'engines', 'opinion', 'mining)', 'pattern', 'Finance', 'company', 'Netflix', 'program', 'predict', 'improve', 'Shortly', 'Netflix', 'ratings', 'viewing', 'changed', 'Journal', 'machine', 'predict', 'article', 'machine', 'applied', 'History', 'between'] def compute_features(word): phis = [] # Check whether the first letter is uppercase phis.append(1*(word[0] >= 'A' and word[0] <= 'Z')) # Check whether the other letters are uppercase phis.append(sum(l >= ord('A') and l <= ord('Z') for l in word[1:])) # Check whether vowels are followed by consonants vowels = 'aeiou' letters = ('abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') phis.append(sum(x in letters and x not in vowels and y in vowels for x,y in zip(word[:-1], word[1:]))) # Check the number of symbols phis.append(sum(x not in letters for x in word)) # The number of each letter phis += [sum(x == y for x in word) for y in letters] # And whether the word is machine (to add a global feature) phis.append(word == 'machine' or word == 'Machine') return np.array(phis) # Draw some words pcd = FixedLengthPCD(words, compute_features, S_chains = 100) convergence = pcd.run(T = 1000, etas = [.1, 0.03, .01], k_steps_per = 1, rms_prop_alpha = 0.9, display_every = 100) # Plot some indices of convergence plt.subplot(411) plt.plot(convergence[0]) plt.title('theta') plt.subplot(412) plt.plot(convergence[1]) plt.xlabel('Iteration') plt.title('||g||') plt.subplot(413) plt.plot(convergence[2]) plt.xlabel('Iteration') plt.title('||g_a||') plt.subplot(414) plt.plot(convergence[3]) plt.title('likelihood') print "Estimated parameters" print pcd.theta samples = [] sampler = pcd.get_new_sampler() for i in range(10000): sampler.sample() samples.append(sampler.last_sample) print "Some samples:" samples[::100] print 'Most likely word' print max([(sampler.log_density_fun(x),x) for x in words]) print 'log-likelihod of machine versus machina:' print (sampler.log_density_fun('machine'), sampler.log_density_fun('machina')) print 'Number of machine samples out of 10000:' print(sum(x == 'machine' for x in samples)) print "Weight for word 'machine'" print pcd.theta[-1]