#!/usr/bin/env python # coding: utf-8 # # EM for IBM Model 2 # # Following the pseudo code proposed by Mike Collins. Use at your own risk, please report bug. # # Author: Yoav Artzi (http://yoavartzi.com). # In[6]: from collections import defaultdict import random # Our (big) data. data = [(['le', 'chien'], ['the', 'dog']), (['le', 'chat'], ['the', 'cat'])] f_len = 2 e_len = 2 e_vocab = ['the', 'dog', 'cat'] f_vocab = ['le', 'chien', 'chat'] # Dictionaries to maintain probabilities. q = defaultdict(float) t = defaultdict(float) # Init randomly (cutting corners: init is not normalized!) for e_word in e_vocab: for f_word in f_vocab: t[(f_word, e_word)] = random.random() for j in range(e_len): for i in range(f_len): q[(j,i,e_len,f_len)] = random.random() # EM loop. for s in range(1,100): print '#### Epoch %d' % (s) # Collect empirical counts. counts = defaultdict(float) for f, e in data: m = len(f) l = len(e) for i in range(len(e)): for j in range(len(f)): delta_sum = sum([q[(delta_sum_j,i,l,m)] * t[(f[i], e[delta_sum_j])] for delta_sum_j in range(l)]) delta = (q[(j,i,l,m)] * t[(f[i], e[j])]) / delta_sum counts[(e[i], f[j])] += delta counts[e[i]] += delta counts[(j,i,l,m)] += delta counts[(i,l,m)] += delta # Recompute probabilities and print. changed = False for e_word in e_vocab: for f_word in f_vocab: old_value = t[(f_word, e_word)] t[(f_word, e_word)] = counts[(e_word, f_word)] / counts[e_word] changed = old_value != t[(f_word, e_word)] print 't(%s | %s) = %f' % (f_word, e_word, t[(f_word, e_word)]) for j in range(e_len): for i in range(f_len): old_value = q[(j,i, e_len, f_len)] q[(j,i, e_len, f_len)] = counts[(j,i,e_len,f_len)] / counts[(i,e_len, f_len)] changed = old_value != q[(j,i, e_len, f_len)] print 'q(%d | %d, %d, %d) = %f' % (j,i,e_len, f_len, q[(j,i, e_len, f_len)]) if not changed: print 'Converged πŸΆπŸ˜»πŸ©πŸŽ‰' break # In[ ]: # In[ ]: # In[ ]: