#!/usr/bin/env python
# coding: utf-8

# # EM for IBM Model 2
# 
# Following the pseudo code proposed by Mike Collins. Use at your own risk, please report bug.  
# 
# Author: Yoav Artzi (http://yoavartzi.com).

# In[6]:


from collections import defaultdict
import random

# Our (big) data. 
data = [(['le', 'chien'], ['the', 'dog']), (['le', 'chat'], ['the', 'cat'])]
f_len = 2
e_len = 2
e_vocab = ['the', 'dog', 'cat']
f_vocab = ['le', 'chien', 'chat']

# Dictionaries to maintain probabilities. 
q = defaultdict(float)
t = defaultdict(float)

# Init randomly (cutting corners: init is not normalized!)
for e_word in e_vocab:
    for f_word in f_vocab:
        t[(f_word, e_word)] = random.random()
for j in range(e_len):
    for i in range(f_len):
        q[(j,i,e_len,f_len)] = random.random()

# EM loop. 
for s in range(1,100):
    print '#### Epoch %d' % (s)
    # Collect empirical counts. 
    counts = defaultdict(float)
    for f, e in data:
        m = len(f)
        l = len(e)
        for i in range(len(e)):
            for j in range(len(f)):
                delta_sum = sum([q[(delta_sum_j,i,l,m)] * t[(f[i], e[delta_sum_j])] for delta_sum_j in range(l)])
                delta = (q[(j,i,l,m)] * t[(f[i], e[j])]) / delta_sum
                counts[(e[i], f[j])] += delta
                counts[e[i]] += delta
                counts[(j,i,l,m)] += delta
                counts[(i,l,m)] += delta
    # Recompute probabilities and print. 
    changed = False
    for e_word in e_vocab:
        for f_word in f_vocab:
            old_value = t[(f_word, e_word)]
            t[(f_word, e_word)] = counts[(e_word, f_word)] / counts[e_word]
            changed = old_value != t[(f_word, e_word)]
            print 't(%s | %s) = %f' % (f_word, e_word, t[(f_word, e_word)])
    for j in range(e_len):
        for i in range(f_len):
            old_value = q[(j,i, e_len, f_len)]
            q[(j,i, e_len, f_len)] = counts[(j,i,e_len,f_len)] / counts[(i,e_len, f_len)]
            changed = old_value != q[(j,i, e_len, f_len)]
            print 'q(%d | %d, %d, %d) = %f' % (j,i,e_len, f_len, q[(j,i, e_len, f_len)])
    if not changed:
        print 'Converged 🐶😻🐩🎉'
        break
            
    
# In[ ]:


# In[ ]:


# In[ ]: