%%capture
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
import statnlpbook.util as util
import statnlpbook.mt as mt
util.execute_notebook('word_mt.ipynb')
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
Translate word-by-word
For most of this lecture:
learn the parameters $\params$ from data
predict highest-scoring translation given source $\source$:
\begin{equation} \argmax_\target s_\params(\target,\source) \end{equation}MT approaches differ primarily in
How to define $s_\params(\target,\source)$?
How is the $(\target,\source)$ data generated?
Develop a distribution $s_\params(\target,\source)=\prob_\params(\target,\source)$ that generates faithful samples
Good generative model: samples look like
Bad generative model:
Generative models need to model both input and output! Contrast with discriminative models...
How can we decompose the problem?
in the same way we decomposed language models $\prob(w_1,\ldots,w_n)=\prob(w_1)\prob(w_2|w_1)\ldots$
Think about the problem backwards
This defines a joint distribution $\prob_\params(\target,\source) = \prob_{\params_t}(\target) \prob_{\params_s}(\source|\target)$
Find $\params$ using MLE (or smoothed variants)
$$ \argmax_\params \sum_{(\target,\source) \in \train} \log \prob_\params(\target, \source) $$Operate forwards:
$$ \argmax_\target \prob_\params(\target|\source) = \argmax_\target \prob_\params(\target,\source) $$Why this equality? $p(\target|\source)=\frac{p(s,t)}{p(s)}$
How important would $\prob(\source)$ be if we had chosen $\prob(\target,\source) = \prob(\source) \prob(\target|\source)$?
$\prob(\target,\source) = \prob(\target) \prob(\source|\target)$ is often called a noisy channel model
%%tikz
\tikzset{every node/.style={font=\sffamily,white}}
\node[fill=blue] at (0,0) (a) {Sender};
\node[fill=blue] at (3,0) (b) {Channel};
\node[fill=blue] at (6,0) (c) {Receiver};
\draw[->] (a) -- (b) node [midway,above,font=\scriptsize,black]{$p(\mathbf{t})$};
\draw[->] (b) -- (c) node [midway,above,font=\scriptsize,black]{$p(\mathbf{s}|\mathbf{t})$};
%%tikz
\tikzset{every node/.style={font=\sffamily,white}}
\node[fill=blue] at (0,0) (a) {Sender};
\node[fill=blue] at (3,0) (b) {Channel};
\node[fill=blue] at (6,0) (c) {Receiver};
\draw[->] (a) -- (b) node [midway,above,font=\scriptsize,black]{$p(\mathbf{t})$};
\draw[->] (b) -- (c) node [midway,above,font=\scriptsize,black]{$p(\mathbf{s}|\mathbf{t})$};
Channel could be
MLE for $\prob_\params(\target,\source) = \prob_{\params_t}(\target) \prob_{\params_s}(\source|\target)$ can be calculated in two independent steps:
What training data do you need for each?
For example: $$ \prob^{\text{Impossible}}_\params(\text{Ich mag Musik }|\text{ I like music})=\params_{\text{Ich mag Musik},\text{I like music}} $$
Why impossible?
Generally we want models that factorize (break up into smaller parts) for dealing with
How did language models do this?
$$ \prob(\text{I like Music})=\prob(\text{I}) \prob(\text{like | I}) \prob(\text{Music | like}) $$Look at example for inspiration:
Token | 1 | 2 | 3 | 4 |
---|---|---|---|---|
Target | the | house | is | small |
Source | das | Haus | ist | klein |
Why naive?
Use the Maximum Likelihood Estimate:
$$ \params^* = \argmax_\params \sum_{(\target,\source) \in \train} \log \prob_\params(\source|\target) $$Amounts to counting:
$$ \param^*_{\ssource,\starget} = \frac{\counts{\train}{s,t}}{\counts{\train}{t}} $$In Python:
from collections import defaultdict
def learn_naive_model(data):
norm = defaultdict(float)
counts = defaultdict(float)
for target, source in data:
for i in range(0, len(target)):
norm[target[i]] += 1.0
counts[(source[i],target[i])] += 1.0
result = {}
for (source,target),score in counts.items():
result[(source,target)] = score / norm[target]
return result
# show defaultdict behaviour
naive_model = learn_naive_model([[('the','house'),('das','Haus')],
[('the','house'),('das','Gebauede')]])
# try other genders or numbers
util.Table(naive_model.items(), "2em", "20px")
('das', 'the') | 1.00 |
('Haus', 'house') | 0.50 |
('Gebauede', 'house') | 0.50 |
Training data:
util.Table(train, "1.5em", "20px")
['the', 'house', 'is', 'small'] | ['das', 'Haus', 'ist', 'klein'] |
['the', 'house', 'is', 'small'] | ['klein', 'ist', 'das', 'Haus'] |
['a', 'man', 'is', 'tall'] | ['ein', 'Mann', 'ist', 'groß'] |
['my', 'house', 'is', 'small'] | ['klein', 'ist', 'mein', 'Haus'] |
table = learn_naive_model(train)
plot_table_for_target(table, "is") # try house
Find the highest scoring target $\target$
$$ \target^*(\source) = \argmax_\target \prod_i^{\length{\source}} \prob_\params(\ssource_i|\starget_i) $$How about brute-force (like in our Structured Prediction example)?
How to find this $\argmax$?
$$ \argmax_{y_1,y_2} f_1(y_1) f_2(y_2) = \ldots $$In Python
def decode(source_sent, model, lm):
source_to_targets = defaultdict(list)
for (source,target),prob in model.items():
source_to_targets[source] += [(target,prob)]
result = []
for tok in source_sent:
candidates = source_to_targets[tok]
multiplied_with_lm = [(target,prob * lm.probability(target))
for target, prob in candidates]
target = max(multiplied_with_lm, key=lambda t: t[1])
result.append(target[0])
return result
lm = UniformLM(set([target for _, target in table.keys()]))
decode(["ein", "Mann"], table, lm) # try other inputs with given vocab
['a', 'man']
Naive Model assumes a sequential alignment
word_mt.Alignment("the house is small".split(" "),
"das Haus ist klein".split(" "),
[(0,0),(1,1),(2,2),(3,3)])
But word order can differ!
word_mt.Alignment("the house is small".split(" "),
"klein ist das Haus".split(" "),
[(0,2),(1,3),(2,1),(3,0)])
Formalise alignments:
word_mt.Alignment("NULL the house is small".split(" "),
"klein ist das Haus".split(" "),
[(1,2),(2,3),(3,1),(4,0)])
$a_1 = 4$ ???
Why the NULL Token?
word_mt.Alignment("NULL I like music".split(" "),
"音楽 が 好き".split(" "),
[(0,1),(2,2),(3,0)])
One-to-many translations
word_mt.Alignment("NULL the house is tiny".split(" "),
"das Haus ist sehr klein".split(" "),
[(1,0),(2,1),(3,2),(4,3),(4,4)])
Many-to-one translations?
Alignments are hidden when given a source!
Need to be
In the late 80s, early 90s
A famous IBM quote from that time
Every time I fire a linguist, the performance of the speech recognizer goes up
-- Fred Jelinek
Group around Robert Mercer were extremely successful
Translation and alignment model:
$$ p_\params^\text{IBM2}(\source,\aligns|\target) $$Generative story with parameters $\params = (\alpha, \beta)$
Given target sentence $\target$ (e.g. NULL I like music) with length $l_\starget$ (e.g. 4):
\ssource$ with uniform probability $\epsilon$ * e.g. $l\ssource=3$
Example: $$ p_\params^\text{IBM2}(\text{das, Haus},1, 2 | \text{NULL, the, house}) = \ldots $$
Answer: $$ \a(das|the) \b(1|1,3,2) \ldots $$
If we had $\train = ((\source_i, \aligns_i, \target_i))_i$ we could optimise
$$ \sum_{(\source, \aligns, \target) \in \train} \log p_\params^\text{IBM2}(\source,\aligns|\target) $$But we have no training alignments
Instead use sentence-aligned data $\train = ((\source_i, \target_i))_i$
and marginal log-likelihood
where
$$ p_\params^\text{IBM2}(\source|\target) = \sum_{\aligns}p_\params^\text{IBM2}(\source,\aligns|\target) $$No closed-form solution
maximises a lower bound of marginal log-likelihood
Figures out (soft) alignments by iterating:
How is this possible without good initial parameters?
Can you find out which target words belong to which source words?
Assume static word order (but possibly different in each language)
word_mt.Alignment("Bb Aa | Aa Cc | Qq Pp".split(" "),
"Xx Yy | Zz Xx | Ss Tt".split(" "),[])
$\a(X|A)= 1/2 \qquad \a(Y|A)= 1/4 \qquad \a(X|B)= \qquad \a(Y|B)= \qquad \a(T|P)=$
$\b(1|1)= 1/2 \qquad \b(1|2)= 1/2$
Formalise and implement, but for dataset with NULL
tokens and
such that EM will
train_model_2_raw = [
("NULL the house is small" , "klein ist das Haus"),
("NULL a man is tall" , "groß ist ein Mann"),
("NULL my house is small" , "klein ist mein Haus"),
("NULL the building is big" , "groß ist das Gebäude"),
("NULL the building is long" , "lang ist das Gebäude")
]
train_model_2 = [(t.split(" "), s.split(" ")) for t,s in train_model_2_raw]
# What could "house" translate to?
util.Table(train_model_2_raw)
NULL the house is small | klein ist das Haus |
NULL a man is tall | groß ist ein Mann |
NULL my house is small | klein ist mein Haus |
NULL the building is big | groß ist das Gebäude |
NULL the building is long | lang ist das Gebäude |
Calculate posterior distribution over alignments given source and target
$$ \pi(\aligns|\source,\target) = p_\params^\text{IBM2}(\aligns|\source,\target) $$Distribution factorizes for Model 2: $$ \pi(\aligns|\source,\target) = \prod_i^{l_{\ssource}} \pi(a_i|\source,\target,i) $$
with
$$ \pi(a_i|\source,\target,i) = \frac {\alpha(\ssource_i|\starget_{a_i}) \beta(a_i|i,l_\starget,l_\ssource)} {\sum_j^{l_{\starget}} \alpha(\ssource_i|\starget_j) \beta(j|i,l_\starget,l_\ssource) } $$word_mt.Alignment("1:NULL 2:the 3:house".split(" "),
"1:das 2:Haus".split(" "),[(1,0)])
For a uniform initial model:
source_vocab = set([tok for _,s in train_model_2 for tok in s])
target_vocab = set([tok for t,_ in train_model_2 for tok in t])
alpha = mt.create_translation_table(source_vocab, target_vocab)
# {'is':{'ist':0.5, 'das':0.5}}
beta = mt.create_distortion_table(5) #{1:1}
init_model = IBMModel2(alpha,beta)
align_matrices = e_step(init_model, train_model_2)
def show_alignment_for_model(sent, fixed_alpha = {}, fixed_beta = {}):
alpha_1 = mt.create_translation_table(source_vocab, target_vocab, fixed=fixed_alpha)
# {'is':{'ist':0.5, 'das':0.5}}
beta_1 = mt.create_distortion_table(5) #{1:1}
init_model_1 = IBMModel2(alpha_1,beta_1)
align_matrices = e_step(init_model_1, train_model_2)
return word_mt.Alignment.from_matrix(align_matrices[sent],
train_model_2[sent][1],
train_model_2[sent][0])
# try {'is':{'ist':0.5, 'Haus':0.5}}
show_alignment_for_model(3,{'is':{'ist':0.5, 'Haus':0.5}})
The M-Step optimizes a weighted or expected version of log-likelihood using distribution $\pi$ from last E-Step:
$$ \params^* = \argmax_\params \sum_{(\target,\source) \in \train} \sum_\aligns \pi(\aligns|\target,\source) \log \prob _\params^\text{IBM2}(\source,\aligns|\target) $$Because $\pi$ factorizes we have a closed-form solution:
$$ \alpha(\ssource|\starget) = \frac {\sum_{(\target,\source)}\sum_i^{l_\source} \sum_j^{l_\target} \pi(j|i) \delta(\ssource,\ssource_i) \delta(\starget,\starget_j) } {\sum_{(\target,\source)} \sum_j^{l_\target} \delta(\starget,\starget_j) } $$$\delta(x,y)$ is 1 if $x=y$ and 0 otherwise
$\beta$ parameters estimated similarly ...
show_alignment_for_model(0)
Implement M-Step, estimating parameters $\params$ from (soft) alignments $\aligns$
theta1 = m_step(align_matrices, train_model_2)
def show_initial_alpha(target):
plot_table_for_target(theta1.alpha, target)
show_initial_alpha("is")
align_matrices = e_step(theta1, train_model_2)
def show_initial_alignment(sent):
return word_mt.Alignment.from_matrix(align_matrices[sent],
train_model_2[sent][1],
train_model_2[sent][0])
How do alignments look like with these parameters?
show_initial_alignment(0)
Good initialisation is crucial for EM
Initialise with the parameters of a simpler model with fixed distortion table:
$$ \beta(a_i|i,l_\starget,l_\ssource) = \frac{1}{l_\starget + 1} $$This is IBM Model 1!
Train IBM Model 1 and see if it converges
ibm1_iterations = em_model1(init_model, train_model_2, 100)
def plot_ibm1_change(end):
plt.plot(range(0,end), [change for _, _, change in ibm1_iterations[:end]])
plot_ibm1_change(100)
Translation Table?
def show_ibm1_alpha(target):
plot_table_for_target(ibm1_iterations[-1][1].alpha, target)
show_ibm1_alpha("house")
Alignments?
def show_ibm1_alignments(sent):
return word_mt.Alignment.from_matrix(ibm1_iterations[-1][0][sent],train_model_2[sent][1], train_model_2[sent][0])
show_ibm1_alignments(0)
Distortions?
def show_ibm1_distortions(si):
util.plot_bar_graph([ibm1.beta[ti,si,5,4] for ti in range(0,5)],
range(0,5)) # change source index != 0
show_ibm1_distortions(0)
Now learn distortion table in Model 2
ibm1 = ibm1_iterations[-1][1] # model 1 of last iteration
def show_ibm2_alpha(num_iterations, target):
ibm2_iterations = em_model2(ibm1, train_model_2, num_iterations)
ibm2 = ibm2_iterations[-1][1] # model 2 of last iteration
plot_table_for_target(ibm2.alpha, target)
def show_ibm2_alignments(num_iterations, sent):
ibm2_iterations = em_model2(ibm1, train_model_2, num_iterations)
ibm2 = ibm2_iterations[-1][1] # model 2 of last iteration
return word_mt.Alignment.from_matrix(ibm2_iterations[-1][0][sent],train_model_2[sent][1], train_model_2[sent][0])
def show_ibm2_distortions(num_iterations, si):
ibm2_iterations = em_model2(ibm1, train_model_2, num_iterations)
ibm2 = ibm2_iterations[-1][1] # model 2 of last iteration
util.plot_bar_graph([ibm2.beta[ti,si,5,4] for ti in range(0,5)],
range(0,5)) # change source index != 0
show_ibm2_distortions(10,0)
Alignments?
show_ibm2_alignments(10, 0)
Translation Table?
show_ibm2_alpha(4, "house")
Ideally marginalise out alignments
$$ \argmax_{\target} p_\params^\text{IBM2}(\source|\target) \prob^\text{LM}(\target) $$with $$ p_\params^\text{IBM2}(\source|\target) = \sum_{\aligns} p_\params^\text{IBM2}(\source,\aligns|\target) $$
Computationally very expensive, instead find both optimal alignment and translation:
$$ \argmax_{\target,\aligns} p_\params^\text{IBM2}(\source,\aligns|\target) \prob^\text{LM}(\target) $$uni_lm = UniformLM({w for w in target_vocab if w != 'NULL'})
def show_decode_history(begin, end, beam=4, lm=uni_lm, num_iterations=3):
ibm2_iterations = em_model2(ibm1, train_model_2, num_iterations)
ibm2 = ibm2_iterations[-1][1] # model 2 of last iteration
hist2 = decode_model_2(ibm2, lm, source, beam)
return mt.render_history(hist2[begin:end])
ngram_lm = LaplaceLM(NGramLM(lm_train, 3),0.1)
show_decode_history(begin=0,end=5,beam=2, lm=ngram_lm)
Target | Remaining | Len | Score |
---|---|---|---|
NULL | groß ist ein Mann | 4 | 0.00 |
NULL man | groß ist _ Mann | 3 | -9.20 |
NULL a | groß ist _ Mann | 3 | -2.49 |
NULL a tall | groß ist _ _ | 2 | -8.19 |
NULL a man | groß ist _ _ | 2 | -3.31 |
NULL a man NULL | groß _ _ _ | 1 | -7.09 |
NULL a man is | groß _ _ _ | 1 | -4.70 |
NULL a man is big | _ _ _ _ | 0 | -7.83 |
NULL a man is tall | _ _ _ _ | 0 | -5.52 |