#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('capture', '', '%load_ext autoreload\n%autoreload 2\n%matplotlib inline\nimport sys\nsys.path.append("..")\nimport statnlpbook.util as util\nimport statnlpbook.mle as smle\nfrom statnlpbook.util import safe_log as log\nutil.execute_notebook(\'mle.ipynb\')\n')


# <!---
# Latex Macros
# -->
# $$
# \newcommand{\Xs}{\mathcal{X}}
# \newcommand{\Ys}{\mathcal{Y}}
# \newcommand{\y}{\mathbf{y}}
# \newcommand{\balpha}{\boldsymbol{\alpha}}
# \newcommand{\bbeta}{\boldsymbol{\beta}}
# \newcommand{\aligns}{\mathbf{a}}
# \newcommand{\align}{a}
# \newcommand{\source}{\mathbf{s}}
# \newcommand{\target}{\mathbf{t}}
# \newcommand{\ssource}{s}
# \newcommand{\starget}{t}
# \newcommand{\repr}{\mathbf{f}}
# \newcommand{\repry}{\mathbf{g}}
# \newcommand{\x}{\mathbf{x}}
# \newcommand{\prob}{p}
# \newcommand{\a}{\alpha}
# \newcommand{\b}{\beta}
# \newcommand{\vocab}{V}
# \newcommand{\params}{\boldsymbol{\theta}}
# \newcommand{\param}{\theta}
# \DeclareMathOperator{\perplexity}{PP}
# \DeclareMathOperator{\argmax}{argmax}
# \DeclareMathOperator{\argmin}{argmin}
# \newcommand{\train}{\mathcal{D}}
# \newcommand{\counts}[2]{\#_{#1}(#2) }
# \newcommand{\length}[1]{\text{length}(#1) }
# \newcommand{\indi}{\mathbb{I}}
# \newcommand{\china}{\text{China}}
# \newcommand{\mexico}{\text{Mexico}}
# \newcommand{\paramc}{\param_\china}
# \newcommand{\paramm}{\param_\mexico}
# \newcommand{\countc}{\counts{\train}{\china}}
# \newcommand{\countm}{\counts{\train}{\mexico}}
# $$

# # Maximum Likelihood Estimation
# for **ShallowDrumpf**!

# What does 
# $$
# \argmax_\params \sum_{(\x,\y) \in \train} \log \prob_\params(\x,\y)
# $$
# have to do with counting?

# ## Application: ShallowDrumpf
# 
# Develop **unigram language model** for generating simplified Trump speeches
# 
# > China, China, China, Mexico, China, Mexico ...

# ## Model
# 
# $$
# \prob_\params(w) = \params_w
# $$
# 
# $$
# \prob_\params(\text{China}) = \params_\text{China}  \qquad \prob_\params(\text{Mexico}) = \params_\text{Mexico} 
# $$
# 
# 

# In[9]:


m = "Mexico"
c = "China"
def prob(th_china, th_mexico, word):
    return th_china if word == 'China' else th_mexico

prob(0.7, 0.3, 'China')


# ## Maximum Likelihood Objective
# 
# $$
# l(\params) = \sum_{w \in \train} \log \prob_\params(w)
# $$

# $$
# l(\params)  = \countc  \log \paramc +  \countm \log \paramm
# $$

# Solution is **counting**:
# 
# $$
# \paramc = \frac{\countc}{\countc + \countm}
# $$

# In[3]:


def mle(data):
    theta_china = len([w for w in data if w == 'China']) / len(data)
    return theta_china, 1.0 - theta_china 

mle([c,c,m,c])


# ### Loss  Surface

# In[11]:


def ll(th_china, th_mexico, data):
    return sum([log(prob(th_china, th_mexico, w)) for w in data])

data = [c,c,m,c] # how does this graph look with all Cs?
smle.plot_mle_graph(lambda x,y: ll(x,y, data), mle(data), 
                    x_label='China',y_label='Mexico')


# Solution trivial (and useless) without **constraints**

# Constraints:
# 
# * $0 \leq \paramc \leq 1 $
# * $0 \leq \paramm \leq 1 $
# * $\paramc + \paramm = 1$
#     * Isoline of $g(\paramc,\paramm)=\paramc + \paramm$ 

# In[5]:


smle.plot_mle_graph(lambda x,y: ll(x,y, data), mle(data), 
                    show_constraint=True)


# ## Gradients at Optimum

# In[6]:


smle.plot_mle_graph(lambda x,y: ll(x,y, data), mle(data), 
                    show_constraint=True, show_optimum=True)


# $$
# \nabla_\params l(\params) = \alpha \nabla_\params g(\params)
# $$

# $$
# l(\params)  = \countc  \log \paramc +  \countm \log \paramm
# $$

# $$
# \frac{\partial l(\params)}{\partial \paramc} = \frac{\counts{D}{China}}{\paramc}
# $$

# $$
# g(\params) = \paramc + \paramm
# $$

# $$
# \frac{\partial g(\params)}{\partial \paramc} = 1
# $$

# $$
# \frac{\partial l(\params)}{\partial \paramc} = \alpha \frac{\partial g(\params)}{\partial \paramc}
# $$

# $$
# \frac{\countc}{\paramc} = \alpha 
# $$

# $$
# \paramc = \frac{\countc}{\alpha} = \ldots
# $$
# $$
# \paramm = \frac{\countm}{\alpha} = \ldots
# $$

# $$
# \paramc = \frac{\countc}{\countc + \countm}
# $$

# ## Summary
# 
# * Derive MLE by 
#     * equating loss and constraint gradient
#     * using constraint equation
# * Easy to extend to any discrete generative model with conditional probability tables
# * Learning goal: be able to derive the equation for new models 

# ## Background Material
# * Introduction to MLE in [Mike Collin's notes](http://www.cs.columbia.edu/~mcollins/em.pdf)