#!/usr/bin/env python
# coding: utf-8

# For Blog: [自然语言计算机形式分析的理论与方法笔记(Ch13) | Yam](https://yam.gift/2019/03/15/NLPFA/2019-03-15-Ch13-Ngram-and-Smoothing/)

# In[4]:


import numpy as np


# In[1]:


wc = {
    "I": 3437,
    "want": 1215,
    "to": 3256,
    "eat": 938,
    "Chinese": 213,
    "food": 1506,
    "lunch": 459,
}


# In[111]:


rawbigramc = [
    [8    , 1087 , 0    , 12   , 0       , 0    , 0]     ,
    [3    , 0    , 786  , 0    , 6       , 8    , 6]     ,
    [3    , 0    , 10   , 860  , 3       , 0    , 12]    ,
    [0    , 0    , 2    , 0    , 19      , 2    , 52]    ,
    [2    , 0    , 0    , 0    , 0       , 120  , 1]     ,
    [19   , 0    , 17   , 0    , 0       , 0    , 0]     ,
    [4    , 0    , 0    , 0    , 0       , 1    , 0]     ,
]


# In[117]:


tokenc = 1616


# In[92]:


wtype_bic = {
    "I x": 95,
    "want x": 76,
    "to x": 130,
    "eat x": 124,
    "Chinese x": 20,
    "food x": 82,
    "lunch x": 45
}


# ## Add-One

# In[114]:


bigramc = [[_+1 for _ in item] for item in rawbigramc]


# In[120]:


res = []
for i,w in enumerate("I want to eat Chinese food lunch".split()):
    item = [(x/(wc[w] + tokenc)) for x in bigramc[i]]
    res.append(item)
prob = np.array(res).reshape(7,7)


# In[121]:


for i, w in enumerate(wc.keys()):
    print([round(_,3) for _ in list(prob[i] * wc[w])])


# ## Witten-Bell

# In[118]:


res = []
for i,w in enumerate("I want to eat Chinese food lunch".split()):
    n = wc[w]
    t = wtype_bic[w+" x"]
    z = tokenc - t
    item = [(x/(n+t)) if x > 0 else t/((n+t)*z) for x in rawbigramc[i]]
    res.append(item)
prob = np.array(res).reshape(7,7)


# In[119]:


for i, w in enumerate(wc.keys()):
    print([round(_,3) for _ in list(prob[i] * wc[w])])


# In[ ]:


# In[ ]: