#!/usr/bin/env python # coding: utf-8 # For Blog: [自然语言计算机形式分析的理论与方法笔记(Ch13) | Yam](https://yam.gift/2019/03/15/NLPFA/2019-03-15-Ch13-Ngram-and-Smoothing/) # In[4]: import numpy as np # In[1]: wc = { "I": 3437, "want": 1215, "to": 3256, "eat": 938, "Chinese": 213, "food": 1506, "lunch": 459, } # In[111]: rawbigramc = [ [8 , 1087 , 0 , 12 , 0 , 0 , 0] , [3 , 0 , 786 , 0 , 6 , 8 , 6] , [3 , 0 , 10 , 860 , 3 , 0 , 12] , [0 , 0 , 2 , 0 , 19 , 2 , 52] , [2 , 0 , 0 , 0 , 0 , 120 , 1] , [19 , 0 , 17 , 0 , 0 , 0 , 0] , [4 , 0 , 0 , 0 , 0 , 1 , 0] , ] # In[117]: tokenc = 1616 # In[92]: wtype_bic = { "I x": 95, "want x": 76, "to x": 130, "eat x": 124, "Chinese x": 20, "food x": 82, "lunch x": 45 } # ## Add-One # In[114]: bigramc = [[_+1 for _ in item] for item in rawbigramc] # In[120]: res = [] for i,w in enumerate("I want to eat Chinese food lunch".split()): item = [(x/(wc[w] + tokenc)) for x in bigramc[i]] res.append(item) prob = np.array(res).reshape(7,7) # In[121]: for i, w in enumerate(wc.keys()): print([round(_,3) for _ in list(prob[i] * wc[w])]) # ## Witten-Bell # In[118]: res = [] for i,w in enumerate("I want to eat Chinese food lunch".split()): n = wc[w] t = wtype_bic[w+" x"] z = tokenc - t item = [(x/(n+t)) if x > 0 else t/((n+t)*z) for x in rawbigramc[i]] res.append(item) prob = np.array(res).reshape(7,7) # In[119]: for i, w in enumerate(wc.keys()): print([round(_,3) for _ in list(prob[i] * wc[w])]) # In[ ]: # In[ ]: