In [1]:
import numpy as np
import itertools
from collections import Counter

In [2]:
traind = [
"just plain boring".split(),
"entirely predictable and lacks energy".split(),
"no surprises and very few laughs".split(),
"very powerful".split(),
"the most fun film of the summer".split()]

In [3]:
yd = [0, 0, 0, 1, 1]

In [4]:
testd = [ "predictable",  "with",  "no", "fun"]

In [9]:
def train_naive_bayes(X, y):
Ndoc = len(X)
logpc = {}
bigdoc = {}
logpwc = {}
V = set(itertools.chain(*X))
for i, c in enumerate(list(set(y))):
cindex = [_i for _i,_c in enumerate(y) if _c == c]
Nc = len(cindex)
logpc[c] = np.log(Nc/Ndoc)
bigdoc[c] = list(itertools.chain(*[X[_i] for _i in cindex]))
for w in V:
countc = Counter(bigdoc[c])
count_wc = countc[w]
logpwc[w,c] = np.log((count_wc+1)/(sum(countc.values())+len(V)))
return logpc, logpwc, V

In [6]:
logpc, logpwc, V = train_naive_bayes(traind, yd)

In [7]:
def test_naive_bayes(testd, logpc, logpwc, y ,V):
sum_c = {}
for c in set(y):
sum_c[c] = logpc[c]
for i in range(len(testd)):
w = testd[i]
if w in V:
sum_c[c] = sum_c[c] + logpwc[w,c]
sortres = sorted(sum_c.items(), key=lambda x: x[1])
return sortres[0][0]

In [8]:
test_naive_bayes(testd, logpc, logpwc, yd, V)

Out[8]:
1
In [ ]: