$$ P(y \mid x) = \underbrace{P(y)}_{\textit{prior}} \prod_i P( f_i \mid y) $$
import pprint
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.util import names_demo, names_demo_features
print("Features used to classify a name as male or female:")
pp = pprint.PrettyPrinter(indent=4)
test_features = names_demo_features("anoop")
pp.pprint(test_features)
Features used to classify a name as male or female: { 'alwayson': True, 'count(a)': 1, 'count(b)': 0, 'count(c)': 0, 'count(d)': 0, 'count(e)': 0, 'count(f)': 0, 'count(g)': 0, 'count(h)': 0, 'count(i)': 0, 'count(j)': 0, 'count(k)': 0, 'count(l)': 0, 'count(m)': 0, 'count(n)': 1, 'count(o)': 2, 'count(p)': 1, 'count(q)': 0, 'count(r)': 0, 'count(s)': 0, 'count(t)': 0, 'count(u)': 0, 'count(v)': 0, 'count(w)': 0, 'count(x)': 0, 'count(y)': 0, 'count(z)': 0, 'endswith': 'p', 'has(a)': True, 'has(b)': False, 'has(c)': False, 'has(d)': False, 'has(e)': False, 'has(f)': False, 'has(g)': False, 'has(h)': False, 'has(i)': False, 'has(j)': False, 'has(k)': False, 'has(l)': False, 'has(m)': False, 'has(n)': True, 'has(o)': True, 'has(p)': True, 'has(q)': False, 'has(r)': False, 'has(s)': False, 'has(t)': False, 'has(u)': False, 'has(v)': False, 'has(w)': False, 'has(x)': False, 'has(y)': False, 'has(z)': False, 'startswith': 'a'}
print("Train NaiveBayes classifier and run on some example input names:")
classifier = names_demo(NaiveBayesClassifier.train)
Train NaiveBayes classifier and run on some example input names: Training classifier... Testing classifier... Accuracy: 0.7820 Avg. log likelihood: -0.7476 Unseen Names P(Male) P(Female) ---------------------------------------- Kelli 0.0132 *0.9868 Er *0.8826 0.1174 Ally 0.0903 *0.9097 Stephan *0.8361 0.1639 Chriss 0.6864 *0.3136
name='nate'
print("Run trained classifier on input name:", name)
test_features = names_demo_features(name)
output = classifier.prob_classify(test_features)
print("P(male|{0})={1}".format(name,output.prob('male')))
print("P(female|{0})={1}".format(name,output.prob('female')))
Run trained classifier on input name: nate P(male|nate)=0.08246413295145613 P(female|nate)=0.9175358670485438
The informativeness of a feature feature_type = feature_value
or $f=v$ is computed by taking the ratio of choosing one label over the other, so if there are two labels: $\ell_1$ or $\ell_2$
If there are more than 2 labels, say $\ell_1, \ldots \ell_n$, then just compare one label versus all others:
$$ score(f=v) = \frac{ P( f=v \mid \ell_i ) }{ \sum_{k \neq i} P( f=v \mid \ell_k ) } $$We sort all the features by this score and report the top 10 below.
classifier.show_most_informative_features()
Most Informative Features endswith = 'a' female : male = 31.5 : 1.0 endswith = 'p' male : female = 14.2 : 1.0 endswith = 'v' male : female = 13.0 : 1.0 endswith = 'f' male : female = 10.5 : 1.0 endswith = 'm' male : female = 10.3 : 1.0 endswith = 'd' male : female = 10.2 : 1.0 endswith = 'o' male : female = 7.7 : 1.0 count(v) = 2 female : male = 6.5 : 1.0 endswith = 'r' male : female = 6.4 : 1.0 endswith = 'w' male : female = 6.1 : 1.0
def bigram_features(name):
features = {}
features['alwayson'] = True
features['startswith'] = name[0].lower()
features['endswith'] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features['count(%s)' % letter] = name.lower().count(letter)
features['has(%s)' % letter] = letter in name.lower()
for letter1 in 'abcdefghijklmnopqrstuvwxyz':
for letter2 in 'abcdefghijklmnopqrstuvwxyz':
bigram = "%s%s" % (letter1, letter2)
features['count2(%s)' % bigram] = name.lower().count(bigram)
features['has2(%s)' % bigram] = bigram in name.lower()
return features
pp.pprint(bigram_features("Dementor"))
{ 'alwayson': True, 'count(a)': 0, 'count(b)': 0, 'count(c)': 0, 'count(d)': 1, 'count(e)': 2, 'count(f)': 0, 'count(g)': 0, 'count(h)': 0, 'count(i)': 0, 'count(j)': 0, 'count(k)': 0, 'count(l)': 0, 'count(m)': 1, 'count(n)': 1, 'count(o)': 1, 'count(p)': 0, 'count(q)': 0, 'count(r)': 1, 'count(s)': 0, 'count(t)': 1, 'count(u)': 0, 'count(v)': 0, 'count(w)': 0, 'count(x)': 0, 'count(y)': 0, 'count(z)': 0, 'count2(aa)': 0, 'count2(ab)': 0, 'count2(ac)': 0, 'count2(ad)': 0, 'count2(ae)': 0, 'count2(af)': 0, 'count2(ag)': 0, 'count2(ah)': 0, 'count2(ai)': 0, 'count2(aj)': 0, 'count2(ak)': 0, 'count2(al)': 0, 'count2(am)': 0, 'count2(an)': 0, 'count2(ao)': 0, 'count2(ap)': 0, 'count2(aq)': 0, 'count2(ar)': 0, 'count2(as)': 0, 'count2(at)': 0, 'count2(au)': 0, 'count2(av)': 0, 'count2(aw)': 0, 'count2(ax)': 0, 'count2(ay)': 0, 'count2(az)': 0, 'count2(ba)': 0, 'count2(bb)': 0, 'count2(bc)': 0, 'count2(bd)': 0, 'count2(be)': 0, 'count2(bf)': 0, 'count2(bg)': 0, 'count2(bh)': 0, 'count2(bi)': 0, 'count2(bj)': 0, 'count2(bk)': 0, 'count2(bl)': 0, 'count2(bm)': 0, 'count2(bn)': 0, 'count2(bo)': 0, 'count2(bp)': 0, 'count2(bq)': 0, 'count2(br)': 0, 'count2(bs)': 0, 'count2(bt)': 0, 'count2(bu)': 0, 'count2(bv)': 0, 'count2(bw)': 0, 'count2(bx)': 0, 'count2(by)': 0, 'count2(bz)': 0, 'count2(ca)': 0, 'count2(cb)': 0, 'count2(cc)': 0, 'count2(cd)': 0, 'count2(ce)': 0, 'count2(cf)': 0, 'count2(cg)': 0, 'count2(ch)': 0, 'count2(ci)': 0, 'count2(cj)': 0, 'count2(ck)': 0, 'count2(cl)': 0, 'count2(cm)': 0, 'count2(cn)': 0, 'count2(co)': 0, 'count2(cp)': 0, 'count2(cq)': 0, 'count2(cr)': 0, 'count2(cs)': 0, 'count2(ct)': 0, 'count2(cu)': 0, 'count2(cv)': 0, 'count2(cw)': 0, 'count2(cx)': 0, 'count2(cy)': 0, 'count2(cz)': 0, 'count2(da)': 0, 'count2(db)': 0, 'count2(dc)': 0, 'count2(dd)': 0, 'count2(de)': 1, 'count2(df)': 0, 'count2(dg)': 0, 'count2(dh)': 0, 'count2(di)': 0, 'count2(dj)': 0, 'count2(dk)': 0, 'count2(dl)': 0, 'count2(dm)': 0, 'count2(dn)': 0, 'count2(do)': 0, 'count2(dp)': 0, 'count2(dq)': 0, 'count2(dr)': 0, 'count2(ds)': 0, 'count2(dt)': 0, 'count2(du)': 0, 'count2(dv)': 0, 'count2(dw)': 0, 'count2(dx)': 0, 'count2(dy)': 0, 'count2(dz)': 0, 'count2(ea)': 0, 'count2(eb)': 0, 'count2(ec)': 0, 'count2(ed)': 0, 'count2(ee)': 0, 'count2(ef)': 0, 'count2(eg)': 0, 'count2(eh)': 0, 'count2(ei)': 0, 'count2(ej)': 0, 'count2(ek)': 0, 'count2(el)': 0, 'count2(em)': 1, 'count2(en)': 1, 'count2(eo)': 0, 'count2(ep)': 0, 'count2(eq)': 0, 'count2(er)': 0, 'count2(es)': 0, 'count2(et)': 0, 'count2(eu)': 0, 'count2(ev)': 0, 'count2(ew)': 0, 'count2(ex)': 0, 'count2(ey)': 0, 'count2(ez)': 0, 'count2(fa)': 0, 'count2(fb)': 0, 'count2(fc)': 0, 'count2(fd)': 0, 'count2(fe)': 0, 'count2(ff)': 0, 'count2(fg)': 0, 'count2(fh)': 0, 'count2(fi)': 0, 'count2(fj)': 0, 'count2(fk)': 0, 'count2(fl)': 0, 'count2(fm)': 0, 'count2(fn)': 0, 'count2(fo)': 0, 'count2(fp)': 0, 'count2(fq)': 0, 'count2(fr)': 0, 'count2(fs)': 0, 'count2(ft)': 0, 'count2(fu)': 0, 'count2(fv)': 0, 'count2(fw)': 0, 'count2(fx)': 0, 'count2(fy)': 0, 'count2(fz)': 0, 'count2(ga)': 0, 'count2(gb)': 0, 'count2(gc)': 0, 'count2(gd)': 0, 'count2(ge)': 0, 'count2(gf)': 0, 'count2(gg)': 0, 'count2(gh)': 0, 'count2(gi)': 0, 'count2(gj)': 0, 'count2(gk)': 0, 'count2(gl)': 0, 'count2(gm)': 0, 'count2(gn)': 0, 'count2(go)': 0, 'count2(gp)': 0, 'count2(gq)': 0, 'count2(gr)': 0, 'count2(gs)': 0, 'count2(gt)': 0, 'count2(gu)': 0, 'count2(gv)': 0, 'count2(gw)': 0, 'count2(gx)': 0, 'count2(gy)': 0, 'count2(gz)': 0, 'count2(ha)': 0, 'count2(hb)': 0, 'count2(hc)': 0, 'count2(hd)': 0, 'count2(he)': 0, 'count2(hf)': 0, 'count2(hg)': 0, 'count2(hh)': 0, 'count2(hi)': 0, 'count2(hj)': 0, 'count2(hk)': 0, 'count2(hl)': 0, 'count2(hm)': 0, 'count2(hn)': 0, 'count2(ho)': 0, 'count2(hp)': 0, 'count2(hq)': 0, 'count2(hr)': 0, 'count2(hs)': 0, 'count2(ht)': 0, 'count2(hu)': 0, 'count2(hv)': 0, 'count2(hw)': 0, 'count2(hx)': 0, 'count2(hy)': 0, 'count2(hz)': 0, 'count2(ia)': 0, 'count2(ib)': 0, 'count2(ic)': 0, 'count2(id)': 0, 'count2(ie)': 0, 'count2(if)': 0, 'count2(ig)': 0, 'count2(ih)': 0, 'count2(ii)': 0, 'count2(ij)': 0, 'count2(ik)': 0, 'count2(il)': 0, 'count2(im)': 0, 'count2(in)': 0, 'count2(io)': 0, 'count2(ip)': 0, 'count2(iq)': 0, 'count2(ir)': 0, 'count2(is)': 0, 'count2(it)': 0, 'count2(iu)': 0, 'count2(iv)': 0, 'count2(iw)': 0, 'count2(ix)': 0, 'count2(iy)': 0, 'count2(iz)': 0, 'count2(ja)': 0, 'count2(jb)': 0, 'count2(jc)': 0, 'count2(jd)': 0, 'count2(je)': 0, 'count2(jf)': 0, 'count2(jg)': 0, 'count2(jh)': 0, 'count2(ji)': 0, 'count2(jj)': 0, 'count2(jk)': 0, 'count2(jl)': 0, 'count2(jm)': 0, 'count2(jn)': 0, 'count2(jo)': 0, 'count2(jp)': 0, 'count2(jq)': 0, 'count2(jr)': 0, 'count2(js)': 0, 'count2(jt)': 0, 'count2(ju)': 0, 'count2(jv)': 0, 'count2(jw)': 0, 'count2(jx)': 0, 'count2(jy)': 0, 'count2(jz)': 0, 'count2(ka)': 0, 'count2(kb)': 0, 'count2(kc)': 0, 'count2(kd)': 0, 'count2(ke)': 0, 'count2(kf)': 0, 'count2(kg)': 0, 'count2(kh)': 0, 'count2(ki)': 0, 'count2(kj)': 0, 'count2(kk)': 0, 'count2(kl)': 0, 'count2(km)': 0, 'count2(kn)': 0, 'count2(ko)': 0, 'count2(kp)': 0, 'count2(kq)': 0, 'count2(kr)': 0, 'count2(ks)': 0, 'count2(kt)': 0, 'count2(ku)': 0, 'count2(kv)': 0, 'count2(kw)': 0, 'count2(kx)': 0, 'count2(ky)': 0, 'count2(kz)': 0, 'count2(la)': 0, 'count2(lb)': 0, 'count2(lc)': 0, 'count2(ld)': 0, 'count2(le)': 0, 'count2(lf)': 0, 'count2(lg)': 0, 'count2(lh)': 0, 'count2(li)': 0, 'count2(lj)': 0, 'count2(lk)': 0, 'count2(ll)': 0, 'count2(lm)': 0, 'count2(ln)': 0, 'count2(lo)': 0, 'count2(lp)': 0, 'count2(lq)': 0, 'count2(lr)': 0, 'count2(ls)': 0, 'count2(lt)': 0, 'count2(lu)': 0, 'count2(lv)': 0, 'count2(lw)': 0, 'count2(lx)': 0, 'count2(ly)': 0, 'count2(lz)': 0, 'count2(ma)': 0, 'count2(mb)': 0, 'count2(mc)': 0, 'count2(md)': 0, 'count2(me)': 1, 'count2(mf)': 0, 'count2(mg)': 0, 'count2(mh)': 0, 'count2(mi)': 0, 'count2(mj)': 0, 'count2(mk)': 0, 'count2(ml)': 0, 'count2(mm)': 0, 'count2(mn)': 0, 'count2(mo)': 0, 'count2(mp)': 0, 'count2(mq)': 0, 'count2(mr)': 0, 'count2(ms)': 0, 'count2(mt)': 0, 'count2(mu)': 0, 'count2(mv)': 0, 'count2(mw)': 0, 'count2(mx)': 0, 'count2(my)': 0, 'count2(mz)': 0, 'count2(na)': 0, 'count2(nb)': 0, 'count2(nc)': 0, 'count2(nd)': 0, 'count2(ne)': 0, 'count2(nf)': 0, 'count2(ng)': 0, 'count2(nh)': 0, 'count2(ni)': 0, 'count2(nj)': 0, 'count2(nk)': 0, 'count2(nl)': 0, 'count2(nm)': 0, 'count2(nn)': 0, 'count2(no)': 0, 'count2(np)': 0, 'count2(nq)': 0, 'count2(nr)': 0, 'count2(ns)': 0, 'count2(nt)': 1, 'count2(nu)': 0, 'count2(nv)': 0, 'count2(nw)': 0, 'count2(nx)': 0, 'count2(ny)': 0, 'count2(nz)': 0, 'count2(oa)': 0, 'count2(ob)': 0, 'count2(oc)': 0, 'count2(od)': 0, 'count2(oe)': 0, 'count2(of)': 0, 'count2(og)': 0, 'count2(oh)': 0, 'count2(oi)': 0, 'count2(oj)': 0, 'count2(ok)': 0, 'count2(ol)': 0, 'count2(om)': 0, 'count2(on)': 0, 'count2(oo)': 0, 'count2(op)': 0, 'count2(oq)': 0, 'count2(or)': 1, 'count2(os)': 0, 'count2(ot)': 0, 'count2(ou)': 0, 'count2(ov)': 0, 'count2(ow)': 0, 'count2(ox)': 0, 'count2(oy)': 0, 'count2(oz)': 0, 'count2(pa)': 0, 'count2(pb)': 0, 'count2(pc)': 0, 'count2(pd)': 0, 'count2(pe)': 0, 'count2(pf)': 0, 'count2(pg)': 0, 'count2(ph)': 0, 'count2(pi)': 0, 'count2(pj)': 0, 'count2(pk)': 0, 'count2(pl)': 0, 'count2(pm)': 0, 'count2(pn)': 0, 'count2(po)': 0, 'count2(pp)': 0, 'count2(pq)': 0, 'count2(pr)': 0, 'count2(ps)': 0, 'count2(pt)': 0, 'count2(pu)': 0, 'count2(pv)': 0, 'count2(pw)': 0, 'count2(px)': 0, 'count2(py)': 0, 'count2(pz)': 0, 'count2(qa)': 0, 'count2(qb)': 0, 'count2(qc)': 0, 'count2(qd)': 0, 'count2(qe)': 0, 'count2(qf)': 0, 'count2(qg)': 0, 'count2(qh)': 0, 'count2(qi)': 0, 'count2(qj)': 0, 'count2(qk)': 0, 'count2(ql)': 0, 'count2(qm)': 0, 'count2(qn)': 0, 'count2(qo)': 0, 'count2(qp)': 0, 'count2(qq)': 0, 'count2(qr)': 0, 'count2(qs)': 0, 'count2(qt)': 0, 'count2(qu)': 0, 'count2(qv)': 0, 'count2(qw)': 0, 'count2(qx)': 0, 'count2(qy)': 0, 'count2(qz)': 0, 'count2(ra)': 0, 'count2(rb)': 0, 'count2(rc)': 0, 'count2(rd)': 0, 'count2(re)': 0, 'count2(rf)': 0, 'count2(rg)': 0, 'count2(rh)': 0, 'count2(ri)': 0, 'count2(rj)': 0, 'count2(rk)': 0, 'count2(rl)': 0, 'count2(rm)': 0, 'count2(rn)': 0, 'count2(ro)': 0, 'count2(rp)': 0, 'count2(rq)': 0, 'count2(rr)': 0, 'count2(rs)': 0, 'count2(rt)': 0, 'count2(ru)': 0, 'count2(rv)': 0, 'count2(rw)': 0, 'count2(rx)': 0, 'count2(ry)': 0, 'count2(rz)': 0, 'count2(sa)': 0, 'count2(sb)': 0, 'count2(sc)': 0, 'count2(sd)': 0, 'count2(se)': 0, 'count2(sf)': 0, 'count2(sg)': 0, 'count2(sh)': 0, 'count2(si)': 0, 'count2(sj)': 0, 'count2(sk)': 0, 'count2(sl)': 0, 'count2(sm)': 0, 'count2(sn)': 0, 'count2(so)': 0, 'count2(sp)': 0, 'count2(sq)': 0, 'count2(sr)': 0, 'count2(ss)': 0, 'count2(st)': 0, 'count2(su)': 0, 'count2(sv)': 0, 'count2(sw)': 0, 'count2(sx)': 0, 'count2(sy)': 0, 'count2(sz)': 0, 'count2(ta)': 0, 'count2(tb)': 0, 'count2(tc)': 0, 'count2(td)': 0, 'count2(te)': 0, 'count2(tf)': 0, 'count2(tg)': 0, 'count2(th)': 0, 'count2(ti)': 0, 'count2(tj)': 0, 'count2(tk)': 0, 'count2(tl)': 0, 'count2(tm)': 0, 'count2(tn)': 0, 'count2(to)': 1, 'count2(tp)': 0, 'count2(tq)': 0, 'count2(tr)': 0, 'count2(ts)': 0, 'count2(tt)': 0, 'count2(tu)': 0, 'count2(tv)': 0, 'count2(tw)': 0, 'count2(tx)': 0, 'count2(ty)': 0, 'count2(tz)': 0, 'count2(ua)': 0, 'count2(ub)': 0, 'count2(uc)': 0, 'count2(ud)': 0, 'count2(ue)': 0, 'count2(uf)': 0, 'count2(ug)': 0, 'count2(uh)': 0, 'count2(ui)': 0, 'count2(uj)': 0, 'count2(uk)': 0, 'count2(ul)': 0, 'count2(um)': 0, 'count2(un)': 0, 'count2(uo)': 0, 'count2(up)': 0, 'count2(uq)': 0, 'count2(ur)': 0, 'count2(us)': 0, 'count2(ut)': 0, 'count2(uu)': 0, 'count2(uv)': 0, 'count2(uw)': 0, 'count2(ux)': 0, 'count2(uy)': 0, 'count2(uz)': 0, 'count2(va)': 0, 'count2(vb)': 0, 'count2(vc)': 0, 'count2(vd)': 0, 'count2(ve)': 0, 'count2(vf)': 0, 'count2(vg)': 0, 'count2(vh)': 0, 'count2(vi)': 0, 'count2(vj)': 0, 'count2(vk)': 0, 'count2(vl)': 0, 'count2(vm)': 0, 'count2(vn)': 0, 'count2(vo)': 0, 'count2(vp)': 0, 'count2(vq)': 0, 'count2(vr)': 0, 'count2(vs)': 0, 'count2(vt)': 0, 'count2(vu)': 0, 'count2(vv)': 0, 'count2(vw)': 0, 'count2(vx)': 0, 'count2(vy)': 0, 'count2(vz)': 0, 'count2(wa)': 0, 'count2(wb)': 0, 'count2(wc)': 0, 'count2(wd)': 0, 'count2(we)': 0, 'count2(wf)': 0, 'count2(wg)': 0, 'count2(wh)': 0, 'count2(wi)': 0, 'count2(wj)': 0, 'count2(wk)': 0, 'count2(wl)': 0, 'count2(wm)': 0, 'count2(wn)': 0, 'count2(wo)': 0, 'count2(wp)': 0, 'count2(wq)': 0, 'count2(wr)': 0, 'count2(ws)': 0, 'count2(wt)': 0, 'count2(wu)': 0, 'count2(wv)': 0, 'count2(ww)': 0, 'count2(wx)': 0, 'count2(wy)': 0, 'count2(wz)': 0, 'count2(xa)': 0, 'count2(xb)': 0, 'count2(xc)': 0, 'count2(xd)': 0, 'count2(xe)': 0, 'count2(xf)': 0, 'count2(xg)': 0, 'count2(xh)': 0, 'count2(xi)': 0, 'count2(xj)': 0, 'count2(xk)': 0, 'count2(xl)': 0, 'count2(xm)': 0, 'count2(xn)': 0, 'count2(xo)': 0, 'count2(xp)': 0, 'count2(xq)': 0, 'count2(xr)': 0, 'count2(xs)': 0, 'count2(xt)': 0, 'count2(xu)': 0, 'count2(xv)': 0, 'count2(xw)': 0, 'count2(xx)': 0, 'count2(xy)': 0, 'count2(xz)': 0, 'count2(ya)': 0, 'count2(yb)': 0, 'count2(yc)': 0, 'count2(yd)': 0, 'count2(ye)': 0, 'count2(yf)': 0, 'count2(yg)': 0, 'count2(yh)': 0, 'count2(yi)': 0, 'count2(yj)': 0, 'count2(yk)': 0, 'count2(yl)': 0, 'count2(ym)': 0, 'count2(yn)': 0, 'count2(yo)': 0, 'count2(yp)': 0, 'count2(yq)': 0, 'count2(yr)': 0, 'count2(ys)': 0, 'count2(yt)': 0, 'count2(yu)': 0, 'count2(yv)': 0, 'count2(yw)': 0, 'count2(yx)': 0, 'count2(yy)': 0, 'count2(yz)': 0, 'count2(za)': 0, 'count2(zb)': 0, 'count2(zc)': 0, 'count2(zd)': 0, 'count2(ze)': 0, 'count2(zf)': 0, 'count2(zg)': 0, 'count2(zh)': 0, 'count2(zi)': 0, 'count2(zj)': 0, 'count2(zk)': 0, 'count2(zl)': 0, 'count2(zm)': 0, 'count2(zn)': 0, 'count2(zo)': 0, 'count2(zp)': 0, 'count2(zq)': 0, 'count2(zr)': 0, 'count2(zs)': 0, 'count2(zt)': 0, 'count2(zu)': 0, 'count2(zv)': 0, 'count2(zw)': 0, 'count2(zx)': 0, 'count2(zy)': 0, 'count2(zz)': 0, 'endswith': 'r', 'has(a)': False, 'has(b)': False, 'has(c)': False, 'has(d)': True, 'has(e)': True, 'has(f)': False, 'has(g)': False, 'has(h)': False, 'has(i)': False, 'has(j)': False, 'has(k)': False, 'has(l)': False, 'has(m)': True, 'has(n)': True, 'has(o)': True, 'has(p)': False, 'has(q)': False, 'has(r)': True, 'has(s)': False, 'has(t)': True, 'has(u)': False, 'has(v)': False, 'has(w)': False, 'has(x)': False, 'has(y)': False, 'has(z)': False, 'has2(aa)': False, 'has2(ab)': False, 'has2(ac)': False, 'has2(ad)': False, 'has2(ae)': False, 'has2(af)': False, 'has2(ag)': False, 'has2(ah)': False, 'has2(ai)': False, 'has2(aj)': False, 'has2(ak)': False, 'has2(al)': False, 'has2(am)': False, 'has2(an)': False, 'has2(ao)': False, 'has2(ap)': False, 'has2(aq)': False, 'has2(ar)': False, 'has2(as)': False, 'has2(at)': False, 'has2(au)': False, 'has2(av)': False, 'has2(aw)': False, 'has2(ax)': False, 'has2(ay)': False, 'has2(az)': False, 'has2(ba)': False, 'has2(bb)': False, 'has2(bc)': False, 'has2(bd)': False, 'has2(be)': False, 'has2(bf)': False, 'has2(bg)': False, 'has2(bh)': False, 'has2(bi)': False, 'has2(bj)': False, 'has2(bk)': False, 'has2(bl)': False, 'has2(bm)': False, 'has2(bn)': False, 'has2(bo)': False, 'has2(bp)': False, 'has2(bq)': False, 'has2(br)': False, 'has2(bs)': False, 'has2(bt)': False, 'has2(bu)': False, 'has2(bv)': False, 'has2(bw)': False, 'has2(bx)': False, 'has2(by)': False, 'has2(bz)': False, 'has2(ca)': False, 'has2(cb)': False, 'has2(cc)': False, 'has2(cd)': False, 'has2(ce)': False, 'has2(cf)': False, 'has2(cg)': False, 'has2(ch)': False, 'has2(ci)': False, 'has2(cj)': False, 'has2(ck)': False, 'has2(cl)': False, 'has2(cm)': False, 'has2(cn)': False, 'has2(co)': False, 'has2(cp)': False, 'has2(cq)': False, 'has2(cr)': False, 'has2(cs)': False, 'has2(ct)': False, 'has2(cu)': False, 'has2(cv)': False, 'has2(cw)': False, 'has2(cx)': False, 'has2(cy)': False, 'has2(cz)': False, 'has2(da)': False, 'has2(db)': False, 'has2(dc)': False, 'has2(dd)': False, 'has2(de)': True, 'has2(df)': False, 'has2(dg)': False, 'has2(dh)': False, 'has2(di)': False, 'has2(dj)': False, 'has2(dk)': False, 'has2(dl)': False, 'has2(dm)': False, 'has2(dn)': False, 'has2(do)': False, 'has2(dp)': False, 'has2(dq)': False, 'has2(dr)': False, 'has2(ds)': False, 'has2(dt)': False, 'has2(du)': False, 'has2(dv)': False, 'has2(dw)': False, 'has2(dx)': False, 'has2(dy)': False, 'has2(dz)': False, 'has2(ea)': False, 'has2(eb)': False, 'has2(ec)': False, 'has2(ed)': False, 'has2(ee)': False, 'has2(ef)': False, 'has2(eg)': False, 'has2(eh)': False, 'has2(ei)': False, 'has2(ej)': False, 'has2(ek)': False, 'has2(el)': False, 'has2(em)': True, 'has2(en)': True, 'has2(eo)': False, 'has2(ep)': False, 'has2(eq)': False, 'has2(er)': False, 'has2(es)': False, 'has2(et)': False, 'has2(eu)': False, 'has2(ev)': False, 'has2(ew)': False, 'has2(ex)': False, 'has2(ey)': False, 'has2(ez)': False, 'has2(fa)': False, 'has2(fb)': False, 'has2(fc)': False, 'has2(fd)': False, 'has2(fe)': False, 'has2(ff)': False, 'has2(fg)': False, 'has2(fh)': False, 'has2(fi)': False, 'has2(fj)': False, 'has2(fk)': False, 'has2(fl)': False, 'has2(fm)': False, 'has2(fn)': False, 'has2(fo)': False, 'has2(fp)': False, 'has2(fq)': False, 'has2(fr)': False, 'has2(fs)': False, 'has2(ft)': False, 'has2(fu)': False, 'has2(fv)': False, 'has2(fw)': False, 'has2(fx)': False, 'has2(fy)': False, 'has2(fz)': False, 'has2(ga)': False, 'has2(gb)': False, 'has2(gc)': False, 'has2(gd)': False, 'has2(ge)': False, 'has2(gf)': False, 'has2(gg)': False, 'has2(gh)': False, 'has2(gi)': False, 'has2(gj)': False, 'has2(gk)': False, 'has2(gl)': False, 'has2(gm)': False, 'has2(gn)': False, 'has2(go)': False, 'has2(gp)': False, 'has2(gq)': False, 'has2(gr)': False, 'has2(gs)': False, 'has2(gt)': False, 'has2(gu)': False, 'has2(gv)': False, 'has2(gw)': False, 'has2(gx)': False, 'has2(gy)': False, 'has2(gz)': False, 'has2(ha)': False, 'has2(hb)': False, 'has2(hc)': False, 'has2(hd)': False, 'has2(he)': False, 'has2(hf)': False, 'has2(hg)': False, 'has2(hh)': False, 'has2(hi)': False, 'has2(hj)': False, 'has2(hk)': False, 'has2(hl)': False, 'has2(hm)': False, 'has2(hn)': False, 'has2(ho)': False, 'has2(hp)': False, 'has2(hq)': False, 'has2(hr)': False, 'has2(hs)': False, 'has2(ht)': False, 'has2(hu)': False, 'has2(hv)': False, 'has2(hw)': False, 'has2(hx)': False, 'has2(hy)': False, 'has2(hz)': False, 'has2(ia)': False, 'has2(ib)': False, 'has2(ic)': False, 'has2(id)': False, 'has2(ie)': False, 'has2(if)': False, 'has2(ig)': False, 'has2(ih)': False, 'has2(ii)': False, 'has2(ij)': False, 'has2(ik)': False, 'has2(il)': False, 'has2(im)': False, 'has2(in)': False, 'has2(io)': False, 'has2(ip)': False, 'has2(iq)': False, 'has2(ir)': False, 'has2(is)': False, 'has2(it)': False, 'has2(iu)': False, 'has2(iv)': False, 'has2(iw)': False, 'has2(ix)': False, 'has2(iy)': False, 'has2(iz)': False, 'has2(ja)': False, 'has2(jb)': False, 'has2(jc)': False, 'has2(jd)': False, 'has2(je)': False, 'has2(jf)': False, 'has2(jg)': False, 'has2(jh)': False, 'has2(ji)': False, 'has2(jj)': False, 'has2(jk)': False, 'has2(jl)': False, 'has2(jm)': False, 'has2(jn)': False, 'has2(jo)': False, 'has2(jp)': False, 'has2(jq)': False, 'has2(jr)': False, 'has2(js)': False, 'has2(jt)': False, 'has2(ju)': False, 'has2(jv)': False, 'has2(jw)': False, 'has2(jx)': False, 'has2(jy)': False, 'has2(jz)': False, 'has2(ka)': False, 'has2(kb)': False, 'has2(kc)': False, 'has2(kd)': False, 'has2(ke)': False, 'has2(kf)': False, 'has2(kg)': False, 'has2(kh)': False, 'has2(ki)': False, 'has2(kj)': False, 'has2(kk)': False, 'has2(kl)': False, 'has2(km)': False, 'has2(kn)': False, 'has2(ko)': False, 'has2(kp)': False, 'has2(kq)': False, 'has2(kr)': False, 'has2(ks)': False, 'has2(kt)': False, 'has2(ku)': False, 'has2(kv)': False, 'has2(kw)': False, 'has2(kx)': False, 'has2(ky)': False, 'has2(kz)': False, 'has2(la)': False, 'has2(lb)': False, 'has2(lc)': False, 'has2(ld)': False, 'has2(le)': False, 'has2(lf)': False, 'has2(lg)': False, 'has2(lh)': False, 'has2(li)': False, 'has2(lj)': False, 'has2(lk)': False, 'has2(ll)': False, 'has2(lm)': False, 'has2(ln)': False, 'has2(lo)': False, 'has2(lp)': False, 'has2(lq)': False, 'has2(lr)': False, 'has2(ls)': False, 'has2(lt)': False, 'has2(lu)': False, 'has2(lv)': False, 'has2(lw)': False, 'has2(lx)': False, 'has2(ly)': False, 'has2(lz)': False, 'has2(ma)': False, 'has2(mb)': False, 'has2(mc)': False, 'has2(md)': False, 'has2(me)': True, 'has2(mf)': False, 'has2(mg)': False, 'has2(mh)': False, 'has2(mi)': False, 'has2(mj)': False, 'has2(mk)': False, 'has2(ml)': False, 'has2(mm)': False, 'has2(mn)': False, 'has2(mo)': False, 'has2(mp)': False, 'has2(mq)': False, 'has2(mr)': False, 'has2(ms)': False, 'has2(mt)': False, 'has2(mu)': False, 'has2(mv)': False, 'has2(mw)': False, 'has2(mx)': False, 'has2(my)': False, 'has2(mz)': False, 'has2(na)': False, 'has2(nb)': False, 'has2(nc)': False, 'has2(nd)': False, 'has2(ne)': False, 'has2(nf)': False, 'has2(ng)': False, 'has2(nh)': False, 'has2(ni)': False, 'has2(nj)': False, 'has2(nk)': False, 'has2(nl)': False, 'has2(nm)': False, 'has2(nn)': False, 'has2(no)': False, 'has2(np)': False, 'has2(nq)': False, 'has2(nr)': False, 'has2(ns)': False, 'has2(nt)': True, 'has2(nu)': False, 'has2(nv)': False, 'has2(nw)': False, 'has2(nx)': False, 'has2(ny)': False, 'has2(nz)': False, 'has2(oa)': False, 'has2(ob)': False, 'has2(oc)': False, 'has2(od)': False, 'has2(oe)': False, 'has2(of)': False, 'has2(og)': False, 'has2(oh)': False, 'has2(oi)': False, 'has2(oj)': False, 'has2(ok)': False, 'has2(ol)': False, 'has2(om)': False, 'has2(on)': False, 'has2(oo)': False, 'has2(op)': False, 'has2(oq)': False, 'has2(or)': True, 'has2(os)': False, 'has2(ot)': False, 'has2(ou)': False, 'has2(ov)': False, 'has2(ow)': False, 'has2(ox)': False, 'has2(oy)': False, 'has2(oz)': False, 'has2(pa)': False, 'has2(pb)': False, 'has2(pc)': False, 'has2(pd)': False, 'has2(pe)': False, 'has2(pf)': False, 'has2(pg)': False, 'has2(ph)': False, 'has2(pi)': False, 'has2(pj)': False, 'has2(pk)': False, 'has2(pl)': False, 'has2(pm)': False, 'has2(pn)': False, 'has2(po)': False, 'has2(pp)': False, 'has2(pq)': False, 'has2(pr)': False, 'has2(ps)': False, 'has2(pt)': False, 'has2(pu)': False, 'has2(pv)': False, 'has2(pw)': False, 'has2(px)': False, 'has2(py)': False, 'has2(pz)': False, 'has2(qa)': False, 'has2(qb)': False, 'has2(qc)': False, 'has2(qd)': False, 'has2(qe)': False, 'has2(qf)': False, 'has2(qg)': False, 'has2(qh)': False, 'has2(qi)': False, 'has2(qj)': False, 'has2(qk)': False, 'has2(ql)': False, 'has2(qm)': False, 'has2(qn)': False, 'has2(qo)': False, 'has2(qp)': False, 'has2(qq)': False, 'has2(qr)': False, 'has2(qs)': False, 'has2(qt)': False, 'has2(qu)': False, 'has2(qv)': False, 'has2(qw)': False, 'has2(qx)': False, 'has2(qy)': False, 'has2(qz)': False, 'has2(ra)': False, 'has2(rb)': False, 'has2(rc)': False, 'has2(rd)': False, 'has2(re)': False, 'has2(rf)': False, 'has2(rg)': False, 'has2(rh)': False, 'has2(ri)': False, 'has2(rj)': False, 'has2(rk)': False, 'has2(rl)': False, 'has2(rm)': False, 'has2(rn)': False, 'has2(ro)': False, 'has2(rp)': False, 'has2(rq)': False, 'has2(rr)': False, 'has2(rs)': False, 'has2(rt)': False, 'has2(ru)': False, 'has2(rv)': False, 'has2(rw)': False, 'has2(rx)': False, 'has2(ry)': False, 'has2(rz)': False, 'has2(sa)': False, 'has2(sb)': False, 'has2(sc)': False, 'has2(sd)': False, 'has2(se)': False, 'has2(sf)': False, 'has2(sg)': False, 'has2(sh)': False, 'has2(si)': False, 'has2(sj)': False, 'has2(sk)': False, 'has2(sl)': False, 'has2(sm)': False, 'has2(sn)': False, 'has2(so)': False, 'has2(sp)': False, 'has2(sq)': False, 'has2(sr)': False, 'has2(ss)': False, 'has2(st)': False, 'has2(su)': False, 'has2(sv)': False, 'has2(sw)': False, 'has2(sx)': False, 'has2(sy)': False, 'has2(sz)': False, 'has2(ta)': False, 'has2(tb)': False, 'has2(tc)': False, 'has2(td)': False, 'has2(te)': False, 'has2(tf)': False, 'has2(tg)': False, 'has2(th)': False, 'has2(ti)': False, 'has2(tj)': False, 'has2(tk)': False, 'has2(tl)': False, 'has2(tm)': False, 'has2(tn)': False, 'has2(to)': True, 'has2(tp)': False, 'has2(tq)': False, 'has2(tr)': False, 'has2(ts)': False, 'has2(tt)': False, 'has2(tu)': False, 'has2(tv)': False, 'has2(tw)': False, 'has2(tx)': False, 'has2(ty)': False, 'has2(tz)': False, 'has2(ua)': False, 'has2(ub)': False, 'has2(uc)': False, 'has2(ud)': False, 'has2(ue)': False, 'has2(uf)': False, 'has2(ug)': False, 'has2(uh)': False, 'has2(ui)': False, 'has2(uj)': False, 'has2(uk)': False, 'has2(ul)': False, 'has2(um)': False, 'has2(un)': False, 'has2(uo)': False, 'has2(up)': False, 'has2(uq)': False, 'has2(ur)': False, 'has2(us)': False, 'has2(ut)': False, 'has2(uu)': False, 'has2(uv)': False, 'has2(uw)': False, 'has2(ux)': False, 'has2(uy)': False, 'has2(uz)': False, 'has2(va)': False, 'has2(vb)': False, 'has2(vc)': False, 'has2(vd)': False, 'has2(ve)': False, 'has2(vf)': False, 'has2(vg)': False, 'has2(vh)': False, 'has2(vi)': False, 'has2(vj)': False, 'has2(vk)': False, 'has2(vl)': False, 'has2(vm)': False, 'has2(vn)': False, 'has2(vo)': False, 'has2(vp)': False, 'has2(vq)': False, 'has2(vr)': False, 'has2(vs)': False, 'has2(vt)': False, 'has2(vu)': False, 'has2(vv)': False, 'has2(vw)': False, 'has2(vx)': False, 'has2(vy)': False, 'has2(vz)': False, 'has2(wa)': False, 'has2(wb)': False, 'has2(wc)': False, 'has2(wd)': False, 'has2(we)': False, 'has2(wf)': False, 'has2(wg)': False, 'has2(wh)': False, 'has2(wi)': False, 'has2(wj)': False, 'has2(wk)': False, 'has2(wl)': False, 'has2(wm)': False, 'has2(wn)': False, 'has2(wo)': False, 'has2(wp)': False, 'has2(wq)': False, 'has2(wr)': False, 'has2(ws)': False, 'has2(wt)': False, 'has2(wu)': False, 'has2(wv)': False, 'has2(ww)': False, 'has2(wx)': False, 'has2(wy)': False, 'has2(wz)': False, 'has2(xa)': False, 'has2(xb)': False, 'has2(xc)': False, 'has2(xd)': False, 'has2(xe)': False, 'has2(xf)': False, 'has2(xg)': False, 'has2(xh)': False, 'has2(xi)': False, 'has2(xj)': False, 'has2(xk)': False, 'has2(xl)': False, 'has2(xm)': False, 'has2(xn)': False, 'has2(xo)': False, 'has2(xp)': False, 'has2(xq)': False, 'has2(xr)': False, 'has2(xs)': False, 'has2(xt)': False, 'has2(xu)': False, 'has2(xv)': False, 'has2(xw)': False, 'has2(xx)': False, 'has2(xy)': False, 'has2(xz)': False, 'has2(ya)': False, 'has2(yb)': False, 'has2(yc)': False, 'has2(yd)': False, 'has2(ye)': False, 'has2(yf)': False, 'has2(yg)': False, 'has2(yh)': False, 'has2(yi)': False, 'has2(yj)': False, 'has2(yk)': False, 'has2(yl)': False, 'has2(ym)': False, 'has2(yn)': False, 'has2(yo)': False, 'has2(yp)': False, 'has2(yq)': False, 'has2(yr)': False, 'has2(ys)': False, 'has2(yt)': False, 'has2(yu)': False, 'has2(yv)': False, 'has2(yw)': False, 'has2(yx)': False, 'has2(yy)': False, 'has2(yz)': False, 'has2(za)': False, 'has2(zb)': False, 'has2(zc)': False, 'has2(zd)': False, 'has2(ze)': False, 'has2(zf)': False, 'has2(zg)': False, 'has2(zh)': False, 'has2(zi)': False, 'has2(zj)': False, 'has2(zk)': False, 'has2(zl)': False, 'has2(zm)': False, 'has2(zn)': False, 'has2(zo)': False, 'has2(zp)': False, 'has2(zq)': False, 'has2(zr)': False, 'has2(zs)': False, 'has2(zt)': False, 'has2(zu)': False, 'has2(zv)': False, 'has2(zw)': False, 'has2(zx)': False, 'has2(zy)': False, 'has2(zz)': False, 'startswith': 'd'}
print("Train new classifier using bigram features")
classifier2 = names_demo(NaiveBayesClassifier.train, bigram_features)
Train new classifier using bigram features Training classifier... Testing classifier... Accuracy: 0.8020 Avg. log likelihood: -1.0164 Unseen Names P(Male) P(Female) ---------------------------------------- Kelli 0.0013 *0.9987 Er *0.9782 0.0218 Ally 0.0076 *0.9924 Stephan *0.9741 0.0259 Chriss 0.1445 *0.8555
name='nate'
print("Run trained classifier on input name:", name)
test_features = bigram_features(name)
output = classifier2.prob_classify(test_features)
print("P(male|{0})={1}".format(name,output.prob('male')))
print("P(female|{0})={1}".format(name,output.prob('female')))
# try the following:
# luke, lee, leigh, karol, chris, kris, pat
Run trained classifier on input name: nate P(male|nate)=0.0012201724411289498 P(female|nate)=0.9987798275588747
classifier2.show_most_informative_features()
Most Informative Features endswith = 'a' female : male = 31.5 : 1.0 count2(hu) = 1 male : female = 26.7 : 1.0 has2(hu) = True male : female = 26.7 : 1.0 has2(rv) = True male : female = 23.3 : 1.0 count2(rv) = 1 male : female = 23.3 : 1.0 count2(lt) = 1 male : female = 19.9 : 1.0 has2(lt) = True male : female = 19.9 : 1.0 has2(rk) = True male : female = 15.3 : 1.0 has2(fo) = True male : female = 15.3 : 1.0 count2(rk) = 1 male : female = 15.3 : 1.0