Notebook

Naïve Bayes Classification¶

Word vectors from text¶

Transformamos cada oración en un vector de logitud fija. Esta longitud corresponderá al número de palabras de nuestro vocabulario. En nuestro ejemplo clasificaremos frases como ofensivas o no ofensivas.

In [1]:

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', \
        'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', \
        'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', \
        'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how',\
        'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

    classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
    return postingList,classVec


def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)  # We create a list of 0's of lenght 'vocabList'
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec

  File "<ipython-input-1-274944a65804>", line 29
    else: print "the word: %s is not in my Vocabulary!" % word
                                                      ^
SyntaxError: Missing parentheses in call to 'print'. Did you mean print("the word: %s is not in my Vocabulary!" % word)?

Imprimimos un ejemplo de frase.

In [7]:

p, c = loadDataSet()
for i, e in zip(p, c):
    print i, e

listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)

print "\nSentence codification:"
print listOPosts[0]
print setOfWords2Vec(myVocabList, listOPosts[0])

['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'] 0
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'] 1
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'] 0
['stop', 'posting', 'stupid', 'worthless', 'garbage'] 1
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'] 0
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'] 1

Sentence codification:
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]

In [8]:

import numpy as np

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    
    p0Num = p0Num = np.ones(numWords) # p0Num = 0
    p1Num = p1Num = np.ones(numWords)  # p1Num = 0
    p0Denom = len(p0Num) # p0Denom = 0
    p1Denom = len(p0Num) # p1Denom = 0

    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i] # Number of instances of each word
            p1Denom += sum(trainMatrix[i]) # Number of words in class 1
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i]) # Number of words in class 0

    p1Vect = np.log(p1Num/p1Denom)  # p1Vect = p1Num/p1Denom
    p0Vect = np.log(p0Num/p0Denom)  # p0Vect = p0Num/p0Denom
    return p0Vect, p1Vect, pAbusive


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    
    #p1 = np.prod(vec2Classify * p1Vec) * pClass1
    #p0 = np.prod(vec2Classify * p0Vec) * (1.0 - pClass1)
    
    print p1, p0
    
    if p1 > p0:
        return 1
    else:
        return 0

In [18]:

trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    
p0V, p1V, pAb = trainNB0(trainMat,listClasses)

s = ['my','stupid', 'dog', 'has',  'help', 'stupid']
sw = setOfWords2Vec(myVocabList, s)
print classifyNB(sw,p0V,p1V,pAb)

s = ['love', 'my', 'dalmation']
sw = setOfWords2Vec(myVocabList, s)
print classifyNB(sw,p0V,p1V,pAb)

-13.4308527194 -13.5178939679
1
-9.82671449373 -7.69484807238
0

In [ ]: