import nltk from nltk.corpus import movie_reviews negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #before creating test and training sets, first survey full corpus negwords=[] for file in negids: negwords += [w.lower() for w in movie_reviews.words(file) if w[0].isalpha()] poswords=[] for file in posids: poswords += [w.lower() for w in movie_reviews.words(file) if w[0].isalpha()] len(negwords),len(poswords) fdistneg= nltk.FreqDist(negwords) fdistpos= nltk.FreqDist(poswords) fdistneg.items()[:10],'...',fdistneg.items()[-10:] fdistpos.items()[:10],'...',fdistpos.items()[-10:] n, bins, patches = hist([v for v in fdistpos.values() if v <40],40) #over 11,000 words appear only once in positive corpus n, bins, patches = hist([v for v in fdistneg.values() if v <40],40) #over 10,000 words appear only once in negative corpus n, bins, patches = hist([v for v in fdistpos.values() if v >=40 and v<1500],100) #mainly stopwords about 1500 #this is approach from http://nltk.org/book/ch06.html (examples 6.4, 6.5) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) #doesn't use #occurrences features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #have a look at features for just one document x=document_features(movie_reviews.words('pos/cv957_8737.txt')) [(k,v) for k,v in x.items()][:10] #this pairs the words in each doc, with the pos/neg category documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) #this pairs the above features for each doc with the pos/neg category featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] # test on first 100 classifier = nltk.NaiveBayesClassifier.train(train_set) nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5) #Now try it using #appearances, not just binary true/false random.shuffle(negids) #shuffle both, random.shuffle(posids) #to pull at random from two sets negwords=[] #use last 950 as training set for filename in negids[50:]: negwords += [w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()] poswords=[] #use last 950 as training set for filename in posids[50:]: poswords += [w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()] colfreqneg= nltk.FreqDist(negwords) #training sets colfreqpos= nltk.FreqDist(poswords) colfreq=nltk.FreqDist(negwords+poswords) #full distribution len(colfreq) #size of full vocab colfreq.items()[:10],'...',colfreq.items()[40:50],'...',colfreq.items()[-10:] #survey what's there vocab = [w for w,v in colfreq.items() if v >=10 and v < 2950] #use frequency cutoffs len(vocab) #reduces vocabulary len([w for w,v in colfreq.items() if v ==10]) #items that occur ten times Nneg=sum([v for w,v in colfreqneg.items() if w in vocab]) Npos=sum([v for w,v in colfreqpos.items() if w in vocab]) Nneg,Npos #total numbers of words in neb and pos training sets # and now train the weights pweight = {} # log ( p(w|P)/p(w|N) ) lc=log(float(Nneg)/Npos) for w in vocab: #need some "smoothing" to avoid any zeroes if colfreqpos[w] == 0: r=1./colfreqneg[w] elif colfreqneg[w] == 0: r= float(colfreqpos[w]) else: r=float(colfreqpos[w])/colfreqneg[w] pweight[w] = log(r) + lc sw=sorted(pweight.keys(),key=pweight.get) #sort to have a look [(w,pweight[w]) for w in sw[:10]] [(w,pweight[w]) for w in sw[-10:]] wrong=[] for filename in negids[:50]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight: score += pweight[w] if score >0: wrong.append((filename,score)) for filename in posids[:50]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight: score += pweight[w] if score <0: wrong.append((filename,score)) len(wrong) #23/100 wrong so strangely enough, the same 77% as before, which ones? wrong #Now try it again, but instead of collection frequency cutoff, # use doc frequency, i.e., appear in minimum number of documents # rather then minimum number of times overall. # using set() will count only once per document: negdocwords=[] #use last 950 as training set for filename in negids[50:]: negdocwords += set([w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()]) posdocwords=[] #use last 950 as training set for filename in posids[50:]: posdocwords += set([w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()]) len(negwords),len(poswords),len(negdocwords),len(posdocwords) #removing multiplicity reduces overall numbers docfreq=nltk.FreqDist(negdocwords+posdocwords) #full distribution len(docfreq) #size of full vocab docfreq.items()[:10] vocab = [w for w,v in docfreq.items() if v >= 10 and v < 1900*.75] #use frequency cutoffs, must appear in at least 10 docs, but not in more than 3/4 of them len(vocab) #with vocab now chosen, determine overall number of terms in neg and pos Nneg=sum([v for w,v in colfreqneg.items() if w in vocab]) Npos=sum([v for w,v in colfreqpos.items() if w in vocab]) print Nneg,Npos #total numbers of words in neb and pos training sets # and now train the weights #(same as above, use colfreq for weights, only used doc frequency to select vocab) pweight = {} # log ( p(w|P)/p(w|N) ) lc=log(float(Nneg)/Npos) for w in vocab: #need some "smoothing" to avoid zeros if colfreqpos[w] == 0: r=1./colfreqneg[w] elif colfreqneg[w] == 0: r= float(colfreqpos[w]) else: r=float(colfreqpos[w])/colfreqneg[w] pweight[w] = log(r) + lc sw=sorted(pweight.keys(),key=pweight.get) #sort to have a look [(w,pweight[w]) for w in sw[:10]] #note that nbsp is gone [(w,pweight[w]) for w in sw[-10:]] #look at some document frequencies docfreq['mulan'],docfreq['winslet'],docfreq['damon'],docfreq['seagal'] wrong=[] for filename in negids[:50]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight: score += pweight[w] if score >0: wrong.append((filename,score)) for filename in posids[:50]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight: score += pweight[w] if score <0: wrong.append((filename,score)) len(wrong) # now up to 86% # improved accuracy shows importance of feature selection, # though should really check for various randomly selected test sets wrong #so try 20-fold cross validation #break it into 20 blocks, omit the i'th and use as test set negwords=[[] for i in range(20)] poswords=[[] for i in range(20)] negdocwords=[[] for i in range(20)] posdocwords=[[] for i in range(20)] for i in range(20): for k in range(20): if k==i: continue #skip the i'th block of 50 files for filename in negids[50*k:50*(k+1)]: negwords[i] += [w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()] negdocwords[i] += set([w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()]) for filename in posids[50*k:50*(k+1)]: poswords[i] += [w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()] posdocwords[i] += set([w.lower() for w in movie_reviews.words(filename) if w[0].isalpha()]) #now same as before, just make lists to do it 20 times docfreq=[] colfreqneg=[] colfreqpos=[] vocab = [] for i in range(20): docfreq.append(nltk.FreqDist(negdocwords[i]+posdocwords[i])) #full distributions colfreqneg.append(nltk.FreqDist(negwords[i])) #training sets colfreqpos.append(nltk.FreqDist(poswords[i])) #use frequency cutoffs, must appear in at least 10 docs, but not in more than 3/4 of them vocab.append([w for w,v in docfreq[i].items() if v >= 10 and v < 1900*.75]) #calculate 20 sets of weights pweight = [{} for i in range(20)] # log ( p(w|P)/p(w|N) ) lc=log(float(Nneg)/Npos) for i in range(20): for w in vocab[i]: #need some "smoothing" to avoid zeros if colfreqpos[i][w] == 0: r=1./colfreqneg[i][w] elif colfreqneg[i][w] == 0: r= float(colfreqpos[i][w]) else: r=float(colfreqpos[i][w])/colfreqneg[i][w] pweight[i][w] = log(r) + lc #collect the number of wrongs for each of the 20 cross-validations wrong=[[] for i in range(20)] for i in range(20): for filename in negids[50*i:50*(i+1)]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight[i]: score += pweight[i][w] if score >0: wrong[i].append((filename,score)) for filename in posids[50*i:50*(i+1)]: score=0 #calculate the score by summing the weights for w in movie_reviews.words(filename): if w.lower() in pweight[i]: score += pweight[i][w] if score <0: wrong[i].append((filename,score)) print map(len,wrong) print mean(map(len,wrong))