Getting text (harry potter series 1-3)
# define a function to get .txt files in a folder
from os import listdir
def list_textfiles(directory):
"Return a list of filenames ending in '.txt' in DIRECTORY."
textfiles = []
for filename in listdir(directory):
if filename.endswith(".txt"):
textfiles.append(directory + "/" + filename)
return textfiles
# define a function to read the text in a .txt file
def read_txt(filename):
try:
f = open(filename,'r')
text = f.read()
finally:
if f:
f.close()
return text
#import harry potter textfiles
filenames = list_textfiles('HP')
raw_texts = []
for f in filenames:
raw_texts.append(read_txt(f))
print len(raw_texts)
7
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(input='content')
# generate document term matrix
dtm = vectorizer.fit_transform(raw_texts)
# get the words as feature names
vocab = np.array(vectorizer.get_feature_names())
# fit_transform returns a sparse matrix (which uses less memory)
# but we want to work with a normal numpy array.
dtm = dtm.toarray()
# normalize counts to rates per 1000 words
rates = 1000 * dtm / np.sum(dtm, axis=1, keepdims=True)
print rates[:, 1000:1010]
print vocab[1000:1010]
[[0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0]] [u'anthem' u'anthems' u'anthill' u'anthology' u'anthony' u'anti' u'anticheating' u'anticipated' u'anticipating' u'anticipation']
Here, in my texts, we will identify the unique words used in each harry potter book.
sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(input='content', min_df = 1, stop_words = 'english')
# Let’s execute the model against our corpus:
tfidf_matrix = vectorizer.fit_transform(raw_texts)
feature_names = vectorizer.get_feature_names()
tfidfArray = tfidf_matrix.toarray()
print len(feature_names)
#print tfidfArray
22987
So we are got more than 10 thousands words. View the top 5 terms of each news article based on the tfidf score
feature_names = vectorizer.get_feature_names()
li = [] #li is a list of tfidf (top 10 words) in the book.
tfidf = []
for l in tfidfArray:
# sort tf-idf /How to explain this : ???
#print [[feature_names[x],l[x]] for x in (l*-1).argsort()][:5]
tfidf.append([(feature_names[x],l[x]) for x in (l*-1).argsort()][:10])
# plot the result
import numpy as np
from matplotlib import pyplot as plt
def plotTfidf (li): # parameter li is a list of tfidf (top 5 words) in the book.
#e.g. [(u'pens', 0.43280168447929529), (u'the', 0.29211105475772814), (u'bit', 0.21640084223964764), (u'of', 0.2133540325973512), (u'am', 0.18396048627999845)]
# set figure size
fig = plt.figure(figsize=[10, 5])
# plot tfidf of a given doc
labels,Y = zip(*li) # See notes below
X = np.arange(len(labels))
plt.bar(X,Y,width = 0.4,facecolor = 'lightskyblue',edgecolor = 'white')
plt.xticks(X, labels)# stick labels on x axis
for x,y in zip(X,Y):
plt.text(x,y+0.005,round(y,2), ha='center', va= 'bottom',fontsize=7) #round(y,2): two decimals
plt.show()
i = 0
for doc in tfidf:
print "tfidf for article " + str(i+1) +" is:"
plotTfidf(doc)
i +=1
tfidf for article 1 is:
tfidf for article 2 is:
tfidf for article 3 is:
tfidf for article 4 is:
tfidf for article 5 is:
tfidf for article 6 is:
tfidf for article 7 is:
From the result, we can see that there are common words like 'harry' 'said' 'ron'... This is because that the documents number is too small. So we need to set the max-df parameter, and filter out those words which have a really high df. Please see example here: http://localhost:8888/notebooks/Desktop/temp/UofA%20/Learn%20python/machine%20learning/Feature%20selection_finding%20distinctive%20words(sklearn-CountVectorizer%2B%20TfidfTransformer)%20.ipynb