In this sample, texts are three ebooks with different topics from Guternburg:
# define a function to get .txt files in a folder
import codecs
from os import listdir
def list_textfiles(directory):
"Return a list of filenames ending in '.txt' in DIRECTORY."
textfiles = []
for filename in listdir(directory):
if filename.endswith(".txt"):
textfiles.append(directory + "/" + filename)
return textfiles
# define a function to read the text in a .txt file
def read_txt(filename):
try:
f = codecs.open(filename,'r','utf-8') #open(filename,'r')
text = f.read()
finally:
if f:
f.close()
return text
#import harry potter textfiles
filenames = list_textfiles('Plaintexts')
raw_texts = []
for n in filenames:
raw_texts.append(read_txt(n))
print len(raw_texts)
3
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
clean_texts = []
for text in raw_texts:
# tokenize
tok = " ".join(word_tokenize(text))
#remove punctuation
punctuation = set(string.punctuation)
re_punc = "".join(i for i in tok if i not in punctuation)
#remove stopwords
re_sw = " ".join([i for i in re_punc.lower().split() if i not in stopwords.words('english')])
#lemmatization
lemmatize = WordNetLemmatizer()
le = " ".join(lemmatize.lemmatize(i) for i in re_sw.split())
clean_texts.append(le)
#vectorize text
from sklearn.feature_extraction.text import CountVectorizer
n_features = 1000
tf_vectorizer = CountVectorizer(min_df = 2,
strip_accents = 'unicode',
max_features=n_features,
stop_words='english')
tf = tf_vectorizer.fit_transform(clean_texts)
from sklearn.decomposition import LatentDirichletAllocation
n_topic = 3
lda = LatentDirichletAllocation(n_components = n_topic,
learning_method='online',
max_iter=50,
random_state=0)
doctopic = lda.fit(tf)
# topic_distribution is a distribution of the topics in each text
topic_distribution = lda.transform(tf)
print topic_distribution # not normalized (sum of each row is not 1)
[[ 2.78220006e-04 9.99694288e-01 2.74919906e-05] [ 9.99641689e-01 3.31086127e-04 2.72251925e-05] [ 7.62868577e-05 7.44416846e-05 9.99849271e-01]]
The list above is a probability of topic distrubition in the three texts.
Next we will try to visualize the topic distribution in a heatmap.
import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
flight_matrix = topic_distribution
yLabel = ['Adrift in New York','Beethoven','Sandwiches']
xLabel = ['topic0','topic1','topic2']
fig = plt.figure()
fig, ax = plt.subplots(1,1, figsize=(6,6))
ax.set_xticks(np.arange(len(xLabel)))
ax.set_yticks(np.arange(len(yLabel)))
ax.set_xticklabels(xLabel)
ax.set_yticklabels(yLabel)
heatplot = ax.imshow(flight_matrix, cmap='Purples')
<matplotlib.figure.Figure at 0x114ce8610>