Notebook

Topic modeling¶

Getting text¶

In this sample, texts are three ebooks with different topics from Guternburg:

Adrift in New York (children fiction)
Beethoven (music)
Sandwiches (cook)

In [1]:

# define a function to get .txt files in a folder
import codecs
from os import listdir
def list_textfiles(directory):
    "Return a list of filenames ending in '.txt' in DIRECTORY."
    textfiles = []
    for filename in listdir(directory):
        if filename.endswith(".txt"):
            textfiles.append(directory + "/" + filename)
    return textfiles   

# define a function to read the text in a .txt file

def read_txt(filename):
    try:
        f = codecs.open(filename,'r','utf-8') #open(filename,'r')
        text = f.read()
    finally:
        if f:
            f.close()
    return text

#import harry potter textfiles
filenames = list_textfiles('Plaintexts')
raw_texts = []
for n in filenames:
    raw_texts.append(read_txt(n))
print len(raw_texts)

Preprocessing¶

remove stopwords
remove puctuation
lemmatize

In [3]:

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string


clean_texts = []
for text in raw_texts:
    # tokenize
    tok = " ".join(word_tokenize(text))
   
    #remove punctuation
    punctuation = set(string.punctuation) 
    re_punc = "".join(i for i in tok if i not in punctuation)
    
    #remove stopwords
    re_sw = " ".join([i for i in re_punc.lower().split() if i not in stopwords.words('english')])
    
    #lemmatization
    lemmatize = WordNetLemmatizer()
    le = " ".join(lemmatize.lemmatize(i) for i in re_sw.split())
    
    clean_texts.append(le)

In [64]:

#vectorize text

from sklearn.feature_extraction.text import CountVectorizer  
n_features = 1000
tf_vectorizer = CountVectorizer(min_df = 2,
                                strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(clean_texts)

LDA topic modeling¶

In [32]:

from sklearn.decomposition import LatentDirichletAllocation
n_topic = 3
lda = LatentDirichletAllocation(n_components = n_topic, 
                                learning_method='online',
                                max_iter=50,
                                random_state=0)
doctopic = lda.fit(tf)

# topic_distribution is a distribution of the topics in each text
topic_distribution = lda.transform(tf)

print topic_distribution # not normalized (sum of each row is not 1)

[[  2.78220006e-04   9.99694288e-01   2.74919906e-05]
 [  9.99641689e-01   3.31086127e-04   2.72251925e-05]
 [  7.62868577e-05   7.44416846e-05   9.99849271e-01]]

The list above is a probability of topic distrubition in the three texts.
Next we will try to visualize the topic distribution in a heatmap.

In [63]:

import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl

import matplotlib.pyplot as plt
%matplotlib inline

flight_matrix = topic_distribution 

yLabel = ['Adrift in New York','Beethoven','Sandwiches']
xLabel = ['topic0','topic1','topic2']

fig = plt.figure()
fig, ax = plt.subplots(1,1, figsize=(6,6))


ax.set_xticks(np.arange(len(xLabel)))
ax.set_yticks(np.arange(len(yLabel)))

ax.set_xticklabels(xLabel)
ax.set_yticklabels(yLabel)

heatplot = ax.imshow(flight_matrix, cmap='Purples')

<matplotlib.figure.Figure at 0x114ce8610>

In [ ]: