Notebook

Calculate coherence for a pretrained model¶

The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics.

Extract the top 10 terms from the term frequency table¶

In [1]:

import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
import re

#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')
TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')

num_topics = 50
num_top_terms = 10

topic_term_list = []
top_terms = []
for k in range(1, num_topics+1):    
    top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]
    top_terms_topic_k = [re.sub(r'\W+', '', term) for term in top_terms_topic_k]    
    top_terms = top_terms + top_terms_topic_k
    topic_term_list.append(top_terms_topic_k)

top_terms = list(set(top_terms))

Load the raw text files and parse to retain top term vocab only¶

In [ ]:

texts = []
counter = 0
with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:
    for line in infile:
        line = line.split(' ')                  
        line = [re.sub(r'\W+', '', term) for term in line]
        line = [word.lower() for word in line if word.lower() in top_terms]
        line = list(set(line))
        counter += 1
        if counter %500000 == 0:            
            print(counter)
        texts.append(line)      

In [2]:

import os
input_dir = '/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags'

# Extract file names from input directory
files = [file for file in os.listdir(input_dir) if file.endswith(".txt")]   
file_paths = [input_dir + "/" + file for file in files]
file_paths = file_paths

texts = []
counter = 0
for file in file_paths:
    with open(file,'r') as infile:    
        for line in infile:
            line = line.split(' ')                  
            line = [re.sub(r'\W+', '', term) for term in line]
            line = [word.lower() for word in line if word.lower() in top_terms]
            line = list(set(line))
            counter += 1
            if counter %500000 == 0:            
                print(counter)
            texts.append(line)

In [3]:

len(texts)

Out[3]:

10951065

Transform the raw text into bag of words dictionary and corpus¶

In [4]:

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Calculate topic coherence¶

In [5]:

from gensim.models import CoherenceModel

cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_scores = cm.get_coherence_per_topic()
coherence_scores

Out[5]:

[-4.313408244829818,
 -6.3413129834533075,
 -3.905509802967587,
 -3.2721747805157393,
 -7.625408311947956,
 -1.7196023225550179,
 -4.203992873774845,
 -4.125434740376362,
 -3.3245267356686656,
 -4.243934168684841,
 -2.885251247261333,
 -3.274337705415125,
 -4.09288216257212,
 -4.924394662078107,
 -4.149309356654412,
 -4.484200299110569,
 -4.2947085374780825,
 -3.9431711613137117,
 -3.8064866510481634,
 -3.3631922605649422,
 -4.021276276302869,
 -4.8187575864249625,
 -3.840477926991953,
 -4.195053778502089,
 -3.2573543587006757,
 -2.948624393117324,
 -4.613766805713099,
 -4.522909421247455,
 -3.3593983229564035,
 -2.7540591437209723,
 -3.998741573269264,
 -3.5019332822601377,
 -2.8636397550329984,
 -3.209242248943138,
 -3.4505860256680485,
 -2.9636872767448135,
 -4.671770031055652,
 -8.15728667880787,
 -5.170307773349419,
 -3.7014139374486565,
 -3.6600284579976434,
 -3.531379750780082,
 -6.942124449900354,
 -3.3191671147482635,
 -3.926852284714465,
 -3.881024734399981,
 -2.9943846741583147,
 -3.8610574376382782,
 -4.207393297610942,
 -4.376751165760487]