The following workflow describes how to use gensim to calculate coherence measures for an LDA model that has already identified topics.
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
import re
#TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/ct_tidy_topics.csv')
TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/twitter/twitter_tidy_topics.csv')
num_topics = 50
num_top_terms = 10
topic_term_list = []
top_terms = []
for k in range(1, num_topics+1):
top_terms_topic_k = TopicTermFreq[TopicTermFreq['topic'] == (k-1)].sort_values('count', ascending = False)['term'].tolist()[0:num_top_terms]
top_terms_topic_k = [re.sub(r'\W+', '', term) for term in top_terms_topic_k]
top_terms = top_terms + top_terms_topic_k
topic_term_list.append(top_terms_topic_k)
top_terms = list(set(top_terms))
texts = []
counter = 0
with open('/Users/dankoban/Documents/CT_LDA/CT_data/mallet_input_data_crowdtangle.txt','r') as infile:
for line in infile:
line = line.split(' ')
line = [re.sub(r'\W+', '', term) for term in line]
line = [word.lower() for word in line if word.lower() in top_terms]
line = list(set(line))
counter += 1
if counter %500000 == 0:
print(counter)
texts.append(line)
import os
input_dir = '/Users/dankoban/Documents/EM6575/twitter/hashtag model/hashtags'
# Extract file names from input directory
files = [file for file in os.listdir(input_dir) if file.endswith(".txt")]
file_paths = [input_dir + "/" + file for file in files]
file_paths = file_paths
texts = []
counter = 0
for file in file_paths:
with open(file,'r') as infile:
for line in infile:
line = line.split(' ')
line = [re.sub(r'\W+', '', term) for term in line]
line = [word.lower() for word in line if word.lower() in top_terms]
line = list(set(line))
counter += 1
if counter %500000 == 0:
print(counter)
texts.append(line)
500000 1000000 1500000 2000000 2500000 3000000 3500000 4000000 4500000 5000000 5500000 6000000 6500000 7000000 7500000 8000000 8500000 9000000 9500000 10000000 10500000
len(texts)
10951065
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
from gensim.models import CoherenceModel
cm = CoherenceModel(topics=topic_term_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_scores = cm.get_coherence_per_topic()
coherence_scores
[-4.313408244829818, -6.3413129834533075, -3.905509802967587, -3.2721747805157393, -7.625408311947956, -1.7196023225550179, -4.203992873774845, -4.125434740376362, -3.3245267356686656, -4.243934168684841, -2.885251247261333, -3.274337705415125, -4.09288216257212, -4.924394662078107, -4.149309356654412, -4.484200299110569, -4.2947085374780825, -3.9431711613137117, -3.8064866510481634, -3.3631922605649422, -4.021276276302869, -4.8187575864249625, -3.840477926991953, -4.195053778502089, -3.2573543587006757, -2.948624393117324, -4.613766805713099, -4.522909421247455, -3.3593983229564035, -2.7540591437209723, -3.998741573269264, -3.5019332822601377, -2.8636397550329984, -3.209242248943138, -3.4505860256680485, -2.9636872767448135, -4.671770031055652, -8.15728667880787, -5.170307773349419, -3.7014139374486565, -3.6600284579976434, -3.531379750780082, -6.942124449900354, -3.3191671147482635, -3.926852284714465, -3.881024734399981, -2.9943846741583147, -3.8610574376382782, -4.207393297610942, -4.376751165760487]