#!/usr/bin/env python # coding: utf-8 # # Performing Model Selection Using Topic Coherence # # This notebook will perform topic modeling on the 20 Newsgroups corpus using LDA. We will perform model selection (over the number of topics) using topic coherence as our evaluation metric. This will showcase some of the features of the topic coherence pipeline implemented in `gensim`. In particular, we will see several features of the `CoherenceModel`. # In[39]: from __future__ import print_function import os import re from gensim.corpora import TextCorpus, MmCorpus from gensim import utils, models from gensim.parsing.preprocessing import STOPWORDS from gensim.utils import deaccent # ## Parsing the Dataset # # The 20 Newsgroups dataset uses a hierarchical directory structure to store the articles. The structure looks something like this: # ``` # 20news-18828/ # |-- alt.atheism # | |-- 49960 # | |-- 51060 # | |-- 51119 # |-- comp.graphics # | |-- 37261 # | |-- 37913 # | |-- 37914 # |-- comp.os.ms-windows.misc # | |-- 10000 # | |-- 10001 # | |-- 10002 # ``` # # The files are in the newsgroup markup format, which includes some headers, quoting of previous messages in the thread, and possibly PGP signature blocks. The message body itself is raw text, which requires preprocessing. The code immediately below is an adaptation of [an active PR](https://github.com/RaRe-Technologies/gensim/pull/1388) for parsing hierarchical directory structures into corpora. The code just below that builds on this basic corpus parser to handle the newsgroup-specific text parsing. # In[34]: class TextDirectoryCorpus(TextCorpus): """Read documents recursively from a directory, where each file is interpreted as a plain text document. """ def iter_filepaths(self): """Lazily yield paths to each file in the directory structure within the specified range of depths. If a filename pattern to match was given, further filter to only those filenames that match. """ for dirpath, dirnames, filenames in os.walk(self.input): for name in filenames: yield os.path.join(dirpath, name) def getstream(self): for path in self.iter_filepaths(): with utils.smart_open(path) as f: doc_content = f.read() yield doc_content def preprocess_text(self, text): text = deaccent( lower_to_unicode( strip_multiple_whitespaces(text))) tokens = simple_tokenize(text) return remove_short( remove_stopwords(tokens)) def get_texts(self): """Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ lines = self.getstream() if self.metadata: for lineno, line in enumerate(lines): yield self.preprocess_text(line), (lineno,) else: for line in lines: yield self.preprocess_text(line) def remove_stopwords(tokens, stopwords=STOPWORDS): return [token for token in tokens if token not in stopwords] def remove_short(tokens, minsize=3): return [token for token in tokens if len(token) >= minsize] def lower_to_unicode(text): return utils.to_unicode(text.lower(), 'ascii', 'ignore') RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) def strip_multiple_whitespaces(text): return RE_WHITESPACE.sub(" ", text) PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) def simple_tokenize(text): for match in PAT_ALPHABETIC.finditer(text): yield match.group() # In[35]: class NewsgroupCorpus(TextDirectoryCorpus): """Parse 20 Newsgroups dataset.""" def extract_body(self, text): return strip_newsgroup_header( strip_newsgroup_footer( strip_newsgroup_quoting(text))) def preprocess_text(self, text): body = self.extract_body(text) return super(NewsgroupCorpus, self).preprocess_text(body) def strip_newsgroup_header(text): """Given text in "news" format, strip the headers, by removing everything before the first blank line. """ _before, _blankline, after = text.partition('\n\n') return after _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:' r'|^In article|^Quoted from|^\||^>)') def strip_newsgroup_quoting(text): """Given text in "news" format, strip lines beginning with the quote characters > or |, plus lines that often introduce a quoted section (for example, because they contain the string 'writes:'.) """ good_lines = [line for line in text.split('\n') if not _QUOTE_RE.search(line)] return '\n'.join(good_lines) _PGP_SIG_BEGIN = "-----BEGIN PGP SIGNATURE-----" def strip_newsgroup_footer(text): """Given text in "news" format, attempt to remove a signature block.""" try: return text[:text.index(_PGP_SIG_BEGIN)] except ValueError: return text # ### Loading the Dataset # # Now that we have defined the necessary code for parsing the dataset, let's load it up and serialize it into Matrix Market format. We'll do this because we want to train LDA on it with several different parameter settings, and this will allow us to avoid repeating the preprocessing. # In[36]: # Replace data_path with path to your own copy of the corpus. # You can download it from here: http://qwone.com/~jason/20Newsgroups/ # I'm using the original, called: 20news-19997.tar.gz home = os.path.expanduser('~') data_dir = os.path.join(home, 'workshop', 'nlp', 'data') data_path = os.path.join(data_dir, '20_newsgroups') # In[49]: get_ipython().run_cell_magic('time', '', '\ncorpus = NewsgroupCorpus(data_path)\ndictionary = corpus.dictionary\nprint(len(corpus))\nprint(dictionary)\n') # In[38]: get_ipython().run_cell_magic('time', '', "\nmm_path = os.path.join(data_dir, '20_newsgroups.mm')\nMmCorpus.serialize(mm_path, corpus, id2word=dictionary)\nmm_corpus = MmCorpus(mm_path) # load back in to use for LDA training\n") # ## Training the Models # # Our goal is to determine which number of topics produces the most coherent topics for the 20 Newsgroups corpus. The corpus is roughly 20,000 documents. If we used 100 topics and the documents were evenly distributed among topics, we'd have clusters of 200 documents. This seems like a reasonable upper bound. In this case, the corpus actually has categories, defined by the first-level directory structure. This can be seen in the directory structure shown above, and three examples are: `alt.atheism`, `comp.graphics`, and `comp.os.ms-windows.misc`. There are 20 of these (hence the name of the dataset), so we'll use 20 as our lower bound for the number of topics. # # One could argue that we already know the model should have 20 topics. I'll argue there may be additional categorizations within each newsgroup and we might hope to capture those by using more topics. We'll step by increments of 10 from 20 to 100. # In[40]: get_ipython().run_cell_magic('time', '', '\ntrained_models = {}\nfor num_topics in range(20, 101, 10):\n print("Training LDA(k=%d)" % num_topics)\n lda = models.LdaMulticore(\n mm_corpus, id2word=dictionary, num_topics=num_topics, workers=4,\n passes=10, iterations=200, random_state=42,\n alpha=\'asymmetric\', # shown to be better than symmetric in most cases\n decay=0.5, offset=64 # best params from Hoffman paper\n )\n trained_models[num_topics] = lda\n') # ## Evaluation Using Coherence # # Now we get to the heart of this notebook. In this section, we'll evaluate each of our LDA models using topic coherence. Coherence is a measure of how interpretable the topics are to humans. It is based on the representation of topics as the top-N most probable words for a particular topic. More specifically, given the topic-term matrix for LDA, we sort each topic from highest to lowest term weights and then select the first N terms. # # Coherence essentially measures how similar these words are to each other. There are various methods for doing this, most of which have been explored in the paper ["Exploring the Space of Topic Coherence Measures"](https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf). The authors performed a comparative analysis of various methods, correlating them to human judgements. The method named "c_v" coherence was found to be the most highly correlated. This and several of the other methods have been implemented in `gensim.models.CoherenceModel`. We will use this to perform our evaluations. # # The "c_v" coherence method makes an expensive pass over the corpus, accumulating term occurrence and co-occurrence counts. It only accumulates counts for the terms in the lists of top-N terms for each topic. In order to ensure we only need to make one pass, we'll construct a "super topic" from the top-N lists of each of the models. This will consist of a single topic with all the relevant terms from all the models. We choose 20 as N. # In[53]: # Build topic listings from each model. import itertools from gensim import matutils def top_topics(lda, num_words=20): str_topics = [] for topic in lda.state.get_lambda(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) beststr = [lda.id2word[_id] for _id in bestn] str_topics.append(beststr) return str_topics model_topics = {} super_topic = set() for num_topics, model in trained_models.items(): topics_as_topn_terms = top_topics(model) model_topics[num_topics] = topics_as_topn_terms super_topic.update(itertools.chain.from_iterable(topics_as_topn_terms)) print("Number of relevant terms: %d" % len(super_topic)) # In[54]: get_ipython().run_cell_magic('time', '', "# Now estimate the probabilities for the CoherenceModel\n\ncm = models.CoherenceModel(\n topics=[super_topic], texts=corpus.get_texts(),\n dictionary=dictionary, coherence='c_v')\ncm.estimate_probabilities()\n") # In[64]: get_ipython().run_cell_magic('time', '', 'import numpy as np\n# Next we perform the coherence evaluation for each of the models.\n# Since we have already precomputed the probabilities, this simply\n# involves using the accumulated stats in the `CoherenceModel` to\n# perform the evaluations, which should be pretty quick.\n\ncoherences = {}\nfor num_topics, topics in model_topics.items():\n cm.topics = topics\n\n # We evaluate at various values of N and average them. This is a more robust,\n # according to: http://people.eng.unimelb.edu.au/tbaldwin/pubs/naacl2016.pdf\n coherence_at_n = {}\n for n in (20, 15, 10, 5):\n cm.topn = n\n topic_coherences = cm.get_coherence_per_topic()\n \n # Let\'s record the coherences for each topic, as well as the aggregated\n # coherence across all of the topics.\n coherence_at_n[n] = (topic_coherences, cm.aggregate_measures(topic_coherences))\n \n topic_coherences, avg_coherences = zip(*coherence_at_n.values())\n avg_topic_coherences = np.vstack(topic_coherences).mean(0)\n avg_coherence = np.mean(avg_coherences)\n print("Avg coherence for num_topics=%d: %.5f" % (num_topics, avg_coherence))\n coherences[num_topics] = (avg_topic_coherences, avg_coherence)\n') # In[68]: # Print the coherence rankings avg_coherence = \ [(num_topics, avg_coherence) for num_topics, (_, avg_coherence) in coherences.items()] ranked = sorted(avg_coherence, key=lambda tup: tup[1], reverse=True) print("Ranked by average '%s' coherence:\n" % cm.coherence) for item in ranked: print("num_topics=%d:\t%.4f" % item) print("\nBest: %d" % ranked[0][0]) # ## Conclusion # # In this notebook, we used `gensim`'s `CoherenceModel` to perform model selection over the number of topics for LDA. We found that for the 20 Newsgroups corpus, 50 topics is best. We showcased the ability of the coherence pipeline to evaluate individual topic coherence as well as aggregated model coherence. We also demonstrated how to avoid repeated passes over the corpus, estimating the term similarity probabilities for all relevant terms just once. Topic coherence is a powerful alternative to evaluation using perplexity on a held-out document set. It is appropriate to use whenever the objective of the topic modeling is to present the topics as top-N lists for human consumption. # # Note that coherence calculations are generally much more accurate when a larger reference corpus is used to estimate the probabilities. In this case, we used the same corpus as for our modeling, which is relatively small at only 20 documents. A better reference corpus is the full Wikipedia corpus. The motivated explorer of this notebook is encouraged to download that corpus (see [Experiments on the English Wikipedia](https://radimrehurek.com/gensim/wiki.html)) and use it for probability estimation. # In[ ]: