#!/usr/bin/env python # coding: utf-8 # # Topic modelling with NMF # In[1]: get_ipython().run_line_magic('config', 'IPCompleter.greedy=True') import pandas as pd import numpy as np import sklearn from nltk.corpus import stopwords import nltk from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from sklearn.preprocessing import normalize import pickle from timeit import default_timer as timer import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') from itertools import combinations import re import spacy import gensim import seaborn as sns nlp = spacy.load('en') # This notebook showcases how to tune the number of topics parameter for NMF using Topic Coherence and trains an NMF model on Trifork's blog posts. It also shows how to interpret the resulting topics. Let's start by loading the data: # In[2]: data_path = '../data/cleaned_blogs.csv' df = pd.read_csv(data_path) col_name = 'clean_content' data = pd.DataFrame(df[[col_name]].to_numpy(), columns=['clean_content']) data.drop([28, 38], axis=0, inplace=True) # ### Preparing the data # Scikit-learn has a simple and fast implementation of NMF that we can use. The input to the model are tf-idf vectors from the corpus. Using tf-idf vector allows us to penalise common words that are in every document and favour words that are less common and more descriptive. For more on tf-idf check http://www.tfidf.com/ # In this process we also remove stopwords, which are common words that do not provide any meaning such as a, the, but, etc. # In[3]: # Scikit vectorizers require a sequence of strings so a list of the sentences X_train_scikit = data.iloc[0:][col_name].values # In[4]: # Normalize the tf idf vectors to unit legth for each row to mask effect of longer docs having more # and thus higher tf values stop_words = set(stopwords.words('english')) tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_df=0.95, min_df=2, stop_words=stop_words, norm='l2') x_tfidf = tfidf_vectorizer.fit_transform(X_train_scikit) tfidf_feature_names = tfidf_vectorizer.get_feature_names() # ## Selecting the number of topics with Topic Coherence # Topic Modelling algortihms require the number of topics as a parameter. However, we do not know the correct value a priori since it is an unsupervised technique and we do not have labels. To pick the value for the number of topics we can use several heuristics, such as Topic Coherence, used in (https://www.insight-centre.org/sites/default/files/publications/15.010_eswa2014_final_submit.pdf and https://github.com/derekgreene/topic-model-tutorial) # # The Topic Coherence-Word2Vec (TC-W2V) metric measures the coherence between words assigned to a topic, i.e.: how semantically close are the words that describe a topic. We can train a Word2Vec model on our collection of documents that will organise the words in a n-dimensional space where semantically similar words are close to each other. The TC-W2V for a topic will be the average similarity between all pairs of the top-n words describing the topic (we define similarity to be 1 when the distance between the words in the n-dimensional space is 0). We then train an NMF model for different values of the number of topics (k) and for each we calculate the average TC-W2V across all topics. The k with the highest average TC-W2V is used to train a final NMF model. # In[5]: # Create this generator to feed words into the Word2Vec model class WordGenerator: ''' Given a document it tokenises it (split in words) and yields one a at a time. ''' def __init__(self, blogs): self.blogs = blogs def __iter__( self ): for blog in self.blogs: sentence_tokens = nlp(str(blog)) tokens = [] for tok in sentence_tokens: if len(tok) >= 2: tokens.append(tok.text) yield tokens # In[6]: # Train our own word2vec model on the blog posts. The size is the number of dimensions of the embedding space and the min_count is the number of times a word needs to # appear in the corpus to be considered word_gen = WordGenerator(data[col_name].tolist()) w2v_model = gensim.models.Word2Vec(word_gen, size=500, min_count=5, sg=1) print("The w2v model has been trained on %d terms" % len(w2v_model.wv.vocab)) w2v_model.save("w2v_model.bin") w2v_lookup = w2v_model.wv del w2v_model # In[7]: def compute_TC_W2V(w2v_lookup, topics_words): ''' Compute TC_W2V for the topics of a model using the w2v_lookup. TC_W2V is calculated for all possible pairs of words in the topic and then averaged with the mean for that topic. The total TC_W2V for the model is the mean over all topics. ''' total_coherence = 0.0 for topic_index in range(len(topics_words)): # Compute coherence per pair of words pair_scores = [] for pair in combinations(topics_words[topic_index], 2): try: pair_scores.append(w2v_lookup.similarity(pair[0], pair[1])) except KeyError as e: # If word is not in the word2vec model then as score 0.5 print(e) pair_scores.append(0.5) # get the mean over all pairs in this topic topic_score = sum(pair_scores) / len(pair_scores) total_coherence += topic_score # get the mean score across all topics return total_coherence / len(topics_words) # In[8]: def get_words_per_topic(topic_vec, feature_names, num_top_words): ''' Returns a list with the num_top_words with the highest score for the topic given ''' return [feature_names[i] for i in topic_vec.argsort()[:-num_top_words - 1:-1]] # In[9]: kmin, kmax = 2, 16 k_values = [] # used later to plot tc_w2v = [] num_top_words = 10 for k in range(kmin, kmax + 1): # Train a NMF model with the current k nmf_model = NMF(n_components=k, random_state=42, init='nndsvd', alpha=0.1, l1_ratio = 0.5) nmf_W = nmf_model.fit_transform(x_tfidf) nmf_H = nmf_model.components_ # Create list of topics, each topic is described by the num_top_words words with highest score topics_words = [] for topic_index in range(k): topics_words.append(get_words_per_topic(nmf_H[topic_index], tfidf_feature_names, num_top_words)) # Compute the coherence for the topics for model with k topics k_values.append(k) tc_w2v.append(compute_TC_W2V(w2v_lookup, topics_words)) print("TC_W2V(k=%d) = %.4f" % (k, tc_w2v[-1])) # In this case, k=15 yields the highest average value. # In[10]: # Plot the coherences for the k and find the highest sns.set() fig = plt.figure(figsize=(10,4)) # create the line plot ax = plt.plot(k_values, tc_w2v) plt.xticks(k_values) plt.xlabel("Number of Topics") plt.ylabel("Mean Coherence") # add the points plt.scatter(k_values, tc_w2v, s=100) # find and annotate the maximum point on the plot ymax = max(tc_w2v) xpos = tc_w2v.index(ymax) best_k = k_values[xpos] plt.annotate("k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16) plt.show() # Once we have an informed decision on the number of topics we can train a final NMF model. Alpha multiplies the regularization term (alpha = 0 no regularization) and l1_ratio = 0 means the penalty is an elementwise L2 penalty # In[11]: num_topics = k_values[tc_w2v.index(max(tc_w2v))] nmf_model = NMF(n_components=num_topics, random_state=42, init='nndsvd', alpha=0.1, l1_ratio=0.5) start = timer() nmf_model.fit(x_tfidf) end = timer() print(f'NMF model trained in {end-start} seconds for {num_topics} topics.') nmf_W = nmf_model.transform(x_tfidf) nmf_H = nmf_model.components_ # In[12]: nmf_W[1] # ## Interpreting the topics with visualisations # The resulting topics are represented by a set of words and not by a name or anything that gives them meaning. Therefore, it is up to us to interpret the set of words and identify the meaning behind the topic, if any. # # Intially we can use the two output matrices H and W to get the scores per document and per topic and use dataframes to inspect them easily. # In[13]: def get_words_per_topic_df(H, feature_names, num_top_words): '''Returns the num_top_words words per topic in a dataframe Args: H: matrix returned by the NMF model, KxV (V = vocabulary size, #words in the corpus) feature_names: the word names, to map it from the indices num_top_words: number of words per topic Returns: a dataframe ''' word_dict = {}; for topic_idx, topic_vec in enumerate(H): words = get_words_per_topic(topic_vec, feature_names, num_top_words) word_dict["Topic %d:" % (topic_idx + 1)] = words # word_dict[(topic_idx + 1)] = words return pd.DataFrame(word_dict) def get_docs_per_topic(W, documents, num_top_docs): '''Returns the num_top_docs documents with highest score per topic Args: W: matrix returned by the NMF model, KxD (D = #docs in the corpus) documents: list of documents, to map it from the indices num_top_docs: number of documents to show per topic Returns: a dataframe''' doc_dict = {} for topic_idx in range(np.shape(W)[1]): top_doc_indices = np.argsort(W[:,topic_idx])[::-1][0:num_top_docs] docs = [documents[doc_index] for doc_index in top_doc_indices] doc_dict["Topic %d:" % (topic_idx + 1)] = docs return pd.DataFrame(doc_dict) def get_docs_per_topic_with_info(W, documents, num_top_docs): ''' Returns the num_top_docs document docuements and all the info from the original dataframe per topic. Args: W: matrix returned by the NMF model, KxD (D = #docs in the corpus) documents: original dataframe with all the data num_top_docs: number of documents to show per topic Returns: a dictionary of dataframes ''' doc_dict = {}; for topic_idx in range(np.shape(W)[1]): top_doc_indices = np.argsort(W[:,topic_idx])[::-1][0:num_top_docs] docs = pd.DataFrame(documents.iloc[top_doc_indices]) doc_dict[topic_idx] = docs return doc_dict # In[14]: # NMF top words per topic num_top_words = 10 result = get_words_per_topic_df(nmf_H, tfidf_feature_names, num_top_words) result.index = range(1,11) result # In[15]: # NMF top documents per topic num_top_docs = 20 documents_per_topic = get_docs_per_topic(nmf_W, data.values, num_top_docs) documents_per_topic = get_docs_per_topic_with_info(nmf_W, df, num_top_docs) # documents_per_topic[0] # uncomment this to see documents assigned to the first topic # However, together with the words it would also be relevant to check the scores and how the words are assigned to different topics. To facilitate the task of interpreting the topics we can use pyLDAviz tool which provides an overview of the words assigned to the topics with their proportion in the topic and in the corpus. We can see how topic 1 englobes topics 3, 4, 5, 6: topic 1 is about Spring and other frameworks (Axon, Elastic search) used usually with Spring. Also, it intersects with topics 2 and 7 which are business-related topics, probably about conferences and similar events where these technologies are discussed. There are also smaller topics farther away, whose documents likely aren’t related to the other topics as much. As an example, if I were interested in reading about containers I would start with documents assigned to topic 9 in the visualization (topic 8 in the topics above since they were numbered differently). # # Note: the topic numeration in this visualisation does not correspond to the models numeration of topics. # In[16]: import pyLDAvis import pyLDAvis.sklearn pyLDAvis.enable_notebook() vis = pyLDAvis.sklearn.prepare(nmf_model, x_tfidf, tfidf_vectorizer) vis # In[17]: pyLDAvis.save_html(vis, 'vis.html')