#!/usr/bin/env python
# coding: utf-8
# # Topic modelling with Spacy, Gensim and Textacy
# By Max Munnecke, @maxmunnecke
#
# This notebook consists of following sections:
# - Initialize: Setting up environment and loading data.
# - Text extraction. Phrase and tokens extraction with Gensim and Spacy.
# - Topic modelling. Using Textacy's LDA model.
# - Data processing. Calculating data for visualization and export.
# - Model evaluation. A collection of visualizations of the resulting topics.
# - Export data. The data can be used for creating more visualization or import into a graph.
#
# General concept:
# The emphasis in this notebook is on facilitating an iterative process where you can easily adjust stopwords and number of topics. Furthermore it contains features to re-focus on sub topics and thereby create a hierachy of topics.
# ## INITIALIZE
# ### Load environment
# In[1]:
# Load packages
import spacy
nlp = spacy.load("en")
import textacy # 0.5.0, does not work with 0.6.0.
import textacy.datasets
import textacy.fileio
import matplotlib.pyplot as plt
import json # write to disk
import pandas as pd
get_ipython().run_line_magic('matplotlib', 'inline')
# nlp = spacy.load("en") # Download spacy english vocabulary: `python -m spacy download en`
import os, re, sys
import warnings
warnings.filterwarnings('ignore') # Let's not pay heed to them right now
get_ipython().run_line_magic('matplotlib', 'inline')
# In[2]:
# Log environment
print("cwd : " + os.getcwd())
print("sys : " + str(sys.version_info))
print("spacy : "+ spacy.__version__)
print("textacy : "+ textacy.__version__)
# ### Set global variables and load data
#
# SET Change 'outroot' to reflect current investigation
# In[3]:
# infolder ='' # win64 py36
infolder = 'data-in/' # docker py35mini
infile = 'tb_data.tsv'
outfolder = 'data-out/'
outroot = 'tb_main_'
#
# START if 'MAIN TOPIC' investigation
# In[4]:
data_org = pd.read_csv(infolder + infile, index_col=0, sep='\t')
print('Length : ' + str(len(data_org)))
data_org.describe()
# In[5]:
# Transforming the incoming dataframe to standard template.
columns_extract = {'pub-full':'title','abstract':'abstract', 'key-au':'keywords'} # {'old':'new'}
data_org = data_org[[name for name in columns_extract.keys()]]
data_org.rename(columns=columns_extract, inplace=True)
data = data_org # Keeping a copy of the original data set for when sub slices are being explored
#
# END if 'MAIN TOPIC' investigation
#
# START if 'SUB TOPIC' investigation
# In[ ]:
# START HERE if 'data' has been manipulated elsewhere
# Load external data frame
data_topic = pd.read_csv(outfolder +'tb_mdrtb_data-topic-df.tsv', index_col=0, sep='\t')
# In[ ]:
data_topic.describe()
#
# SET `sub_topic` and adjust `cutoff`
# In[ ]:
sub_topic = '3'
cutoff = 0.7
data_tmp = data_topic[data_topic[sub_topic]>cutoff]
# In[ ]:
print('Number of articles: %s' % len(data_tmp))
# In[ ]:
data = data_tmp
#
# END if 'SUB TOPIC' investigation
# In[6]:
docs = [ str(a_) + ". " + str(b_) for a_,b_ in zip(data['title'], data['abstract'])]
# Converting '-' to '_' to make sure that terms are not split up during subsequent Gensim and Textacy manipulation.
docs = [re.sub(r'\b-\b', '_', text) for text in docs] # Should not be touched as it is referenced later.
# ## TEXT EXTRACTION
# ### Find phrases
# Concept: Identify freqent phrases and glue them together with an underscore "_".
#
# Inspiration: *Phrase model for bi- and tri-gram with Gensim: https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb
# Other source: https://github.com/bhargavvader/personal/blob/master/notebooks/text_analysis_tutorial/topic_modelling.ipynb
# #### Train phrase model
# In[7]:
import re
import gensim
# Split pargraph into sentences
grap_sentence = re.compile(r'(?
# SET Stopwords. Topic and sub-topic specific words. As well as artifacts not captured by standard `is_stop` property.
# In[72]:
# EXACT nlp object. Names, organisations etc.
exact_stop = 'someword'.split()
for w in exact_stop:
lexeme = nlp.vocab[w]
lexeme.is_stop = True
# LOWERCASE in list. Safe choice.
lower_stop = ('the to a an background objective').lower().split()
# LEMMA in list. Powerful
topic_stop = 'tb mdr mdr_tb tuberculosis'.split()
subtopic_stop = ''.split('|')
artifact = ['-PRON-','=', '+', 'in']
lemma_stop = [item.strip() for item in (topic_stop + subtopic_stop + artifact)]
# #### Filter text
# Overview of all token attributes: https://github.com/explosion/spaCy/blob/master/spacy/attrs.pyx
# In[73]:
docs_tokens, tokens_tmp = [], []
for item in docs_phrased:
doc = nlp(item)
for w in doc:
# Filter away line endings,nlp stopwords, numbers and words in lists
if not (w.text == '\n' or w.is_stop or w.is_punct or w.like_num or w.lemma_ in lemma_stop or w.text.lower() in lower_stop):
tokens_tmp.append(w.lemma_)
docs_tokens.append(tokens_tmp)
tokens_tmp = []
# check
print(docs_tokens[1])
# ## TOPIC MODELLING
# We use Textacy because it has a nice suite of functions for topic modelling and connects with both termite and pyLDAvis
# In[74]:
vectorizer = textacy.Vectorizer(
weighting='tf', normalize=False, smooth_idf=True,
min_df=3, max_df=0.95, max_n_terms=10000)
# In[75]:
# Document-Term Matrix
doc_term_matrix = vectorizer.fit_transform(docs_tokens)
#
# SET Number of topics to model. Start with a larger number (10-20) and narrow down in subsequent iterations.
# In[89]:
# SET
no_topics=6
model = textacy.tm.TopicModel('lda', n_topics=no_topics) # `n_components=x` is not being registered in 19.1 version
model.fit(doc_term_matrix)
# In[90]:
# Document-Topic Matrix
doc_topic_matrix = model.transform(doc_term_matrix)
# check
doc_topic_matrix.shape
# ## DATA PROCESSING
# #### Topic/Term distribution
# In[91]:
topic_weight_serie = pd.Series(model.topic_weights(doc_topic_matrix))
# In[92]:
# convert list of terms to text string
topic_term_list = ["| "+" | ".join(x[1])+" |" for x in model.top_topic_terms(vectorizer.id_to_term, topics=-1)]
# In[93]:
# Creating a dataframe from a list of series so the order is perserved. Dictionaries (including OrderDicts) move columns around.
series_list_tmp = [ pd.Series(range(len(topic_weight_serie)),name='topic_id'), pd.Series(topic_term_list, name='terms'), topic_weight_serie.rename('weight')]
topic_term_df = pd.concat(series_list_tmp, axis=1)
# In[94]:
# Insert a column in a particular position. NB: 'rank' returns the type of the column being ranked. is in this case converted to integer.
topic_term_df.insert(0, 'rank', topic_term_df['weight'].rank(ascending=False).astype(int))
# #### Topic Weight Chart
# In[95]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
bars = ax1.bar(range(len(topic_weight_serie)),topic_weight_serie, color='c', edgecolor='black')
plt.savefig(outfolder + outroot +"topic-weight.png")
# #### Find top topics (= top_topic_list)
#
# SET Number of top topics to focus. The 'Topic Weight Chart' above may help deciding how many to include.
# In[96]:
number_top_topics = 4 # max 6
top_list_tmp = topic_weight_serie.nlargest(n=number_top_topics)
top_topic_list = list(top_list_tmp.index)
print(top_topic_list)
# #### Prepare Topic-Term visualization with Termite plot & pyLDAvis
# - See all display options under 'viz.termite': https://www.pydoc.io/pypi/textacy-0.5.0/autoapi/tm/topic_model/index.html
# - More options available with `draw_termite_plot` but not needed so far: https://www.pydoc.io/pypi/textacy-0.5.0/autoapi/viz/termite/index.html#module-viz.termite
# - pyLDAvis based on calculated 'model' and 'doc_topic_matrix' https://github.com/chartbeat-labs/textacy/issues/28
# In[97]:
# NB: `termite_plot` saves an image with option: `save='filename.png'
termite_file = outfolder + outroot +"termite.png"
# In[98]:
# Prepare pyLDAvis
import pyLDAvis
pyLDAvis.enable_notebook()
top_term_matrix = model.model.components_
doc_lengths = [len(d) for d in docs_tokens]
vocab = list(vectorizer.id_to_term.values())
term_frequency = textacy.vsm.get_term_freqs(doc_term_matrix)
vis_data = pyLDAvis.prepare(top_term_matrix,doc_topic_matrix,doc_lengths,vocab,term_frequency)
# ## MODEL EVALUATION
# A number of visualizations are gathered in the following to give a comprehensive overview of the resulting topic model.
# In[99]:
for index, row in topic_term_df.iterrows():
print('%02d' % (row['weight']*100) +'%','#'+'%02d' % row['rank'],"@"+('%02d' % index), row['terms'])
# In[100]:
grid = model.termite_plot(doc_term_matrix, vectorizer.id_to_term, highlight_topics=top_topic_list,
topics=-1, n_terms=30, sort_terms_by='seriation', save=termite_file)
#
# NOTE "Termite Plot" above assign random number to topic starting with (@00). "pyLDAvis" below ranks topic based on their weight in corpus starting with (#01)
# In[101]:
pyLDAvis.display(vis_data)
# In[102]:
for index, row in topic_term_df.sort_values(by='rank').iterrows():
print('%02d' % (row['weight']*100) +'%','@'+('%02d' % index),'#'+'%02d' % row['rank'], row['terms'])
#
# NOTE The above visualizations may inspire you to iteratively adjust stopwords, number of topics or number of topics in focus. Once you have a satisfying result you can proceed with the following export of data
# ## EXPORT DATA
#
# NB: Termite plot is saved every time it is generated.
# NB: Textacy has functions to save the trained model (`model.save` and `textacy.tm.TopicModel.load`).
# In[103]:
# Export topic_term_df
topic_term_df.to_csv(outfolder + outroot +'topic-term-df.tsv', encoding='UTF-8', header=True, index=False, sep='\t')
# In[104]:
# pyLDAvis visualization
pyldavis_file = outfolder + outroot +"pyldavis.html"
pyLDAvis.save_html(vis_data,pyldavis_file)
# #### Document/Topic all-weights (..doc-topic-weight.tsv)
# Merging the docs id with the generated doc_topic_matrix makes it possible to search the content. These are standard functions in Textacy, but we might as well practice our skills in manipulating pandas.
# Get topics for uid with `doc_topic_df.loc['10.1038_emi.2017.83'].nlargest(n=3)`
# Load again with `loaded_df = pd.read_csv(open(r'...doc-topic-weight.tsv',encoding='UTF-8'),sep='\t', index_col=0)`
# In[105]:
# Write Document-Topic Matrix
doc_topic_df = pd.DataFrame(data=doc_topic_matrix, # values
index=data.index, # 1st column as index
columns=list(range(doc_topic_matrix.shape[1]))) # 1st row as the column names
# In[106]:
doc_topic_df.to_csv(outfolder + outroot +'doc-topic-weight.tsv', encoding='UTF-8', header=True, index=True, sep='\t')
# Check df
doc_topic_df.iloc[:2,:5]
# #### Document/Topic top-weights (..doc-topic-top.json)
# Comments: 'model.top_doc_topics' generates a row number and a tuple containing pairs of topic:weight. Row number is used to get 'uid'. Tuple is made into a list of list and the values are converted from numpy to python objects.
# In[107]:
generator = model.top_doc_topics(doc_topic_matrix, docs=-1, top_n=3, weights=True)
doc_topic_top3 = [[data.index[doc_idx],[[x.item(),round(y.item(),2)] for x,y in topics]] for doc_idx, topics in generator]
with open(outfolder + outroot +'doc-topic-top3.json', 'w') as outfile:
json.dump(doc_topic_top3, outfile)
doc_topic_top3[:2]
# #### Topic-Term Matrix with weights
#
# In[108]:
topic_term_list = list(model.top_topic_terms(vectorizer.id_to_term, topics=-1, top_n=10, weights=True))
with open(outfolder + outroot +'topic-term-weight.json', 'w') as outfile:
json.dump(topic_term_list, outfile)
# In[109]:
# Topic Aggregated Weight (list with one number per topic)
with open(outfolder + outroot + 'topic-aggregated-weight.tsv', 'w') as f:
f.write("\n".join([str(x) for x in topic_weight_serie]))
# ##### Word cloud (..wordcloud.txt)
# No satisfactory packages exist for making word clouds in Python from a topic-term frequency list. Below a text file is generated with the data needed for creating word clouds at https://worditout.com/word-cloud/create. I recommend the following settings; font:sans-serif, colours: #ff8000 - #40bfbf, background:#242424, colour blending: rainbow, vary-word-colour: frequency, aspect ration 16/9, differences:big, vary-word-size: frequency
# In[110]:
with open(str(outfolder + outroot + 'wordcloud.txt'), 'w') as f:
for topic_tmp in topic_term_list:
f.write("==== #" + str(topic_tmp[0])+"\n")
for x in topic_tmp[1]:
(a,b) = x
f.write(str(a) + ":" + str(int(b))+"\n")
# #### Top articles for each topic (..doc-topic-abstract.html)
# In[111]:
cutoff = 15 # How many top articles should be displayed for each topic
top_overview = [] # list of topics containing each a list of top articles.
for topic in range(no_topics):
top_overview.append([topic,[[x,'%.2f' % y] for (x,y) in doc_topic_df[topic].nlargest(n=cutoff).iteritems()]])
# In[112]:
with open(str(outfolder + outroot + 'doc-top.html'), 'w') as f:
f.write('\nTop Documents')
for topic in top_overview:
# print top topic titles
f.write('Topic number %s
' % (topic[0]))
for item in topic[1]:
#look-up title
uid = item[0]
url = uid.replace('_','/')
weight = item[1]
# Find row and transpose the first row to a Serie with column names as index.
row = data.loc[uid]
f.write('%s : %s | %s
' % (row['title'],url,uid,weight))
#Output title, doi, weight.
# print top topic abstracts
f.write('')
#
# START if 'MAIN TOPIC' investigation
# ### Save dataframe for exploring sub-themes
# When a main theme has been explored, a dataframe may be exported containing the combined data and the doc_topic_df, so it is possible to drill into the individual topics and explore sub topics.
# In[113]:
# Execute only for main
sub_topics = top_topic_list
data_topic = pd.concat([data,doc_topic_df], axis=1)
# In[114]:
data_topic.head()
# In[115]:
data_topic.to_csv(outfolder + outroot +'data-topic-df.tsv', encoding='UTF-8', header=True, index=True, sep='\t')
#
# END if 'MAIN TOPIC' investigation
# In[ ]: