from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import *
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn import metrics
We're going to use a bunch of Arxiv physics papers for this task.
# We're already in the directory with the papers, so we can use os.listdir() to get the file names
filename_list = os.listdir()
# There are a few extraneous files we don't want to get caught in the
# filenamelist, so we add some logic to exclude those
filename_list = [filename for filename in filename_list if len(filename) < 8]
# Check that these file names are correct:
filename_list[0:5]
['0301116', '0304232', '0303017', '0303225', '0302131']
Now we can read in all the files from the filename_list
.
corpus = []
for i in range(len(filename_list)):
filename = filename_list[i]
# errors='ignore' is added to deal with UnicodeDecodeErrors
with open(filename, 'r', errors='ignore') as file:
file_contents = file.read()
# Add document to corpus
corpus.append(file_contents)
# Removing LaTeX and other formatting artifacts that will cause issues with NMF and LDA
from nltk.stem.wordnet import WordNetLemmatizer
import re
import gensim.parsing.preprocessing as genpre
lmtzr = WordNetLemmatizer()
def prep_text(text):
# this removes LaTeX formatting, citations, splits hyphens
myreg = r'\\[\w]+[\{| ]|\$[^\$]+\$|\(.+\, *\d{2,4}\w*\)|\S*\/\/\S*|[\\.,\/#!$%\^&\*;:{}=_`\'\"~()><\|]|\[.+\]|\d+|\b\w{1,2}\b'
parsed_data = text.replace('-', ' ')
parsed_data = re.sub(myreg, '', parsed_data)
parsed_data = [lmtzr.lemmatize(w) for w in parsed_data.lower().split() if w not in genpre.STOPWORDS]
return parsed_data
corpus = [prep_text(document) for document in corpus]
prep_text
didn't remove -everything-, but we will have many fewer artifacts than if we didn't run it at all. We can also scrape off some very common LaTeX phrases by passing them as stopwords when retraining the TfIdfVectorizer
, and also by setting max_df
to exclude words that occur in more than 90% of documents.
See this excellent blog post on why prep_text
works to remove LaTeX artifacts. All credit goes to author Omar Abdelbadie for this method.
Note that by using prep_text
we've caused every entry in corpus
to become a list containing a number of strings, rather than one big string for each entry. This is a problem for when we want to create our feature matrix, as TfIdfVectorizer
is not compatible with a list of lists. We'll need to use join
(a string method) to change each entry back to a string instead of a list.
for i in range(len(corpus)):
corpus[i] = ' '.join(corpus[i])
len(corpus)
1019
Now we have 1019 documents to work with. We can turn our corpus into a matrix of Term Frequency Inverse Document Frequency (TF-IDF) features using sklearn
's TfidfVectorizer()
.
# Again ignoring any UnicodeDecodeErrors
vectorizer = TfidfVectorizer(decode_error = 'ignore', min_df = 50, max_df = 0.9,
stop_words = 'english', max_features = 20000)
X = vectorizer.fit_transform(corpus)
X
<1019x2642 sparse matrix of type '<class 'numpy.float64'>' with 492178 stored elements in Compressed Sparse Row format>
Let's take a look at the vocabulary that was learned by the vectorizer.
# Cast the vocab dict to a list so we can print just a subset of the dict
first20_vocab = {k: vectorizer.vocabulary_[k] for k in list(vectorizer.vocabulary_)[:20]}
first20_vocab
{'latex': 1359, 'file': 943, 'paper': 1694, 'documentstylerevtex': 729, 'documentstylearticle': 728, 'beqequation': 206, 'eeqequation': 764, 'partial': 1706, 'footnote': 975, 'thefootnotefootnote': 2416, 'defnonumber': 605, 'def': 586, 'original': 1677, 'draft': 738, 'flushright': 965, 'gravity': 1077, 'induced': 1214, 'smooth': 2210, 'soliton': 2216, 'mail': 1441}
Now we'll do some topic modeling.
# Initialize NMF
nmf_model = NMF(n_components = 10, solver = 'mu')
# Create variable to make it easy to retrieve topics
idx_to_word = np.array(vectorizer.get_feature_names())
nmf_model.fit(X)
NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=10, random_state=None, shuffle=False, solver='mu', tol=0.0001, verbose=0)
nmf_components = nmf_model.components_
for i, topic in enumerate(nmf_components):
print("Topic {}: {}".format(i + 1, ", ".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))
Topic 1: phi, potential, action, symmetry, function, case, term, phys, model, gauge Topic 2: action, cosmological, inflation, hep, bulk, solution, string, tachyon, branes, brane Topic 3: model, mm, defequation, em, math, defem, phys, hep, pt, def Topic 4: space, schwarzschild, metric, cosmological, solution, sitter, entropy, horizon, hole, black Topic 5: wave, alpha, solution, string, function, eq, array, eqnarray, right, left Topic 6: nucl, background, eqn, jhep, wave, arxivhep, phys, string, hep, citation Topic 7: product, lie, quantum, group, representation, state, space, string, operator, algebra Topic 8: brane, action, phi, solution, delta, theta, nonumber, big, gamma, eqnarray Topic 9: bundle, hep, left, matrix, brane, lambda, eea, bea, beq, eeq Topic 10: hep, product, phys, space, theta, star, quantum, noncommutativity, commutative, noncommutative
Exciting! We have a few topics here that are composed of LaTeX specifications, but others are clearly relevant to particular areas of physics.
Let's try out LDA as well.
lda_model = LatentDirichletAllocation(max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda_model.fit(X)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='online', learning_offset=50.0, max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001, n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1, random_state=0, topic_word_prior=None, total_samples=1000000.0, verbose=0)
for i, topic in enumerate(lda_model.components_):
print("Topic {}: {}".format(i + 1, ", ".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))
Topic 1: lie, solution, wave, space, branes, point, limit, string, brane, def Topic 2: left, big, lambda, def, path, brane, beta, string, solution, hep Topic 3: brane, vortex, gauge, right, hep, string, wave, left, citation, def Topic 4: citation, brane, gauge, right, hep, def, left, eqnarray, string, phys Topic 5: vortex, bit, suggested, phiphi, model, root, fermionic, generator, eqn, wall Topic 6: vafa, function, wave, gauge, pt, sec, eqnarray, hep, string, citation Topic 7: right, big, gauge, model, citation, branes, hep, string, brane, phys Topic 8: like, broken, motivation, calculated, orbit, insert, max, phys, def, brane Topic 9: monopole, superfield, citation, radiation, spinor, description, ground, set, gauge, align Topic 10: look, inclusion, left, oscillation, shape, bea, phltab, wigner, chi, def
Interestingly, the LDA model seems to prioritize the LaTeX terms less than the NMF model does. That said, they are similar enough that we'll do our clustering with NMF for ease-of-understanding.
Now we're going to do some clustering. In order to get an appropriate matrix, we'll use NMF's fit_transform
method.
H = nmf_model.fit_transform(X)
# Our corpus is pretty large, so we'll do 15 clusters
kmeans15 = KMeans(n_clusters=15, random_state=6).fit(H)
kmeans15.labels_[0:20]
array([ 8, 12, 11, 12, 12, 2, 2, 7, 11, 14, 11, 7, 11, 12, 8, 3, 2, 1, 14, 13], dtype=int32)
doc_cords = kmeans15.fit_transform(H)
doc_cords[0]
array([0.25134642, 0.2054491 , 0.11403623, 0.16735021, 0.25931608, 0.22499003, 0.29496372, 0.16920125, 0.05657176, 0.32005591, 0.24236943, 0.14514269, 0.20884579, 0.1209675 , 0.12043548])
This is a document represented as its distance from each cluster centroid. The lowest value in the array, therefore, represents the cluster the document belongs to. (Try verifying this yourself by comparing what you see here with the cluster labels we printed out in the previous cell!)
Let's see how many documents we have in each cluster:
Counter(kmeans15.labels_)
Counter({8: 106, 12: 58, 11: 111, 2: 119, 7: 49, 14: 41, 3: 60, 1: 70, 13: 171, 0: 35, 5: 54, 4: 44, 9: 25, 10: 55, 6: 21})
cluster_docs = defaultdict(list)
tmp_op = [cluster_docs[topic].append(idx) for idx, topic in enumerate(kmeans15.labels_)]
As a disclaimer, I had significant help from a professional data scientist in writing this method. It's important to ask for help when you really need it!
def print_top_keywords_for_cluster(doc_list, H, nmf_h, idx_to_word):
"""
doc_list : list of row ids of documents in a cluster
H : NMF transformation of document X term features
nmf_h : nmf object
idx_to_word: vectorizer features by index
"""
# top terms
n = 7
top_words = []
for doc in doc_list:
# Get the top topic for this document
top_topic = H[doc].argsort()[-1:][0]
# Top terms for this document
top_words.append([str(idx_to_word[x])for x in nmf_h.components_[top_topic].argsort()[-n:]])
# flatten the list of lists
top_terms = [kw for doc_list in top_words for kw in doc_list]
# print the top keywords
print("Top keywords in cluster: ", Counter(top_terms))
return 1
# Top keywords for clusters 0-6
for i in range(7):
print_top_keywords_for_cluster(cluster_docs[i], H, nmf_model, idx_to_word)
print()
Top keywords in cluster: Counter({'left': 35, 'lambda': 35, 'brane': 35, 'eea': 35, 'bea': 35, 'beq': 35, 'eeq': 35}) Top keywords in cluster: Counter({'group': 68, 'representation': 68, 'state': 68, 'space': 68, 'string': 68, 'operator': 68, 'algebra': 68, 'cosmological': 2, 'solution': 2, 'sitter': 2, 'entropy': 2, 'horizon': 2, 'hole': 2, 'black': 2}) Top keywords in cluster: Counter({'phys': 81, 'function': 71, 'symmetry': 68, 'case': 68, 'term': 68, 'model': 68, 'gauge': 68, 'solution': 31, 'cosmological': 21, 'sitter': 21, 'entropy': 21, 'horizon': 21, 'hole': 21, 'black': 21, 'hep': 20, 'string': 17, 'brane': 8, 'bulk': 7, 'tachyon': 7, 'branes': 7, 'em': 7, 'math': 7, 'defem': 7, 'pt': 7, 'def': 7, 'eqnarray': 6, 'jhep': 6, 'wave': 6, 'arxivhep': 6, 'citation': 6, 'theta': 5, 'left': 4, 'eq': 3, 'array': 3, 'right': 3, 'space': 3, 'delta': 3, 'nonumber': 3, 'big': 3, 'gamma': 3, 'star': 2, 'quantum': 2, 'noncommutativity': 2, 'commutative': 2, 'noncommutative': 2, 'group': 1, 'representation': 1, 'state': 1, 'operator': 1, 'algebra': 1, 'lambda': 1, 'eea': 1, 'bea': 1, 'beq': 1, 'eeq': 1}) Top keywords in cluster: Counter({'phys': 57, 'hep': 54, 'em': 51, 'math': 51, 'defem': 51, 'pt': 51, 'def': 51, 'symmetry': 5, 'function': 5, 'case': 5, 'term': 5, 'model': 5, 'gauge': 5, 'solution': 3, 'string': 3, 'bulk': 2, 'tachyon': 2, 'branes': 2, 'brane': 2, 'jhep': 1, 'wave': 1, 'arxivhep': 1, 'citation': 1, 'cosmological': 1, 'sitter': 1, 'entropy': 1, 'horizon': 1, 'hole': 1, 'black': 1}) Top keywords in cluster: Counter({'space': 44, 'theta': 44, 'star': 44, 'quantum': 44, 'noncommutativity': 44, 'commutative': 44, 'noncommutative': 44}) Top keywords in cluster: Counter({'eqnarray': 53, 'solution': 51, 'delta': 51, 'theta': 51, 'nonumber': 51, 'big': 51, 'gamma': 51, 'function': 3, 'string': 2, 'eq': 2, 'array': 2, 'right': 2, 'left': 2, 'symmetry': 1, 'case': 1, 'term': 1, 'phys': 1, 'model': 1, 'gauge': 1}) Top keywords in cluster: Counter({'string': 21, 'function': 21, 'eq': 21, 'array': 21, 'eqnarray': 21, 'right': 21, 'left': 21})
We can see from the above that there are distinct clusters for certain groups of papers. For example, cluster 1 contains terms related to particle physics (brane, tachyon), and cluster 2 contains terms related to research on black holes.
However, we also see that several of these clusters are somewhat uninformative, due to the LaTeX content. If we were dealing with only the text of these papers, rather than the formatting, this would not be an issue. If anyone wanted to do a similar exercise, I would recommend that they find a different dataset, or create a dataset of their own by extracting the text directly from a set of PDFs.
We'll try again with a smaller number of clusters to see if we can increase the signal-to-noise ratio here.
# Try again with 10 clusters
kmeans10 = KMeans(n_clusters=10, random_state=6).fit(H)
kmeans10.labels_[0:20]
array([6, 3, 3, 3, 3, 0, 0, 6, 3, 6, 1, 0, 0, 3, 6, 8, 0, 1, 3, 0], dtype=int32)
Counter(kmeans10.labels_)
Counter({6: 131, 3: 96, 0: 317, 1: 159, 8: 59, 7: 35, 5: 55, 9: 44, 4: 57, 2: 66})
tmp_op = [cluster_docs[topic].append(idx) for idx, topic in enumerate(kmeans10.labels_)]
# Top keywords for all clusters
for i in range(10):
print_top_keywords_for_cluster(cluster_docs[i], H, nmf_model, idx_to_word)
print()
Top keywords in cluster: Counter({'phys': 280, 'function': 266, 'symmetry': 261, 'case': 261, 'term': 261, 'model': 261, 'gauge': 261, 'brane': 43, 'left': 42, 'lambda': 37, 'eea': 37, 'bea': 37, 'beq': 37, 'eeq': 37, 'hep': 25, 'solution': 23, 'string': 19, 'em': 16, 'math': 16, 'defem': 16, 'pt': 16, 'def': 16, 'cosmological': 13, 'sitter': 13, 'entropy': 13, 'horizon': 13, 'hole': 13, 'black': 13, 'eqnarray': 9, 'space': 7, 'theta': 6, 'bulk': 6, 'tachyon': 6, 'branes': 6, 'eq': 5, 'array': 5, 'right': 5, 'group': 5, 'representation': 5, 'state': 5, 'operator': 5, 'algebra': 5, 'delta': 4, 'nonumber': 4, 'big': 4, 'gamma': 4, 'jhep': 3, 'wave': 3, 'arxivhep': 3, 'citation': 3, 'star': 2, 'quantum': 2, 'noncommutativity': 2, 'commutative': 2, 'noncommutative': 2}) Top keywords in cluster: Counter({'string': 209, 'space': 208, 'group': 206, 'representation': 206, 'state': 206, 'operator': 206, 'algebra': 206, 'phys': 12, 'function': 9, 'symmetry': 6, 'case': 6, 'term': 6, 'model': 6, 'gauge': 6, 'em': 6, 'math': 6, 'defem': 6, 'hep': 6, 'pt': 6, 'def': 6, 'cosmological': 5, 'solution': 5, 'sitter': 5, 'entropy': 5, 'horizon': 5, 'hole': 5, 'black': 5, 'left': 4, 'eq': 3, 'array': 3, 'eqnarray': 3, 'right': 3, 'theta': 2, 'star': 2, 'quantum': 2, 'noncommutativity': 2, 'commutative': 2, 'noncommutative': 2, 'lambda': 1, 'brane': 1, 'eea': 1, 'bea': 1, 'beq': 1, 'eeq': 1}) Top keywords in cluster: Counter({'solution': 97, 'cosmological': 87, 'sitter': 87, 'entropy': 87, 'horizon': 87, 'hole': 87, 'black': 87, 'phys': 81, 'function': 71, 'symmetry': 68, 'case': 68, 'term': 68, 'model': 68, 'gauge': 68, 'hep': 20, 'string': 17, 'brane': 8, 'bulk': 7, 'tachyon': 7, 'branes': 7, 'em': 7, 'math': 7, 'defem': 7, 'pt': 7, 'def': 7, 'eqnarray': 6, 'jhep': 6, 'wave': 6, 'arxivhep': 6, 'citation': 6, 'theta': 5, 'left': 4, 'eq': 3, 'array': 3, 'right': 3, 'space': 3, 'delta': 3, 'nonumber': 3, 'big': 3, 'gamma': 3, 'star': 2, 'quantum': 2, 'noncommutativity': 2, 'commutative': 2, 'noncommutative': 2, 'group': 1, 'representation': 1, 'state': 1, 'operator': 1, 'algebra': 1, 'lambda': 1, 'eea': 1, 'bea': 1, 'beq': 1, 'eeq': 1}) Top keywords in cluster: Counter({'hep': 143, 'phys': 141, 'string': 91, 'jhep': 80, 'wave': 80, 'arxivhep': 80, 'citation': 80, 'em': 54, 'math': 54, 'defem': 54, 'pt': 54, 'def': 54, 'solution': 11, 'bulk': 9, 'tachyon': 9, 'branes': 9, 'brane': 9, 'symmetry': 7, 'function': 7, 'case': 7, 'term': 7, 'model': 7, 'gauge': 7, 'space': 4, 'cosmological': 2, 'sitter': 2, 'entropy': 2, 'horizon': 2, 'hole': 2, 'black': 2, 'group': 2, 'representation': 2, 'state': 2, 'operator': 2, 'algebra': 2, 'theta': 2, 'star': 2, 'quantum': 2, 'noncommutativity': 2, 'commutative': 2, 'noncommutative': 2}) Top keywords in cluster: Counter({'string': 55, 'function': 55, 'eq': 55, 'array': 55, 'eqnarray': 55, 'right': 55, 'left': 55, 'space': 45, 'theta': 45, 'star': 45, 'quantum': 45, 'noncommutativity': 45, 'commutative': 45, 'noncommutative': 45, 'cosmological': 1, 'solution': 1, 'sitter': 1, 'entropy': 1, 'horizon': 1, 'hole': 1, 'black': 1}) Top keywords in cluster: Counter({'eqnarray': 106, 'solution': 102, 'delta': 102, 'theta': 102, 'nonumber': 102, 'big': 102, 'gamma': 102, 'function': 6, 'string': 5, 'eq': 4, 'array': 4, 'right': 4, 'left': 4, 'phys': 3, 'symmetry': 2, 'case': 2, 'term': 2, 'model': 2, 'gauge': 2, 'jhep': 1, 'wave': 1, 'arxivhep': 1, 'hep': 1, 'citation': 1}) Top keywords in cluster: Counter({'string': 146, 'solution': 124, 'hep': 123, 'bulk': 123, 'tachyon': 123, 'branes': 123, 'brane': 123, 'function': 28, 'eq': 23, 'array': 23, 'eqnarray': 23, 'right': 23, 'left': 23, 'symmetry': 5, 'case': 5, 'term': 5, 'phys': 5, 'model': 5, 'gauge': 5, 'cosmological': 1, 'sitter': 1, 'entropy': 1, 'horizon': 1, 'hole': 1, 'black': 1}) Top keywords in cluster: Counter({'left': 73, 'function': 45, 'string': 39, 'eq': 38, 'array': 38, 'eqnarray': 38, 'right': 38, 'brane': 36, 'lambda': 35, 'eea': 35, 'bea': 35, 'beq': 35, 'eeq': 35, 'symmetry': 7, 'case': 7, 'term': 7, 'phys': 7, 'model': 7, 'gauge': 7, 'solution': 3, 'cosmological': 2, 'sitter': 2, 'entropy': 2, 'horizon': 2, 'hole': 2, 'black': 2, 'hep': 1, 'bulk': 1, 'tachyon': 1, 'branes': 1, 'space': 1, 'theta': 1, 'star': 1, 'quantum': 1, 'noncommutativity': 1, 'commutative': 1, 'noncommutative': 1}) Top keywords in cluster: Counter({'hep': 158, 'solution': 101, 'bulk': 99, 'string': 99, 'tachyon': 99, 'branes': 99, 'brane': 99, 'phys': 64, 'em': 59, 'math': 59, 'defem': 59, 'pt': 59, 'def': 59, 'symmetry': 5, 'function': 5, 'case': 5, 'term': 5, 'model': 5, 'gauge': 5, 'cosmological': 2, 'sitter': 2, 'entropy': 2, 'horizon': 2, 'hole': 2, 'black': 2}) Top keywords in cluster: Counter({'space': 44, 'theta': 44, 'star': 44, 'quantum': 44, 'noncommutativity': 44, 'commutative': 44, 'noncommutative': 44, 'em': 25, 'math': 25, 'defem': 25, 'phys': 25, 'hep': 25, 'pt': 25, 'def': 25})
Ah, this looks much better. Each cluster is now clearly defined by its most common terms. It looks like 15 clusters was causing very similar clusters to be separated arbitrarily, and 10 is more appropriate.
print(kmeans15.inertia_, kmeans10.inertia_)
6.786605666351621 8.120669184630783
Though the 10-cluster model produces a slightly higher inertia, the difference in inertia between the two models is quite small, and the actual results of the 10-cluster model are more compelling.