## package install
!pip install arxiv nltk gensim textgenrnn pyLDAvis
## import necessary packages on the fly instead once for all,
## making things a bit clear
from arxiv import query
## get some meta data of recent papers from arxiv
## papers from strongly correlated electrons
lst_str_el = query(search_query='cat:cond-mat.str-el',start=0, max_results=2000)
## papers from statistical mechanics
lst_stat_mech = query(search_query='cat:cond-mat.stat-mech',start=0, max_results=2000)
## papers from quantum mechanics
lst_quant_ph = query(search_query='cat:quant-ph',start=0, max_results=2000)
## papers from artificial intelligence
lst_cs_ai = query(search_query='cat:cs.AI',start=0, max_results=1000)
## papers from astrophysics
lst_astro_ph = query(search_query='cat:astro-ph',start=0, max_results=1000)
For the repo of getting metadata from arxiv and the API of arxiv, refer to the repo and user manual of API from arxiv. For a better wrapper with more functionality, please check the project arxiv-analysis written by me.
import requests
## download the better stoplist for research paper
stoplist = requests.get("https://raw.githubusercontent.com/refraction-ray/arxiv-analysis/master/arxivanalysis/SmartStopList.txt")
stoplist = stoplist.text.split("\n")
len(stoplist)
768
## some utility functions
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
stop = set(stoplist)
exclude = set(string.punctuation)
lemma = WordNetLemmatizer() ## used to get the original form of a word
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
exclude.add(" ")
for i in range(10):
exclude.add(str(i))
stop_free = re.subn("|".join([re.escape(e) for e in exclude]),"|",stop_free)[0]
stop_free = stop_free.split("|")
punc_free = [ch for ch in stop_free if len(ch)>2] ## omit all words with length no more than 2
normalized = " ".join(lemma.lemmatize(word) for word in punc_free)
return normalized
## download the data for lemmatizer
import nltk
nltk.download('wordnet')
## prepare clean doc for cond.str-el (with more weights on title)
doc_clean = [clean(doc['title']+'. '+doc['title']+'. '+doc['summary']).split() for doc in lst_str_el]
## one can also other papers in the document table
doc_clean += [clean(doc['title']+'. '+doc['title']+'. '+doc['summary']).split() for doc in lst_quant_ph]
import gensim
from gensim import corpora
## make the corpus
## first merge phrase because the importance of jargon in research paper
bigram = gensim.models.Phrases(doc_clean, min_count=3, threshold=5)
## merge again, make a_b_c type of words
trigram = gensim.models.Phrases(bigram[doc_clean], threshold=2)
## create the dictionary for word id mapping
dictionary = corpora.Dictionary(trigram[doc_clean])
## filter the extrem value: occur more than 10 times while no more that 10% documents
dictionary.filter_extremes(no_below=10,no_above=0.1)
corpus = [dictionary.doc2bow(doc) for doc in trigram[doc_clean]]
## use tfidf to adjust the weight of words
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
## train the word2vec model
w2vmodel = gensim.models.Word2Vec(trigram[doc_clean], size=50, window=6, min_count=10)
/usr/local/lib/python3.6/dist-packages/gensim/models/phrases.py:598: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
## evaluate the model
w2vmodel.wv.similar_by_word("algorithm"), w2vmodel.wv.similar_by_word("superconductor")
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
([('efficient', 0.9989340305328369), ('universal', 0.9984945058822632), ('computation', 0.9984763860702515), ('quantum_algorithm', 0.9984354972839355), ('machine', 0.9982243776321411), ('simple', 0.9982030987739563), ('quantum_code', 0.9981838464736938), ('computer', 0.9981072545051575), ('polynomial_time', 0.9980635046958923), ('fourier_transform', 0.9980482459068298)], [('first', 0.9994072914123535), ('oxide', 0.9993759393692017), ('spin_polarization', 0.9993758201599121), ('ferromagnetic_transition', 0.9993693232536316), ('stable', 0.9993599057197571), ('charge_order', 0.9993472695350647), ('quantitatively', 0.9993464350700378), ('ferromagnetic_phase', 0.9993222951889038), ('nature', 0.9992976188659668), ('investigation', 0.9992893934249878)])
## pick out the different one
w2vmodel.wv.doesnt_match("syk holography otoc scrambling weyl".split())
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
'weyl'
w2vmodel.wv.similarity("superconductor", "high_temperature"), w2vmodel.wv.similarity("quantum", "high_temperature")
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
(0.9968866, 0.7526422)
## predict the word based on the context
w2vmodel.predict_output_word("localization entanglement_entropy qunench dynamics otoc".split(),topn=30)
[('classical', 0.00086197956), ('quantum_mechanic', 0.0008123985), ('time', 0.0007668873), ('problem', 0.0006764776), ('measurement', 0.0006719102), ('particle', 0.0006466909), ('equation', 0.00064092746), ('function', 0.0006380455), ('operator', 0.00063431147), ('number', 0.0006099663), ('potential', 0.000589651), ('discussed', 0.0005892991), ('shown', 0.0005805112), ('general', 0.00057841814), ('term', 0.00057534914), ('structure', 0.00057245494), ('experiment', 0.00056691194), ('dynamic', 0.00056628825), ('energy', 0.00055901625), ('simple', 0.0005562697), ('one', 0.00055271725), ('paper', 0.000549292), ('error', 0.000547884), ('scheme', 0.0005394369), ('process', 0.0005315003), ('density_matrix', 0.0005153318), ('decoherence', 0.0005099839), ('quantum_state', 0.000509113), ('quantum_computer', 0.0005075995), ('type', 0.0005058143)]
## a simple one: automatically separate CS from physics papers
doc_clean = [clean(doc['title']+'. '+doc['title']+'. '+doc['summary']).split() for doc in lst_str_el]
doc_clean += [clean(doc['title']+'. '+doc['title']+'. '+doc['summary']).split() for doc in lst_cs_ai]
bigram = gensim.models.Phrases(doc_clean, min_count=3, threshold=5)
trigram = gensim.models.Phrases(bigram[doc_clean], threshold=2)
dictionary = corpora.Dictionary(trigram[doc_clean])
dictionary.filter_extremes(no_below=10,no_above=0.1)
corpus = [dictionary.doc2bow(doc) for doc in trigram[doc_clean]]
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=2, id2word = dictionary, passes=5, iterations=100)
ldamodel.log_perplexity(corpus_tfidf)
-7.977682440999339
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus_tfidf, dictionary)
vis
bin1 = []
bin0 = []
for i,c in enumerate(doc_clean):
if max( ldamodel[corpus_tfidf[i]],key=lambda x:x[1])[0] == 0:
bin0.append(i)
elif max( ldamodel[corpus_tfidf[i]],key=lambda x:x[1])[0] == 1:
bin1.append(i)
len([1 for i in bin0 if i>=2000]),len([1 for i in bin1 if i<2000]) ## the unsupervised topic calssification is perfect
(2, 6)
## what about lad within one subject?
doc_clean = doc_clean[:2000]
bigram = gensim.models.Phrases(doc_clean, min_count=3, threshold=5)
trigram = gensim.models.Phrases(bigram[doc_clean], threshold=2)
dictionary = corpora.Dictionary(trigram[doc_clean])
dictionary.filter_extremes(no_below=5,no_above=0.1)
corpus = [dictionary.doc2bow(doc) for doc in trigram[doc_clean]]
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=200, id2word = dictionary, passes=30, iterations=500)
vis = pyLDAvis.gensim.prepare(ldamodel, corpus_tfidf, dictionary)
vis
from gensim import similarities
index = similarities.MatrixSimilarity(ldamodel[corpus_tfidf])
j= 1629
c = lst_str_el[j]
query = clean(c['title']+". "+c['title']+". "+c["summary"])
vec_bow = dictionary.doc2bow(trigram[query.lower().split()])
vec_lsi = ldamodel[tfidf[vec_bow]] # convert the query to LSI space
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(list(enumerate(sims))[:10])
[(0, (1629, 0.9999994)), (1, (1862, 0.84906775)), (2, (220, 0.7793414)), (3, (266, 0.7751794)), (4, (112, 0.7729919)), (5, (626, 0.7641656)), (6, (144, 0.7625283)), (7, (1428, 0.7625283)), (8, (1677, 0.7493693)), (9, (1095, 0.71349245))]
/usr/local/lib/python3.6/dist-packages/gensim/models/phrases.py:598: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
lst_str_el[1629]['title'],lst_str_el[1862]['title']
('Stabilization of A-type layered antiferromagnetic phase in LaMnO_3 by\n cooperative Jahn-Teller deformations', 'Orbitally Degenerate Spin-1 Model for Insulating V2O3')
from textgenrnn import textgenrnn
long_text = " ".join([doc['title']+'|' for doc in lst_astro_ph])
long_text = re.subn("\n", "",long_text)[0]
long_text = re.subn(" ", " ",long_text)[0]
long_text = [line.strip(" ") for line in long_text.split("|")]
textgen = textgenrnn()
textgen.train_on_texts(long_text,num_epochs=20)
textgen.generate(5,temperature=0.3)
A Spectroscopic Method for Expanding the Formation of Shapes of thher Axis The Pressure of Models of Galaxy Clusters The Three--Point Correlation Function of Galaxies and $$-Ray Bursts The Angular Distribution of Spiral Galaxies The Cosmic Background Radiation from Cosmological Perturbations in the Cosmic Microwave Background