Notebook

In [1]:

from operator import itemgetter
from concurrent.futures import ProcessPoolExecutor 

import os 
import gensim
import arxiv
import pandas as pd

import itertools

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import scikitplot

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

from fastFM import sgd

Data preprocessing¶

In [2]:

lemmatizer = WordNetLemmatizer()


def stem(text):
    return lemmatizer.lemmatize(text)


def map_parallel(f, iterable, **kwargs):
    with ProcessPoolExecutor() as pool:
        result = pool.map(f, iterable, **kwargs)
    return result


def retrieve_articles(start, chunksize=1000):
    return arxiv.query(
        search_query=search_query,
        start=start,
        max_results=chunksize
    )

Actual text mining functions¶

In [3]:

def vectorize_text(examples_df):

    vectorizer = CountVectorizer(min_df=2)
    features = vectorizer.fit_transform(examples_df['summary'])

    le = LabelEncoder()
    ohe = OneHotEncoder()
    labels = le.fit_transform(valid_example_categories).reshape(-1, 1)
    labels_ohe = ohe.fit_transform(labels).todense()
    vectorized_data = {
        'features': features,
        'labels': labels,
        'labels_onehot' : labels_ohe
    }
    return vectorized_data, (ohe, le)


def extract_keywords(text):
    """
    Use gensim's textrank-based approach
    """
    return gensim.summarization.keywords(
        text=text,
        lemmatize=True,
        split=True
    )


def extract_mz_keywords(text):
    """
    Use gensim's Montemurro-Zanette method implementation
    """
    return gensim.summarization.mz_keywords(
        text=stem(text),
        blocksize=32,
        split=True
    )

Factorization machine utils¶

In [4]:

class FMClassifier(sgd.FMClassification):
    """
    Wrapper for fastFM estimator that makes it behave like sklearn ones
    """
    
    def fit(self, X, y, *args):
        y = y.copy()
        y[y == 0] = -1
        return super(FMClassifier, self).fit(X, y, *args)

    def predict_proba(self, X):
        probs = super(FMClassifier, self).predict_proba(X)
        return np.tile(probs, 2).reshape(2, probs.shape[0]).T
    

def predict_ovr(model, X):
    """
    predict as multiclass (standard OVR behaves as predicting multilabel)
    """
    return np.argmax(model.predict_proba(X), 1)

In [5]:

def filter_out_small_categories(df, categories, threshold=200):

    class_counts = categories.value_counts()
    too_small_classes = class_counts[class_counts < threshold].index
    too_small_classes

    valid_example_indices = ~categories.isin(too_small_classes)
    valid_examples = df[valid_example_indices]
    valid_example_categories = categories[valid_example_indices]
    
    return valid_examples, valid_example_categories

Plotting utils¶

In [6]:

def report_classification_confusion_matrix(y, y_pred, label_encoder):

    y_test_pred_label_names = label_encoder.inverse_transform(y_pred)
    y_test_label_names = label_encoder.inverse_transform(y.reshape(-1))

    print(classification_report(y_test_label_names, y_test_pred_label_names))

    
    scikitplot.metrics.plot_confusion_matrix(
        y_test_label_names,
        y_test_pred_label_names,
        hide_zeros=True,
        x_tick_rotation=90
    )
    plt.show()

Load ML articles from arxiv¶

%%time

search_query = 'matrix factorization'

max_n_articles = 10000 chunksize = 1000

we need to use def since lambdas can't be pickled¶

def retrieve_chunk(chunk_start): return arxiv.query( search_query=search_query, start=chunk_start, max_results=chunksize )

result_chunks = list( map_parallel( retrieve_chunk, range(0, max_n_articles, chunksize) ) )

results = list(itertools.chain(*result_chunks))

print('Retrieved {} articles'.format(len(results)))

In [7]:

%store -r results
print('Retrieved {} articles'.format(len(results)))

Retrieved 10000 articles

Display some basic information¶

In [8]:

n_examples = 20

for entry in results[:n_examples]:
    print(20 * '*')
    print(entry['title'])
    print(20 * '*')
    print(', '.join(entry['authors']))
    print(entry['date'])
    print(entry['summary'])
    print()

********************
Approximate Method of Variational Bayesian Matrix
  Factorization/Completion with Sparse Prior
********************
Ryota Kawasumi, Koujin Takeda
2018-03-14T13:54:23Z
We derive analytical expression of matrix factorization/completion solution
by variational Bayes method, under the assumption that observed matrix is
originally the product of low-rank dense and sparse matrices with additive
noise. We assume the prior of sparse matrix is Laplace distribution by taking
matrix sparsity into consideration. Then we use several approximations for
derivation of matrix factorization/completion solution. By our solution, we
also numerically evaluate the performance of sparse matrix reconstruction in
matrix factorization, and completion of missing matrix element in matrix
completion.

********************
A New Method of Matrix Spectral Factorization
********************
Gigla Janashia, Edem Lagvilava, Lasha Ephremidze
2009-09-29T15:08:13Z
A new method of matrix spectral factorization is proposed which reliably
computes an approximate spectral factor of any matrix spectral density that
admits spectral factorization

********************
Matrix Factorizations via the Inverse Function Theorem
********************
Paul W. Y. Lee
2014-08-12T03:29:00Z
We give proofs of QR factorization, Cholesky's factorization, and LDU
factorization using the inverse function theorem. As a consequence, we obtain
analytic dependence of these matrix factorizations which does not follow
immediately using Gaussian elimination.

********************
The Reciprocal Pascal Matrix
********************
Thomas M. Richardson
2014-05-24T16:16:58Z
The reciprocal Pascal matrix is the Hadamard inverse of the symmetric Pascal
matrix. We show that the ordinary matrix inverse of the reciprocal Pascal
matrix has integer elements. The proof uses two factorizations of the matrix of
super Catalan numbers.

********************
Invariance properties of thematic factorizations of matrix functions
********************
R. B. Alexeev, V. V. Peller
2001-01-26T21:54:16Z
We study the problem of invariance of indices of thematic factorizations.
Such factorizations were introduced in [PY1] for studying superoptimal
approximation by bounded analytic matrix functions. As shown in [PY1], the
indices may depend on the choice of a thematic factorization. We introduce the
notion of a monotone thematic factorization. The main result shows that under
natural assumptions a matrix function that admits a thematic factorization also
admits a monotone thematic factorization and the indices of a monotone thematic
factorization are uniquely determined by the matrix function itself. We obtain
similar results for so-called partial thematic factorizations.

********************
Online Matrix Factorization via Broyden Updates
********************
Ömer Deniz Akyıldız
2015-06-26T07:11:17Z
In this paper, we propose an online algorithm to compute matrix
factorizations. Proposed algorithm updates the dictionary matrix and associated
coefficients using a single observation at each time. The algorithm performs
low-rank updates to dictionary matrix. We derive the algorithm by defining a
simple objective function to minimize whenever an observation is arrived. We
extend the algorithm further for handling missing data. We also provide a
mini-batch extension which enables to compute the matrix factorization on big
datasets. We demonstrate the efficiency of our algorithm on a real dataset and
give comparisons with well-known algorithms such as stochastic gradient matrix
factorization and nonnegative matrix factorization (NMF).

********************
Matrix factorizations and intertwiners of the fundamental
  representations of quantum group U_q (sl_n)
********************
Yasuyoshi Yonezawa
2008-06-30T17:13:00Z
We want to construct a homological link invariant whose Euler characteristic
is MOY polynomial as Khovanov and Rozansky constructed a categorification of
HOMFLY polynomial. The present paper gives the first step to construct a
categorification of MOY polynomial. For the essential colored planar diagrams
with additional data which is a sequence naturally induced by coloring, we
define matrix factorizations, and then we define a matrix factorization for
planar diagram obtained by gluing the essential colored planar diagrams as
tensor product of the matrix factorizations for the essential planar diagrams.
Moreover, we show that some matrix factorizations deribed from tensor product
of the essential matrix factorizations have homotopy equivalences corresponding
to MOY relations.

********************
Fundamental matrix factorization in the FJRW-theory revisited
********************
Alexander Polishchuk
2017-12-26T21:08:40Z
We present an improved construction of the fundamental matrix factorization
in the FJRW-theory given in arXiv:1105.2903. The revised construction is
coordinate-free and works for a possibly nonabelian finite group of symmetries.
One of the new ingrediants is the category of dg-matrix factorizations over a
dg-scheme.

********************
Matrix factorizations and double line in $\mathfrak{sl}_n$ quantum link
  invariant
********************
Yasuyoshi Yonezawa
2007-03-28T07:26:02Z
This article gives matrix factorizations for the trivalent diagrams and
double line appearing in $\mathfrak{sl}_n$ quantum link invariant.
  These matrix factorizations reconstruct Khovanov-Rozansky homology. And we
show that the Euler characteristic of the matrix factorization for a double
loop equals the quantum dimension of the representation $\land^2 V$ of $U_q
(\mathfrak{sl}_n)$ in Section \ref{sec3.3}.

********************
Finiteness of small factor analysis models
********************
Mathias Drton, Han Xiao
2009-08-12T15:42:31Z
We consider small factor analysis models with one or two factors. Fixing the
number of factors, we prove a finiteness result about the covariance matrix
parameter space when the size of the covariance matrix increases. According to
this result, there exists a distinguished matrix size starting at which one can
determine whether a given covariance matrix belongs to the parameter space by
determining whether all principal submatrices of the distinguished size belong
to the corresponding parameter space. We show that the distinguished matrix
size is equal to four in the one-factor model and six with two factors.

********************
Stochastic Matrix Factorization
********************
Christopher Adams
2016-09-19T15:19:44Z
This paper considers a restriction to non-negative matrix factorization in
which at least one matrix factor is stochastic. That is, the elements of the
matrix factors are non-negative and the columns of one matrix factor sum to 1.
This restriction includes topic models, a popular method for analyzing
unstructured data. It also includes a method for storing and finding pictures.
The paper presents necessary and sufficient conditions on the observed data
such that the factorization is unique. In addition, the paper characterizes
natural bounds on the parameters for any observed data and presents a
consistent least squares estimator. The results are illustrated using a topic
model analysis of PhD abstracts in economics and the problem of storing and
retrieving a set of pictures of faces.

********************
Simulated Annealing with Levy Distribution for Fast Matrix
  Factorization-Based Collaborative Filtering
********************
Mostafa A. Shehata, Mohammad Nassef, Amr A. Badr
2017-08-09T15:14:54Z
Matrix factorization is one of the best approaches for collaborative
filtering, because of its high accuracy in presenting users and items latent
factors. The main disadvantages of matrix factorization are its complexity, and
being very hard to be parallelized, specially with very large matrices. In this
paper, we introduce a new method for collaborative filtering based on Matrix
Factorization by combining simulated annealing with levy distribution. By using
this method, good solutions are achieved in acceptable time with low
computations, compared to other methods like stochastic gradient descent,
alternating least squares, and weighted non-negative matrix factorization.

********************
Primitive factorizations, Jucys-Murphy elements, and matrix models
********************
Sho Matsumoto, Jonathan Novak
2010-05-02T17:46:10Z
A factorization of a permutation into transpositions is called "primitive" if
its factors are weakly ordered. We discuss the problem of enumerating primitive
factorizations of permutations, and its place in the hierarchy of previously
studied factorization problems. Several formulas enumerating minimal primitive
and possibly non-minimal primitive factorizations are presented, and
interesting connections with Jucys-Murphy elements, symmetric group characters,
and matrix models are described.

********************
Localization of Matrix Factorizations
********************
Ilya Krishtal, Thomas Strohmer, Tim Wertz
2013-05-07T19:55:06Z
Matrices with off-diagonal decay appear in a variety of fields in mathematics
and in numerous applications, such as signal processing, statistics,
communications engineering, condensed matter physics, and quantum chemistry.
Numerical algorithms dealing with such matrices often take advantage
(implicitly or explicitly) of the empirical observation that this off-diagonal
decay property seems to be preserved when computing various useful matrix
factorizations, such as the Cholesky factorization or the QR-factorization.
There is a fairly extensive theory describing when the inverse of a matrix
inherits the localization properties of the original matrix. Yet, except for
the special case of band matrices, surprisingly very little theory exists that
would establish similar results for matrix factorizations. We will derive a
comprehensive framework to rigorously answer the question when and under which
conditions the matrix factors inherit the localization of the original matrix
for such fundamental matrix factorizations as the LU-, QR-, Cholesky, and Polar
factorization.

********************
Monotone thematic factorizations of matrix functions
********************
Alberto A. Condori
2009-08-28T20:29:13Z
We continue the study of the so-called thematic factorizations of admissible
very badly approximable matrix functions. These factorizations were introduced
by V.V. Peller and N.J. Young for studying superoptimal approximation by
bounded analytic matrix functions. Even though thematic indices associated with
a thematic factorization of an admissible very badly approximable matrix
function are not uniquely determined by the function itself, R.B. Alexeev and
V.V. Peller showed that the thematic indices of any monotone non-increasing
thematic factorization of an admissible very badly approximable matrix function
are uniquely determined. In this paper, we prove the existence of monotone
non-decreasing thematic factorizations for admissible very badly approximable
matrix functions. It is also shown that the thematic indices appearing in a
monotone non-decreasing thematic factorization are not uniquely determined by
the matrix function itself. Furthermore, we show that the monotone
non-increasing thematic factorization gives rise to a great number of other
thematic factorizations.

********************
Badly approximable matrix functions and canonical factorizations
********************
R. B. Alexeev, V. V. Peller
2001-01-26T22:08:33Z
We continue studying the problem of analytic approximation of matrix
functions. We introduce the notion of a partial canonical factorization of a
badly approximable matrix function $\Phi$ and the notion of a canonical
factorization of a very badly approximable matrix function $\Phi$. Such
factorizations are defined in terms of so-called balanced unitary-valued
functions which have many remarkable properties. Unlike the case of thematic
factorizations studied earlier in [PY1], [PY2], [PT], [AP1], the factors in
canonical factorizations (as well as partial canonical factorizations) are
uniquely determined by the matrix function $\Phi$ up to constant unitary
factors. We study many properties of canonical factorizations. In particular we
show that under certain natural assumptions on a function space $X$ the
condition $\Phi\in X$ implies that all factors in a canonical factorization of
$\Phi$ belong to the same space $X$. In the last section we characterize the
very badly approximable unitary-valued functions $U$ that satisfy the condition
$\|H_U\|_{\text e}<1$.

********************
Nonnegative Matrix Factorization Requires Irrationality
********************
Dmitry Chistikov, Stefan Kiefer, Ines Marušić, Mahsa Shirmohammadi, James Worrell
2017-03-22T22:03:17Z
Nonnegative matrix factorization (NMF) is the problem of decomposing a given
nonnegative $n \times m$ matrix $M$ into a product of a nonnegative $n \times
d$ matrix $W$ and a nonnegative $d \times m$ matrix $H$. A longstanding open
question, posed by Cohen and Rothblum in 1993, is whether a rational matrix $M$
always has an NMF of minimal inner dimension $d$ whose factors $W$ and $H$ are
also rational. We answer this question negatively, by exhibiting a matrix for
which $W$ and $H$ require irrational entries.

********************
Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions
********************
Lasha Ephremidze, Edem Lagvilava
2010-08-18T16:04:13Z
A spectral factorization theorem is proved for polynomial rank-deficient
matrix-functions. The theorem is used to construct paraunitary matrix-functions
with first rows given.

********************
From-Below Approximations in Boolean Matrix Factorization: Geometry and
  New Algorithm
********************
Radim Belohlavek, Martin Trnecka
2013-06-20T15:19:22Z
We present new results on Boolean matrix factorization and a new algorithm
based on these results. The results emphasize the significance of
factorizations that provide from-below approximations of the input matrix.
While the previously proposed algorithms do not consider the possibly different
significance of different matrix entries, our results help measure such
significance and suggest where to focus when computing factors. An experimental
evaluation of the new algorithm on both synthetic and real data demonstrates
its good performance in terms of good coverage by the first k factors as well
as a small number of factors needed for exact decomposition and indicates that
the algorithm outperforms the available ones in these terms. We also propose
future research topics.

********************
Necessary And Sufficient Conditions For Existence of the LU
  Factorization of an Arbitrary Matrix
********************
Pavel Okunev, Charles R. Johnson
2005-06-19T23:10:13Z
If $A$ is an n-by-n matrix over a field $F$ ($A\in M_{n}(F)$), then $A$ is
said to ``have an LU factorization'' if there exists a lower triangular matrix
$L\in M_{n}(F)$ and an upper triangular matrix $U\in M_{n}(F)$ such that
$$A=LU.$$ We give necessary and sufficient conditions for LU factorability of a
matrix. Also simple algorithm for computing an LU factorization is given. It is
an extension of the Gaussian elimination algorithm to the case of not
necessarily invertible matrices. We consider possibilities to factors a matrix
that does not have an LU factorization as the product of an ``almost lower
triangular'' matrix and an ``almost upper triangular'' matrix. There are many
ways to formalize what almost means. We consider some of them and derive
necessary and sufficient conditions. Also simple algorithms for computing of an
``almost LU factorization'' are given.

In [9]:

articles_df = pd.DataFrame(results)

In [11]:

articles_df = pd.read_json('matrix_factorization_arxiv_query_result.json')

In [14]:

articles_df.columns

Out[14]:

Index(['affiliation', 'arxiv_comment', 'arxiv_primary_category', 'arxiv_url',
       'author', 'author_detail', 'authors', 'doi', 'guidislink', 'id',
       'journal_reference', 'links', 'pdf_url', 'published',
       'published_parsed', 'summary', 'summary_detail', 'tags', 'title',
       'title_detail', 'updated', 'updated_parsed'],
      dtype='object')

In [11]:

articles_df.head()

Out[11]:

	affiliation	arxiv_comment	arxiv_primary_category	arxiv_url	author	author_detail	authors	doi	guidislink	id	...	pdf_url	published	published_parsed	summary	summary_detail	tags	title	title_detail	updated	updated_parsed
0	None	22 pages, 4 figures, part of this work was pre...	{'term': 'eess.SP', 'scheme': 'http://arxiv.or...	http://arxiv.org/abs/1803.06234v1	Koujin Takeda	{'name': 'Koujin Takeda'}	[Ryota Kawasumi, Koujin Takeda]	None	True	http://arxiv.org/abs/1803.06234v1	...	http://arxiv.org/pdf/1803.06234v1	2018-03-14T13:54:23Z	(2018, 3, 14, 13, 54, 23, 2, 73, 0)	We derive analytical expression of matrix fact...	{'type': 'text/plain', 'language': None, 'base...	[{'term': 'eess.SP', 'scheme': 'http://arxiv.o...	Approximate Method of Variational Bayesian Mat...	{'type': 'text/plain', 'language': None, 'base...	2018-03-14T13:54:23Z	(2018, 3, 14, 13, 54, 23, 2, 73, 0)
1	None	23 pages	{'term': 'math.CV', 'scheme': 'http://arxiv.or...	http://arxiv.org/abs/0909.5361v1	Lasha Ephremidze	{'name': 'Lasha Ephremidze'}	[Gigla Janashia, Edem Lagvilava, Lasha Ephremi...	None	True	http://arxiv.org/abs/0909.5361v1	...	http://arxiv.org/pdf/0909.5361v1	2009-09-29T15:08:13Z	(2009, 9, 29, 15, 8, 13, 1, 272, 0)	A new method of matrix spectral factorization ...	{'type': 'text/plain', 'language': None, 'base...	[{'term': 'math.CV', 'scheme': 'http://arxiv.o...	A New Method of Matrix Spectral Factorization	{'type': 'text/plain', 'language': None, 'base...	2009-09-29T15:08:13Z	(2009, 9, 29, 15, 8, 13, 1, 272, 0)
2	None	6 pages	{'term': 'math.CA', 'scheme': 'http://arxiv.or...	http://arxiv.org/abs/1408.2611v1	Paul W. Y. Lee	{'name': 'Paul W. Y. Lee'}	[Paul W. Y. Lee]	None	True	http://arxiv.org/abs/1408.2611v1	...	http://arxiv.org/pdf/1408.2611v1	2014-08-12T03:29:00Z	(2014, 8, 12, 3, 29, 0, 1, 224, 0)	We give proofs of QR factorization, Cholesky's...	{'type': 'text/plain', 'language': None, 'base...	[{'term': 'math.CA', 'scheme': 'http://arxiv.o...	Matrix Factorizations via the Inverse Function...	{'type': 'text/plain', 'language': None, 'base...	2014-08-12T03:29:00Z	(2014, 8, 12, 3, 29, 0, 1, 224, 0)
3	None	None	{'term': 'math.CO', 'scheme': 'http://arxiv.or...	http://arxiv.org/abs/1405.6315v1	Thomas M. Richardson	{'name': 'Thomas M. Richardson'}	[Thomas M. Richardson]	None	True	http://arxiv.org/abs/1405.6315v1	...	http://arxiv.org/pdf/1405.6315v1	2014-05-24T16:16:58Z	(2014, 5, 24, 16, 16, 58, 5, 144, 0)	The reciprocal Pascal matrix is the Hadamard i...	{'type': 'text/plain', 'language': None, 'base...	[{'term': 'math.CO', 'scheme': 'http://arxiv.o...	The Reciprocal Pascal Matrix	{'type': 'text/plain', 'language': None, 'base...	2014-05-24T16:16:58Z	(2014, 5, 24, 16, 16, 58, 5, 144, 0)
4	None	20 pages	{'term': 'math.FA', 'scheme': 'http://arxiv.or...	http://arxiv.org/abs/math/0101182v2	V. V. Peller	{'name': 'V. V. Peller'}	[R. B. Alexeev, V. V. Peller]	None	True	http://arxiv.org/abs/math/0101182v2	...	http://arxiv.org/pdf/math/0101182v2	2001-01-22T23:32:55Z	(2001, 1, 22, 23, 32, 55, 0, 22, 0)	We study the problem of invariance of indices ...	{'type': 'text/plain', 'language': None, 'base...	[{'term': 'math.FA', 'scheme': 'http://arxiv.o...	Invariance properties of thematic factorizatio...	{'type': 'text/plain', 'language': None, 'base...	2001-01-26T21:54:16Z	(2001, 1, 26, 21, 54, 16, 4, 26, 0)

5 rows × 22 columns

arXiv categories¶

In [12]:

categories = articles_df['arxiv_primary_category'].apply(itemgetter('term'))

main_categories = categories.apply(lambda s: s.split('.')[0].split('-')[0])

Toplevel categories

In [13]:

main_categories_counts = main_categories.value_counts(ascending=True)
main_categories_counts.plot.barh()
plt.show()

Toplevel categories with more than 200 papers

In [14]:

main_categories_counts[main_categories_counts > 200].plot.barh()
plt.show()

In [15]:

categories.value_counts(ascending=True)[-10:].plot.barh()
plt.show()

Extract keywords from summaries¶

In [16]:

%%time

articles_df['summary_keywords'] = list(
    map_parallel(extract_keywords, articles_df['summary'])
)

CPU times: user 5.37 s, sys: 611 ms, total: 5.99 s
Wall time: 1min 40s

In [17]:

for __, row in itertools.islice(articles_df.iterrows(), n_examples):
  print(20 * '*')
  print(row['title'])
  print(20 * '*')
  print('keywords:', row['summary_keywords'])
  print()

********************
Approximate Method of Variational Bayesian Matrix
  Factorization/Completion with Sparse Prior
********************
keywords: ['matrix', 'analytical', 'bayes', 'completion']

********************
A New Method of Matrix Spectral Factorization
********************
keywords: ['spectral', 'method']

********************
Matrix Factorizations via the Inverse Function Theorem
********************
keywords: ['factorizations', 'function']

********************
The Reciprocal Pascal Matrix
********************
keywords: ['pascal matrix']

********************
Invariance properties of thematic factorizations of matrix functions
********************
keywords: ['thematic', 'results', 'superoptimal', 'matrix']

********************
Online Matrix Factorization via Broyden Updates
********************
keywords: ['matrix', 'algorithms', 'missing', 'objective', 'dataset']

********************
Matrix factorizations and intertwiners of the fundamental
  representations of quantum group U_q (sl_n)
********************
keywords: ['matrix', 'equivalences', 'link', 'paper', 'naturally', 'planar']

********************
Fundamental matrix factorization in the FJRW-theory revisited
********************
keywords: ['construction', 'matrix']

********************
Matrix factorizations and double line in $\mathfrak{sl}_n$ quantum link
  invariant
********************
keywords: ['ref', 'quantum link', 'line', 'loop']

********************
Finiteness of small factor analysis models
********************
keywords: ['matrix parameter', 'factors', 'size']

********************
Stochastic Matrix Factorization
********************
keywords: ['paper', 'unstructured', 'data', 'factors', 'model']

********************
Simulated Annealing with Levy Distribution for Fast Matrix
  Factorization-Based Collaborative Filtering
********************
keywords: ['methods', 'computations', 'simulated', 'filtering', 'non', 'latent', 'descent', 'good']

********************
Primitive factorizations, Jucys-Murphy elements, and matrix models
********************
keywords: ['primitive', 'factorizations', 'enumerating', 'minimal']

********************
Localization of Matrix Factorizations
********************
keywords: ['matrix', 'factors', 'decay', 'extensive theory', 'numerical', 'similar', 'matrices', 'properties']

********************
Monotone thematic factorizations of matrix functions
********************
keywords: ['thematic', 'matrix', 'approximation', 'indices', 'non']

********************
Badly approximable matrix functions and canonical factorizations
********************
keywords: ['unitary', 'factors', 'approximable', 'function', 'study']

********************
Nonnegative Matrix Factorization Requires Irrationality
********************
keywords: ['question', 'matrix', 'irrational']

********************
Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions
********************
keywords: ['matrix', 'factorization']

********************
From-Below Approximations in Boolean Matrix Factorization: Geometry and
  New Algorithm
********************
keywords: ['matrix', 'algorithms', 'different', 'factors', 'new results', 'propose']

********************
Necessary And Sufficient Conditions For Existence of the LU
  Factorization of an Arbitrary Matrix
********************
keywords: ['triangular', 'invertible', 'elimination']

In [18]:

article_keyword_lengths = articles_df['summary_keywords'].apply(len)

In [19]:

article_keyword_lengths.plot.hist(bins=article_keyword_lengths.max(), title='Number of summary keywords')

Out[19]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fd30d1fccf8>

Try to predict tags given summaries¶

In [20]:

valid_examples, valid_example_categories = filter_out_small_categories(articles_df, main_categories)
valid_examples.shape

Out[20]:

(9403, 23)

In [21]:

vectorized_data, (ohe, le) = vectorize_text(valid_examples)

We'll use Factorization Machine model. FMs are well suited for such high-dimensional sparse data. This model comes from FastFM library.

Note OneVsRestClassifier. We have to use it here because FastFM doesn't support multiclass classification.

In [22]:

fm = FMClassifier(
    rank=50,
    n_iter=10000,
    step_size=0.0001,
    l2_reg_w=0.01,
    l2_reg_V=0.01
)
fm_multiclass = OneVsRestClassifier(fm)

In [23]:

x_train, x_test, y_train, y_test, y_train_labels, y_test_labels = train_test_split(
    vectorized_data['features'],
    vectorized_data['labels_onehot'],
    vectorized_data['labels'],
    stratify=vectorized_data['labels'],
    test_size=0.2,
    random_state=0
)

In [24]:

%%time

fm_multiclass.fit(x_train, y_train)

CPU times: user 14 s, sys: 171 ms, total: 14.1 s
Wall time: 12.5 s

Out[24]:

OneVsRestClassifier(estimator=FMClassifier(init_stdev=0.1, l2_reg=None, l2_reg_V=0.01, l2_reg_w=0.01,
       n_iter=10000, random_state=123, rank=50, step_size=0.0001),
          n_jobs=1)

In [25]:

y_test_pred = predict_ovr(fm_multiclass, x_test)

In [26]:

print(
    'train score:', accuracy_score(y_train_labels, predict_ovr(fm_multiclass, x_train)), '\n'
    'test score: ', accuracy_score(y_test_labels, y_test_pred)
)

train score: 0.719090667376 
test score:  0.658692185008

Confusion matrix¶

In [27]:

report_classification_confusion_matrix(y_test_labels, y_test_pred, le)

             precision    recall  f1-score   support

       cond       0.52      0.61      0.56       176
         cs       0.53      0.60      0.56       309
        hep       0.82      0.86      0.84       579
       math       0.63      0.77      0.69       542
       nucl       0.62      0.30      0.41        50
    physics       0.33      0.02      0.03        57
      quant       0.53      0.16      0.25        62
       stat       0.54      0.07      0.12       106

avg / total       0.64      0.66      0.63      1881