from operator import itemgetter
from concurrent.futures import ProcessPoolExecutor
import os
import gensim
import arxiv
import pandas as pd
import itertools
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scikitplot
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from fastFM import sgd
lemmatizer = WordNetLemmatizer()
def stem(text):
return lemmatizer.lemmatize(text)
def map_parallel(f, iterable, **kwargs):
with ProcessPoolExecutor() as pool:
result = pool.map(f, iterable, **kwargs)
return result
def retrieve_articles(start, chunksize=1000):
return arxiv.query(
search_query=search_query,
start=start,
max_results=chunksize
)
def vectorize_text(examples_df):
vectorizer = CountVectorizer(min_df=2)
features = vectorizer.fit_transform(examples_df['summary'])
le = LabelEncoder()
ohe = OneHotEncoder()
labels = le.fit_transform(valid_example_categories).reshape(-1, 1)
labels_ohe = ohe.fit_transform(labels).todense()
vectorized_data = {
'features': features,
'labels': labels,
'labels_onehot' : labels_ohe
}
return vectorized_data, (ohe, le)
def extract_keywords(text):
"""
Use gensim's textrank-based approach
"""
return gensim.summarization.keywords(
text=text,
lemmatize=True,
split=True
)
def extract_mz_keywords(text):
"""
Use gensim's Montemurro-Zanette method implementation
"""
return gensim.summarization.mz_keywords(
text=stem(text),
blocksize=32,
split=True
)
class FMClassifier(sgd.FMClassification):
"""
Wrapper for fastFM estimator that makes it behave like sklearn ones
"""
def fit(self, X, y, *args):
y = y.copy()
y[y == 0] = -1
return super(FMClassifier, self).fit(X, y, *args)
def predict_proba(self, X):
probs = super(FMClassifier, self).predict_proba(X)
return np.tile(probs, 2).reshape(2, probs.shape[0]).T
def predict_ovr(model, X):
"""
predict as multiclass (standard OVR behaves as predicting multilabel)
"""
return np.argmax(model.predict_proba(X), 1)
def filter_out_small_categories(df, categories, threshold=200):
class_counts = categories.value_counts()
too_small_classes = class_counts[class_counts < threshold].index
too_small_classes
valid_example_indices = ~categories.isin(too_small_classes)
valid_examples = df[valid_example_indices]
valid_example_categories = categories[valid_example_indices]
return valid_examples, valid_example_categories
def report_classification_confusion_matrix(y, y_pred, label_encoder):
y_test_pred_label_names = label_encoder.inverse_transform(y_pred)
y_test_label_names = label_encoder.inverse_transform(y.reshape(-1))
print(classification_report(y_test_label_names, y_test_pred_label_names))
scikitplot.metrics.plot_confusion_matrix(
y_test_label_names,
y_test_pred_label_names,
hide_zeros=True,
x_tick_rotation=90
)
plt.show()
%%time
search_query = 'matrix factorization'
max_n_articles = 10000 chunksize = 1000
def retrieve_chunk(chunk_start): return arxiv.query( search_query=search_query, start=chunk_start, max_results=chunksize )
result_chunks = list( map_parallel( retrieve_chunk, range(0, max_n_articles, chunksize) ) )
results = list(itertools.chain(*result_chunks))
print('Retrieved {} articles'.format(len(results)))
%store -r results
print('Retrieved {} articles'.format(len(results)))
Retrieved 10000 articles
n_examples = 20
for entry in results[:n_examples]:
print(20 * '*')
print(entry['title'])
print(20 * '*')
print(', '.join(entry['authors']))
print(entry['date'])
print(entry['summary'])
print()
******************** Approximate Method of Variational Bayesian Matrix Factorization/Completion with Sparse Prior ******************** Ryota Kawasumi, Koujin Takeda 2018-03-14T13:54:23Z We derive analytical expression of matrix factorization/completion solution by variational Bayes method, under the assumption that observed matrix is originally the product of low-rank dense and sparse matrices with additive noise. We assume the prior of sparse matrix is Laplace distribution by taking matrix sparsity into consideration. Then we use several approximations for derivation of matrix factorization/completion solution. By our solution, we also numerically evaluate the performance of sparse matrix reconstruction in matrix factorization, and completion of missing matrix element in matrix completion. ******************** A New Method of Matrix Spectral Factorization ******************** Gigla Janashia, Edem Lagvilava, Lasha Ephremidze 2009-09-29T15:08:13Z A new method of matrix spectral factorization is proposed which reliably computes an approximate spectral factor of any matrix spectral density that admits spectral factorization ******************** Matrix Factorizations via the Inverse Function Theorem ******************** Paul W. Y. Lee 2014-08-12T03:29:00Z We give proofs of QR factorization, Cholesky's factorization, and LDU factorization using the inverse function theorem. As a consequence, we obtain analytic dependence of these matrix factorizations which does not follow immediately using Gaussian elimination. ******************** The Reciprocal Pascal Matrix ******************** Thomas M. Richardson 2014-05-24T16:16:58Z The reciprocal Pascal matrix is the Hadamard inverse of the symmetric Pascal matrix. We show that the ordinary matrix inverse of the reciprocal Pascal matrix has integer elements. The proof uses two factorizations of the matrix of super Catalan numbers. ******************** Invariance properties of thematic factorizations of matrix functions ******************** R. B. Alexeev, V. V. Peller 2001-01-26T21:54:16Z We study the problem of invariance of indices of thematic factorizations. Such factorizations were introduced in [PY1] for studying superoptimal approximation by bounded analytic matrix functions. As shown in [PY1], the indices may depend on the choice of a thematic factorization. We introduce the notion of a monotone thematic factorization. The main result shows that under natural assumptions a matrix function that admits a thematic factorization also admits a monotone thematic factorization and the indices of a monotone thematic factorization are uniquely determined by the matrix function itself. We obtain similar results for so-called partial thematic factorizations. ******************** Online Matrix Factorization via Broyden Updates ******************** Ömer Deniz Akyıldız 2015-06-26T07:11:17Z In this paper, we propose an online algorithm to compute matrix factorizations. Proposed algorithm updates the dictionary matrix and associated coefficients using a single observation at each time. The algorithm performs low-rank updates to dictionary matrix. We derive the algorithm by defining a simple objective function to minimize whenever an observation is arrived. We extend the algorithm further for handling missing data. We also provide a mini-batch extension which enables to compute the matrix factorization on big datasets. We demonstrate the efficiency of our algorithm on a real dataset and give comparisons with well-known algorithms such as stochastic gradient matrix factorization and nonnegative matrix factorization (NMF). ******************** Matrix factorizations and intertwiners of the fundamental representations of quantum group U_q (sl_n) ******************** Yasuyoshi Yonezawa 2008-06-30T17:13:00Z We want to construct a homological link invariant whose Euler characteristic is MOY polynomial as Khovanov and Rozansky constructed a categorification of HOMFLY polynomial. The present paper gives the first step to construct a categorification of MOY polynomial. For the essential colored planar diagrams with additional data which is a sequence naturally induced by coloring, we define matrix factorizations, and then we define a matrix factorization for planar diagram obtained by gluing the essential colored planar diagrams as tensor product of the matrix factorizations for the essential planar diagrams. Moreover, we show that some matrix factorizations deribed from tensor product of the essential matrix factorizations have homotopy equivalences corresponding to MOY relations. ******************** Fundamental matrix factorization in the FJRW-theory revisited ******************** Alexander Polishchuk 2017-12-26T21:08:40Z We present an improved construction of the fundamental matrix factorization in the FJRW-theory given in arXiv:1105.2903. The revised construction is coordinate-free and works for a possibly nonabelian finite group of symmetries. One of the new ingrediants is the category of dg-matrix factorizations over a dg-scheme. ******************** Matrix factorizations and double line in $\mathfrak{sl}_n$ quantum link invariant ******************** Yasuyoshi Yonezawa 2007-03-28T07:26:02Z This article gives matrix factorizations for the trivalent diagrams and double line appearing in $\mathfrak{sl}_n$ quantum link invariant. These matrix factorizations reconstruct Khovanov-Rozansky homology. And we show that the Euler characteristic of the matrix factorization for a double loop equals the quantum dimension of the representation $\land^2 V$ of $U_q (\mathfrak{sl}_n)$ in Section \ref{sec3.3}. ******************** Finiteness of small factor analysis models ******************** Mathias Drton, Han Xiao 2009-08-12T15:42:31Z We consider small factor analysis models with one or two factors. Fixing the number of factors, we prove a finiteness result about the covariance matrix parameter space when the size of the covariance matrix increases. According to this result, there exists a distinguished matrix size starting at which one can determine whether a given covariance matrix belongs to the parameter space by determining whether all principal submatrices of the distinguished size belong to the corresponding parameter space. We show that the distinguished matrix size is equal to four in the one-factor model and six with two factors. ******************** Stochastic Matrix Factorization ******************** Christopher Adams 2016-09-19T15:19:44Z This paper considers a restriction to non-negative matrix factorization in which at least one matrix factor is stochastic. That is, the elements of the matrix factors are non-negative and the columns of one matrix factor sum to 1. This restriction includes topic models, a popular method for analyzing unstructured data. It also includes a method for storing and finding pictures. The paper presents necessary and sufficient conditions on the observed data such that the factorization is unique. In addition, the paper characterizes natural bounds on the parameters for any observed data and presents a consistent least squares estimator. The results are illustrated using a topic model analysis of PhD abstracts in economics and the problem of storing and retrieving a set of pictures of faces. ******************** Simulated Annealing with Levy Distribution for Fast Matrix Factorization-Based Collaborative Filtering ******************** Mostafa A. Shehata, Mohammad Nassef, Amr A. Badr 2017-08-09T15:14:54Z Matrix factorization is one of the best approaches for collaborative filtering, because of its high accuracy in presenting users and items latent factors. The main disadvantages of matrix factorization are its complexity, and being very hard to be parallelized, specially with very large matrices. In this paper, we introduce a new method for collaborative filtering based on Matrix Factorization by combining simulated annealing with levy distribution. By using this method, good solutions are achieved in acceptable time with low computations, compared to other methods like stochastic gradient descent, alternating least squares, and weighted non-negative matrix factorization. ******************** Primitive factorizations, Jucys-Murphy elements, and matrix models ******************** Sho Matsumoto, Jonathan Novak 2010-05-02T17:46:10Z A factorization of a permutation into transpositions is called "primitive" if its factors are weakly ordered. We discuss the problem of enumerating primitive factorizations of permutations, and its place in the hierarchy of previously studied factorization problems. Several formulas enumerating minimal primitive and possibly non-minimal primitive factorizations are presented, and interesting connections with Jucys-Murphy elements, symmetric group characters, and matrix models are described. ******************** Localization of Matrix Factorizations ******************** Ilya Krishtal, Thomas Strohmer, Tim Wertz 2013-05-07T19:55:06Z Matrices with off-diagonal decay appear in a variety of fields in mathematics and in numerous applications, such as signal processing, statistics, communications engineering, condensed matter physics, and quantum chemistry. Numerical algorithms dealing with such matrices often take advantage (implicitly or explicitly) of the empirical observation that this off-diagonal decay property seems to be preserved when computing various useful matrix factorizations, such as the Cholesky factorization or the QR-factorization. There is a fairly extensive theory describing when the inverse of a matrix inherits the localization properties of the original matrix. Yet, except for the special case of band matrices, surprisingly very little theory exists that would establish similar results for matrix factorizations. We will derive a comprehensive framework to rigorously answer the question when and under which conditions the matrix factors inherit the localization of the original matrix for such fundamental matrix factorizations as the LU-, QR-, Cholesky, and Polar factorization. ******************** Monotone thematic factorizations of matrix functions ******************** Alberto A. Condori 2009-08-28T20:29:13Z We continue the study of the so-called thematic factorizations of admissible very badly approximable matrix functions. These factorizations were introduced by V.V. Peller and N.J. Young for studying superoptimal approximation by bounded analytic matrix functions. Even though thematic indices associated with a thematic factorization of an admissible very badly approximable matrix function are not uniquely determined by the function itself, R.B. Alexeev and V.V. Peller showed that the thematic indices of any monotone non-increasing thematic factorization of an admissible very badly approximable matrix function are uniquely determined. In this paper, we prove the existence of monotone non-decreasing thematic factorizations for admissible very badly approximable matrix functions. It is also shown that the thematic indices appearing in a monotone non-decreasing thematic factorization are not uniquely determined by the matrix function itself. Furthermore, we show that the monotone non-increasing thematic factorization gives rise to a great number of other thematic factorizations. ******************** Badly approximable matrix functions and canonical factorizations ******************** R. B. Alexeev, V. V. Peller 2001-01-26T22:08:33Z We continue studying the problem of analytic approximation of matrix functions. We introduce the notion of a partial canonical factorization of a badly approximable matrix function $\Phi$ and the notion of a canonical factorization of a very badly approximable matrix function $\Phi$. Such factorizations are defined in terms of so-called balanced unitary-valued functions which have many remarkable properties. Unlike the case of thematic factorizations studied earlier in [PY1], [PY2], [PT], [AP1], the factors in canonical factorizations (as well as partial canonical factorizations) are uniquely determined by the matrix function $\Phi$ up to constant unitary factors. We study many properties of canonical factorizations. In particular we show that under certain natural assumptions on a function space $X$ the condition $\Phi\in X$ implies that all factors in a canonical factorization of $\Phi$ belong to the same space $X$. In the last section we characterize the very badly approximable unitary-valued functions $U$ that satisfy the condition $\|H_U\|_{\text e}<1$. ******************** Nonnegative Matrix Factorization Requires Irrationality ******************** Dmitry Chistikov, Stefan Kiefer, Ines Marušić, Mahsa Shirmohammadi, James Worrell 2017-03-22T22:03:17Z Nonnegative matrix factorization (NMF) is the problem of decomposing a given nonnegative $n \times m$ matrix $M$ into a product of a nonnegative $n \times d$ matrix $W$ and a nonnegative $d \times m$ matrix $H$. A longstanding open question, posed by Cohen and Rothblum in 1993, is whether a rational matrix $M$ always has an NMF of minimal inner dimension $d$ whose factors $W$ and $H$ are also rational. We answer this question negatively, by exhibiting a matrix for which $W$ and $H$ require irrational entries. ******************** Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions ******************** Lasha Ephremidze, Edem Lagvilava 2010-08-18T16:04:13Z A spectral factorization theorem is proved for polynomial rank-deficient matrix-functions. The theorem is used to construct paraunitary matrix-functions with first rows given. ******************** From-Below Approximations in Boolean Matrix Factorization: Geometry and New Algorithm ******************** Radim Belohlavek, Martin Trnecka 2013-06-20T15:19:22Z We present new results on Boolean matrix factorization and a new algorithm based on these results. The results emphasize the significance of factorizations that provide from-below approximations of the input matrix. While the previously proposed algorithms do not consider the possibly different significance of different matrix entries, our results help measure such significance and suggest where to focus when computing factors. An experimental evaluation of the new algorithm on both synthetic and real data demonstrates its good performance in terms of good coverage by the first k factors as well as a small number of factors needed for exact decomposition and indicates that the algorithm outperforms the available ones in these terms. We also propose future research topics. ******************** Necessary And Sufficient Conditions For Existence of the LU Factorization of an Arbitrary Matrix ******************** Pavel Okunev, Charles R. Johnson 2005-06-19T23:10:13Z If $A$ is an n-by-n matrix over a field $F$ ($A\in M_{n}(F)$), then $A$ is said to ``have an LU factorization'' if there exists a lower triangular matrix $L\in M_{n}(F)$ and an upper triangular matrix $U\in M_{n}(F)$ such that $$A=LU.$$ We give necessary and sufficient conditions for LU factorability of a matrix. Also simple algorithm for computing an LU factorization is given. It is an extension of the Gaussian elimination algorithm to the case of not necessarily invertible matrices. We consider possibilities to factors a matrix that does not have an LU factorization as the product of an ``almost lower triangular'' matrix and an ``almost upper triangular'' matrix. There are many ways to formalize what almost means. We consider some of them and derive necessary and sufficient conditions. Also simple algorithms for computing of an ``almost LU factorization'' are given.
articles_df = pd.DataFrame(results)
articles_df = pd.read_json('matrix_factorization_arxiv_query_result.json')
articles_df.columns
Index(['affiliation', 'arxiv_comment', 'arxiv_primary_category', 'arxiv_url', 'author', 'author_detail', 'authors', 'doi', 'guidislink', 'id', 'journal_reference', 'links', 'pdf_url', 'published', 'published_parsed', 'summary', 'summary_detail', 'tags', 'title', 'title_detail', 'updated', 'updated_parsed'], dtype='object')
articles_df.head()
affiliation | arxiv_comment | arxiv_primary_category | arxiv_url | author | author_detail | authors | doi | guidislink | id | ... | pdf_url | published | published_parsed | summary | summary_detail | tags | title | title_detail | updated | updated_parsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | 22 pages, 4 figures, part of this work was pre... | {'term': 'eess.SP', 'scheme': 'http://arxiv.or... | http://arxiv.org/abs/1803.06234v1 | Koujin Takeda | {'name': 'Koujin Takeda'} | [Ryota Kawasumi, Koujin Takeda] | None | True | http://arxiv.org/abs/1803.06234v1 | ... | http://arxiv.org/pdf/1803.06234v1 | 2018-03-14T13:54:23Z | (2018, 3, 14, 13, 54, 23, 2, 73, 0) | We derive analytical expression of matrix fact... | {'type': 'text/plain', 'language': None, 'base... | [{'term': 'eess.SP', 'scheme': 'http://arxiv.o... | Approximate Method of Variational Bayesian Mat... | {'type': 'text/plain', 'language': None, 'base... | 2018-03-14T13:54:23Z | (2018, 3, 14, 13, 54, 23, 2, 73, 0) |
1 | None | 23 pages | {'term': 'math.CV', 'scheme': 'http://arxiv.or... | http://arxiv.org/abs/0909.5361v1 | Lasha Ephremidze | {'name': 'Lasha Ephremidze'} | [Gigla Janashia, Edem Lagvilava, Lasha Ephremi... | None | True | http://arxiv.org/abs/0909.5361v1 | ... | http://arxiv.org/pdf/0909.5361v1 | 2009-09-29T15:08:13Z | (2009, 9, 29, 15, 8, 13, 1, 272, 0) | A new method of matrix spectral factorization ... | {'type': 'text/plain', 'language': None, 'base... | [{'term': 'math.CV', 'scheme': 'http://arxiv.o... | A New Method of Matrix Spectral Factorization | {'type': 'text/plain', 'language': None, 'base... | 2009-09-29T15:08:13Z | (2009, 9, 29, 15, 8, 13, 1, 272, 0) |
2 | None | 6 pages | {'term': 'math.CA', 'scheme': 'http://arxiv.or... | http://arxiv.org/abs/1408.2611v1 | Paul W. Y. Lee | {'name': 'Paul W. Y. Lee'} | [Paul W. Y. Lee] | None | True | http://arxiv.org/abs/1408.2611v1 | ... | http://arxiv.org/pdf/1408.2611v1 | 2014-08-12T03:29:00Z | (2014, 8, 12, 3, 29, 0, 1, 224, 0) | We give proofs of QR factorization, Cholesky's... | {'type': 'text/plain', 'language': None, 'base... | [{'term': 'math.CA', 'scheme': 'http://arxiv.o... | Matrix Factorizations via the Inverse Function... | {'type': 'text/plain', 'language': None, 'base... | 2014-08-12T03:29:00Z | (2014, 8, 12, 3, 29, 0, 1, 224, 0) |
3 | None | None | {'term': 'math.CO', 'scheme': 'http://arxiv.or... | http://arxiv.org/abs/1405.6315v1 | Thomas M. Richardson | {'name': 'Thomas M. Richardson'} | [Thomas M. Richardson] | None | True | http://arxiv.org/abs/1405.6315v1 | ... | http://arxiv.org/pdf/1405.6315v1 | 2014-05-24T16:16:58Z | (2014, 5, 24, 16, 16, 58, 5, 144, 0) | The reciprocal Pascal matrix is the Hadamard i... | {'type': 'text/plain', 'language': None, 'base... | [{'term': 'math.CO', 'scheme': 'http://arxiv.o... | The Reciprocal Pascal Matrix | {'type': 'text/plain', 'language': None, 'base... | 2014-05-24T16:16:58Z | (2014, 5, 24, 16, 16, 58, 5, 144, 0) |
4 | None | 20 pages | {'term': 'math.FA', 'scheme': 'http://arxiv.or... | http://arxiv.org/abs/math/0101182v2 | V. V. Peller | {'name': 'V. V. Peller'} | [R. B. Alexeev, V. V. Peller] | None | True | http://arxiv.org/abs/math/0101182v2 | ... | http://arxiv.org/pdf/math/0101182v2 | 2001-01-22T23:32:55Z | (2001, 1, 22, 23, 32, 55, 0, 22, 0) | We study the problem of invariance of indices ... | {'type': 'text/plain', 'language': None, 'base... | [{'term': 'math.FA', 'scheme': 'http://arxiv.o... | Invariance properties of thematic factorizatio... | {'type': 'text/plain', 'language': None, 'base... | 2001-01-26T21:54:16Z | (2001, 1, 26, 21, 54, 16, 4, 26, 0) |
5 rows × 22 columns
categories = articles_df['arxiv_primary_category'].apply(itemgetter('term'))
main_categories = categories.apply(lambda s: s.split('.')[0].split('-')[0])
Toplevel categories
main_categories_counts = main_categories.value_counts(ascending=True)
main_categories_counts.plot.barh()
plt.show()
Toplevel categories with more than 200 papers
main_categories_counts[main_categories_counts > 200].plot.barh()
plt.show()
categories.value_counts(ascending=True)[-10:].plot.barh()
plt.show()
%%time
articles_df['summary_keywords'] = list(
map_parallel(extract_keywords, articles_df['summary'])
)
CPU times: user 5.37 s, sys: 611 ms, total: 5.99 s Wall time: 1min 40s
for __, row in itertools.islice(articles_df.iterrows(), n_examples):
print(20 * '*')
print(row['title'])
print(20 * '*')
print('keywords:', row['summary_keywords'])
print()
******************** Approximate Method of Variational Bayesian Matrix Factorization/Completion with Sparse Prior ******************** keywords: ['matrix', 'analytical', 'bayes', 'completion'] ******************** A New Method of Matrix Spectral Factorization ******************** keywords: ['spectral', 'method'] ******************** Matrix Factorizations via the Inverse Function Theorem ******************** keywords: ['factorizations', 'function'] ******************** The Reciprocal Pascal Matrix ******************** keywords: ['pascal matrix'] ******************** Invariance properties of thematic factorizations of matrix functions ******************** keywords: ['thematic', 'results', 'superoptimal', 'matrix'] ******************** Online Matrix Factorization via Broyden Updates ******************** keywords: ['matrix', 'algorithms', 'missing', 'objective', 'dataset'] ******************** Matrix factorizations and intertwiners of the fundamental representations of quantum group U_q (sl_n) ******************** keywords: ['matrix', 'equivalences', 'link', 'paper', 'naturally', 'planar'] ******************** Fundamental matrix factorization in the FJRW-theory revisited ******************** keywords: ['construction', 'matrix'] ******************** Matrix factorizations and double line in $\mathfrak{sl}_n$ quantum link invariant ******************** keywords: ['ref', 'quantum link', 'line', 'loop'] ******************** Finiteness of small factor analysis models ******************** keywords: ['matrix parameter', 'factors', 'size'] ******************** Stochastic Matrix Factorization ******************** keywords: ['paper', 'unstructured', 'data', 'factors', 'model'] ******************** Simulated Annealing with Levy Distribution for Fast Matrix Factorization-Based Collaborative Filtering ******************** keywords: ['methods', 'computations', 'simulated', 'filtering', 'non', 'latent', 'descent', 'good'] ******************** Primitive factorizations, Jucys-Murphy elements, and matrix models ******************** keywords: ['primitive', 'factorizations', 'enumerating', 'minimal'] ******************** Localization of Matrix Factorizations ******************** keywords: ['matrix', 'factors', 'decay', 'extensive theory', 'numerical', 'similar', 'matrices', 'properties'] ******************** Monotone thematic factorizations of matrix functions ******************** keywords: ['thematic', 'matrix', 'approximation', 'indices', 'non'] ******************** Badly approximable matrix functions and canonical factorizations ******************** keywords: ['unitary', 'factors', 'approximable', 'function', 'study'] ******************** Nonnegative Matrix Factorization Requires Irrationality ******************** keywords: ['question', 'matrix', 'irrational'] ******************** Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions ******************** keywords: ['matrix', 'factorization'] ******************** From-Below Approximations in Boolean Matrix Factorization: Geometry and New Algorithm ******************** keywords: ['matrix', 'algorithms', 'different', 'factors', 'new results', 'propose'] ******************** Necessary And Sufficient Conditions For Existence of the LU Factorization of an Arbitrary Matrix ******************** keywords: ['triangular', 'invertible', 'elimination']
article_keyword_lengths = articles_df['summary_keywords'].apply(len)
article_keyword_lengths.plot.hist(bins=article_keyword_lengths.max(), title='Number of summary keywords')
<matplotlib.axes._subplots.AxesSubplot at 0x7fd30d1fccf8>
valid_examples, valid_example_categories = filter_out_small_categories(articles_df, main_categories)
valid_examples.shape
(9403, 23)
vectorized_data, (ohe, le) = vectorize_text(valid_examples)
We'll use Factorization Machine model. FMs are well suited for such high-dimensional sparse data. This model comes from FastFM library.
Note OneVsRestClassifier
. We have to use it here because FastFM doesn't support multiclass classification.
fm = FMClassifier(
rank=50,
n_iter=10000,
step_size=0.0001,
l2_reg_w=0.01,
l2_reg_V=0.01
)
fm_multiclass = OneVsRestClassifier(fm)
x_train, x_test, y_train, y_test, y_train_labels, y_test_labels = train_test_split(
vectorized_data['features'],
vectorized_data['labels_onehot'],
vectorized_data['labels'],
stratify=vectorized_data['labels'],
test_size=0.2,
random_state=0
)
%%time
fm_multiclass.fit(x_train, y_train)
CPU times: user 14 s, sys: 171 ms, total: 14.1 s Wall time: 12.5 s
OneVsRestClassifier(estimator=FMClassifier(init_stdev=0.1, l2_reg=None, l2_reg_V=0.01, l2_reg_w=0.01, n_iter=10000, random_state=123, rank=50, step_size=0.0001), n_jobs=1)
y_test_pred = predict_ovr(fm_multiclass, x_test)
print(
'train score:', accuracy_score(y_train_labels, predict_ovr(fm_multiclass, x_train)), '\n'
'test score: ', accuracy_score(y_test_labels, y_test_pred)
)
train score: 0.719090667376 test score: 0.658692185008
report_classification_confusion_matrix(y_test_labels, y_test_pred, le)
precision recall f1-score support cond 0.52 0.61 0.56 176 cs 0.53 0.60 0.56 309 hep 0.82 0.86 0.84 579 math 0.63 0.77 0.69 542 nucl 0.62 0.30 0.41 50 physics 0.33 0.02 0.03 57 quant 0.53 0.16 0.25 62 stat 0.54 0.07 0.12 106 avg / total 0.64 0.66 0.63 1881