Analysis of Semantic Scholar's paper-author relations

(bipartite graph and collaboration complex)

Todo:

  • describe the projected networks, e.g., degree distribution, clustering coefficient, mean path length, diameter
  • correlation between author degree and #citations or #publications
  • facet (upper) degrees: for a k-facet (i.e., a paper), number of incident k+1 simplices
    • many authors are in few groups and very few authors are in many groups
    • high facet degrees should correlate with high vertex degree and high publication rank => collaborate more with others, but also with more distinct sets of people
In [1]:
%matplotlib inline
In [2]:
import numpy as np
from scipy import sparse
import matplotlib as mpl
from matplotlib import pyplot as plt
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite as nxb
from IPython import display as ipd

import sys
sys.path.append('..')

from data.s2_5_bipartite_to_complex import build_features
#from data.s2_4_complex_to_matrices import load_matrices
In [3]:
plt.rcParams['figure.figsize'] = (17, 5)
In [4]:
def get_link(id, entity='paper'):
    api = 'https://api.semanticscholar.org/v1/{}/{}'
    webpage = 'https://www.semanticscholar.org/{}/{}'
    for base in [api, webpage]:
        link = base.format(entity, id)
        txt = f'<a href="{link}">{link}</a>'
        ipd.display(ipd.HTML(txt))
In [5]:
def bins(array):
    return np.arange(array.min(), array.max() + 2) - 0.5

1 Data loading

In [6]:
papers = pd.read_csv('../data/s2_2_bipartite_graph/papers.csv', index_col=0)
edges = pd.read_csv('../data/s2_2_bipartite_graph/paper_author_edges.csv')

print('paper table: {:,} papers, {:,} features'.format(*papers.shape))
print('edges table: {:,} edges'.format(edges.shape[0]))
paper table: 241,412 papers, 10 features
edges table: 1,805,826 edges
In [7]:
# Uncomment to do (part of) the analysis on the full dataset.
# papers, edges = load('../data/s2_2_bipartite_graph/paper_author_full.pickle')
In [8]:
biadjacency = sparse.load_npz('../data/s2_2_bipartite_graph/paper_author_biadjacency.npz')

print('biadjacency matrix: {:,} papers, {:,} authors, {:,} edges'.format(
    *biadjacency.shape, biadjacency.nnz))
biadjacency matrix: 241,412 papers, 741,665 authors, 1,805,826 edges
In [9]:
adjacency_papers = sparse.load_npz('../data/s2_2_bipartite_graph/papers_adjacency.npz')
adjacency_authors = sparse.load_npz('../data/s2_2_bipartite_graph/authors_adjacency.npz')

print('adjacency matrix: {:,} papers, {:,} edges'.format(adjacency_papers.shape[0], adjacency_papers.nnz // 2))
print('adjacency matrix: {:,} authors, {:,} edges'.format(adjacency_authors.shape[0], adjacency_authors.nnz // 2))
adjacency matrix: 241,412 papers, 10,276,283 edges
adjacency matrix: 741,665 authors, 18,874,369 edges
In [10]:
s_node=150250
simplices=np.load(f'../data/s2_3_collaboration_complex/{s_node}_simplices.npy',allow_pickle=True)
cochains = np.load(f'../data/s2_3_collaboration_complex/{s_node}_cochains.npy',allow_pickle=True)
features=build_features(simplices, cochains)
In [11]:
laplacians=np.load(f'../data/s2_3_collaboration_complex/{s_node}_laplacians.npy', allow_pickle=True)
boundaries=np.load(f'../data/s2_3_collaboration_complex/{s_node}_boundaries.npy', allow_pickle=True)

2 Size of collaborations

In [12]:
papers_per_author = edges.groupby('author')['paper'].count()
authors_per_paper = edges.groupby('paper')['author'].count()

print('Paper with the most authors ({}):'.format(authors_per_paper.max()))
get_link(authors_per_paper.idxmax(), 'paper')

print('Author with the most papers ({}):'.format(papers_per_author.max()))
get_link(papers_per_author.idxmax(), 'author')

fig, ax = plt.subplots()
ax.hist(papers_per_author, bins=bins(papers_per_author), log=True);
ax.set_title('number of papers per author');
ax.set_xlabel('number of papers');
ax.set_ylabel('number of authors');

fig, ax = plt.subplots()
ax.hist(authors_per_paper, bins=bins(authors_per_paper), log=True);
ax.set_title('number of authors per paper');
ax.set_xlabel('number of authors');
ax.set_ylabel('number of papers');
In [13]:
fig, ax = plt.subplots()
ax.hist(adjacency_authors.data, bins=bins(adjacency_authors.data), log=True);
ax.set_title('collaboration between authors');
ax.set_xlabel('number of papers in common (edge weight)');

fig, ax = plt.subplots()
ax.hist(adjacency_papers.data, bins=bins(adjacency_papers.data), log=True);
ax.set_title('collaboration between papers');
ax.set_xlabel('number of authors in common (edge weight)');

3 Publication year

In [14]:
ax = papers['year'].hist(bins=bins(papers['year']))
ax.set_title('number of papers published per year ({} - {})'.format(papers['year'].min(), papers['year'].max()));

4 Citations

In [15]:
print('papers have at least {:,} citations (2019)'.format(papers['citations_2019'].min()))
print('papers have at most {:,} citations (2019)'.format(papers['citations_2019'].max()))
print('there are {:,} citations in total (2019)'.format(papers['citations_2019'].sum()))
print()

# print('authors have at least {:,} citations'.format(authors['citations_2019'].min()))
# print('authors have at most {:,} citations'.format(authors['citations_2019'].max()))
# print('there are {:,} in total'.format(authors['citations_2019'].sum()))
# print()

print('Most cited paper ({:,} citations):'.format(papers['citations_2019'].max()))
get_link(papers['citations_2019'].idxmax(), 'paper')

# print('Most cited author ({:,} citations):'.format(authors['citations_2019'].max()))
# get_link(authors['citations_2019'].idxmax(), 'author')
papers have at least 0 citations (2019)
papers have at most 234 citations (2019)
there are 443,283 citations in total (2019)

Most cited paper (234 citations):
In [16]:
years = [int(column[-4:]) for column in papers.columns if 'citations' in column[:-4]]
citations_per_year = np.zeros(len(years))
for i, year in enumerate(years):
    citations_per_year[i] = papers[f'citations_{year}'].sum()
fig, ax = plt.subplots()
ax.plot(years, citations_per_year, '.-')
ax.set_xticks(years)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.set_xlabel('year')
ax.set_ylabel('number of citations');
In [17]:
fig, ax = plt.subplots(figsize=(15, 5))
for year in years[::-1]:
    ax.hist(papers[f'citations_{year}'], bins=bins(papers['citations_2019']), log=True, label=f'Year {year}', alpha=0.4)
ax.set_xlabel('number of citations')
ax.set_ylabel('number of papers')
ax.legend();

5 References

In [18]:
print('papers have at least {:,} references'.format(papers['references'].min()))
print('papers have at most {:,} references'.format(papers['references'].max()))

print('most referencing paper ({:,} references):'.format(papers['references'].max()))
get_link(papers['references'].idxmax(), 'paper')
papers have at least 1 references
papers have at most 5,558 references
most referencing paper (5,558 references):
In [19]:
papers['references'].hist(bins=bins(papers['references']), log=True);

6 Collaboration complex between authors

In [20]:
dimension = len(simplices)
sizes = np.array([len(s) for s in simplices])

for k, size in enumerate(sizes):
    print(f'{size:,} {k}-simplices')
print('{:,} simplices in total'.format(np.sum(sizes)))
352 0-simplices
1,474 1-simplices
3,285 2-simplices
5,019 3-simplices
5,559 4-simplices
4,547 5-simplices
2,732 6-simplices
1,175 7-simplices
343 8-simplices
61 9-simplices
5 10-simplices
24,552 simplices in total
In [21]:
fig, ax = plt.subplots()
ax.plot(range(dimension), sizes, '.-')
ax.set_xlabel('simplex dimension')
ax.set_ylabel('number of simplices');

7 Operators (boundaries and Laplacians)

In [22]:
for dim, boundary in enumerate(boundaries):
    print('{}-boundary matrix: {:,} x {:,}, {:,} non-zeros ({:.2%})'.format(
        dim+1, *boundary.shape, boundary.nnz, boundary.nnz/np.prod(boundary.shape)))
1-boundary matrix: 352 x 1,474, 2,948 non-zeros (0.57%)
2-boundary matrix: 1,474 x 3,285, 9,855 non-zeros (0.20%)
3-boundary matrix: 3,285 x 5,019, 20,076 non-zeros (0.12%)
4-boundary matrix: 5,019 x 5,559, 27,795 non-zeros (0.10%)
5-boundary matrix: 5,559 x 4,547, 27,282 non-zeros (0.11%)
6-boundary matrix: 4,547 x 2,732, 19,124 non-zeros (0.15%)
7-boundary matrix: 2,732 x 1,175, 9,400 non-zeros (0.29%)
8-boundary matrix: 1,175 x 343, 3,087 non-zeros (0.77%)
9-boundary matrix: 343 x 61, 610 non-zeros (2.92%)
10-boundary matrix: 61 x 5, 55 non-zeros (18.03%)
In [23]:
def get_spectrum(laplacian, lowest=False, shift_invert=True):
    # Shift-invert mode is much faster, but can raise "factor is exactly singular".
    largest = sparse.linalg.eigsh(laplacian, k=1, which='LA', return_eigenvectors=False)
    if lowest:
        if shift_invert:
            lowest = sparse.linalg.eigsh(laplacian, k=1, sigma=0, which='LM', return_eigenvectors=False)
        else:
            lowest = sparse.linalg.eigsh(laplacian, k=1, which='SA', return_eigenvectors=False)
        return lowest[0], largest[0]
    else:
        return largest[0]

spectrums = [get_spectrum(laplacian) for laplacian in laplacians]
In [24]:
for dim, (laplacian, spectrum) in enumerate(zip(laplacians, spectrums)):
    print('{}-simplices: {:,} simplices, {:.2%} sparse, spectrum in [0, {:.0f}]'.format(
        dim, laplacian.shape[0], laplacian.nnz/np.prod(laplacian.shape), spectrum))
    assert laplacian.shape == (len(simplices[dim]), len(simplices[dim]))
0-simplices: 352 simplices, 2.66% sparse, spectrum in [0, 117]
1-simplices: 1,474 simplices, 1.17% sparse, spectrum in [0, 117]
2-simplices: 3,285 simplices, 0.04% sparse, spectrum in [0, 24]
3-simplices: 5,019 simplices, 0.02% sparse, spectrum in [0, 16]
4-simplices: 5,559 simplices, 0.02% sparse, spectrum in [0, 14]
5-simplices: 4,547 simplices, 0.02% sparse, spectrum in [0, 11]
6-simplices: 2,732 simplices, 0.04% sparse, spectrum in [0, 11]
7-simplices: 1,175 simplices, 0.09% sparse, spectrum in [0, 11]
8-simplices: 343 simplices, 0.29% sparse, spectrum in [0, 11]
9-simplices: 61 simplices, 1.64% sparse, spectrum in [0, 11]
10-simplices: 5 simplices, 20.00% sparse, spectrum in [0, 11]

8 Signals (cochains) on collaboration complex

In [25]:
# Rayleigh quotient between every pair of signals (for all dimensions).
n_features = features[0].shape[1]
rayleigh = np.empty((len(features), n_features, n_features))
for dim in range(len(features)):
    rayleigh[dim] = features[dim].T @ laplacians[dim] @ features[dim]
    # Division by zero will occur if a signal is all zeros.
    rayleigh[dim] /= features[dim].T @ features[dim]

Which signals are "smooth"?

In [26]:
#columns = ['citations_1994', 'citations_1999', 'citations_2004', 'citations_2009', 'citations_2014', 'citations_2019', 'references', 'year']
columns=['citations_2019']
fig, ax = plt.subplots()
ax.set_title('smoothness of signals')
ax.set_ylabel('Rayleigh quotient')
ax.set_xlabel("signal's name")
ax.semilogy(np.array([np.diag(r) for r in rayleigh]).T, 'o-')
ax.set_xticks(range(len(columns)))
ax.set_xticklabels(columns)
ax.legend([f'{dim}-cochains ($\lambda_{{max}} = {spectrums[dim]:.0f}$)' for dim in range(len(features))], loc='lower left')
fig.tight_layout();
#fig.savefig('smoothness.pdf');

Does a signal explain another?

In [27]:
fig, axes = plt.subplots(1, len(rayleigh))
for dim, (r, ax) in enumerate(zip(rayleigh, axes)):
    im = ax.imshow(r)
    ax.set_title(f'dimension {dim}')

9 Laplacians' spectra (and Fourier transform)

In [28]:
eigenvalues, eigenvectors = np.linalg.eigh(laplacians[0].toarray())

The spectrum of the 0-Laplacian has a couple of very high eigenvalues.

In [29]:
plt.plot(eigenvalues);
print(eigenvalues[:10])
[6.7940750e-15 2.7748093e-02 5.9429258e-02 1.4139873e-01 1.8538801e-01
 2.2739482e-01 2.2824809e-01 2.6765519e-01 3.3016667e-01 3.6270031e-01]

Some eigenvalues have high multiplicity. Probably due to the connected cliques formed by the higher-dimensional simplices.

In [30]:
plt.plot(eigenvalues[:-100]);
In [31]:
multiplicities = [np.sum(eigenvalues == e) for e in range(20)]

fig, ax = plt.subplots()
ax.plot(multiplicities, 'o-')
ax.set_xticks(range(len(multiplicities)))
ax.set_title('eigenvalue multiplicities')
ax.set_ylabel('muliplicity')
ax.set_xlabel('eigenvalue');

Fourier transform of 0-cochains. Their spectral content should tell how smooth they are.

In [32]:
fourier = eigenvectors.T @ (features[0] / np.linalg.norm(features[0], axis=0))
In [33]:
idx_max = -100

plt.semilogy(eigenvalues[:idx_max], np.abs(fourier)[:idx_max], '.', alpha=0.8)
plt.legend(columns);