Extract authors from PMC-OAI frontmatter `<article>` records¶

In [1]:

import pathlib

import pandas

from pubmedpy.xml import yield_etrees_from_zip
from pubmedpy.pmc_oai import extract_authors_from_article

In [2]:

zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))
zip_paths

Out[2]:

[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),
 PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),
 PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]

In [3]:

authors = list()
for zip_path in zip_paths:
    for name, article in yield_etrees_from_zip(zip_path):
        authors.extend(extract_authors_from_article(article))
author_df = pandas.DataFrame(authors)
author_df = author_df.sort_values(['pmcid', 'position'])
affiliation_df = author_df[["pmcid", "position", "affiliations"]]
author_df = author_df.drop(columns=['affiliations'])
author_df.tail()

Out[3]:

	pmcid	position	fore_name	last_name	corresponding	reverse_position
24041	PMC77394	2	Ferdinando Di	Cunto	0	2
24042	PMC77394	3	Paolo	Provero	1	1
24043	PMC90187	1	Jonas S	Almeida	1	2
24044	PMC90187	2	Susana	Vinga	0	1
24045	PMC99049	1	Harry J	Mangalam	1	1

In [4]:

# create affiliations table
affiliation_df = (
    affiliation_df
    .explode('affiliations')
    .rename(columns={"affiliations": "affiliation"})
    [["pmcid", "position", "affiliation"]]
    .dropna(subset=["affiliation"])
)
affiliation_df.head(2)

Out[4]:

	pmcid	position	affiliation
24046	PMC100321	1	1 University of Cologne, Institute of Genetics...
24047	PMC100321	2	1 University of Cologne, Institute of Genetics...

In [5]:

# Show 10 random affiliations
print(*affiliation_df.sample(10, random_state=0).affiliation, sep='\n')

6 Commissariat à l'énergie atomique, iBiTecS, Gif-sur-Yvette, France
1 Department of Computer Science, Princeton University, Princeton, NJ 08544, USA and 2 Lewis-Sigler Institute for Integrative Genomics, Princeton University, Princeton, NJ 08540, USA
1 Bioinformatics Institute (BII), Agency for Science Technology and Research (A*STAR), 30 Biopolis Street, #07-01, Matrix, 138671, 2 Institute of High Performance Computing (IHPC), Agency for Science Technology and Research (A*STAR), 1 Fusionopolis Way, #16-16 Connexis, 138632, 3 Department of Biological Sciences (DBS), National University of Singapore (NUS), 8 Medical Drive 4, 117597, 4 School of Computer Engineering (SCE), Nanyang Technological University (NTU), 50 Nanyang Drive, 637553 and 5 School of Biological Sciences (SBS), Nanyang Technological University (NTU), 60 Nanyang Drive, 637551, Singapore
2 Fogarty International Center, National Institutes of Health, Bethesda, MD, United States of America
2 Department of Mathematics, Rowland Hall, University of California, Irvine, California, United States of America
2 Center for Medical Informatics, Yale University, New Haven, CT 06520, USA
Department of Biology, Carleton University, Ottawa, ON, Canada
1 0000 0001 2106 9910 grid.65499.37 Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute, Boston, MA 02215 USA
5 Grossman Center for the Statistics of Mind and Center for Theoretical Neuroscience, Columbia University, New York, New York, United States of America
1 Computer Science Division, University of California, Berkeley, Berkeley, California, United States of America

In [6]:

# number of unique affiliations
affiliation_df.affiliation.nunique()

Out[6]:

In [7]:

# Total number of articles
author_df.pmcid.nunique()

Out[7]:

In [8]:

# number of corresponding authors per paper
n_corresponding = author_df.groupby("pmcid").corresponding.sum()
pmcids_without_corresponding = set(n_corresponding[n_corresponding == 0].index)

In [9]:

# Probability of author position being corresponding,
# given that there's at least one corresponding author
# and the author is not the last author
(
    author_df
    .query("pmcid not in @pmcids_without_corresponding")
    .query("reverse_position > 1")
    .groupby("position")
    .corresponding
    .mean()
    .map("{:.1%}".format)
    .head()
)

Out[9]:

position
1    42.9%
2     7.6%
3     4.6%
4     4.6%
5     5.3%
Name: corresponding, dtype: object

In [10]:

# Probability of author reverse position being corresponding,
# given that there's at least one corresponding author
# and the author is not the first author
(
    author_df
    .query("pmcid not in @pmcids_without_corresponding")
    .query("position > 1")
    .groupby("reverse_position")
    .corresponding
    .mean()
    .map("{:.1%}".format)
    .head()
)

Out[10]:

reverse_position
1    61.9%
2    12.4%
3     4.1%
4     3.0%
5     3.5%
Name: corresponding, dtype: object

In [11]:

# Corresponding author counts
n_corresponding.value_counts().sort_index()

Out[11]:

0       371
1     17529
2      3267
3       314
4        62
5        19
6         7
7         2
8         2
9         6
10        2
11        1
14        2
15        1
17        1
21        1
Name: corresponding, dtype: int64

In [12]:

# Testing: show some articles without any corresponding authors
n_corresponding.reset_index().query("corresponding == 0").head()

Out[12]:

	pmcid	corresponding
105	PMC1183510	0
106	PMC1183511	0
107	PMC1183512	0
119	PMC1185644	0
160	PMC1193992	0

In [13]:

# Testing: show some articles without >10 corresponding authors
n_corresponding.reset_index().query("corresponding >= 10")

Out[13]:

	pmcid	corresponding
9078	PMC3463115	15
9349	PMC3509495	14
9393	PMC3519461	17
9583	PMC3546797	10
9719	PMC3570207	11
10363	PMC3694659	10
15564	PMC5001208	21
17344	PMC5647556	14

In [14]:

# Write author dataframe to a TSV
author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\t')

# Write affiliation dataframe to a TSV
affiliation_df.to_csv('data/pmc/affiliations.tsv.xz', index=False, sep='\t')

Extract authors from PMC-OAI frontmatter <article> records¶

Extract authors from PMC-OAI frontmatter `<article>` records¶