<article>
records¶import pathlib
import pandas
from pubmedpy.xml import yield_etrees_from_zip
from pubmedpy.pmc_oai import extract_authors_from_article
zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))
zip_paths
[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'), PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'), PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]
authors = list()
for zip_path in zip_paths:
for name, article in yield_etrees_from_zip(zip_path):
authors.extend(extract_authors_from_article(article))
author_df = pandas.DataFrame(authors)
author_df = author_df.sort_values(['pmcid', 'position'])
affiliation_df = author_df[["pmcid", "position", "affiliations"]]
author_df = author_df.drop(columns=['affiliations'])
author_df.tail()
pmcid | position | fore_name | last_name | corresponding | reverse_position | |
---|---|---|---|---|---|---|
24041 | PMC77394 | 2 | Ferdinando Di | Cunto | 0 | 2 |
24042 | PMC77394 | 3 | Paolo | Provero | 1 | 1 |
24043 | PMC90187 | 1 | Jonas S | Almeida | 1 | 2 |
24044 | PMC90187 | 2 | Susana | Vinga | 0 | 1 |
24045 | PMC99049 | 1 | Harry J | Mangalam | 1 | 1 |
# create affiliations table
affiliation_df = (
affiliation_df
.explode('affiliations')
.rename(columns={"affiliations": "affiliation"})
[["pmcid", "position", "affiliation"]]
.dropna(subset=["affiliation"])
)
affiliation_df.head(2)
pmcid | position | affiliation | |
---|---|---|---|
24046 | PMC100321 | 1 | 1 University of Cologne, Institute of Genetics... |
24047 | PMC100321 | 2 | 1 University of Cologne, Institute of Genetics... |
# Show 10 random affiliations
print(*affiliation_df.sample(10, random_state=0).affiliation, sep='\n')
6 Commissariat à l'énergie atomique, iBiTecS, Gif-sur-Yvette, France 1 Department of Computer Science, Princeton University, Princeton, NJ 08544, USA and 2 Lewis-Sigler Institute for Integrative Genomics, Princeton University, Princeton, NJ 08540, USA 1 Bioinformatics Institute (BII), Agency for Science Technology and Research (A*STAR), 30 Biopolis Street, #07-01, Matrix, 138671, 2 Institute of High Performance Computing (IHPC), Agency for Science Technology and Research (A*STAR), 1 Fusionopolis Way, #16-16 Connexis, 138632, 3 Department of Biological Sciences (DBS), National University of Singapore (NUS), 8 Medical Drive 4, 117597, 4 School of Computer Engineering (SCE), Nanyang Technological University (NTU), 50 Nanyang Drive, 637553 and 5 School of Biological Sciences (SBS), Nanyang Technological University (NTU), 60 Nanyang Drive, 637551, Singapore 2 Fogarty International Center, National Institutes of Health, Bethesda, MD, United States of America 2 Department of Mathematics, Rowland Hall, University of California, Irvine, California, United States of America 2 Center for Medical Informatics, Yale University, New Haven, CT 06520, USA Department of Biology, Carleton University, Ottawa, ON, Canada 1 0000 0001 2106 9910 grid.65499.37 Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute, Boston, MA 02215 USA 5 Grossman Center for the Statistics of Mind and Center for Theoretical Neuroscience, Columbia University, New York, New York, United States of America 1 Computer Science Division, University of California, Berkeley, Berkeley, California, United States of America
# number of unique affiliations
affiliation_df.affiliation.nunique()
52939
# Total number of articles
author_df.pmcid.nunique()
21587
# number of corresponding authors per paper
n_corresponding = author_df.groupby("pmcid").corresponding.sum()
pmcids_without_corresponding = set(n_corresponding[n_corresponding == 0].index)
# Probability of author position being corresponding,
# given that there's at least one corresponding author
# and the author is not the last author
(
author_df
.query("pmcid not in @pmcids_without_corresponding")
.query("reverse_position > 1")
.groupby("position")
.corresponding
.mean()
.map("{:.1%}".format)
.head()
)
position 1 42.9% 2 7.6% 3 4.6% 4 4.6% 5 5.3% Name: corresponding, dtype: object
# Probability of author reverse position being corresponding,
# given that there's at least one corresponding author
# and the author is not the first author
(
author_df
.query("pmcid not in @pmcids_without_corresponding")
.query("position > 1")
.groupby("reverse_position")
.corresponding
.mean()
.map("{:.1%}".format)
.head()
)
reverse_position 1 61.9% 2 12.4% 3 4.1% 4 3.0% 5 3.5% Name: corresponding, dtype: object
# Corresponding author counts
n_corresponding.value_counts().sort_index()
0 371 1 17529 2 3267 3 314 4 62 5 19 6 7 7 2 8 2 9 6 10 2 11 1 14 2 15 1 17 1 21 1 Name: corresponding, dtype: int64
# Testing: show some articles without any corresponding authors
n_corresponding.reset_index().query("corresponding == 0").head()
pmcid | corresponding | |
---|---|---|
105 | PMC1183510 | 0 |
106 | PMC1183511 | 0 |
107 | PMC1183512 | 0 |
119 | PMC1185644 | 0 |
160 | PMC1193992 | 0 |
# Testing: show some articles without >10 corresponding authors
n_corresponding.reset_index().query("corresponding >= 10")
pmcid | corresponding | |
---|---|---|
9078 | PMC3463115 | 15 |
9349 | PMC3509495 | 14 |
9393 | PMC3519461 | 17 |
9583 | PMC3546797 | 10 |
9719 | PMC3570207 | 11 |
10363 | PMC3694659 | 10 |
15564 | PMC5001208 | 21 |
17344 | PMC5647556 | 14 |
# Write author dataframe to a TSV
author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\t')
# Write affiliation dataframe to a TSV
affiliation_df.to_csv('data/pmc/affiliations.tsv.xz', index=False, sep='\t')