import os
print(os.getcwd())
/Users/ksb/computation/science/gender-comp-bio/notebooks
os.chdir("../data/pubdata")
os.listdir()
['arxiv-bio-data.xml', 'arxiv-data.xml', 'bio.xml', 'comp.xml']
Next, we'll need to parse the xml files. Since several of the data fles are huge, we don't want to use the python xml module, which would require loading the entire contents of the file into memory. Instead, we'll use lxml.etree.iterparse()
, which will allow us to grab one article at a time, grab its info, then clear it from memory.
import lxml.etree as ET
import datetime
i = 0
for event, element in ET.iterparse('comp.xml', tag="PubmedArticle", events=("end",)):
i+=1
element.xpath('.//DateCreated/Year')[0].text
pmid = element.xpath('.//PMID')[0].text
pubdate = datetime.date(
int(element.xpath('.//DateCreated/Year')[0].text), # year
int(element.xpath('.//DateCreated/Month')[0].text), # month
int(element.xpath('.//DateCreated/Day')[0].text), #day
)
journal = element.xpath('.//Journal//ISOAbbreviation')
if journal:
journal = journal[0].text
else:
journal = None
title = element.xpath('.//Article/ArticleTitle')
if title:
title = title[0].text
else:
title = None
abstract = element.xpath('.//Article/Abstract')
if abstract:
abstract = abstract[0].text
else:
abstract = None
author_records = element.xpath('.//Article/AuthorList/Author')
authors = []
for name in author_records:
try:
authors.append((name[0].text, name[1].text))
except IndexError:
pass
print("{}, {}:{}".format(pmid, journal, authors))
element.clear()
if i > 5:
break
26605382, IEEE/ACM Trans Comput Biol Bioinform:[('Huang', 'Yufei'), ('Chen', 'Yidong'), ('Qian', 'Xiaoning')] 26357062, IEEE/ACM Trans Comput Biol Bioinform:[('Wang', 'Haiying'), ('Zheng', 'Huiru')] 26357061, IEEE/ACM Trans Comput Biol Bioinform:[('Loohuis', 'Loes Olde'), ('Witzel', 'Andreas'), ('Mishra', 'Bud')] 26357060, IEEE/ACM Trans Comput Biol Bioinform:[('Kobayashi', 'Koichi'), ('Hiraishi', 'Kunihiko')] 26357059, IEEE/ACM Trans Comput Biol Bioinform:[('Ahmed', 'Hasin Afzal'), ('Mahanta', 'Priyakshi'), ('Bhattacharyya', 'Dhruba Kumar'), ('Kalita', 'Jugal Kumar')] 26357058, IEEE/ACM Trans Comput Biol Bioinform:[('Disanto', 'Filippo'), ('Rosenberg', 'Noah A')]
Just because I need the practice, I'm going to set up an Article
class to hold the data and make working with it easier, and an Author
class that we can use to deal with author names
class Article(object):
"""Container for publication info"""
def __init__(self, article_id, pubdate, journal, title, abstract, authors):
self.article_id = article_id
self.pubdate = pubdate
self.journal = journal
self.title = title
self.abstract = abstract
self.authors = authors
def __repr__(self):
return "<Article ID: {}>".format(self.article_id)
def get_authors(self):
for author in self.authors:
yield author
class Author(object):
def __init__(self, last_name, first_name=None):
assert type(last_name) == str
self.last_name = last_name
if first_name:
assert type(first_name) == str
self.first_name = first_name.split()[0]
try:
self.initials = " ".join(first_name.split()[1:])
except IndexError:
self.initials = None
else:
self.first_name = None
self.initials = None
And... we can turn the code above into a generator function that yields an Article
for each document
from lxml.etree import iterparse
def iter_parse_pubmed(xml_file):
# get an iterable
for event, element in iterparse(xml_file, tag="PubmedArticle", events=("end",)):
pmid = element.xpath('.//PMID')[0].text
pubdate = datetime.date(
int(element.xpath('.//DateCreated/Year')[0].text), # year
int(element.xpath('.//DateCreated/Month')[0].text), # month
int(element.xpath('.//DateCreated/Day')[0].text), #day
)
journal = element.xpath('.//Journal//ISOAbbreviation')
if journal:
journal = journal[0].text
else:
journal = None
title = element.xpath('.//Article/ArticleTitle')
if title:
title = title[0].text
else:
title = None
abstract = element.xpath('.//Article/Abstract')
if abstract:
abstract = abstract[0].text
else:
abstract = None
author_records = element.xpath('.//Article/AuthorList/Author')
authors = []
for name in author_records:
try:
authors.append(Author(name[0].text, name[1].text))
except IndexError:
pass
element.clear()
yield Article(pmid, pubdate, journal, title, abstract, authors)
iter_parse_pubmed('comp.xml')
<generator object iter_parse_pubmed at 0x10a7ee780>
Usage:
i = 0
for article in iter_parse_pubmed('comp.xml'):
i+=1
print(article)
print(article.pubdate)
for author in article.get_authors():
print("{}, {} {}".format(author.last_name, author.first_name, author.initials))
print()
if i > 5:
break
<Article ID: 26605382> 2015-11-24 Huang, Yufei Chen, Yidong Qian, Xiaoning <Article ID: 26357062> 2015-09-11 Wang, Haiying Zheng, Huiru <Article ID: 26357061> 2015-09-11 Loohuis, Loes Olde Witzel, Andreas Mishra, Bud <Article ID: 26357060> 2015-09-11 Kobayashi, Koichi Hiraishi, Kunihiko <Article ID: 26357059> 2015-09-11 Ahmed, Hasin Afzal Mahanta, Priyakshi Bhattacharyya, Dhruba Kumar Kalita, Jugal Kumar <Article ID: 26357058> 2015-09-11 Disanto, Filippo Rosenberg, Noah A
Author position matters, but it matters in sort of a weird way - first author and last author are most important, then decreasing as you work your way in to the middle of the list. But practically, there's not much distinction between 3rd and 4th author (or 3rd from last and 4th from last), so we'll generate scores for first, second, last, penultimate and everyone else. The trick is to avoid index errors if the author list is smaller than 5, so we need to write up some special cases.
def score_authors(author_list):
if not author_list:
first = None
else:
first = author_list[0]
others, penultimate, second, last = None, None, None, None
list_length = len(author_list)
if list_length > 4:
others = [author for author in author_list[2:-2]]
if list_length > 3:
penultimate = author_list[-2]
if list_length > 2:
second = author_list[1]
if list_length > 1:
last = author_list[-1]
return first, last, second, penultimate, others
import pandas as pd
col_names = ["Date", "Journal", "Author Name", "Position"]
df = pd.DataFrame(columns=col_names)
i = 0
for article in iter_parse_pubmed('comp.xml'):
i+=1
first, last, second, penultimate, others = score_authors(article.authors)
if first:
row = pd.Series([str(article.pubdate)), article.journal, first.first_name, "first"], name=article.article_id, index=col_names)
df = df.append(row)
else:
continue
try:
row = pd.Series([str(article.pubdate), article.journal, last.first_name, "last"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([str(article.pubdate), article.journal, second.first_name, "second"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([str(article.pubdate), article.journal, penultimate.first_name, "penultimate"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
for x in others:
row = pd.Series([str(article.pubdate), article.journal, x.first_name, "other"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
if i > 5:
break
print(df[1:10])
Date Journal Author Name \ 26605382 2015-11-24 IEEE/ACM Trans Comput Biol Bioinform Xiaoning 26605382 2015-11-24 IEEE/ACM Trans Comput Biol Bioinform Yidong 26357062 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Haiying 26357062 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Huiru 26357061 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Loes 26357061 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Bud 26357061 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Andreas 26357060 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Koichi 26357060 2015-09-11 IEEE/ACM Trans Comput Biol Bioinform Kunihiko Position 26605382 last 26605382 second 26357062 first 26357062 last 26357061 first 26357061 last 26357061 second 26357060 first 26357060 last
def iter_parse_arxiv(xml_file):
print("parsing!")
ns = {
"a":"http://arxiv.org/OAI/arXiv/",
"o":"http://www.openarchives.org/OAI/2.0/"}
for event, element in iterparse(xml_file, tag= "{http://www.openarchives.org/OAI/2.0/}record", events=("end",)):
ident = element.xpath('./o:header/o:identifier', namespaces = ns)[0].text
pubdate = element.xpath('.//o:datestamp', namespaces = ns)[0].text.split("-")
pubdate = datetime.date(*[int(d) for d in pubdate])
author_records = element.xpath('.//o:metadata//a:authors/a:author', namespaces = ns)
authors = []
for name in author_records:
last_name = name.xpath('./a:keyname', namespaces = ns)[0].text
try:
first_name = name.xpath('./a:forenames', namespaces = ns)[0].text
except IndexError:
first_name = None
try:
authors.append(Author(last_name, first_name))
except IndexError:
pass
try:
title = element.xpath('.//o:metadata//a:title', namespaces = ns)[0].text
except IndexError:
title = None
try:
abstract = element.xpath('.//o:metadata//a:abstract', namespaces = ns)[0].text
except IndexError:
abstract = None
try:
cat = element.xpath('.//o:metadata//a:categories', namespaces = ns)[0].text.split(" ")
except IndexError:
cat = None
element.clear()
yield Article(ident, pubdate, cat, title, abstract, authors)
Now the conclusion - I'll write a function that takes a pubmed xml, parses it using iter_parse_pubmed()
or iter_parse_arxiv()
and score_authors()
, puts the authors into a data frame as shown above, and writes a CSV file. Note: if you want to just parse a pubmed file without going through this notebook, you can use the included xml_parsing.py
script:
$ python xml_parsing.py --pubmed /path/to/pubmed.xml /path/to/output.csv
or
$ python xml_parsing.py --arxiv /path/to/arxiv.xml /path/to/output.csv
def write_names_to_file(in_file, out_file, pub_type="pubmed"):
col_names = ["Date", "Journal", "Author Name", "Position"]
df = pd.DataFrame(columns=col_names)
with open(out_file, 'w+') as out:
df.to_csv(out, columns = col_names)
counter = 0
if pub_type == "arxiv":
articles = iter_parse_arxiv(in_file)
elif pub_type == "pubmed":
articles = iter_parse_pubmed(in_file)
else:
raise IndexError
for article in articles:
first, last, second, penultimate, others = score_authors(article.authors)
if first:
row = pd.Series([str(article.pubdate), article.journal, first.first_name, "first"], name=article.article_id, index=col_names)
df = df.append(row)
else:
continue
try:
row = pd.Series([str(article.pubdate), article.journal, last.first_name, "last"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([str(article.pubdate), article.journal, second.first_name, "second"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([str(article.pubdate), article.journal, penultimate.first_name, "penultimate"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
for x in others:
row = pd.Series([str(article.pubdate), article.journal, x.first_name, "other"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
if counter % 1000 == 0:
print(counter)
with open(out_file, 'a+') as out:
df.to_csv(out, columns = col_names, header=False)
df = pd.DataFrame(columns=col_names)
counter +=1
with open(out_file, 'a+') as out:
df.to_csv(out, columns = col_names, header=False)
Now the tough part - getting genders.
I played around trying to get sexmachine
and GenderComputer
to work, but ran into some issues, and those projects don't seem like they're being maintained, so I thought i'd try genderize.io and gender-api.com. The trouble is these are a web apis, which takes more time than something run locally, and they have a limit to the number of requests you can make. The owners of both of these APIs generously provided me with enough requests to use them for free for this project, but I'll show how to use all three methods.
On to a new notebook...