import time import urllib2 import datetime from itertools import ifilter from collections import Counter, defaultdict import xml.etree.ElementTree as ET from bs4 import BeautifulSoup import matplotlib.pylab as plt import pandas as pd import numpy as np import bibtexparser pd.set_option('mode.chained_assignment','warn') %matplotlib inline OAI = "{http://www.openarchives.org/OAI/2.0/}" ARXIV = "{http://arxiv.org/OAI/arXiv/}" def harvest(arxiv="physics:hep-ex"): df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi")) base_url = "http://export.arxiv.org/oai2?verb=ListRecords&" url = (base_url + "from=2010-01-01&until=2014-12-31&" + "metadataPrefix=arXiv&set=%s"%arxiv) while True: print "fetching", url try: response = urllib2.urlopen(url) except urllib2.HTTPError, e: if e.code == 503: to = int(e.hdrs.get("retry-after", 30)) print "Got 503. Retrying after {0:d} seconds.".format(to) time.sleep(to) continue else: raise xml = response.read() root = ET.fromstring(xml) for record in root.find(OAI+'ListRecords').findall(OAI+"record"): arxiv_id = record.find(OAI+'header').find(OAI+'identifier') meta = record.find(OAI+'metadata') info = meta.find(ARXIV+"arXiv") created = info.find(ARXIV+"created").text created = datetime.datetime.strptime(created, "%Y-%m-%d") categories = info.find(ARXIV+"categories").text # if there is more than one DOI use the first one # often the second one (if it exists at all) refers # to an eratum or similar doi = info.find(ARXIV+"doi") if doi is not None: doi = doi.text.split()[0] contents = {'title': info.find(ARXIV+"title").text, 'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:], 'abstract': info.find(ARXIV+"abstract").text.strip(), 'created': created, 'categories': categories.split(), 'doi': doi, } df = df.append(contents, ignore_index=True) # The list of articles returned by the API comes in chunks of # 1000 articles. The presence of a resumptionToken tells us that # there is more to be fetched. token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken") if token is None or token.text is None: break else: url = base_url + "resumptionToken=%s"%(token.text) return df df = harvest() df.head() def bar_chart(items): """Make a bar chart showing the count associated with each key `items` is a list of (key, count) pairs. """ width = 0.5 ind = np.arange(len(items)) fig, ax = plt.subplots(figsize=(8,8)) rects1 = ax.bar(ind, zip(*items)[1], width, color='r') ax.set_xticks(ind+width) ax.set_xticklabels(zip(*items)[0]) fig.autofmt_xdate() plt.show() edits_per_year = Counter(df.created.map(lambda x: x.year)) bar_chart(edits_per_year.items()) new_articles = sum(edits_per_year[year] for year in (2010,2011,2012,2013,2014)) print "Unique arXiv IDs edited between 2010 and 2014:", len(df.id.unique()) print "of which %i entries were created in that time period."%(new_articles) df[df.created"] words = filter(lambda w: w not in stops, word_bag.split()) top_twenty = Counter(words).most_common(n=20) bar_chart(top_twenty) import nltk.stem as stem porter = stem.PorterStemmer() for w in ("measurement", "measurements", "measured", "measure"): print w, "->", porter.stem(w) word_stems = map(lambda w: (porter.stem(w),w), words) stem2words = defaultdict(set) for stem, word in word_stems: stem2words[stem].add(word) top_twenty = Counter(w[0] for w in word_stems).most_common(n=20) bar_chart(top_twenty) # list all words which correspond to each top twenty stem for stem,count in top_twenty: print stem, "<-", ", ".join(stem2words[stem]) before_2014 = datetime.datetime(2014,1,1) plt.hist(df[df.createdbefore_2014].citation_count.idxmax()]