import time
import urllib2
import datetime
from itertools import ifilter
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
import bibtexparser

pd.set_option('mode.chained_assignment','warn')

%matplotlib inline

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

def harvest(arxiv="physics:hep-ex"):
    df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from=2010-01-01&until=2014-12-31&" +
           "metadataPrefix=arXiv&set=%s"%arxiv)
    
    while True:
        print "fetching", url
        try:
            response = urllib2.urlopen(url)
            
        except urllib2.HTTPError, e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print "Got 503. Retrying after {0:d} seconds.".format(to)

                time.sleep(to)
                continue
                
            else:
                raise
            
        xml = response.read()

        root = ET.fromstring(xml)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
            meta = record.find(OAI+'metadata')
            info = meta.find(ARXIV+"arXiv")
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            categories = info.find(ARXIV+"categories").text

            # if there is more than one DOI use the first one
            # often the second one (if it exists at all) refers
            # to an eratum or similar
            doi = info.find(ARXIV+"doi")
            if doi is not None:
                doi = doi.text.split()[0]
                
            contents = {'title': info.find(ARXIV+"title").text,
                        'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                        'abstract': info.find(ARXIV+"abstract").text.strip(),
                        'created': created,
                        'categories': categories.split(),
                        'doi': doi,
                        }

            df = df.append(contents, ignore_index=True)

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df
    

df = harvest()

df.head()

def bar_chart(items):
    """Make a bar chart showing the count associated with each key
    
    `items` is a list of (key, count) pairs.
    """
    width = 0.5
    ind = np.arange(len(items))
    fig, ax = plt.subplots(figsize=(8,8))
    rects1 = ax.bar(ind, zip(*items)[1], width, color='r')
    ax.set_xticks(ind+width)
    ax.set_xticklabels(zip(*items)[0])
    fig.autofmt_xdate()
    plt.show()

edits_per_year = Counter(df.created.map(lambda x: x.year))
bar_chart(edits_per_year.items())
new_articles = sum(edits_per_year[year] for year in (2010,2011,2012,2013,2014))
print "Unique arXiv IDs edited between 2010 and 2014:", len(df.id.unique())
print "of which %i entries were created in that time period."%(new_articles)

df[df.created<datetime.date(1995,1,1)]

def get_cites(arxiv_id):
    cites = []
    base_url = "http://inspirehep.net/search?p=refersto:%s&of=hx&rg=250&jrec=%i"
    offset = 1
    
    while True:
        print base_url%(arxiv_id, offset)
        response = urllib2.urlopen(base_url%(arxiv_id, offset))
        xml = response.read()
        soup = BeautifulSoup(xml)

        refs = "\n".join(cite.get_text() for cite in soup.findAll("pre"))

        bib_database = bibtexparser.loads(refs)
        if bib_database.entries:
            cites += bib_database.entries
            offset += 250
            
        else:
            break

    return cites

step = 1000
for N in range(0,17):
    print N
    cites = df['id'][N*step:(N+1)*step].map(get_cites)
    df.ix[N*step:(N+1)*step -1,'cited_by'] = cites

store = pd.HDFStore("/Users/thead/git/arxiv-experiments/hep-ex.h5")
#store['df'] = df
#df = store['df']
store.close()

word_bag = " ".join(df.abstract.apply(lambda t: t.lower()))

Counter(word_bag.split()).most_common(n=10)

from nltk.corpus import stopwords

stops = [word for word in stopwords.words('english')]
stops += ["=", "->"]
words = filter(lambda w: w not in stops,
               word_bag.split())
top_twenty = Counter(words).most_common(n=20)

bar_chart(top_twenty)

import nltk.stem as stem

porter = stem.PorterStemmer()
for w in ("measurement", "measurements", "measured", "measure"):
    print w, "->", porter.stem(w)

word_stems = map(lambda w: (porter.stem(w),w), words)
stem2words = defaultdict(set)
for stem, word in word_stems:
    stem2words[stem].add(word)

top_twenty = Counter(w[0] for w in word_stems).most_common(n=20)

bar_chart(top_twenty)

# list all words which correspond to each top twenty stem
for stem,count in top_twenty:
    print stem, "<-", ", ".join(stem2words[stem])

before_2014 = datetime.datetime(2014,1,1)
plt.hist(df[df.created<before_2014].cited_by.map(len),
         bins=200, normed=True, range=(0,200))
plt.xlabel("Number of citations")
plt.ylabel("Fraction")

df['citation_count'] = df.cited_by.map(len)
df[df.created<before_2014]['citation_count'].describe()

df.iloc[df.citation_count.idxmax()]

df.sort('citation_count', ascending=False).head(10)

df.iloc[df[df.created>before_2014].citation_count.idxmax()]