#!/usr/bin/env python
# coding: utf-8

# # Crossref Citation Stats
# 
# ## Data Provenance
# 
# The data was retrieved in August 2019 via the [Crossref API](http://api.crossref.org/works) (the download should include everything at least up to 05 Aug 2019, having started on the 06th taking over a week). ([The raw responses LZMA zipped](https://doi.org/10.6084/m9.figshare.9751865) take about 49 GB disk space)
# 
# ## Data Preprocessing
# 
# The data was then preprocessed to get calculate some high level stats over the data which are then analysed in this notebook.
# 
# The files are available here:
# * [crossref-works-summaries-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-summaries-stat.tsv.gz) - Overall summary stats
# * [crossref-works-summaries-by-type-and-publisher-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-summaries-by-type-and-publisher-stat.tsv.gz) - Summary stats grouped by _type_ and _publisher_ 
# * [crossref-works-reference-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-reference-stat.tsv.gz) - Stats relating to open access (oa) references
# 
# Related, but not used for the stats in this notebook:
# * [citation links](https://doi.org/10.6084/m9.figshare.9751988)
# 
# ## Notes
# 
# The *reference\_count* and *referenced\_by\_count* are counts provided by Crossref. *num\_references* are the number of references that are actually accessible via the API (i.e. oa).
# 
# ## Scripts
# 
# The scripts can be found in [this repository](https://github.com/elifesciences/datacapsule-crossref), see [README](https://github.com/elifesciences/datacapsule-crossref/blob/analysis/README.md). Using those scripts the data can be updated (but it will take some time to retrieve all of the Crossref data).
# 
# Link to this notebook: https://elifesci.org/crossref-data-notebook

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from IPython.display import Markdown, display


# In[3]:


plt.style.use('ggplot')

def printmd(string):
    display(Markdown(string))


# In[4]:


data_path = '../data'
summary_stats_filename = 'crossref-works-summaries-stat.tsv.gz'
summary_by_type_and_publisher_stats_filename = 'crossref-works-summaries-by-type-and-publisher-stat.tsv.gz'
reference_stats_filename = 'crossref-works-reference-stat.tsv.gz'


# In[5]:


df_summary = pd.read_csv(os.path.join(data_path, summary_stats_filename), sep='\t')

# drop non-numeric stat value 'type' and convert to numeric values
df_summary = df_summary[
    df_summary['stat'] != 'type'
].apply(pd.to_numeric, errors='ignore')

df_summary = df_summary.set_index('stat')
df_summary


# In[6]:


print('total work count: {:,}'.format(int(df_summary['reference_count']['count'])))


# In[7]:


DEFAULT_NUMBER_FORMAT = '{:,.0f}'
DEFAULT_PERCENTAGE_FORMAT = '{:,.2f}%'
format_count = lambda c, number_format=DEFAULT_NUMBER_FORMAT: number_format.format(c)
format_percentage = lambda c, number_format=DEFAULT_PERCENTAGE_FORMAT: number_format.format(c)

def apply_columns(df, columns, f):
    for c in columns:
        df[c] = df[c].apply(f)
    return df

def format_columns(df):
    for c in df.columns:
        if str(c).startswith('p_'):
            df[c] = df[c].apply(format_percentage)
        else:
            df[c] = df[c].apply(format_count)
    return df

def show_counts(counts, title='', number_format=DEFAULT_NUMBER_FORMAT, **kwargs):
    if isinstance(counts, list):
        df = pd.DataFrame(counts, columns=['what', 'count']).set_index('what')['count']
    else:
        df = counts
    ax = df.plot(kind='bar', **kwargs)
    for p in ax.patches:
        ax.annotate(
            format_count(p.get_height(), number_format=number_format),
            xy=(p.get_x() + p.get_width() / 2, p.get_height()), xycoords="data",
            va="bottom", ha="center"
        )
    ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: format_count(y))) 
    ax.axes.get_xaxis().set_label_text(title)
    ax.margins(y=0.2)


# In[8]:


printmd('### Work counts')
printmd('Out of the *{:,}* works with references in Crossref, **{:.2f}%** are open (*{:,}*).'.format(
    int(df_summary['reference_count']['count_non_zero']),
    100.0 * df_summary['has_references']['count_non_zero'] / df_summary['reference_count']['count_non_zero'],
    int(df_summary['has_references']['count_non_zero'])
))

show_counts([
    ['extracted', df_summary['has_references']['count']],
    ['has ref count', df_summary['reference_count']['count_non_zero']],
    ['oa references', df_summary['has_references']['count_non_zero']]
], 'works')


# In[9]:


printmd('### Reference counts')
printmd('Out of the *{:,}* references in Crossref, **{:.2f}%** are open (*{:,}*).'.format(
    int(df_summary['reference_count']['sum']),
    100.0 * df_summary['num_references']['sum'] / df_summary['reference_count']['sum'],
    int(df_summary['num_references']['sum'])
))

show_counts([
    ['as per Crossref', df_summary['reference_count']['sum']],
    ['oa available', df_summary['num_references']['sum']],
    ['with dois', df_summary['num_references']['sum'] - df_summary['num_citations_without_doi']['sum']],
    ['missing dois', df_summary['num_citations_without_doi']['sum']],
    ['duplicate dois', df_summary['num_duplicate_citation_dois']['sum']]
], 'references')


# In[10]:


printmd('### Reference stats')
df_reference_stat = pd.read_csv(os.path.join(data_path, reference_stats_filename), sep='\t')
df_reference_stat['examples'] = df_reference_stat['examples'].map(lambda x: json.loads(x))
df_reference_stat['examples_without_source'] = df_reference_stat['examples'].map(lambda x: [y[1] for y in x])
df_reference_stat = df_reference_stat.sort_values('count', ascending=False)
df_reference_stat.drop('examples', axis=1).head(10)


# In[11]:


printmd('### Reference stats are aggregated by different \'types\'')
print('\n'.join(sorted(set(df_reference_stat['type'].values))))


# In[12]:


printmd('### Count of referenced work by key combinations (populated fields in reference)')

df_reference_key_combination_stat = (
    df_reference_stat[df_reference_stat['type'] == 'key_combination']
    .drop('type', axis=1)
    .set_index('key')
)
show_counts(df_reference_key_combination_stat['count'].head(10), figsize=(20, 4))


# In[13]:


printmd('### Count of referenced work by year, sorted descending by year')

df_reference_year_stat = (
    df_reference_stat[df_reference_stat['type'] == 'year']
    .drop('type', axis=1)
    .sort_values('key', ascending=False)
    .set_index('key')
)
df_reference_year_stat = df_reference_year_stat[df_reference_year_stat['count'] > 1000]
show_counts(df_reference_year_stat['count'].head(150), figsize=(20, 4))


# In[14]:


set(df_reference_stat['type'])


# In[15]:


printmd('### Count of non-oa work by publisher, sorted descending')

df_temp = (
    df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_publisher']
    .drop('type', axis=1)
    .set_index('key')
)

df_temp_total = (
    df_reference_stat[df_reference_stat['type'] == 'total_publisher']
    .drop('type', axis=1)
    .set_index('key')
)
# print(df_temp_total.head())
print((df_temp['count'].head(10) / df_temp_total['count']).dropna())
show_counts(df_temp['count'].head(10), figsize=(20, 4))


# In[16]:


printmd('### Percentage of non-oa work by publisher (>10k works), sorted descending by percentage')

df_temp = (
    df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_publisher']
    .drop('type', axis=1)
    .set_index('key')
)
df_temp2 = (
    df_reference_stat[df_reference_stat['type'] == 'total_publisher']
    .drop('type', axis=1)
    .set_index('key')
)
df_temp = df_temp[df_temp['count'] > 100000]
# df_temp = df_temp[df_temp2['count'] > 10000]
# df_temp2 = df_temp2[df_temp2['count'] > 10000]
df_temp3 = df_temp.copy()
df_temp3['count'] = df_temp3['count'] / df_temp2['count']
print(df_temp3['count'].sort_values(ascending=False).head())
show_counts(df_temp3['count'].sort_values(ascending=False).head(10), number_format='{:,.5f}', figsize=(20, 4))


# In[17]:


printmd('### Non-oa works by container title')

df_temp = (
    df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_countainer_title']
    .drop('type', axis=1)
    .set_index('key')
)
show_counts(df_temp['count'].head(10), figsize=(20, 4))


# In[18]:


printmd('### Non-oa works by first subject area')

df_temp = (
    df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_first_subject_area']
    .drop('type', axis=1)
    .set_index('key')
)
show_counts(df_temp['count'].head(10), figsize=(20, 4))


# In[19]:


printmd('### Non-oa works by creation date (year)')

df_temp = (
    df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_created']
    .drop('type', axis=1)
    .set_index('key')
)
show_counts(df_temp['count'].head(10), figsize=(20, 4))


# ## More detailed analysis on the summaries by type and publisher

# In[20]:


df_summary_by_type_and_publisher = pd.read_csv(
    os.path.join(data_path, summary_by_type_and_publisher_stats_filename), sep='\t'
).rename(columns={
    'reference_count': 'reference_count_crossref',
    'num_references': 'reference_count_oa'
})

# fill na type and publisher with blank (otherwise causes issues with groupby)
df_summary_by_type_and_publisher[['type', 'publisher']] = (
    df_summary_by_type_and_publisher[['type', 'publisher']].fillna('')
)

# drop non-numeric stat value 'type' and convert to numeric values
df_summary_by_type_and_publisher = df_summary_by_type_and_publisher[
    df_summary_by_type_and_publisher['stat'] != 'type'
].apply(pd.to_numeric, errors='ignore')

df_summary_by_type_and_publisher.head(3)


# In[21]:


printmd('### Work counts by type and publisher')

df_summary_by_type_and_publisher_work_counts = df_summary_by_type_and_publisher[
    df_summary_by_type_and_publisher['stat'] == 'count'
].groupby(['type', 'publisher'])['doi'].sum().sort_values(ascending=False).to_frame('work_count')

df_summary_by_type_and_publisher_non_zero_counts = df_summary_by_type_and_publisher[
    df_summary_by_type_and_publisher['stat'] == 'count_non_zero'
].groupby(['type', 'publisher'])[[
    'reference_count_crossref', 'reference_count_oa'
]].sum().rename(columns={
    'reference_count_crossref': 'has_nonzero_reference_count_crossref',
    'reference_count_oa': 'has_nonzero_reference_count_oa'
})

df_summary_by_type_and_publisher_work_counts = df_summary_by_type_and_publisher_work_counts.merge(
    df_summary_by_type_and_publisher_non_zero_counts, how='outer', left_index=True, right_index=True
).astype(int)

# sanity check
assert int(df_summary_by_type_and_publisher_work_counts.sum()[0]) == int(df_summary['has_references']['count'])

df_summary_by_type_and_publisher_work_counts['p_oa'] = (
    100.0 * df_summary_by_type_and_publisher_work_counts['has_nonzero_reference_count_oa'] /
    df_summary_by_type_and_publisher_work_counts['has_nonzero_reference_count_crossref']
)

format_columns(df_summary_by_type_and_publisher_work_counts.sort_values(
    'work_count', ascending=False
).head())


# In[22]:


printmd('### Total work counts')

(
    df_summary_by_type_and_publisher_work_counts.reset_index()
    .drop(['type', 'publisher'], axis=1)
    .sum(axis=0)
    .to_frame('total')
    .applymap(format_count)
)


# In[23]:


printmd('### Work counts by type')

df_temp = (
    df_summary_by_type_and_publisher_work_counts.reset_index().groupby('type')
    .sum().sort_values(by='work_count', ascending=False).head()
)

def show_oa_ref_work_percentages_by_type(df):
    for work_type, has_nonzero_reference_count_crossref, has_nonzero_reference_count_oa in zip(
        df.index,
        df['has_nonzero_reference_count_crossref'],
        df['has_nonzero_reference_count_oa']
    ):
        if not has_nonzero_reference_count_crossref:
            continue
        printmd('Out of the *{:,}* works of type *{:}* with references in Crossref, **{:.2f}%** are open (*{:,}*).'.format(
            int(has_nonzero_reference_count_crossref),
            work_type,
            100.0 * has_nonzero_reference_count_oa / has_nonzero_reference_count_crossref,
            int(has_nonzero_reference_count_oa)
        ))

df_temp['p_oa'] = (
    100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref']
)

show_oa_ref_work_percentages_by_type(df_temp)

format_columns(df_temp)


# In[24]:


printmd('### Journal vs non-journal-article type')

def type_to_journal_non_journal_article(df):
    df = df.copy()
    df['type'] = df['type'].map(lambda x: 'non-journal-article' if x != 'journal-article' else x)
    return df

df_temp = (
    type_to_journal_non_journal_article(
        df_summary_by_type_and_publisher_work_counts.reset_index()
    )
    .groupby('type')
    .sum().sort_values(by='work_count', ascending=False).head()
)

df_temp['p_oa'] = 100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref']

show_oa_ref_work_percentages_by_type(df_temp)

format_columns(df_temp)


# In[25]:


printmd('### By publisher')

def select_top_publishers(df, count=5):
    top_publishers = (
        df.groupby('publisher').sum()
        .sort_values(by='work_count', ascending=False)
        .index.values
    )
    return df[df['publisher'].isin(top_publishers[:count])]

df_temp = (
    type_to_journal_non_journal_article(
        select_top_publishers(df_summary_by_type_and_publisher_work_counts.reset_index())
    )
    .groupby(['type', 'publisher'])
    .sum().sort_values(by='work_count', ascending=False)
)

df_temp['p_oa'] = (
    100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref']
)

format_columns(df_temp)


# In[26]:


printmd('### By normalised publisher')

def normalise_publisher(df):
    elsevier_names = [
        s for s in df['publisher'].unique()
        if not pd.isnull(s) and 'elsevier' in s.lower()
    ]
    return df.replace({
        'publisher': {s: 'Elsevier (all)' for s in elsevier_names}
    })

df_temp = (
    select_top_publishers(normalise_publisher(type_to_journal_non_journal_article(
        df_summary_by_type_and_publisher_work_counts.reset_index()
    )))
    .groupby(['type', 'publisher'])
    .sum().sort_values(by='work_count', ascending=False)
)

df_temp['p_oa'] = (
    100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref']
)

format_columns(df_temp)


# In[27]:


printmd('### Sanity check, the grouped summary should equal the ungrouped summary')

df_summary_by_type_and_publisher_sums = df_summary_by_type_and_publisher[
    df_summary_by_type_and_publisher['stat'] == 'sum'
]
total_reference_count_according_to_crossref = df_summary_by_type_and_publisher_sums['reference_count_crossref'].sum()
assert total_reference_count_according_to_crossref == df_summary['reference_count']['sum']
(
    df_summary_by_type_and_publisher_sums[['reference_count_crossref', 'reference_count_oa']]
    .sum()
    .map(format_count)
    .to_frame('sum')
)


# In[28]:


printmd('### Total references by type')

df_temp = (
    df_summary_by_type_and_publisher_sums.groupby('type')
    [['reference_count_crossref', 'reference_count_oa']]
    .sum()
    .sort_values('reference_count_crossref', ascending=False)
    .head()
)

df_temp['p_oa'] = (
    100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref']
)

format_columns(df_temp)


# In[29]:


printmd('### Publishers with the highest amount of references')

df_temp = (
    df_summary_by_type_and_publisher_sums.groupby('publisher')
    .sum()[['reference_count_crossref', 'reference_count_oa']]
    .sort_values(by='reference_count_crossref', ascending=False)
).head()

df_temp['p_oa'] = (
    100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref']
)

format_columns(df_temp)


# In[30]:


printmd('### Elsevier uses mutliple names, treat them as one')

elsevier_names = [
    s for s in df_summary_by_type_and_publisher_sums['publisher'].unique()
    if not pd.isnull(s) and 'elsevier' in s.lower()
]
printmd('* %s' % '\n* '.join(elsevier_names))
df_summary_by_type_and_publisher_sums_merged_publisher = (
    df_summary_by_type_and_publisher_sums.replace({
        'publisher': {s: 'Elsevier (all)' for s in elsevier_names}
    })
)


# In[31]:


printmd('### Publishers with the highest amount of references - this time with merged publisher names')

df_temp = (
    df_summary_by_type_and_publisher_sums_merged_publisher.groupby('publisher')
    .sum()[['reference_count_crossref', 'reference_count_oa']]
    .sort_values(by='reference_count_crossref', ascending=False)
).head()

df_temp['p_oa'] = (
    100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref']
)

format_columns(df_temp)


# In[32]:


printmd('### Publishers with the highest amount of references (by type) - this time with merged publisher names')

df_temp = (
    df_summary_by_type_and_publisher_sums_merged_publisher.groupby(['type', 'publisher'])
    .sum()[['reference_count_crossref', 'reference_count_oa']]
    .sort_values(by='reference_count_crossref', ascending=False)
).head()

df_temp['p_oa'] = (
    100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref']
)

format_columns(df_temp)