#!/usr/bin/env python # coding: utf-8 # # Crossref Citation Stats # # ## Data Provenance # # The data was retrieved in August 2019 via the [Crossref API](http://api.crossref.org/works) (the download should include everything at least up to 05 Aug 2019, having started on the 06th taking over a week). ([The raw responses LZMA zipped](https://doi.org/10.6084/m9.figshare.9751865) take about 49 GB disk space) # # ## Data Preprocessing # # The data was then preprocessed to get calculate some high level stats over the data which are then analysed in this notebook. # # The files are available here: # * [crossref-works-summaries-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-summaries-stat.tsv.gz) - Overall summary stats # * [crossref-works-summaries-by-type-and-publisher-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-summaries-by-type-and-publisher-stat.tsv.gz) - Summary stats grouped by _type_ and _publisher_ # * [crossref-works-reference-stat.tsv.gz](https://storage.googleapis.com/elife-ml/citations/by-date/2019-08-06/crossref-works-reference-stat.tsv.gz) - Stats relating to open access (oa) references # # Related, but not used for the stats in this notebook: # * [citation links](https://doi.org/10.6084/m9.figshare.9751988) # # ## Notes # # The *reference\_count* and *referenced\_by\_count* are counts provided by Crossref. *num\_references* are the number of references that are actually accessible via the API (i.e. oa). # # ## Scripts # # The scripts can be found in [this repository](https://github.com/elifesciences/datacapsule-crossref), see [README](https://github.com/elifesciences/datacapsule-crossref/blob/analysis/README.md). Using those scripts the data can be updated (but it will take some time to retrieve all of the Crossref data). # # Link to this notebook: https://elifesci.org/crossref-data-notebook # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: import os import json import pandas as pd import matplotlib.pyplot as plt from matplotlib.ticker import FuncFormatter from IPython.display import Markdown, display # In[3]: plt.style.use('ggplot') def printmd(string): display(Markdown(string)) # In[4]: data_path = '../data' summary_stats_filename = 'crossref-works-summaries-stat.tsv.gz' summary_by_type_and_publisher_stats_filename = 'crossref-works-summaries-by-type-and-publisher-stat.tsv.gz' reference_stats_filename = 'crossref-works-reference-stat.tsv.gz' # In[5]: df_summary = pd.read_csv(os.path.join(data_path, summary_stats_filename), sep='\t') # drop non-numeric stat value 'type' and convert to numeric values df_summary = df_summary[ df_summary['stat'] != 'type' ].apply(pd.to_numeric, errors='ignore') df_summary = df_summary.set_index('stat') df_summary # In[6]: print('total work count: {:,}'.format(int(df_summary['reference_count']['count']))) # In[7]: DEFAULT_NUMBER_FORMAT = '{:,.0f}' DEFAULT_PERCENTAGE_FORMAT = '{:,.2f}%' format_count = lambda c, number_format=DEFAULT_NUMBER_FORMAT: number_format.format(c) format_percentage = lambda c, number_format=DEFAULT_PERCENTAGE_FORMAT: number_format.format(c) def apply_columns(df, columns, f): for c in columns: df[c] = df[c].apply(f) return df def format_columns(df): for c in df.columns: if str(c).startswith('p_'): df[c] = df[c].apply(format_percentage) else: df[c] = df[c].apply(format_count) return df def show_counts(counts, title='', number_format=DEFAULT_NUMBER_FORMAT, **kwargs): if isinstance(counts, list): df = pd.DataFrame(counts, columns=['what', 'count']).set_index('what')['count'] else: df = counts ax = df.plot(kind='bar', **kwargs) for p in ax.patches: ax.annotate( format_count(p.get_height(), number_format=number_format), xy=(p.get_x() + p.get_width() / 2, p.get_height()), xycoords="data", va="bottom", ha="center" ) ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: format_count(y))) ax.axes.get_xaxis().set_label_text(title) ax.margins(y=0.2) # In[8]: printmd('### Work counts') printmd('Out of the *{:,}* works with references in Crossref, **{:.2f}%** are open (*{:,}*).'.format( int(df_summary['reference_count']['count_non_zero']), 100.0 * df_summary['has_references']['count_non_zero'] / df_summary['reference_count']['count_non_zero'], int(df_summary['has_references']['count_non_zero']) )) show_counts([ ['extracted', df_summary['has_references']['count']], ['has ref count', df_summary['reference_count']['count_non_zero']], ['oa references', df_summary['has_references']['count_non_zero']] ], 'works') # In[9]: printmd('### Reference counts') printmd('Out of the *{:,}* references in Crossref, **{:.2f}%** are open (*{:,}*).'.format( int(df_summary['reference_count']['sum']), 100.0 * df_summary['num_references']['sum'] / df_summary['reference_count']['sum'], int(df_summary['num_references']['sum']) )) show_counts([ ['as per Crossref', df_summary['reference_count']['sum']], ['oa available', df_summary['num_references']['sum']], ['with dois', df_summary['num_references']['sum'] - df_summary['num_citations_without_doi']['sum']], ['missing dois', df_summary['num_citations_without_doi']['sum']], ['duplicate dois', df_summary['num_duplicate_citation_dois']['sum']] ], 'references') # In[10]: printmd('### Reference stats') df_reference_stat = pd.read_csv(os.path.join(data_path, reference_stats_filename), sep='\t') df_reference_stat['examples'] = df_reference_stat['examples'].map(lambda x: json.loads(x)) df_reference_stat['examples_without_source'] = df_reference_stat['examples'].map(lambda x: [y[1] for y in x]) df_reference_stat = df_reference_stat.sort_values('count', ascending=False) df_reference_stat.drop('examples', axis=1).head(10) # In[11]: printmd('### Reference stats are aggregated by different \'types\'') print('\n'.join(sorted(set(df_reference_stat['type'].values)))) # In[12]: printmd('### Count of referenced work by key combinations (populated fields in reference)') df_reference_key_combination_stat = ( df_reference_stat[df_reference_stat['type'] == 'key_combination'] .drop('type', axis=1) .set_index('key') ) show_counts(df_reference_key_combination_stat['count'].head(10), figsize=(20, 4)) # In[13]: printmd('### Count of referenced work by year, sorted descending by year') df_reference_year_stat = ( df_reference_stat[df_reference_stat['type'] == 'year'] .drop('type', axis=1) .sort_values('key', ascending=False) .set_index('key') ) df_reference_year_stat = df_reference_year_stat[df_reference_year_stat['count'] > 1000] show_counts(df_reference_year_stat['count'].head(150), figsize=(20, 4)) # In[14]: set(df_reference_stat['type']) # In[15]: printmd('### Count of non-oa work by publisher, sorted descending') df_temp = ( df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_publisher'] .drop('type', axis=1) .set_index('key') ) df_temp_total = ( df_reference_stat[df_reference_stat['type'] == 'total_publisher'] .drop('type', axis=1) .set_index('key') ) # print(df_temp_total.head()) print((df_temp['count'].head(10) / df_temp_total['count']).dropna()) show_counts(df_temp['count'].head(10), figsize=(20, 4)) # In[16]: printmd('### Percentage of non-oa work by publisher (>10k works), sorted descending by percentage') df_temp = ( df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_publisher'] .drop('type', axis=1) .set_index('key') ) df_temp2 = ( df_reference_stat[df_reference_stat['type'] == 'total_publisher'] .drop('type', axis=1) .set_index('key') ) df_temp = df_temp[df_temp['count'] > 100000] # df_temp = df_temp[df_temp2['count'] > 10000] # df_temp2 = df_temp2[df_temp2['count'] > 10000] df_temp3 = df_temp.copy() df_temp3['count'] = df_temp3['count'] / df_temp2['count'] print(df_temp3['count'].sort_values(ascending=False).head()) show_counts(df_temp3['count'].sort_values(ascending=False).head(10), number_format='{:,.5f}', figsize=(20, 4)) # In[17]: printmd('### Non-oa works by container title') df_temp = ( df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_countainer_title'] .drop('type', axis=1) .set_index('key') ) show_counts(df_temp['count'].head(10), figsize=(20, 4)) # In[18]: printmd('### Non-oa works by first subject area') df_temp = ( df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_first_subject_area'] .drop('type', axis=1) .set_index('key') ) show_counts(df_temp['count'].head(10), figsize=(20, 4)) # In[19]: printmd('### Non-oa works by creation date (year)') df_temp = ( df_reference_stat[df_reference_stat['type'] == 'non_oa_ref_created'] .drop('type', axis=1) .set_index('key') ) show_counts(df_temp['count'].head(10), figsize=(20, 4)) # ## More detailed analysis on the summaries by type and publisher # In[20]: df_summary_by_type_and_publisher = pd.read_csv( os.path.join(data_path, summary_by_type_and_publisher_stats_filename), sep='\t' ).rename(columns={ 'reference_count': 'reference_count_crossref', 'num_references': 'reference_count_oa' }) # fill na type and publisher with blank (otherwise causes issues with groupby) df_summary_by_type_and_publisher[['type', 'publisher']] = ( df_summary_by_type_and_publisher[['type', 'publisher']].fillna('') ) # drop non-numeric stat value 'type' and convert to numeric values df_summary_by_type_and_publisher = df_summary_by_type_and_publisher[ df_summary_by_type_and_publisher['stat'] != 'type' ].apply(pd.to_numeric, errors='ignore') df_summary_by_type_and_publisher.head(3) # In[21]: printmd('### Work counts by type and publisher') df_summary_by_type_and_publisher_work_counts = df_summary_by_type_and_publisher[ df_summary_by_type_and_publisher['stat'] == 'count' ].groupby(['type', 'publisher'])['doi'].sum().sort_values(ascending=False).to_frame('work_count') df_summary_by_type_and_publisher_non_zero_counts = df_summary_by_type_and_publisher[ df_summary_by_type_and_publisher['stat'] == 'count_non_zero' ].groupby(['type', 'publisher'])[[ 'reference_count_crossref', 'reference_count_oa' ]].sum().rename(columns={ 'reference_count_crossref': 'has_nonzero_reference_count_crossref', 'reference_count_oa': 'has_nonzero_reference_count_oa' }) df_summary_by_type_and_publisher_work_counts = df_summary_by_type_and_publisher_work_counts.merge( df_summary_by_type_and_publisher_non_zero_counts, how='outer', left_index=True, right_index=True ).astype(int) # sanity check assert int(df_summary_by_type_and_publisher_work_counts.sum()[0]) == int(df_summary['has_references']['count']) df_summary_by_type_and_publisher_work_counts['p_oa'] = ( 100.0 * df_summary_by_type_and_publisher_work_counts['has_nonzero_reference_count_oa'] / df_summary_by_type_and_publisher_work_counts['has_nonzero_reference_count_crossref'] ) format_columns(df_summary_by_type_and_publisher_work_counts.sort_values( 'work_count', ascending=False ).head()) # In[22]: printmd('### Total work counts') ( df_summary_by_type_and_publisher_work_counts.reset_index() .drop(['type', 'publisher'], axis=1) .sum(axis=0) .to_frame('total') .applymap(format_count) ) # In[23]: printmd('### Work counts by type') df_temp = ( df_summary_by_type_and_publisher_work_counts.reset_index().groupby('type') .sum().sort_values(by='work_count', ascending=False).head() ) def show_oa_ref_work_percentages_by_type(df): for work_type, has_nonzero_reference_count_crossref, has_nonzero_reference_count_oa in zip( df.index, df['has_nonzero_reference_count_crossref'], df['has_nonzero_reference_count_oa'] ): if not has_nonzero_reference_count_crossref: continue printmd('Out of the *{:,}* works of type *{:}* with references in Crossref, **{:.2f}%** are open (*{:,}*).'.format( int(has_nonzero_reference_count_crossref), work_type, 100.0 * has_nonzero_reference_count_oa / has_nonzero_reference_count_crossref, int(has_nonzero_reference_count_oa) )) df_temp['p_oa'] = ( 100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref'] ) show_oa_ref_work_percentages_by_type(df_temp) format_columns(df_temp) # In[24]: printmd('### Journal vs non-journal-article type') def type_to_journal_non_journal_article(df): df = df.copy() df['type'] = df['type'].map(lambda x: 'non-journal-article' if x != 'journal-article' else x) return df df_temp = ( type_to_journal_non_journal_article( df_summary_by_type_and_publisher_work_counts.reset_index() ) .groupby('type') .sum().sort_values(by='work_count', ascending=False).head() ) df_temp['p_oa'] = 100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref'] show_oa_ref_work_percentages_by_type(df_temp) format_columns(df_temp) # In[25]: printmd('### By publisher') def select_top_publishers(df, count=5): top_publishers = ( df.groupby('publisher').sum() .sort_values(by='work_count', ascending=False) .index.values ) return df[df['publisher'].isin(top_publishers[:count])] df_temp = ( type_to_journal_non_journal_article( select_top_publishers(df_summary_by_type_and_publisher_work_counts.reset_index()) ) .groupby(['type', 'publisher']) .sum().sort_values(by='work_count', ascending=False) ) df_temp['p_oa'] = ( 100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref'] ) format_columns(df_temp) # In[26]: printmd('### By normalised publisher') def normalise_publisher(df): elsevier_names = [ s for s in df['publisher'].unique() if not pd.isnull(s) and 'elsevier' in s.lower() ] return df.replace({ 'publisher': {s: 'Elsevier (all)' for s in elsevier_names} }) df_temp = ( select_top_publishers(normalise_publisher(type_to_journal_non_journal_article( df_summary_by_type_and_publisher_work_counts.reset_index() ))) .groupby(['type', 'publisher']) .sum().sort_values(by='work_count', ascending=False) ) df_temp['p_oa'] = ( 100.0 * df_temp['has_nonzero_reference_count_oa'] / df_temp['has_nonzero_reference_count_crossref'] ) format_columns(df_temp) # In[27]: printmd('### Sanity check, the grouped summary should equal the ungrouped summary') df_summary_by_type_and_publisher_sums = df_summary_by_type_and_publisher[ df_summary_by_type_and_publisher['stat'] == 'sum' ] total_reference_count_according_to_crossref = df_summary_by_type_and_publisher_sums['reference_count_crossref'].sum() assert total_reference_count_according_to_crossref == df_summary['reference_count']['sum'] ( df_summary_by_type_and_publisher_sums[['reference_count_crossref', 'reference_count_oa']] .sum() .map(format_count) .to_frame('sum') ) # In[28]: printmd('### Total references by type') df_temp = ( df_summary_by_type_and_publisher_sums.groupby('type') [['reference_count_crossref', 'reference_count_oa']] .sum() .sort_values('reference_count_crossref', ascending=False) .head() ) df_temp['p_oa'] = ( 100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref'] ) format_columns(df_temp) # In[29]: printmd('### Publishers with the highest amount of references') df_temp = ( df_summary_by_type_and_publisher_sums.groupby('publisher') .sum()[['reference_count_crossref', 'reference_count_oa']] .sort_values(by='reference_count_crossref', ascending=False) ).head() df_temp['p_oa'] = ( 100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref'] ) format_columns(df_temp) # In[30]: printmd('### Elsevier uses mutliple names, treat them as one') elsevier_names = [ s for s in df_summary_by_type_and_publisher_sums['publisher'].unique() if not pd.isnull(s) and 'elsevier' in s.lower() ] printmd('* %s' % '\n* '.join(elsevier_names)) df_summary_by_type_and_publisher_sums_merged_publisher = ( df_summary_by_type_and_publisher_sums.replace({ 'publisher': {s: 'Elsevier (all)' for s in elsevier_names} }) ) # In[31]: printmd('### Publishers with the highest amount of references - this time with merged publisher names') df_temp = ( df_summary_by_type_and_publisher_sums_merged_publisher.groupby('publisher') .sum()[['reference_count_crossref', 'reference_count_oa']] .sort_values(by='reference_count_crossref', ascending=False) ).head() df_temp['p_oa'] = ( 100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref'] ) format_columns(df_temp) # In[32]: printmd('### Publishers with the highest amount of references (by type) - this time with merged publisher names') df_temp = ( df_summary_by_type_and_publisher_sums_merged_publisher.groupby(['type', 'publisher']) .sum()[['reference_count_crossref', 'reference_count_oa']] .sort_values(by='reference_count_crossref', ascending=False) ).head() df_temp['p_oa'] = ( 100.0 * df_temp['reference_count_oa'] / df_temp['reference_count_crossref'] ) format_columns(df_temp)