#!/usr/bin/env python # coding: utf-8 # ## Analyze access color trends using Unpaywall data # In[1]: import collections import pandas import plotnine import numpy import utils # In[2]: access_df = pandas.read_csv( 'data/02.unpaywall-access.tsv.xz', sep='\t', #nrows=100_000, ) access_df = access_df.query("crossref_type in @utils.get_crossref_types()") access_df['journal_year'] = access_df.journal_date.str.slice(stop=4).astype('float').astype('Int64') access_df.head(2) # In[3]: access_df['journal_access_category'] = 'closed' access_df.loc[access_df.unpaywall_access.astype(bool), 'journal_access_category'] = 'green' access_df.loc[access_df.journal_access.astype(bool), 'journal_access_category'] = 'hybrid/bronze' access_df.loc[access_df.journal_fully_oa.astype(bool), 'journal_access_category'] = 'gold' access_df.head() # In[4]: access_df.journal_access_category.value_counts() # In[5]: access_df.crossref_type.value_counts() # In[6]: def summarize(df): row = collections.Counter() counts = collections.Counter(df.journal_access_category) for category in utils.access_categories_colors: row[category] = counts[category] row['total'] = len(df) return pandas.Series(row) year_df = access_df.groupby('journal_year').apply(summarize).reset_index() year_df.journal_year = year_df.journal_year.astype(int) year_df = year_df.query("1950 <= journal_year <= 2018") year_df.tail(10) # In[7]: year_df.to_csv('data/05.unpaywall-colors-by-year.tsv', sep='\t', index=False) # In[9]: year_melt_df = year_df.melt(id_vars=['journal_year', 'total'], var_name='access_category', value_name='count') year_melt_df.access_category = pandas.Categorical( year_melt_df.access_category, categories=reversed(list(utils.access_categories_colors)) ) year_melt_df.head(2) # In[10]: gg_counts = ( plotnine.ggplot(mapping=plotnine.aes(x='journal_year', y='count', fill='access_category'), data=year_melt_df) + plotnine.geom_bar(stat='identity', size=0, color=None, width=1) + plotnine.scale_x_continuous(breaks=numpy.arange(1960, 2011, 10), expand=(0, 0), name='') + plotnine.scale_y_continuous(labels=lambda array: [f'{x:,.0f}' for x in array], name='', expand=(0,0)) + plotnine.scale_fill_manual(values=utils.access_categories_colors) + plotnine.theme_bw() + plotnine.theme(legend_position=(0.3, 0.7), legend_title=plotnine.element_blank()) ) gg_counts # In[12]: gg_percents = ( plotnine.ggplot(mapping=plotnine.aes(x='journal_year', y='count', fill='access_category'), data=year_melt_df) + plotnine.geom_bar(stat='identity', position='fill', size=0, color=None, width=1) + plotnine.scale_fill_manual(values=utils.access_categories_colors) + plotnine.scale_x_continuous(breaks=numpy.arange(1960, 2011, 10), expand=(0, 0), name='') + plotnine.scale_y_continuous(breaks=numpy.arange(0.1, 1, 0.2), labels=lambda array: [f'{x:.0%}' for x in array], expand=(0, 0), name='') + plotnine.theme_bw() + plotnine.theme(legend_position=(0.3, 0.7), legend_title=plotnine.element_blank()) ) gg_percents