#!/usr/bin/env python # coding: utf-8 # # Visualizing the topic and accessibility of scholarly articles cited in Wikipedia # # Building on a dataset we previously released of [citations with identifiers](https://doi.org/10.6084/m9.figshare.1299540.v10) across all Wikipedia language editions, we explore the distribution of [DOIs](https://en.wikipedia.org/wiki/Digital_object_identifier) cited in Wikipedia by *topic* and *accessibility*. # # ## Topic # We assign a *topic* to each publication, by looking at the main topic(s) of the Wikipedia article that cites it. Topics are determined by matching each article with its [WikiProject](https://en.wikipedia.org/wiki/Wikipedia:WikiProject), and assigning the corresponding top-level topic according to the [Wikiproject Hierarchy](https://figshare.com/articles/WikiProjects_Machine_Readable_Dataset/5503819/1). This is done using the [draftopic tool](https://github.com/wiki-ai/drafttopic) from the Wikimedia Foundation's Scoring Platform team. # # ## Accessibility # We determine the *accessibility* of each publication (by looking up the DOI in data provided by [Unpaywall](http://unpaywall.org/data-format). The DOI is marked as: # * *Open*: if the canonical version is open access at the source (journal); # * *Available*: if the canonical version at the source is behind a paywall, but an open access copy is available at a different location; # * *Closed*: if the canonical version at the source is behind a paywall and not open access copy was identified at a different location # # Note that this analysis only takes into account the openness of the canonical version of a scholarly paper citation as identified by a DOI. Citation templates used in Wikipedia articles often complement a DOI with a link to an accessible version, when it has identified." # ## Dataset documentation # Further documentation on the format of the data can be found in the parent dataset: [https://doi.org/10.6084/m9.figshare.1299540.v10](https://doi.org/10.6084/m9.figshare.1299540.v10) # In[3]: ''' import useful libraries ''' import pandas as pd import numpy as np from bokeh.models import ColumnDataSource, LabelSet, HoverTool,Range1d, NumeralTickFormatter from bokeh.plotting import figure, show, output_file from bokeh.io import output_notebook from bokeh.transform import stack import math import operator import sys import warnings if not sys.warnoptions: warnings.simplefilter("ignore") ''' use input file provided or substitute with your own path ''' inputfile='data/all_data_forplot.tsv' # ### We define the three functions to discover relations between publication topic, language and accessibility # * Breakdown of total number of publications and percentage of open/available publications by language, for a specific topic or for all topics # * Breakdown of total number of publications and percentage of open/available publications by topic, for a specific language or for all languages # * Breakdown of publications by type and topic, for all topics # # In[4]: def generate_open_language_plot(dataframe,open_type,topic): ''' takes as input: dataframe - the pandas data frame containing the data open_type - 'open' if you want to visualize completely open access statistics; 'avaliable if you want to visualize statistics about publications having free copies available; topic - a string corresponding to one of the topics, or 'all' if you want to have a complete overview across topics. Choose between: 'Africa', 'Americas', 'Article improvement and grading', 'Arts', 'Biology', 'Bodies of water', 'Broadcasting', 'Business and economics', 'Chemistry', 'Cities', 'Contents systems','Countries', 'Crafts and hobbies', 'Economics', 'Education','Entertainment', 'Europe', 'Files', 'Food and drink', 'Geosciences','History and society', 'Information science', 'Internet culture', 'Landforms', 'Language and literature', 'Maintenance', 'Maps', 'Mathematics', 'Media', 'Medicine', 'Meteorology', 'Military and warfare', 'Music', 'Performing arts','Philosophy and religion', 'Physics', 'Plastic arts','Politics and government', 'Science', 'Space', 'Sports','Technology', 'Time', 'Transportation', 'all' ''' if open_type=='open': text='open access' elif open_type=='available': text='available open access' else: print('wrong accessibility type') return if topic == 'all': topic= 'all_topics' TITLE = "Percentage of "+text+" publications for all topics" else: TITLE = "Percentage of "+text+" publications for topic "+topic #load the data for one specific topic source=ColumnDataSource(dataframe.loc[dataframe['topic'] == topic]) #prepare interaction tools tools = "pan,wheel_zoom,box_zoom,reset,save".split(',') hover = HoverTool(tooltips=[ ("language", "@wiki"), ("total scholarly publications:", "@{total}{0}"), ("% open publications", "@{open}{0.00%}"), ("% open access available publications", "@{available}{0.00%}")]) tools.append(hover) #prepare the plot figure, depending on the quantity of data, go for log scale or linear scale if max(dataframe.loc[dataframe['topic'] == topic]['total']>500): p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log") else: p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE) #prepare plot background, axes labels and line colors p.background_fill_color = "#ffffff" #change if you don't want white background p.xaxis.axis_label = "percentage of "+text+" publications" p.yaxis.axis_label = "total number of scholarly publications" p.grid.grid_line_color = "gray" #choose format for axes p.yaxis[0].formatter = NumeralTickFormatter(format="0") p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%") #draw the circles; change colors here p.circle(open_type, "total", size=10, source=source, line_color="#005693", line_width=1, line_alpha=0.7, fill_alpha=0.5, fill_color="#23a3ff") labels = LabelSet(x=open_type, y="total", text="wiki",y_offset=8, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(labels) #draws the plot output_notebook() show(p) def generate_open_topic_plot(df,topics, open_type,lan): ''' takes as input: df - the pandas data frame containing the data open_type - 'open' if you want to visualize completely open access statistics; 'avaliable if you want to visualize statistics about publications having free copies available; lan - a string corresponding to one of the languages for which we have data, or 'all' if you want to have a complete overview across all languages. Choose between: 'ace', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba', 'bar', 'bat_smg', 'bcl', 'be', 'be_x_old', 'bg', 'bh', 'bjn', 'bn', 'bo', 'bpy', 'br', 'bs', 'bxr', 'cbk_zam', 'cdo', 'ce', 'ceb', 'chr', 'ckb', 'co', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'diq', 'dsb', 'dty', 'dv', 'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fo', 'fr', 'frr', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'gn', 'gom', 'gu', 'gv', 'ha', 'hak', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kab', 'kk', 'km', 'kn', 'ko', 'koi', 'krc', 'ku', 'kv', 'ky', 'la', 'lad', 'lb', 'lez', 'lg', 'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'lv', 'mai', 'map_bms', 'mdf', 'mg', 'mhr', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl', 'my', 'myv', 'mzn', 'na', 'nah', 'nds', 'nds_nl', 'ne', 'new', 'nl', 'nn', 'no', 'nov', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pam', 'pap', 'pcd', 'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'rm', 'ro', 'roa_tara', 'ru', 'rue', 'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sh', 'si', 'simple', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te', 'test', 'test2', 'tet', 'tg', 'th', 'ti', 'tl', 'tn', 'tr', 'ts', 'tt', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 'vi', 'vls', 'vo', 'war', 'wo', 'wuu', 'xh', 'xmf', 'yi', 'yo', 'za', 'zh', 'zh_classical', 'zh_min_nan', 'zh_yue','all' ''' #we are now going to generate a new dataframe gathering for each topic either the values for one language, #or the average/sum of values across languages dic={} counts={} if open_type=='open': text='open access' elif open_type=='available': text='available open access' else: print('wrong accessibility type') return if lan=='all': for topic in topics: if topic=='all_topics': continue dic[topic]=np.mean(df.loc[df['topic']==topic].loc[df['wiki']=='all_languages'][open_type]) counts[topic]=np.sum(df.loc[df['topic']==topic].loc[df['wiki']=='all_languages']['total']) TITLE = "Percentage of "+text+" publications by topic for all languages" else: for topic in topics: if topic=='all_topics': continue dic[topic]=np.mean(df.loc[df['topic']==topic].loc[df['wiki']==lan][open_type]) counts[topic]=np.sum(df.loc[df['topic']==topic].loc[df['wiki']==lan]['total']) TITLE = "Percentage of "+text+" publications for "+lan+".Wikipedia" source = pd.DataFrame(data={'topics':dic.keys(), 'counts':counts.values(), 'perc':dic.values()}) #prepare interaction tools tools = "pan,wheel_zoom,box_zoom,reset,save".split(',') hover = HoverTool(tooltips=[ ("topic", "@topics"), ("total scholarly publications:", "@{counts}{0}"), ("% "+text+" publications", "@{perc}{0.00%}")]) tools.append(hover) #prepare the plot figure, depending on the quantity of data, go for log scale or linear scale if max(source['counts']>200): p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log") else: p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE) p.background_fill_color = "#ffffff" #change axes labels according to whether we analyze one language or all languages, prepare axes if lan=='all': p.xaxis.axis_label = "average percentage of "+text+" publications across languages" p.yaxis.axis_label = "sum of all scholarly publications across languages" else: p.xaxis.axis_label = "percentage of "+text+" publications" p.yaxis.axis_label = "total number of scholarly publications" p.x_range=Range1d(0,1.1) p.yaxis[0].formatter = NumeralTickFormatter(format="0") p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%") p.grid.grid_line_color = "gray" #draw the circles; change colors here source = ColumnDataSource(source) p.circle("perc", "counts", size=10, source=source, line_color="#8B0A50", line_width=1,line_alpha=0.7, fill_alpha=0.5, fill_color="#cd1076") labels = LabelSet(x="perc", y="counts", text="topics",y_offset=8, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(labels) #draw plot output_notebook() show(p) def generate_comparison_plot(dataframe,topics): ''' takes as input: dataframe - the pandas data frame containing the data ''' TITLE = "Percentage of open, available, and closed publications for all topics" header=['open','available','closed'] #prepare interaction tools tools = "pan,wheel_zoom,box_zoom,reset,save".split(',') hover = HoverTool(tooltips=[ ("topic", "@topic"), ("total scholarly publications:", "@{total}{0}"), ("% open publications", "@{open}{0.00%}"), ("% OA available publications", "@{available}{0.00%}"), ("% paywalled publications", "@{closed}{0.00%}")]) tools.append(hover) dic=df.loc[df['wiki']=='all_languages'] #print dic p = figure(y_range=topics,tools=tools, toolbar_location="above", logo="grey", plot_width=800, title=TITLE) #p.background_fill_color = "#ffffff" #change if you don't want white background #choose format for axes source = ColumnDataSource(dic) p.hbar_stack(header,y='topic',height=0.5,source=source, color=['#5F9E99','#ffb06e','#e72d66']) #p.hbar(right=stack(),left=stack('open'),y='topic',height=0.2,color='blue',source=source,name='open') #p.hbar(right=stack('open'),left=stack('open', 'available'), y='topic', height=0.2, color='red',source=source,name='available') p.legend.visible=True #draws the plot output_notebook() show(p) # We now read the input data and store set of languages and topics available (for future usage) # # In[11]: df = pd.DataFrame.from_csv(inputfile,sep='\t', index_col=None) wikis = list(np.sort(list(set(df['wiki'].tolist())))) topics =list(set(df['topic'].tolist())) topics=np.delete(topics,[topics.index('all_topics'),topics.index('Article improvement and grading')]) topics= np.sort(topics)[::-1] # We now generate the distribution of languages over the real accessibility ('open') and potential accessibility ('available') of their publications, for all topics # In[12]: generate_open_language_plot(df,'open','all') generate_open_language_plot(df,'available','all') # We can also look at the distribution of all topics over the accessibility of their publications, for all languages # # In[13]: generate_open_topic_plot(df,topics,'open','all') generate_open_topic_plot(df,topics,'available','all') # We can also look at the three levels of 'openness' together, for all topics # In[14]: generate_comparison_plot(df,topics) # We see from the language distribution plot above that among the languages with highest number of publications, "Belarussian" Wikipedia is very open; by contrast, we see that "Khmer" wikipedia is less open; let's see their breakdown by topic: # # In[17]: generate_open_topic_plot(df,topics,'open','bewiki') generate_open_topic_plot(df,topics,'open','kmwiki') # We see from the topic distribution plot above that "Space" is the most open topic based on scholarly articles cited, while, for example "Chemistry" has fewer open publications; let's see their breakdown by language: # In[23]: generate_open_language_plot(df,'open','Space') generate_open_language_plot(df,'open','Chemistry')