Geographic trends in academic conferences

Generic data preparation

The dataset, as extracted by our query from GraphDB, needs some minor patching to fix some issues related to CSV parsing. Commented because it has to run only the first time)

In [1]:
# !cat ./data/query-result.tsv | sed 's/View\\"/View/g' > ./data/fixed.tsv
# !cat ./data/fixed.tsv | sed 's/“Eduardo R. Caianiello\\"/“Eduardo R. Caianiello”/g' > ./data/fixed2.tsv
# !rm ./data/fixed.tsv
# !mv ./data/fixed2.tsv ./data/query-result.tsv
In [2]:
import collections
import numpy as np
from hashlib import md5

import pandas as pd
from pandas import DataFrame, read_csv
import cufflinks as cf

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.io as pio

import matplotlib.pyplot as plt
import scipy
import scipy.stats
from scipy.optimize import leastsq
from scipy.stats import spearmanr
from scipy.stats import pearsonr

from sklearn import preprocessing

import pycountry_convert

# plotly.offline.init_notebook_mode()
init_notebook_mode(connected=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)
/Users/andrea/.virtualenvs/skm/lib/python3.6/site-packages/plotly/graph_objs/_deprecations.py:558: DeprecationWarning:

plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis


/Users/andrea/.virtualenvs/skm/lib/python3.6/site-packages/plotly/graph_objs/_deprecations.py:531: DeprecationWarning:

plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis


In [3]:
df = read_csv('./data/query-result.tsv', delimiter='\t', dtype=object)
newcols = {
    '?paper_doi': 'paper_doi',
    '?conf_id': 'conf_id',
    '?conf_seriesId': 'conf_seriesId',
    '?conf_acronym': 'conf_acronym',
    '?conf_name': 'conf_name',
    '?conf_country': 'conf_country',
    '?conf_city': 'conf_city',
    '?author_order': 'author_order', 
    '?author_givenName': 'author_givenName',
    '?author_familyName': 'author_familyName',
    '?org_gridId': 'org_gridId',
    '?org_name': 'org_name',
    '?org_city': 'org_city',
    '?org_country': 'org_country',
    '?org_countrycode': 'org_countrycode',
    '?affiliationString': 'affiliation_string',
    '?paper_title': 'paper_title',
    '?book_doi': 'book_doi',
    '?book_title': 'book_title',
    '?book_subtitle': 'book_confSubtitle',
    '?rights': 'rights',
    '?year': 'year'
}
df.rename(columns=newcols, inplace=True)
df['year'] = df['year'].str.replace(r'\^\^<http://www.w3.org/2001/XMLSchema#gYear>', '')
df['year'] = pd.to_datetime(df['year'], format="%Y")
df['author_order'] = pd.to_numeric(df['author_order'])
df.head()
Out[3]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year
0 10.1007/11678564_14 a462e5448db3b48f8ab0b392a20e4516 bpm BPM International Conference on Business Process M... Nancy France 1 Malu Castellanos grid.418547.b Hewlett-Packard (United States) Palo Alto United States US Hewlett-Packard Labs Preface (BPI 2005) 10.1007/11678564 Business Process Management Workshops BPM 2005 International Workshops, BPI, BPD, EN... Restricted 2006-01-01
1 10.1007/11678564_14 a462e5448db3b48f8ab0b392a20e4516 bpm BPM International Conference on Business Process M... Nancy France 2 Ton Weijters NaN NaN NaN NaN NaN U. of Eindhoven Preface (BPI 2005) 10.1007/11678564 Business Process Management Workshops BPM 2005 International Workshops, BPI, BPD, EN... Restricted 2006-01-01
2 10.1007/11678564_16 a462e5448db3b48f8ab0b392a20e4516 bpm BPM International Conference on Business Process M... Nancy France 1 Linh Thao Ly grid.6582.9 University of Ulm Ulm Germany DE University of Ulm Mining Staff Assignment Rules from Event-Based... 10.1007/11678564 Business Process Management Workshops BPM 2005 International Workshops, BPI, BPD, EN... Restricted 2006-01-01
3 10.1007/11678564_16 a462e5448db3b48f8ab0b392a20e4516 bpm BPM International Conference on Business Process M... Nancy France 2 Stefanie Rinderle grid.6582.9 University of Ulm Ulm Germany DE University of Ulm Mining Staff Assignment Rules from Event-Based... 10.1007/11678564 Business Process Management Workshops BPM 2005 International Workshops, BPI, BPD, EN... Restricted 2006-01-01
4 10.1007/11678564_16 a462e5448db3b48f8ab0b392a20e4516 bpm BPM International Conference on Business Process M... Nancy France 3 Peter Dadam grid.6582.9 University of Ulm Ulm Germany DE University of Ulm Mining Staff Assignment Rules from Event-Based... 10.1007/11678564 Business Process Management Workshops BPM 2005 International Workshops, BPI, BPD, EN... Restricted 2006-01-01

The following is a simple check in order to see if format problems are solved.

In [4]:
# for col in (df.columns):
#     if col != 'year':
#         print(df[df[col].str.contains("w3.org", na = False)])

Let's explore all contributions for a couple of papers

In [5]:
df[df.paper_title == 'NIF Combinator: Combining NLP Tool Output']
Out[5]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year
583487 10.1007/978-3-642-33876-2_44 80f10d491a601c62fd8f2d6822a910d4 ekaw EKAW International Conference on Knowledge Engineer... Galway Ireland 1 Sebastian Hellmann grid.9647.c Leipzig University Leipzig Germany DE Universität Leipzig NIF Combinator: Combining NLP Tool Output 10.1007/978-3-642-33876-2 Knowledge Engineering and Knowledge Management 18th International Conference, EKAW 2012, Galw... Restricted 2012-01-01
583488 10.1007/978-3-642-33876-2_44 80f10d491a601c62fd8f2d6822a910d4 ekaw EKAW International Conference on Knowledge Engineer... Galway Ireland 2 Jens Lehmann grid.9647.c Leipzig University Leipzig Germany DE Universität Leipzig NIF Combinator: Combining NLP Tool Output 10.1007/978-3-642-33876-2 Knowledge Engineering and Knowledge Management 18th International Conference, EKAW 2012, Galw... Restricted 2012-01-01
583489 10.1007/978-3-642-33876-2_44 80f10d491a601c62fd8f2d6822a910d4 ekaw EKAW International Conference on Knowledge Engineer... Galway Ireland 3 Sören Auer grid.6810.f Chemnitz University of Technology Chemnitz Germany DE Technische Universität Chemnitz NIF Combinator: Combining NLP Tool Output 10.1007/978-3-642-33876-2 Knowledge Engineering and Knowledge Management 18th International Conference, EKAW 2012, Galw... Restricted 2012-01-01
583490 10.1007/978-3-642-33876-2_44 80f10d491a601c62fd8f2d6822a910d4 ekaw EKAW International Conference on Knowledge Engineer... Galway Ireland 4 Marcus Nitzschke grid.9647.c Leipzig University Leipzig Germany DE Universität Leipzig NIF Combinator: Combining NLP Tool Output 10.1007/978-3-642-33876-2 Knowledge Engineering and Knowledge Management 18th International Conference, EKAW 2012, Galw... Restricted 2012-01-01
In [6]:
df[df.paper_doi == '10.1007/11926078_68']
Out[6]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year
1760120 10.1007/11926078_68 b02ea6fdadf64adbc6d95ddb5dca530b semweb ISWC International Semantic Web Conference Athens, GA USA 1 Markus Krötzsch grid.7892.4 Karlsruhe Institute of Technology Karlsruhe Germany DE AIFB, Universität Karlsruhe Semantic MediaWiki 10.1007/11926078 The Semantic Web - ISWC 2006 5th International Semantic Web Conference, ISW... OpenAccess 2006-01-01
1760121 10.1007/11926078_68 b02ea6fdadf64adbc6d95ddb5dca530b semweb ISWC International Semantic Web Conference Athens, GA USA 2 Denny Vrandečić grid.7892.4 Karlsruhe Institute of Technology Karlsruhe Germany DE AIFB, Universität Karlsruhe Semantic MediaWiki 10.1007/11926078 The Semantic Web - ISWC 2006 5th International Semantic Web Conference, ISW... OpenAccess 2006-01-01
1760122 10.1007/11926078_68 b02ea6fdadf64adbc6d95ddb5dca530b semweb ISWC International Semantic Web Conference Athens, GA USA 3 Max Völkel NaN NaN NaN NaN NaN FZI Karlsruhe Semantic MediaWiki 10.1007/11926078 The Semantic Web - ISWC 2006 5th International Semantic Web Conference, ISW... OpenAccess 2006-01-01
In [7]:
df.describe(include='all')
Out[7]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year
count 1770091 999822 999822 999822 999822 999822 999822 1.770091e+06 1767514 1770091 1356666 1356666 1356666 1356666 1356632 1737513 1770091 1770091 1770091 1770091 1503426 1770091
unique 506049 1028 977 1074 1213 1244 103 NaN 165052 247568 14802 14623 4203 160 159 191164 503994 12117 7130 11806 2 22
top 10.1007/978-3-540-49676-2_9 7a0232d303c8198116d174007f5faca1 hci MICCAI International Conference on Medical Image Comp... Beijing China NaN M. Wang grid.9227.e Chinese Academy of Sciences Beijing China CN Chinese Academy of Sciences Introduction 10.1007/978-3-540-36841-0 World Congress on Medical Physics and Biomedic... Volume 1 Restricted 2015-01-01 00:00:00
freq 240 27468 38971 21475 21475 26828 129061 NaN 20176 25630 10536 10536 54835 193013 193013 9222 409 5240 12404 7924 1428271 149612
first NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996-01-01 00:00:00
last NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2017-01-01 00:00:00
mean NaN NaN NaN NaN NaN NaN NaN 2.647722e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN 2.638346e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN 2.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN 1.390000e+02 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Alas, there are non-trivial problem with affiliations (check below). We need to fix them!

In [8]:
df[(df.affiliation_string == 'University of Oxford') & (df.org_countrycode == 'IL')]
Out[8]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year
1393283 10.1007/0-387-25515-X_15 NaN NaN NaN NaN NaN NaN 1 Derek J. Blake grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Glycosylation Defects and Muscular Dystrophy 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393284 10.1007/0-387-25515-X_15 NaN NaN NaN NaN NaN NaN 2 Christopher T. Esapa grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Glycosylation Defects and Muscular Dystrophy 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393297 10.1007/0-387-25515-X_18 NaN NaN NaN NaN NaN NaN 1 Simon A. Fry grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Differential Glycosylation of Gelatinase B fro... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393299 10.1007/0-387-25515-X_18 NaN NaN NaN NaN NaN NaN 3 Louise Royle grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Differential Glycosylation of Gelatinase B fro... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393300 10.1007/0-387-25515-X_18 NaN NaN NaN NaN NaN NaN 4 Mark R. Wormald grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Differential Glycosylation of Gelatinase B fro... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393303 10.1007/0-387-25515-X_18 NaN NaN NaN NaN NaN NaN 7 Pauline M. Rudd grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Differential Glycosylation of Gelatinase B fro... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393304 10.1007/0-387-25515-X_18 NaN NaN NaN NaN NaN NaN 8 Raymond A. Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Differential Glycosylation of Gelatinase B fro... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393305 10.1007/0-387-25515-X_21 NaN NaN NaN NaN NaN NaN 1 Frances M. Platt grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford New Developments in Treating Glycosphingolipid... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393306 10.1007/0-387-25515-X_21 NaN NaN NaN NaN NaN NaN 2 Mylvaganam Jeyakumar grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford New Developments in Treating Glycosphingolipid... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393307 10.1007/0-387-25515-X_21 NaN NaN NaN NaN NaN NaN 3 Ulrika Andersson grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford New Developments in Treating Glycosphingolipid... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393308 10.1007/0-387-25515-X_21 NaN NaN NaN NaN NaN NaN 4 Raymond A. Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford New Developments in Treating Glycosphingolipid... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393309 10.1007/0-387-25515-X_21 NaN NaN NaN NaN NaN NaN 5 Terry D. Butters grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford New Developments in Treating Glycosphingolipid... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393319 10.1007/0-387-25515-X_11 NaN NaN NaN NaN NaN NaN 1 Alan J. Wright grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Hyaluronan in Immune Processes 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393320 10.1007/0-387-25515-X_11 NaN NaN NaN NaN NaN NaN 2 Anthony J. Day grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Hyaluronan in Immune Processes 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393321 10.1007/0-387-25515-X_9 NaN NaN NaN NaN NaN NaN 1 James N. Arnold grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Human Immunoglobulin Glycosylation and the Lec... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393325 10.1007/0-387-25515-X_9 NaN NaN NaN NaN NaN NaN 5 Robert B. Sim grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Human Immunoglobulin Glycosylation and the Lec... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393352 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 1 D. Pavlovic grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393353 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 2 W. Fischer grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393354 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 3 M. Hussey grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393358 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 7 S. Woodhouse grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393359 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 8 R. A. Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393360 10.1007/0-387-25515-X_2 NaN NaN NaN NaN NaN NaN 9 N. Zitzmann grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Long Alkylchain Iminosugars Block the HCV p7 I... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393374 10.1007/0-387-25515-X_12 NaN NaN NaN NaN NaN NaN 1 David A. Shore grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Glycosylation and the Function of the T Cell C... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393378 10.1007/0-387-25515-X_12 NaN NaN NaN NaN NaN NaN 3 Raymond A. Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Glycosylation and the Function of the T Cell C... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393379 10.1007/0-387-25515-X_12 NaN NaN NaN NaN NaN NaN 4 Pauline M. Rudd grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Glycosylation and the Function of the T Cell C... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393388 10.1007/0-387-25515-X_4 NaN NaN NaN NaN NaN NaN 1 Chris Scanlan grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Antibody Recognition of a Carbohydrate Epitope... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393396 10.1007/0-387-25515-X_4 NaN NaN NaN NaN NaN NaN 8 Raymond Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Antibody Recognition of a Carbohydrate Epitope... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393397 10.1007/0-387-25515-X_4 NaN NaN NaN NaN NaN NaN 9 Pauline Rudd grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Antibody Recognition of a Carbohydrate Epitope... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393410 10.1007/0-387-25515-X_13 NaN NaN NaN NaN NaN NaN 1 Yusuke Mimura grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Immunogenicity of Calreticulin-Bound Murine Le... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393413 10.1007/0-387-25515-X_13 NaN NaN NaN NaN NaN NaN 3 Yuka Mimura-Kimura grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Immunogenicity of Calreticulin-Bound Murine Le... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393414 10.1007/0-387-25515-X_13 NaN NaN NaN NaN NaN NaN 4 Raymond A. Dwek grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Immunogenicity of Calreticulin-Bound Murine Le... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
1393415 10.1007/0-387-25515-X_13 NaN NaN NaN NaN NaN NaN 5 Pauline M. Rudd grid.13992.30 Weizmann Institute of Science Rehovot Israel IL University of Oxford Immunogenicity of Calreticulin-Bound Murine Le... 10.1007/b135491 Glycobiology and Medicine Proceedings of the 7th Jenner Glycobiology and... Restricted 2005-01-01
In [9]:
aff_string_countries = df[['affiliation_string', 'org_countrycode']].drop_duplicates()
aff_string_countries = aff_string_countries.groupby('affiliation_string').agg({'org_countrycode':lambda x: list(x)})
aff_string_countries['attributed_countries'] = aff_string_countries.apply(lambda row: len(row['org_countrycode']), axis=1)
In [10]:
aff_string_countries.loc['University of Oxford':'University of Oxford']
Out[10]:
org_countrycode attributed_countries
affiliation_string
University of Oxford [GB, nan, IL, US] 4
In [11]:
aff_string_countries.loc['University of Oslo':'University of Oslo']
Out[11]:
org_countrycode attributed_countries
affiliation_string
University of Oslo [NO, nan, IL] 3
In [12]:
aff_string_countries.loc['University of London':'University of London']
Out[12]:
org_countrycode attributed_countries
affiliation_string
University of London [GB, nan, IT] 3
In [13]:
aff_string_countries.loc['University of Leuven':'University of Leuven']
Out[13]:
org_countrycode attributed_countries
affiliation_string
University of Leuven [BE, US, IL] 3
In [14]:
aff_string_countries[(aff_string_countries.attributed_countries == 2)].head(10)
Out[14]:
org_countrycode attributed_countries
affiliation_string
(Lanzhou Jiaotong University), Ministry of Education [CN, nan] 2
(Northeastern University), Ministry of Education [CN, nan] 2
(Renmin University of China), MOE [CN, nan] 2
(U. Lyon, CNRS, ENS Lyon, INRIA, UCBL) [nan, FR] 2
1 University of New Mexico [US, nan] 2
21st Century Systems Inc. [US, nan] 2
A*STAR [SG, nan] 2
A*STAR-NUS [nan, SG] 2
A-Star [nan, SG] 2
A. D. Patel Institute of Technology [nan, IN] 2
In [15]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha2(country)
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(country_code)
        except:
            return np.nan

We will use the official Grid.ac dataset to fix all the cases where the affiliation_string matches a unique organisation name in Grid.ac

In [16]:
grid_df = pd.read_csv('./data/grid.csv')
grid_df.Country.replace({'State of Palestine': 'Palestine',
                         'Reunion': 'Réunion',
                         'Aland Islands': 'Åland Islands',
                         'Curacao': 'Curaçao'}, inplace=True)
grid_df['Countrycode'] = grid_df['Country'].map(lambda country: country_to_countrycode(country))
grid_df.head()
Out[16]:
ID Name City State Country Countrycode
0 grid.1001.0 Australian National University Canberra Australian Capital Territory Australia AU
1 grid.1002.3 Monash University Melbourne Victoria Australia AU
2 grid.1003.2 University of Queensland Brisbane Queensland Australia AU
3 grid.1004.5 Macquarie University Sydney New South Wales Australia AU
4 grid.1005.4 UNSW Australia Sydney New South Wales Australia AU
In [17]:
data = grid_df.groupby('Country').count().reset_index()
tmp = [ dict(
        type = 'choropleth',
        locationmode = 'country names',
        locations = data['Country'],
        z = data['ID'],
        text = data['ID'],
#         colorscale = [[0,"rgb(0,0,0)"]],
        autocolorscale = True,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = True,
            title = 'Title'),
      ) ]

layout = dict(
    title = 'gridID dataset coverage',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'equirectangular'
        )
    )
)

fig = dict(data=tmp, layout=layout)
plotly.offline.iplot( fig, validate=False )

Let's add a column with the number of entries in grid.ac having the same name.
In order to avoid problems, we will focus only on the unique ones (e.g. count = 1)

In [18]:
count_affiliation_names = grid_df[['ID', 'Name']].groupby('Name').count().sort_values(by='ID', ascending=False).reset_index()
count_affiliation_names.columns = ['Name', 'Count']
count_affiliation_names.head(n=5)
Out[18]:
Name Count
0 Ministry of Health 54
1 Ministry of Education 19
2 Argosy University 18
3 Ministry of Justice 17
4 St. Luke's Hospital 12
In [19]:
grid_df = pd.merge(grid_df, count_affiliation_names, left_on='Name', right_on='Name')
grid_df.head()
Out[19]:
ID Name City State Country Countrycode Count
0 grid.1001.0 Australian National University Canberra Australian Capital Territory Australia AU 1
1 grid.1002.3 Monash University Melbourne Victoria Australia AU 1
2 grid.1003.2 University of Queensland Brisbane Queensland Australia AU 1
3 grid.1004.5 Macquarie University Sydney New South Wales Australia AU 1
4 grid.1005.4 UNSW Australia Sydney New South Wales Australia AU 1
In [20]:
df = pd.merge(df, grid_df[grid_df.Count == 1], left_on='affiliation_string', right_on='Name', how='left')
df.loc[df.affiliation_string == df.Name, 'org_gridId'] = df.loc[df.affiliation_string == df.Name, 'ID']
df.loc[df.affiliation_string == df.Name, 'org_name'] = df.loc[df.affiliation_string == df.Name, 'Name']
df.loc[df.affiliation_string == df.Name, 'org_city'] = df.loc[df.affiliation_string == df.Name, 'City']
df.loc[df.affiliation_string == df.Name, 'org_country'] = df.loc[df.affiliation_string == df.Name, 'Country']
df.loc[df.affiliation_string == df.Name, 'org_countrycode'] = df.loc[df.affiliation_string == df.Name, 'Countrycode']
df.drop(columns=['ID', 'Name', 'City', 'State', 'Country', 'Countrycode'], inplace=True)

Check whether University of Oxford is still attributed to Israel in some records.

In [21]:
df[(df.affiliation_string == 'University of Oxford') & (df.org_countrycode != 'GB')]
Out[21]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count

Now let's do the same with organisation aliases.

In [22]:
alias_df = pd.read_csv('./data/aliases.csv')
alias_df = pd.merge(alias_df, grid_df, left_on='grid_id', right_on='ID', how='left').drop(columns=['ID', 'Name', 'Count'])
alias_df.head()
Out[22]:
grid_id alias City State Country Countrycode
0 grid.1005.4 University of New South Wales Sydney New South Wales Australia AU
1 grid.1006.7 University of Newcastle upon Tyne Newcastle upon Tyne NaN United Kingdom GB
2 grid.1007.6 Wollongong University Wollongong New South Wales Australia AU
3 grid.1008.9 Melbourne University Melbourne Victoria Australia AU
4 grid.1010.0 Adelaide Uni Adelaide South Australia Australia AU
In [23]:
count_alias_names = alias_df[['grid_id', 'alias']].groupby('alias').count().sort_values(by='grid_id', ascending=False).reset_index()
count_alias_names.columns = ['alias', 'Count']
count_alias_names.head(n=5)
Out[23]:
alias Count
0 Merck Sharp & Dohme 22
1 Biogen Idec 8
2 Hoffmann-La Roche 8
3 Valeant Pharmaceuticals 6
4 Royal Philips 6
In [24]:
alias_df = pd.merge(alias_df, count_alias_names, left_on='alias', right_on='alias')
alias_df.head()
Out[24]:
grid_id alias City State Country Countrycode Count
0 grid.1005.4 University of New South Wales Sydney New South Wales Australia AU 1
1 grid.1006.7 University of Newcastle upon Tyne Newcastle upon Tyne NaN United Kingdom GB 1
2 grid.1007.6 Wollongong University Wollongong New South Wales Australia AU 1
3 grid.1008.9 Melbourne University Melbourne Victoria Australia AU 1
4 grid.1010.0 Adelaide Uni Adelaide South Australia Australia AU 1
In [25]:
df = pd.merge(df, alias_df[alias_df.Count == 1], left_on='affiliation_string', right_on='alias', how='left')
df.loc[df.affiliation_string == df.alias, 'org_gridId'] = df.loc[df.affiliation_string == df.alias, 'grid_id']
df.loc[df.affiliation_string == df.alias, 'org_name'] = df.loc[df.affiliation_string == df.alias, 'alias']
df.loc[df.affiliation_string == df.alias, 'org_city'] = df.loc[df.affiliation_string == df.alias, 'City']
df.loc[df.affiliation_string == df.alias, 'org_country'] = df.loc[df.affiliation_string == df.alias, 'Country']
df.loc[df.affiliation_string == df.alias, 'org_countrycode'] = df.loc[df.affiliation_string == df.alias, 'Countrycode']
df.drop(columns=['grid_id', 'alias', 'City', 'State', 'Country', 'Countrycode'], inplace=True)
In [26]:
df[(df.affiliation_string == 'University of Leuven') & (df.org_countrycode != 'BE')]
Out[26]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count_x Count_y

Let's fix some minor issues we found.

In [27]:
df.loc[df.org_country == 'Namibia', 'org_countrycode'] = 'NA'
df.loc[df.org_country == 'Kosovo', 'org_countrycode'] = 'XK'
In [28]:
df['org_continent'] = df['org_countrycode'].map(lambda countrycode: countrycode_to_continent(countrycode))
In [29]:
df.loc[df.org_countrycode == 'TL', 'org_continent'] = 'AS'

Let's fix now the geolocation for conferences

In [30]:
df.conf_country.replace(to_replace={'The Netherlands': 'Netherlands',
                                    'UK': 'United Kingdom',
                                    'Korea (Republic of)': 'South Korea'}, inplace=True)
In [31]:
df['conf_countrycode'] = df['conf_country'].map(lambda country: country_to_countrycode(country))
In [32]:
df['conf_continent'] = df['conf_countrycode'].map(lambda countrycode: countrycode_to_continent(countrycode))

Let's prepare the four datasets for our analysis (macro + 3 micro). The three micro are dumped as they will be manually refined and later re-loaded.

In [33]:
macro = df[df.year != '2017-01-01']

For ISWC, I have to add manually ISWC 2007 and 2015.

In [34]:
iswc = df[(df.conf_seriesId == 'semweb') | (df.book_doi == '10.1007/978-3-540-76298-0') | (df.book_doi == '10.1007/978-3-319-25010-6') | (df.book_doi == '10.1007/978-3-319-25007-6')]
iswc = iswc.fillna({'conf_seriesId': 'semweb', 'conf_acronym': 'ISWC', 'conf_name': 'International Semantic Web Conference'})
iswc.to_csv('./data/iswc_dump.csv')

Let's prepare the dataset for ESWC. 2007 edition apparently is missing!

In [35]:
df[df.book_doi == '10.1007/978-3-540-72667-8']
Out[35]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count_x Count_y org_continent conf_countrycode conf_continent
In [36]:
eswc = df[(df.conf_seriesId == 'esws')]
eswc.to_csv('./data/eswc_dump.csv')

Now it's the turn of TPDL.

In [37]:
tpdl = df[(df.conf_seriesId == 'ercimdl') | (df.book_doi == '10.1007/978-3-319-24592-8')]
tpdl = tpdl.fillna({'conf_seriesId': 'ercimdl', 'conf_acronym': 'TPDL', 'conf_name': 'International Conference on Theory and Practice of Digital Libraries'})
tpdl.to_csv('./data/tpdl_dump.csv')
In [38]:
tpdl.head()
Out[38]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count_x Count_y org_continent conf_countrycode conf_continent
300382 10.1007/978-3-540-74851-9_1 d756335bdf230885dd112341d2d131da ercimdl ECDL International Conference on Theory and Practic... Budapest Hungary 1 Jörg Diederich NaN NaN NaN NaN NaN L3S Research Center and Leibniz Universität Ha... The Semantic GrowBag Algorithm: Automatically ... 10.1007/978-3-540-74851-9 Research and Advanced Technology for Digital L... 11th European Conference, ECDL 2007, Budapest,... Restricted 2007-01-01 NaN NaN NaN HU EU
300383 10.1007/978-3-540-74851-9_1 d756335bdf230885dd112341d2d131da ercimdl ECDL International Conference on Theory and Practic... Budapest Hungary 2 Wolf-Tilo Balke NaN NaN NaN NaN NaN L3S Research Center and Leibniz Universität Ha... The Semantic GrowBag Algorithm: Automatically ... 10.1007/978-3-540-74851-9 Research and Advanced Technology for Digital L... 11th European Conference, ECDL 2007, Budapest,... Restricted 2007-01-01 NaN NaN NaN HU EU
300384 10.1007/978-3-540-74851-9_2 d756335bdf230885dd112341d2d131da ercimdl ECDL International Conference on Theory and Practic... Budapest Hungary 1 Stephan Bloehdorn grid.7892.4 Karlsruhe Institute of Technology Karlsruhe Germany DE Institute AIFB, University of Karlsruhe, D-761... Ontology-Based Question Answering for Digital ... 10.1007/978-3-540-74851-9 Research and Advanced Technology for Digital L... 11th European Conference, ECDL 2007, Budapest,... Restricted 2007-01-01 NaN NaN EU HU EU
300385 10.1007/978-3-540-74851-9_2 d756335bdf230885dd112341d2d131da ercimdl ECDL International Conference on Theory and Practic... Budapest Hungary 2 Philipp Cimiano grid.7892.4 Karlsruhe Institute of Technology Karlsruhe Germany DE Institute AIFB, University of Karlsruhe, D-761... Ontology-Based Question Answering for Digital ... 10.1007/978-3-540-74851-9 Research and Advanced Technology for Digital L... 11th European Conference, ECDL 2007, Budapest,... Restricted 2007-01-01 NaN NaN EU HU EU
300386 10.1007/978-3-540-74851-9_2 d756335bdf230885dd112341d2d131da ercimdl ECDL International Conference on Theory and Practic... Budapest Hungary 3 Alistair Duke grid.1453.3 BT Group (United Kingdom) London United Kingdom GB British Telecom, Adastral Park, Ipswich IP5 3RE Ontology-Based Question Answering for Digital ... 10.1007/978-3-540-74851-9 Research and Advanced Technology for Digital L... 11th European Conference, ECDL 2007, Budapest,... Restricted 2007-01-01 NaN NaN EU HU EU

Finally, let's load another dataset providing a lots of information about countries' centroids

In [39]:
centroids = pd.read_csv('./data/country_centroids_az8.csv')
additional_centroids = pd.DataFrame(data=[['GP', 29.030833, -118.28],
                                          ['RE', -21.114444, 55.5325],
                                          ['BQ', 12.183333, -68.25],
                                          ['MQ', 4.666667, -61.0],
                                          ['TV', -8.520833, 179.198611],
                                          ['NA', -22.0, 17.0],
                                          ['GF', 4.0, -53.0],
                                          ['XK', 42.583333, 21.0]],
                           columns=['iso_a2', 'Latitude', 'Longitude'])
centroids = centroids.append(additional_centroids, sort=False)
In [40]:
centroids.head(n=5)
Out[40]:
FID the_geom FID_ scalerank featurecla labelrank sovereignt sov_a3 adm0_dif level type admin adm0_a3 geou_dif geounit gu_a3 su_dif subunit su_a3 brk_diff name name_long brk_a3 brk_name brk_group abbrev postal formal_en formal_fr note_adm0 note_brk name_sort name_alt mapcolor7 mapcolor8 mapcolor9 mapcolor13 pop_est gdp_md_est pop_year lastcensus gdp_year economy income_grp wikipedia fips_10 iso_a2 iso_a3 iso_n3 un_a3 wb_a2 wb_a3 woe_id adm0_a3_is adm0_a3_us adm0_a3_un adm0_a3_wb continent region_un subregion region_wb name_len long_len abbrev_len tiny homepart Longitude Latitude
0 country_centroids_az8.1 POINT (-69.9826771125 12.5208803838) 0.0 3.0 Admin-0 country 5.0 Netherlands NL1 1.0 2.0 Country Aruba ABW 0.0 Aruba ABW 0.0 Aruba ABW 0.0 Aruba Aruba ABW Aruba NaN Aruba AW Aruba NaN Neth. NaN Aruba NaN 4.0 2.0 2.0 9.0 103065.0 2258.0 -99.0 2010.0 -99.0 6. Developing region 2. High income: nonOECD -99.0 NaN AW ABW 533.0 533.0 AW ABW -99.0 ABW ABW -99.0 -99.0 North America Americas Caribbean Latin America & Caribbean 5.0 5.0 5.0 4.0 -99.0 -69.982677 12.520880
1 country_centroids_az8.2 POINT (66.0047336558 33.8352307278) 1.0 1.0 Admin-0 country 3.0 Afghanistan AFG 0.0 2.0 Sovereign country Afghanistan AFG 0.0 Afghanistan AFG 0.0 Afghanistan AFG 0.0 Afghanistan Afghanistan AFG Afghanistan NaN Afg. AF Islamic State of Afghanistan NaN NaN NaN Afghanistan NaN 5.0 6.0 8.0 7.0 28400000.0 22270.0 -99.0 1979.0 -99.0 7. Least developed region 5. Low income -99.0 NaN AF AFG 4.0 4.0 AF AFG -99.0 AFG AFG -99.0 -99.0 Asia Asia Southern Asia South Asia 11.0 11.0 4.0 -99.0 1.0 66.004734 33.835231
2 country_centroids_az8.3 POINT (17.5373676815 -12.2933605438) 2.0 1.0 Admin-0 country 3.0 Angola AGO 0.0 2.0 Sovereign country Angola AGO 0.0 Angola AGO 0.0 Angola AGO 0.0 Angola Angola AGO Angola NaN Ang. AO People's Republic of Angola NaN NaN NaN Angola NaN 3.0 2.0 6.0 1.0 12799293.0 110300.0 -99.0 1970.0 -99.0 7. Least developed region 3. Upper middle income -99.0 NaN AO AGO 24.0 24.0 AO AGO -99.0 AGO AGO -99.0 -99.0 Africa Africa Middle Africa Sub-Saharan Africa 6.0 6.0 4.0 -99.0 1.0 17.537368 -12.293361
3 country_centroids_az8.4 POINT (-63.0649892654 18.2239595023) 3.0 1.0 Admin-0 country 6.0 United Kingdom GB1 1.0 2.0 Dependency Anguilla AIA 0.0 Anguilla AIA 0.0 Anguilla AIA 0.0 Anguilla Anguilla AIA Anguilla NaN Ang. AI NaN NaN U.K. NaN Anguilla NaN 6.0 6.0 6.0 3.0 14436.0 108.9 -99.0 -99.0 -99.0 6. Developing region 3. Upper middle income -99.0 NaN AI AIA 660.0 660.0 -99 -99 -99.0 AIA AIA -99.0 -99.0 North America Americas Caribbean Latin America & Caribbean 8.0 8.0 4.0 -99.0 -99.0 -63.064989 18.223959
4 country_centroids_az8.5 POINT (20.0498339611 41.1424498947) 4.0 1.0 Admin-0 country 6.0 Albania ALB 0.0 2.0 Sovereign country Albania ALB 0.0 Albania ALB 0.0 Albania ALB 0.0 Albania Albania ALB Albania NaN Alb. AL Republic of Albania NaN NaN NaN Albania NaN 1.0 4.0 1.0 6.0 3639453.0 21810.0 -99.0 2001.0 -99.0 6. Developing region 4. Lower middle income -99.0 NaN AL ALB 8.0 8.0 AL ALB -99.0 ALB ALB -99.0 -99.0 Europe Europe Southern Europe Europe & Central Asia 7.0 7.0 4.0 -99.0 1.0 20.049834 41.142450
In [41]:
def latlon(countrycode):
    if pd.isna(countrycode):
        return (np.nan, np.nan)
    else:
        try:
            return (float(centroids[centroids.iso_a2 == countrycode]['Latitude']),
                    float(centroids[centroids.iso_a2 == countrycode]['Longitude']))
        except:
            print(countrycode)

Let's free some memory now

In [42]:
del df

Macro analysis

Data preparation

In [43]:
macro.describe(include='all')
Out[43]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count_x Count_y org_continent conf_countrycode conf_continent
count 1664733 946165 946165 946165 946165 946165 946165 1.664733e+06 1663075 1664733 1369841 1369841 1369831 1369831 1369831 1632282 1664733 1664733 1664733 1664733 1398076 1664733 712848.0 50397.0 1369831 946165 946165
unique 477921 1016 969 1064 1193 1201 100 NaN 157022 235440 15423 16043 4305 168 166 182169 475894 11335 6726 11056 2 21 NaN NaN 6 100 6
top 10.1007/978-3-540-49676-2_4 7a0232d303c8198116d174007f5faca1 hci MICCAI International Conference on Medical Image Comp... Beijing China NaN M. Wang grid.9227.e Chinese Academy of Sciences Beijing China CN Chinese Academy of Sciences Introduction 10.1007/978-3-540-36841-0 World Congress on Medical Physics and Biomedic... Volume 1 Restricted 2015-01-01 00:00:00 NaN NaN EU CN EU
freq 240 27468 34390 19683 19683 25829 124473 NaN 19506 24014 10650 10650 55488 197746 197746 8792 392 5240 12404 7605 1324725 149612 NaN NaN 637889 124473 515033
first NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996-01-01 00:00:00 NaN NaN NaN NaN NaN
last NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01 00:00:00 NaN NaN NaN NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN 2.646140e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN 2.676850e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN 2.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN 1.390000e+02 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
In [44]:
macro[macro.conf_id.notna()].describe(include='all')
Out[44]:
paper_doi conf_id conf_seriesId conf_acronym conf_name conf_city conf_country author_order author_givenName author_familyName org_gridId org_name org_city org_country org_countrycode affiliation_string paper_title book_doi book_title book_confSubtitle rights year Count_x Count_y org_continent conf_countrycode conf_continent
count 946165 946165 946165 946165 946165 946165 946165 946165.000000 945899 946165 809939 809939 809933 809933 809933 942749 946165 946165 946165 946165 888267 946165 427988.0 30214.0 809933 946165 946165
unique 264534 1016 969 1064 1193 1201 100 NaN 96616 128098 9578 10003 3247 150 149 98905 263653 6405 2706 6373 2 20 NaN NaN 6 100 6
top 10.1007/978-3-540-49676-2_9 7a0232d303c8198116d174007f5faca1 hci MICCAI International Conference on Medical Image Comp... Beijing China NaN Michael Wang grid.9227.e Chinese Academy of Sciences Beijing China CN Chinese Academy of Sciences Formal Derivation of Finite State Machines for... 10.1007/978-3-540-74958-5 Electronic Government 18th European Conference on Machine Learning, ... Restricted 2013-01-01 00:00:00 NaN NaN EU CN EU
freq 240 27468 34390 19683 19683 25829 124473 NaN 5657 13952 7328 7328 33704 111188 111188 6085 240 3780 9127 3780 827838 88827 NaN NaN 381029 124473 515033
first NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1997-01-01 00:00:00 NaN NaN NaN NaN NaN
last NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01 00:00:00 NaN NaN NaN NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN 2.448613 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN 1.982875 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN 1.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN 1.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN 2.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN 3.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN 139.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN
In [45]:
macro[['conf_continent', 'conf_id']].drop_duplicates().groupby('conf_continent').count()
Out[45]:
conf_id
conf_continent
AF 28
AS 410
EU 800
NA 343
OC 104
SA 66
In [46]:
trace1 = go.Scatter(x=macro.groupby('year')['conf_id'].nunique().index,
                    y=macro.groupby('year')['conf_id'].nunique().values,
                    name='All conferences')
traces = [trace1]
for continent in macro['conf_continent'].unique():
    trace = go.Scatter(x=macro[macro.conf_continent == continent].groupby('year')['conf_id'].nunique().index,
                        y=macro[macro.conf_continent == continent].groupby('year')['conf_id'].nunique().values,
                        name= str(continent) + ' conferences')
    traces.append(trace)

layout = go.Layout(title="Number of conferences per year",
                   legend=dict(orientation='h'))

fig = go.Figure(data=traces, layout=layout)
plotly.offline.iplot(fig)
In [47]:
macro_contributions = macro.groupby(['year'])['paper_doi'].count()
macro_contributions_with_gridId = macro[macro['org_gridId'].notna()].groupby(['year'])['paper_doi'].count()
macro_contributions_without_gridId = macro[macro['org_gridId'].isna()].groupby(['year'])['paper_doi'].count()

trace1 = go.Scatter(x=macro_contributions.index,
                    y=macro_contributions.values,
                    name='contributions',
                    marker=dict(symbol='circle', size=8),
                    mode='lines+markers')
                    
trace2 = go.Scatter(x=macro_contributions_with_gridId.index,
                    y=macro_contributions_with_gridId.values,
                    name='contributions (w/ gridID)',
                    marker=dict(symbol='cross', size=8),
                    mode='lines+markers')

trace3 = go.Scatter(x=macro_contributions_without_gridId.index,
                    y=macro_contributions_without_gridId.values,
                    name='contributions (w/o gridID)',
                    marker=dict(symbol='triangle-up', size=8),
                    mode='lines+markers')
                    
layout = go.Layout(title='Trends of contributions',
                   legend=dict(orientation='h'))

fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
plotly.offline.iplot(fig)
pio.write_image(fig, 'reports/figures/macro_contributions.png')
In [48]:
macro_ids = macro.groupby(['year'])['paper_doi', 'org_gridId'].nunique()

trace1 = go.Scatter(x=macro_ids.index,
                    y=macro_ids['paper_doi'],
                    name='DOIs',
                    marker=dict(symbol='circle', size=8),
                    mode='lines+markers')
                    
trace2 = go.Scatter(x=macro_ids.index,
                    y=macro_ids['org_gridId'],
                    name='gridIDs',
                    marker=dict(symbol='cross', size=8),
                    mode='lines+markers')

layout = go.Layout(title='Trends of unique identifiers',
                   legend=dict(orientation='h'))

fig = go.Figure(data=[trace1, trace2], layout=layout)
plotly.offline.iplot(fig)
pio.write_image(fig, 'reports/figures/macro_identifiers.png')
In [49]:
macro_ids.corr()
Out[49]:
paper_doi org_gridId
paper_doi 1.000000 0.987086
org_gridId 0.987086 1.000000

Let's validate our hypothesis on authors' order

In [50]:
author_lists = macro[['paper_doi', 'author_order', 'author_familyName']].drop_duplicates().sort_values(by=['paper_doi', 'author_order'])
author_lists = author_lists.groupby('paper_doi').agg({'author_familyName':lambda x: list(x)})

def isOrdered(lst):
    if len(lst) == 1:
        return False
    else:
        return all(lst[i] <= lst[i+1] for i in range(len(lst)-1))

author_lists['isOrdered'] = author_lists['author_familyName'].map(lambda lst: isOrdered(lst))
author_lists['n_authors'] = author_lists['author_familyName'].map(lambda lst: len(lst))

Let's double check we did not miss any paper. 477921 is the number of unique DOIs we are expecting to see

In [51]:
author_lists.shape[0]
Out[51]:
477921
In [52]:
author_lists.head(n=10)
Out[52]:
author_familyName isOrdered n_authors
paper_doi
10.1007/0-387-25515-X_1 [Zitzmann, Block, Methta, Rudd, Burton, Wilson... False 9
10.1007/0-387-25515-X_10 [Steen, Grillet, Opdenakker] False 3
10.1007/0-387-25515-X_11 [Wright, Day] False 2
10.1007/0-387-25515-X_12 [Shore, Wilson, Dwek, Rudd] False 4
10.1007/0-387-25515-X_13 [Mimura, Golgher, Mimura-Kimura, Dwek, Rudd, E... False 6
10.1007/0-387-25515-X_14 [Hooper] False 1
10.1007/0-387-25515-X_15 [Blake, Esapa, Martin-Rendon, McIlhinney] True 4
10.1007/0-387-25515-X_16 [Shi, Williams, Kurniawan, Lu, Stanley] False 5
10.1007/0-387-25515-X_17 [Baldwin, Allen, Bourke, Hounsell, Calvert] False 5
10.1007/0-387-25515-X_18 [Fry, Steen, Royle, Wormald, Leathem, Opdenakk... False 8

The percentage of papers with authors alphabetically ordered is:

In [53]:
author_lists[(author_lists.isOrdered == True)].shape[0]/author_lists.shape[0]
Out[53]:
0.26809661010920216

Considering only papers with more than three authors, it drops to:

In [54]:
author_lists[(author_lists.isOrdered == True) & (author_lists.n_authors >= 3)].shape[0]/author_lists.shape[0]
Out[54]:
0.1125311505458015
In [55]:
trace = go.Histogram(x=author_lists[(author_lists.isOrdered == True)]['n_authors'],
                     histnorm='probability',
                     xbins=dict(start=1,
                                size=1,
                                end=np.max(author_lists[(author_lists.isOrdered == True)]['n_authors'])))

layout = go.Layout(title="Probability distribution of n_authors for authors ordered alphabetically")

fig = go.Figure(data=[trace], layout=layout)
plotly.offline.iplot(fig)
In [56]:
paper_stats = macro[['paper_doi', 'org_gridId', 'org_countrycode', 'year']].groupby(['paper_doi', 'year']).agg({'org_gridId':lambda x: set(x),
                                                                                                                'org_countrycode':lambda x: set(x)})
def len_of_set_with_nan(x):
    if np.nan in x:
        return 1 if len(x) == 1 else len(x) - 1
    else:
        return len(x)

paper_stats['n_gridIds'] = paper_stats['org_gridId'].map(lambda x: len_of_set_with_nan(x))
paper_stats['n_countries'] = paper_stats['org_countrycode'].map(lambda x: len_of_set_with_nan(x))
paper_stats = author_lists.join(paper_stats).reset_index()
paper_stats.head(10)
Out[56]:
paper_doi year author_familyName isOrdered n_authors org_gridId org_countrycode n_gridIds n_countries
0 10.1007/0-387-25515-X_1 2005-01-01 [Zitzmann, Block, Methta, Rudd, Burton, Wilson... False 9 {grid.4991.5, grid.251075.4, grid.214007.0} {US, GB} 3 2
1 10.1007/0-387-25515-X_10 2005-01-01 [Steen, Grillet, Opdenakker] False 3 {grid.5596.f} {BE} 1 1
2 10.1007/0-387-25515-X_11 2005-01-01 [Wright, Day] False 2 {grid.4991.5} {GB} 1 1
3 10.1007/0-387-25515-X_12 2005-01-01 [Shore, Wilson, Dwek, Rudd] False 4 {grid.4991.5, grid.251075.4, grid.214007.0} {US, GB} 3 2
4 10.1007/0-387-25515-X_13 2005-01-01 [Mimura, Golgher, Mimura-Kimura, Dwek, Rudd, E... False 6 {grid.4991.5, grid.5491.9} {GB} 2 1
5 10.1007/0-387-25515-X_14 2005-01-01 [Hooper] False 1 {grid.9909.9} {GB} 1 1
6 10.1007/0-387-25515-X_15 2005-01-01 [Blake, Esapa, Martin-Rendon, McIlhinney] True 4 {grid.4991.5, grid.251075.4} {US, GB} 2 2
7 10.1007/0-387-25515-X_16 2005-01-01 [Shi, Williams, Kurniawan, Lu, Stanley] False 5 {grid.13992.30} {IL} 1 1
8 10.1007/0-387-25515-X_17 2005-01-01 [Baldwin, Allen, Bourke, Hounsell, Calvert] False 5 {grid.4464.2, grid.42629.3b, grid.266842.c, gr... {AU, GB} 4 2
9 10.1007/0-387-25515-X_18 2005-01-01 [Fry, Steen, Royle, Wormald, Leathem, Opdenakk... False 8 {grid.4991.5, grid.5596.f, grid.83440.3b} {GB, BE} 3 2
In [57]:
trace1 = go.Histogram(x=paper_stats[(paper_stats.isOrdered == False)]['n_gridIds'],
                     histnorm='probability',
                     name='unordered',
                     xbins=dict(start=1,
                                size=1,
                                end=np.max(paper_stats[(paper_stats.isOrdered == False)]['n_gridIds'])))

trace2 = go.Histogram(x=paper_stats[(paper_stats.isOrdered == True)]['n_gridIds'],
                     histnorm='probability',
                     name='ordered',
                     xbins=dict(start=1,
                                size=1,
                                end=np.max(paper_stats[(paper_stats.isOrdered == True)]['n_gridIds'])))

layout = go.Layout(title="Probability distribution of n_gridids")

fig = go.Figure(data=[trace1, trace2], layout=layout)
plotly.offline.iplot(fig)