Quick and dirty code to address Research publications per capita? - Academia Stack Exchange.

In [14]:

import pandas as pd

In [15]:

population = pd.read_excel("sp.pop.totl_Indicator_en_excel_v2.xls", "Data",
                           header=2, index_col="Country Name")
publications = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Data",
                             header=2, index_col="Country Name")
metac = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Metadata - Countries",
                       header=0, index_col="Country Name")

In [16]:

okCountries = metac['Region'].dropna().index;
# as oposed to regions

In [17]:

publications = publications.loc[okCountries]
population = population.loc[okCountries]

In [18]:

pubs_per_cap = (publications['2009'] / population['2009']).dropna()
# there is no data for publications for 2010, 2011, 2012 or 2013

In [19]:

pubs_per_cap.sort(ascending=False)

In [20]:

pubs_per_cap

Out[20]:

Country Name
Switzerland       0.001223
Sweden            0.001019
Denmark           0.000961
Finland           0.000927
Norway            0.000919
Netherlands       0.000899
Australia         0.000869
Canada            0.000860
Israel            0.000842
Singapore         0.000839
Iceland           0.000814
New Zealand       0.000739
United Kingdom    0.000739
United States     0.000680
Belgium           0.000669
...
Korea, Dem. Rep.       3.405566e-07
Angola                 3.275804e-07
Guinea                 3.209592e-07
Congo, Dem. Rep.       3.157741e-07
Burundi                2.912615e-07
Chad                   2.022631e-07
Myanmar                1.979027e-07
Turkmenistan           1.004225e-07
Somalia                7.462007e-08
Liberia                0.000000e+00
St. Lucia              0.000000e+00
Tuvalu                 0.000000e+00
Antigua and Barbuda    0.000000e+00
Samoa                  0.000000e+00
Kiribati               0.000000e+00
Name: 2009, Length: 188, dtype: float64

In [21]:

vals = pubs_per_cap.values
pops = population['2009'][pubs_per_cap.index].values / 1000000
poss = np.concatenate([np.array([0.]), np.cumsum(pops)])
countries = [name for name in pubs_per_cap.index]
# very dirty, I know...

In [22]:

n = 50

# dark magic for colors:
colors = [get_cmap("Set1")((hash(name)/1234567.) % 1) for name in pubs_per_cap.index[:n]]
# or alternatively something like:
# https://gist.github.com/endolith/2719900#file_accent.py
# colors = [get_cmap("Paired")(rand()) for i in range(n)]

figure(num=None, figsize=(14, 8), dpi=80, facecolor='w', edgecolor='k')
rects = bar(poss[:n], vals[:n], width=pops[:n],
    linewidth=0, color=colors)
for i, rect in enumerate(rects):
    height = rect.get_height()
    text(rect.get_x()+rect.get_width()/2., 1.05*height, countries[i],
         ha='left', va='bottom', rotation=45)
xlabel("Population [in millions]")
ylabel("Publications per capita per year (2009)")
show()

In [22]: