In [14]:
import pandas as pd
In [15]:
population = pd.read_excel("sp.pop.totl_Indicator_en_excel_v2.xls", "Data",
                           header=2, index_col="Country Name")
publications = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Data",
                             header=2, index_col="Country Name")
metac = pd.read_excel("IP.JRN.ARTC.SC_Indicator_en_excel_v2.xls", "Metadata - Countries",
                       header=0, index_col="Country Name")
In [16]:
okCountries = metac['Region'].dropna().index;
# as oposed to regions
In [17]:
publications = publications.loc[okCountries]
population = population.loc[okCountries]
In [18]:
pubs_per_cap = (publications['2009'] / population['2009']).dropna()
# there is no data for publications for 2010, 2011, 2012 or 2013
In [19]:
pubs_per_cap.sort(ascending=False)
In [20]:
pubs_per_cap
Out[20]:
Country Name
Switzerland       0.001223
Sweden            0.001019
Denmark           0.000961
Finland           0.000927
Norway            0.000919
Netherlands       0.000899
Australia         0.000869
Canada            0.000860
Israel            0.000842
Singapore         0.000839
Iceland           0.000814
New Zealand       0.000739
United Kingdom    0.000739
United States     0.000680
Belgium           0.000669
...
Korea, Dem. Rep.       3.405566e-07
Angola                 3.275804e-07
Guinea                 3.209592e-07
Congo, Dem. Rep.       3.157741e-07
Burundi                2.912615e-07
Chad                   2.022631e-07
Myanmar                1.979027e-07
Turkmenistan           1.004225e-07
Somalia                7.462007e-08
Liberia                0.000000e+00
St. Lucia              0.000000e+00
Tuvalu                 0.000000e+00
Antigua and Barbuda    0.000000e+00
Samoa                  0.000000e+00
Kiribati               0.000000e+00
Name: 2009, Length: 188, dtype: float64
In [21]:
vals = pubs_per_cap.values
pops = population['2009'][pubs_per_cap.index].values / 1000000
poss = np.concatenate([np.array([0.]), np.cumsum(pops)])
countries = [name for name in pubs_per_cap.index]
# very dirty, I know...
In [22]:
n = 50

# dark magic for colors:
colors = [get_cmap("Set1")((hash(name)/1234567.) % 1) for name in pubs_per_cap.index[:n]]
# or alternatively something like:
# https://gist.github.com/endolith/2719900#file_accent.py
# colors = [get_cmap("Paired")(rand()) for i in range(n)]

figure(num=None, figsize=(14, 8), dpi=80, facecolor='w', edgecolor='k')
rects = bar(poss[:n], vals[:n], width=pops[:n],
    linewidth=0, color=colors)
for i, rect in enumerate(rects):
    height = rect.get_height()
    text(rect.get_x()+rect.get_width()/2., 1.05*height, countries[i],
         ha='left', va='bottom', rotation=45)
xlabel("Population [in millions]")
ylabel("Publications per capita per year (2009)")
show()
In [22]: