In [1]:
# This is my notebook for exploring data about economic inequality in Cambodia.

%matplotlib inline
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import qgrid
from pylab import *
import seaborn as sb

# Hey, good news! We can remotely access the World Bank's World Development Indicators Database
# directly from pandas!

from pandas.io import wb
In [2]:
# First, search the database for all poverty-related indicator names and store them.
# I didn't use qgrid because it wouldn't display the id column correctly for some reason. It'd look nicer if it worked, though.

pov = wb.search('pov.*%').iloc[:,:2]
pov
Out[2]:
id name
5453 IN.POV.HCR.EST.RURL Poverty HCR Estimates (%) - Rural
5454 IN.POV.HCR.EST.TOTL Poverty HCR Estimates (%) - Total
5455 IN.POV.HCR.EST.URBN Poverty HCR Estimates (%) - Urban
7529 SI.POV.25DAY Poverty headcount ratio at $2.5 a day (PPP) (%...
7530 SI.POV.2DAY Poverty headcount ratio at $2 a day (PPP) (% o...
7531 SI.POV.4DAY Poverty headcount ratio at $4 a day (PPP) (% o...
7532 SI.POV.5DAY Poverty headcount ratio at $5 a day (PPP) (% o...
7533 SI.POV.DDAY Poverty headcount ratio at $1.25 a day (PPP) (...
7534 SI.POV.GAP2 Poverty gap at $2 a day (PPP) (%)
7535 SI.POV.GAP25 Poverty gap at $2.5 a day (PPP) (%)
7536 SI.POV.GAP4 Poverty gap at $4 a day (PPP) (%)
7537 SI.POV.GAP5 Poverty gap at $5 a day (PPP) (%)
7538 SI.POV.GAPS Poverty gap at $1.25 a day (PPP) (%)
7540 SI.POV.NAGP Poverty gap at national poverty lines (%)
7541 SI.POV.NAHC Poverty headcount ratio at national poverty li...
7547 SI.POV.RUGP Rural poverty gap at national poverty lines (%)
7548 SI.POV.RUHC Rural poverty headcount ratio at national pove...
7549 SI.POV.URGP Urban poverty gap at national poverty lines (%)
7550 SI.POV.URHC Urban poverty headcount ratio at national pove...
10162 ccx_povchi_40_fem Poverty headcount of children (below bottom 40...
10163 ccx_povchi_40_mal Poverty headcount of children (below bottom 40...
10164 ccx_povchi_40_rur Poverty headcount of children (below bottom 40...
10165 ccx_povchi_40_tot Poverty headcount of children (below bottom 40%)
10166 ccx_povchi_40_urb Poverty headcount of children (below bottom 40...
10167 ccx_poveld_40_fem Poverty headcount of the elderly (below bottom...
10168 ccx_poveld_40_mal Poverty headcount of the elderly (below bottom...
10169 ccx_poveld_40_rur Poverty headcount of the elderly (below bottom...
10170 ccx_poveld_40_tot Poverty headcount of the elderly (below bottom...
10171 ccx_poveld_40_urb Poverty headcount of the elderly (below bottom...
10172 ccx_povwka_40_fem Poverty headcount of working age adults (below...
... ... ...
12761 per_si_allsi_p1_ep_preT_tot Poverty Gap reduction (%) - All Social Insura...
12762 per_si_allsi_p1_ep_tot Poverty Gap reduction (%) - All Social Insura...
12763 per_si_allsi_p1_preT_tot Poverty Gap reduction (%) - All Social Insura...
12764 per_si_allsi_p1_rur Poverty Gap reduction (%) - All Social Insura...
12765 per_si_allsi_p1_tot Poverty Gap reduction (%) - All Social Insura...
12766 per_si_allsi_p1_urb Poverty Gap reduction (%) - All Social Insura...
12901 per_si_oa_p0_ep_preT_tot Poverty Headcount reduction (%) - Old Age Con...
12902 per_si_oa_p0_ep_tot Poverty Headcount reduction (%) - Old Age Con...
12903 per_si_oa_p0_preT_tot Poverty Headcount reduction (%) - Old Age Con...
12904 per_si_oa_p0_rur Poverty Headcount reduction (%) - Old Age Con...
12905 per_si_oa_p0_tot Poverty Headcount reduction (%) - Old Age Con...
12906 per_si_oa_p0_urb Poverty Headcount reduction (%) - Old Age Con...
12907 per_si_oa_p1_ep_preT_tot Poverty Gap reduction (%) - Old Age Contribut...
12908 per_si_oa_p1_ep_tot Poverty Gap reduction (%) - Old Age Contribut...
12909 per_si_oa_p1_preT_tot Poverty Gap reduction (%) - Old Age Contribut...
12910 per_si_oa_p1_rur Poverty Gap reduction (%) - Old Age Contribut...
12911 per_si_oa_p1_tot Poverty Gap reduction (%) - Old Age Contribut...
12912 per_si_oa_p1_urb Poverty Gap reduction (%) - Old Age Contribut...
13047 per_si_ss_p0_ep_preT_tot Poverty Headcount reduction (%) - Other Socia...
13048 per_si_ss_p0_ep_tot Poverty Headcount reduction (%) - Other Socia...
13049 per_si_ss_p0_preT_tot Poverty Headcount reduction (%) - Other Socia...
13050 per_si_ss_p0_rur Poverty Headcount reduction (%) - Other Socia...
13051 per_si_ss_p0_tot Poverty Headcount reduction (%) - Other Socia...
13052 per_si_ss_p0_urb Poverty Headcount reduction (%) - Other Socia...
13053 per_si_ss_p1_ep_preT_tot Poverty Gap reduction (%) - Other Social Insu...
13054 per_si_ss_p1_ep_tot Poverty Gap reduction (%) - Other Social Insu...
13055 per_si_ss_p1_preT_tot Poverty Gap reduction (%) - Other Social Insu...
13056 per_si_ss_p1_rur Poverty Gap reduction (%) - Other Social Insu...
13057 per_si_ss_p1_tot Poverty Gap reduction (%) - Other Social Insu...
13058 per_si_ss_p1_urb Poverty Gap reduction (%) - Other Social Insu...

231 rows × 2 columns

In [3]:
# Strip the English labels from the id's and store them in a separate table

povnames = pov.loc[7529:7550, 'name']
povnames = povnames.tolist()

# Keep only the id's in the original pov table

pov = pov.loc[7529:7550, 'id']
pov = pov.tolist()

# Take a look

povnames
Out[3]:
[u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $2 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $4 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $5 a day (PPP) (% of population)',
 u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)',
 u'Poverty gap at $2 a day (PPP) (%)',
 u'Poverty gap at $2.5 a day (PPP) (%)',
 u'Poverty gap at $4 a day (PPP) (%)',
 u'Poverty gap at $5 a day (PPP) (%)',
 u'Poverty gap at $1.25 a day (PPP) (%)',
 u'Poverty gap at national poverty lines (%)',
 u'Poverty headcount ratio at national poverty lines (% of population)',
 u'Rural poverty gap at national poverty lines (%)',
 u'Rural poverty headcount ratio at national poverty lines (% of rural population)',
 u'Urban poverty gap at national poverty lines (%)',
 u'Urban poverty headcount ratio at national poverty lines (% of urban population)']
In [4]:
pov
Out[4]:
[u'SI.POV.25DAY',
 u'SI.POV.2DAY',
 u'SI.POV.4DAY',
 u'SI.POV.5DAY',
 u'SI.POV.DDAY',
 u'SI.POV.GAP2',
 u'SI.POV.GAP25',
 u'SI.POV.GAP4',
 u'SI.POV.GAP5',
 u'SI.POV.GAPS',
 u'SI.POV.NAGP',
 u'SI.POV.NAHC',
 u'SI.POV.RUGP',
 u'SI.POV.RUHC',
 u'SI.POV.URGP',
 u'SI.POV.URHC']
In [5]:
# Create a dictionary of the names and id's
povdict = dict(zip(pov, povnames))
povdict
Out[5]:
{u'SI.POV.25DAY': u'Poverty headcount ratio at $2.5 a day (PPP) (% of population)',
 u'SI.POV.2DAY': u'Poverty headcount ratio at $2 a day (PPP) (% of population)',
 u'SI.POV.4DAY': u'Poverty headcount ratio at $4 a day (PPP) (% of population)',
 u'SI.POV.5DAY': u'Poverty headcount ratio at $5 a day (PPP) (% of population)',
 u'SI.POV.DDAY': u'Poverty headcount ratio at $1.25 a day (PPP) (% of population)',
 u'SI.POV.GAP2': u'Poverty gap at $2 a day (PPP) (%)',
 u'SI.POV.GAP25': u'Poverty gap at $2.5 a day (PPP) (%)',
 u'SI.POV.GAP4': u'Poverty gap at $4 a day (PPP) (%)',
 u'SI.POV.GAP5': u'Poverty gap at $5 a day (PPP) (%)',
 u'SI.POV.GAPS': u'Poverty gap at $1.25 a day (PPP) (%)',
 u'SI.POV.NAGP': u'Poverty gap at national poverty lines (%)',
 u'SI.POV.NAHC': u'Poverty headcount ratio at national poverty lines (% of population)',
 u'SI.POV.RUGP': u'Rural poverty gap at national poverty lines (%)',
 u'SI.POV.RUHC': u'Rural poverty headcount ratio at national poverty lines (% of rural population)',
 u'SI.POV.URGP': u'Urban poverty gap at national poverty lines (%)',
 u'SI.POV.URHC': u'Urban poverty headcount ratio at national poverty lines (% of urban population)'}
In [6]:
# Now, look for all income related indicators and store them

inc = wb.search('income.*share.*%').iloc[:,:2]
inc
Out[6]:
id name
7522 SI.DST.02ND.20 Income share held by second 20%
7523 SI.DST.03RD.20 Income share held by third 20%
7524 SI.DST.04TH.20 Income share held by fourth 20%
7525 SI.DST.05TH.20 Income share held by highest 20%
7526 SI.DST.10TH.10 Income share held by highest 10%
7527 SI.DST.FRST.10 Income share held by lowest 10%
7528 SI.DST.FRST.20 Income share held by lowest 20%
In [7]:
# Repeat what was done with the poverty indicators

incnames = inc.loc[:, 'name']
incnames = incnames.tolist()
inc = inc.loc[:,'id']
inc = inc.tolist()
incnames
Out[7]:
[u'Income share held by second 20%',
 u'Income share held by third 20%',
 u'Income share held by fourth 20%',
 u'Income share held by highest 20%',
 u'Income share held by highest 10%',
 u'Income share held by lowest 10%',
 u'Income share held by lowest 20%']
In [8]:
inc
Out[8]:
[u'SI.DST.02ND.20',
 u'SI.DST.03RD.20',
 u'SI.DST.04TH.20',
 u'SI.DST.05TH.20',
 u'SI.DST.10TH.10',
 u'SI.DST.FRST.10',
 u'SI.DST.FRST.20']
In [9]:
# Create another dictionary for income

incdict = dict(zip(inc, incnames))
incdict
Out[9]:
{u'SI.DST.02ND.20': u'Income share held by second 20%',
 u'SI.DST.03RD.20': u'Income share held by third 20%',
 u'SI.DST.04TH.20': u'Income share held by fourth 20%',
 u'SI.DST.05TH.20': u'Income share held by highest 20%',
 u'SI.DST.10TH.10': u'Income share held by highest 10%',
 u'SI.DST.FRST.10': u'Income share held by lowest 10%',
 u'SI.DST.FRST.20': u'Income share held by lowest 20%'}
In [10]:
# Create master list of all of the data we want to download:

idx = pov + inc
idx
Out[10]:
[u'SI.POV.25DAY',
 u'SI.POV.2DAY',
 u'SI.POV.4DAY',
 u'SI.POV.5DAY',
 u'SI.POV.DDAY',
 u'SI.POV.GAP2',
 u'SI.POV.GAP25',
 u'SI.POV.GAP4',
 u'SI.POV.GAP5',
 u'SI.POV.GAPS',
 u'SI.POV.NAGP',
 u'SI.POV.NAHC',
 u'SI.POV.RUGP',
 u'SI.POV.RUHC',
 u'SI.POV.URGP',
 u'SI.POV.URHC',
 u'SI.DST.02ND.20',
 u'SI.DST.03RD.20',
 u'SI.DST.04TH.20',
 u'SI.DST.05TH.20',
 u'SI.DST.10TH.10',
 u'SI.DST.FRST.10',
 u'SI.DST.FRST.20']
In [11]:
# Download data and store it as a DataFrame

khm = wb.download(indicator=idx, country='KHM', start=2004, end=2012)
khm
Out[11]:
SI.POV.25DAY SI.POV.2DAY SI.POV.4DAY SI.POV.5DAY SI.POV.DDAY SI.POV.GAP2 SI.POV.GAP25 SI.POV.GAP4 SI.POV.GAP5 SI.POV.GAPS ... SI.POV.RUHC SI.POV.URGP SI.POV.URHC SI.DST.02ND.20 SI.DST.03RD.20 SI.DST.04TH.20 SI.DST.05TH.20 SI.DST.10TH.10 SI.DST.FRST.10 SI.DST.FRST.20
country year
Cambodia 2012 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 20.8 NaN 6.4 NaN NaN NaN NaN NaN NaN NaN
2011 59.00 41.26 85.70 91.83 10.05 10.25 18.29 39.60 49.51 1.43 ... 23.6 NaN 8.7 12.46 16.11 21.24 41.20 26.91 4.04 8.99
2010 57.59 40.88 84.06 90.63 11.25 10.59 18.37 39.00 48.74 1.70 ... 25.3 NaN 8.5 12.02 15.80 21.18 42.49 28.01 3.80 8.51
2009 56.25 40.74 82.20 89.25 12.93 11.21 18.71 38.50 48.02 2.08 ... 27.5 NaN 8.0 11.66 15.68 21.48 43.15 28.20 3.55 8.03
2008 65.90 51.05 87.33 92.48 20.89 16.27 24.78 45.05 54.08 4.39 ... 38.5 NaN 15.1 11.60 15.67 21.43 43.45 28.57 3.44 7.85
2007 71.05 59.39 87.65 92.07 30.82 21.92 30.64 49.51 57.63 7.24 ... 51.4 NaN 18.3 10.05 13.95 20.16 48.89 33.99 3.12 6.95
2006 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2005 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2004 76.54 64.43 91.55 94.95 32.77 23.64 33.09 52.85 60.96 7.79 ... 54.2 NaN 28.5 11.43 15.37 21.22 44.00 29.09 3.56 7.98

9 rows × 23 columns

In [12]:
# Reverse the order of the DataFrame so the years are ascending, drop Cambodia index, drop categories with all NA's

khm.index = khm.index.droplevel(0)
khm = khm.iloc[::-1]
khm = khm.dropna(axis=1, how='all')
qgrid.show_grid(khm, remote_js=True)
In [13]:
# Let's look at the data of percent of total income earned from the highest 10% next to that of
# the lowest 10%

incframe = khm[['SI.DST.10TH.10', 'SI.DST.FRST.10']]
incframe = incframe[0:8] # No data for 2012, so let's omit it
incframe.columns = [incdict[incframe.columns.tolist()[0]], incdict[incframe.columns.tolist()[1]]]
incframe
Out[13]:
Income share held by highest 10% Income share held by lowest 10%
year
2004 29.09 3.56
2005 NaN NaN
2006 NaN NaN
2007 33.99 3.12
2008 28.57 3.44
2009 28.20 3.55
2010 28.01 3.80
2011 26.91 4.04
In [14]:
incframe.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
incframe.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[14]:
<matplotlib.text.Text at 0x10d85eb10>
<matplotlib.figure.Figure at 0x10d9f4910>
In [15]:
incframe1 = khm[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'
                 ]]

# Loop to look up ids in dictionary to rename the columns
newcolumns = range(0,len(incframe1.columns))
for i in range(0, len(incframe1.columns)):
    newcolumns[i] = incdict[incframe1.columns.tolist()[i]]
incframe1.columns = newcolumns

incframe1 = incframe1[0:8] #Omit 2012, no data
incframe1
Out[15]:
Income share held by lowest 20% Income share held by second 20% Income share held by third 20% Income share held by fourth 20% Income share held by highest 20%
year
2004 7.98 11.43 15.37 21.22 44.00
2005 NaN NaN NaN NaN NaN
2006 NaN NaN NaN NaN NaN
2007 6.95 10.05 13.95 20.16 48.89
2008 7.85 11.60 15.67 21.43 43.45
2009 8.03 11.66 15.68 21.48 43.15
2010 8.51 12.02 15.80 21.18 42.49
2011 8.99 12.46 16.11 21.24 41.20
In [16]:
# Change column names
incframe1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
plt.ylim([0, 50])
Out[16]:
(0, 50)
<matplotlib.figure.Figure at 0x10d9f4850>
In [17]:
# Stacked bar graph of above dataframe

plt.figure()
incframe1.plot(title = 'Income Share in Cambodia', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[17]:
<matplotlib.text.Text at 0x10d816c90>
<matplotlib.figure.Figure at 0x10d967250>
In [18]:
# How does this compare to the US?

usa = wb.download(indicator=['SI.DST.10TH.10', 'SI.DST.FRST.10', 'SI.DST.FRST.20', 'SI.DST.02ND.20', 
                             'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20'], 
                  country='USA', start=2004, end=2011)

usa.index = usa.index.droplevel(0)
usa = usa.iloc[::-1]
qgrid.show_grid(usa, remote_js=True)
In [19]:
# So there isn't nearly as much data for the US during this time period, but it's still worth looking at.

usainc = usa[['SI.DST.10TH.10', 'SI.DST.FRST.10']]

usainc.columns = ['Highest 10% of Income Earners', 'Lowest 10% of Income Earners']

plt.figure()
usainc.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[19]:
<matplotlib.text.Text at 0x10aed8710>
<matplotlib.figure.Figure at 0x10d9dd210>
In [20]:
usainc1 = usa[['SI.DST.FRST.20', 'SI.DST.02ND.20', 'SI.DST.03RD.20', 'SI.DST.04TH.20', 'SI.DST.05TH.20']]

usainc1.columns = ['Lowest 20%', 'Second Lowest 20%', 'Middle 20%', 'Second Highest 20%', 'Highest 20%']

plt.figure()
usainc1.plot(title = 'Income Share in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[20]:
<matplotlib.text.Text at 0x10d876950>
<matplotlib.figure.Figure at 0x10d9ff550>
In [21]:
plt.figure()
usainc1.plot(title = 'Income Share in the USA', kind = 'bar', stacked = 'True')
plt.xlabel('Year')
plt.ylabel('Share of National Income (%)')
Out[21]:
<matplotlib.text.Text at 0x10d69ed50>
<matplotlib.figure.Figure at 0x10d926610>
In [27]:
# So... we aren't much better (perhaps worse). Let's look at the Gross National Income per capita for each
# country to get a better look at the differences

khmgni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='KHM', start=2004, end=2011)
khmgni.index = khmgni.index.droplevel(0)
khmgni = khmgni.iloc[::-1]

qgrid.show_grid(khmgni, remote_js=True)
In [28]:
plt.figure()
khmgni.plot(title = 'Gross National Income per Capita in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
Out[28]:
<matplotlib.text.Text at 0x10a84bfd0>
<matplotlib.figure.Figure at 0x10d067850>
In [29]:
usagni = wb.download(indicator=['NY.GNP.PCAP.CD'], country='USA', start=2004, end=2011)
usagni.index = usagni.index.droplevel(0)
usagni = usagni.iloc[::-1]

qgrid.show_grid(usagni, remote_js=True)
In [30]:
plt.figure()
usagni.plot(title = 'Gross National Income per Capita in the USA', style = 'o-')
plt.xlabel('Year')
plt.ylabel('Income ($)')
Out[30]:
<matplotlib.text.Text at 0x109ca6810>
<matplotlib.figure.Figure at 0x109d0b310>
In [24]:
# Explore how close people are to the poverty boundary and how it is changing over time

povline = khm[['SI.POV.25DAY', 'SI.POV.2DAY', 'SI.POV.4DAY', 'SI.POV.5DAY', 'SI.POV.DDAY', 'SI.POV.NAHC']]
newcolumns = range(0,len(povline.columns))
for i in range(0, len(povline.columns)):
    newcolumns[i] = povdict[povline.columns.tolist()[i]]
povline.columns = newcolumns

povline
Out[24]:
Poverty headcount ratio at $2.5 a day (PPP) (% of population) Poverty headcount ratio at $2 a day (PPP) (% of population) Poverty headcount ratio at $4 a day (PPP) (% of population) Poverty headcount ratio at $5 a day (PPP) (% of population) Poverty headcount ratio at $1.25 a day (PPP) (% of population) Poverty headcount ratio at national poverty lines (% of population)
year
2004 76.54 64.43 91.55 94.95 32.77 50.2
2005 NaN NaN NaN NaN NaN NaN
2006 NaN NaN NaN NaN NaN NaN
2007 71.05 59.39 87.65 92.07 30.82 45.0
2008 65.90 51.05 87.33 92.48 20.89 34.0
2009 56.25 40.74 82.20 89.25 12.93 23.9
2010 57.59 40.88 84.06 90.63 11.25 22.1
2011 59.00 41.26 85.70 91.83 10.05 20.5
2012 NaN NaN NaN NaN NaN 17.7
In [31]:
# Change titles of columns for plotting, then plot
povline.columns = ['<$2.50 a day', '<$2 a day', '<$4 a day', '<$5 a day', '<$1.25 a day', '<National poverty lines']

plt.figure()
povline.plot(title='Poverty Headcount Ratio at Different Incomes in Cambodia', style = 'o-')
plt.xlabel('Year')
plt.ylabel('% of population')
Out[31]:
<matplotlib.text.Text at 0x10dfbee10>
<matplotlib.figure.Figure at 0x109cc6c90>
In [26]:
# The World Bank does not have most of this data for the USA, so I will find US Census data later.