from pandas import DataFrame, Series
import pandas as pd
import os
import codecs

# Verify existence of & Read in the datasets - project and operations & Freedom Index

DATA_FILES={"projdict":"data/projects_operations_api.csv", "fredict":"data/FreedomIndex.csv"}
def file_path(key):
    return os.path.join(os.pardir, DATA_FILES[key])
for file_key in DATA_FILES.keys():
    abs_fname = file_path(file_key)
    print abs_fname, os.path.exists(abs_fname)

f = codecs.open(file_path("projdict"), encoding='iso-8859-1')
initial_proj_df = pd.read_csv(f)

initial_proj_df.columns

is_africa = initial_proj_df['regionname']=='AFRICA'

initial_proj_df[is_africa]['countryname'][:5]

initial_proj_df[is_africa][['countryname', 'totalamt']][:5]

#The totalamt value is not properly formatted. This step cleans up the value by stripping out unnecessary characters.

initial_proj_df['totalamt'] = initial_proj_df['totalamt'].str.replace(';','')

initial_proj_df[is_africa]['totalamt'][:5]

initial_proj_df['totalamt'] = initial_proj_df['totalamt'].astype('float32')

sum(initial_proj_df[is_africa]['totalamt'][:5])

initial_proj_df[['regionname','countryname','projectstatusdisplay','totalamt']][:2]

# This step is data cleaning. Removing the semi-column from the money values.

initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].str.replace(';','')
initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].astype('float32')

initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].str.replace(';','')
initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].astype('float32')

initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].str.replace(';','')
initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].astype('float32')

initial_proj_df['grantamt'] = initial_proj_df['grantamt'].str.replace(';','')
initial_proj_df['grantamt'] = initial_proj_df['grantamt'].astype('float32')

initial_proj_df[is_africa][['countryname','project_name','boardapprovaldate','status','lendprojectcost','grantamt']][:10]

projcp_df = initial_proj_df.copy()

projcp_df = projcp_df.drop(['lendinginstrtype','envassesmentcategorycode','productlinetype','closingdate','url','sector2','sector3','sector4','sector5','sector','mjsector1','mjsector2','mjsector3','mjsector4','mjsector5','mjsector','theme1','theme2','theme3','theme4','theme5','financier','mjtheme2name','mjtheme3name','mjtheme4name','mjtheme5name'],axis=1)

del projcp_df['projectstatusdisplay']

projcp_df2 = projcp_df.drop(['prodline','supplementprojectflg','goal','mjtheme1name','location'], axis=1)

projcp_df2.columns

projcp_df2[is_africa][:5]

grouped = projcp_df2.groupby('regionname')

# function to calculate the total amount awarded by the worldbank per country or regional operating body
def func(x):
    totalamt = x['totalamt'].sum()
    return Series([totalamt] ,index=['totalamt'])

# result dataframe 
result = grouped.apply(func)

#create a new column in dataframe to hold the years from the board approval date
projcp_df2['year'] = projcp_df2['boardapprovaldate'].str[:4]

projcp_df2['year'][:2]

# group data by year and region name
grouped3 = projcp_df2.groupby(['regionname','year'])

# statistics on the banks lending commitments to different regions over time
grouped3['totalamt'].describe()

grouped4 = projcp_df2.groupby(['regionname','year','board_approval_month'])

result4 =grouped4.apply(func)

result4.unstack('regionname')[:5]

result5 = grouped3.apply(func).unstack('regionname').fillna(0)

result5[:5]

# python-us-cpi is a tool for parsing the latest US Consumer Price Index and also provides an inflation calculator api.
#We'll be using this api to calculate the loan commitments from other years into today's dollars for better comparision. 

from uscpi import UsCpi

cpi = UsCpi() # downloads the latest CPI data

    # $100 in 2012 is worth how much in 1980?
cpi.value_with_inflation(100, 2012, 1980)

projcpi = projcp_df2[['regionname','countryname','project_name','totalamt','grantamt','sector1','year']].copy()

# Function used to convert monetary values to today's value from any year less than 2013 using the cpi api

def fun2(y):
    totalamts = y['totalamt']
    year = int(y['year'])   
    regionname =y['regionname']
    countryname = y['countryname']
    project_name = y['project_name']
    grantamt = y['grantamt']
    sector1 = y['sector1']
    boolVal = 1914 <= year <= 2013
    if(boolVal):
        totalamts = cpi.value_with_inflation(totalamts,year,2013)
    return Series([regionname,countryname,project_name,totalamts,grantamt,sector1,year],index=['regionname','countryname','project_name','totalamt','grantamt','sector1','year'])


resultcpi = projcpi.fillna(0).apply(fun2, axis=1)


# Data cleaning: Removing un-wanted data from the sector1 string
resultcpi['sectorMain'] = resultcpi['sector1'].str.split("!").str[0]

resultcpi['country'] = resultcpi['countryname'].str.split(";").str[0]

# validate that the data is cleaned. 
resultcpi[:4]

resultcpi['year'] = resultcpi['year'].astype(int)

# The dataset is big, which makes it very difficult to analyze. This next step we construct a boolean to extract only those items that have been funded from 200 - 2013
is_bv = (resultcpi['year'] >= 2000) & (resultcpi['year'] <= 2013)

resultcpi2 = resultcpi[is_bv]

# verify that data is formatted in the way we want to analyze it. 
resultcpi2[:4]

#In the line below we are finding the total bank commitments to Africa over a period from 2000 - 2009. 
# We use the cpi function to calculate the inflation and CPI on all the loans less than 2013
ggroup_africa = resultcpi2[resultcpi2['regionname']=='AFRICA'].groupby('year').apply(func)

ggroup_africa.plot(kind='bar', title='Bank lending commitments to Africa in year 2000 - 2013'); plt.tight_layout()

#In the line below we are finding the total bank commitments per region over a period from 2000 - 2009. 
# We use the cpi function to calculate the inflation and CPI on all the loans less than 2013
amtByRegion =resultcpi2.groupby(['regionname','year']).apply(func).unstack('regionname')

amtByRegion[:2]

amtByRegion.plot(kind='bar',figsize=(16,8), title='Lendig commitments by the Bank from 1947 - 2013'); plt.legend(loc='best')

# count the number of world bank projects from 2000 - 2013 per country
numOfproj_by_country = resultcpi2.groupby('country').size().order(na_last=True, ascending=False, kind='mergesort')

numOfproj_by_country[:5]

# From above, I observed that the top most funded UN nations are BRICS, so the list below is created to filter out the BRICS for further observation and analysis
listBRICS = ['Federative Republic of Brazil','Russian Federation','Republic of India','People\'s Republic of China','Republic of South Africa']

brics_nations = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','year']).size()

# In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013
brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout()

#rpt[rpt['STK_ID'].isin(stk_list)]
df_of_BRICS = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','sector3']).size().order(na_last=True, ascending=False, kind='mergesort')

df_of_BRICS.unstack('country').fillna(0)

# We import the Freedom index csv for comparison analysis

f = codecs.open(file_path("fredict"), encoding='iso-8859-1')
free_df = pd.read_csv(f)

free_df[:2]

# similar to the projects and operations dataset, I restrict the analysis to only data from 2000 - 2009
free_df2 = free_df[free_df['index year']>=2000].copy()

free_df2.columns

# I extract the BRICS to further observe them 
free_df2 = free_df2[free_df2['name'].isin(['China', 'India', 'Russia', 'Brazil', 'South Africa'])]

free_df2[:5]

free_df3 = free_df2[['name','index year','overall score']].copy()

free_df3[:2]

free_df3['overall score'] = free_df3['overall score'].astype(float)

free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(kind='line', title='freedom Index per BRICS country', figsize=(10,10))


free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(subplots=True, figsize=(8, 8)); plt.legend(loc='best');plt.tight_layout();plt.ylabel('Freedom Index');

# In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013
brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout()
Findings:
1. In 2010, The world bank funding increased (compared to previous year) in all the BRICS except Russia and the GDP in these countries also dropped. Is there a relationship? Financial Crisis?

Challenges
1. The Worldbank data has so many facets that can be useful for data analysis however not all variables are as properly explained. 
2. During the analysis of the data, i noticed that after 1970, The world bank changed the format of their reporting which made it really difficult during munging ot the data. 
f = codecs.open(file_path("fredict"), encoding='iso-8859-1')
free_df = pd.read_csv(f)

#because I'm looking at the contribution of funds over a period of time, I want to look at the current
#Freedom Index for these countries to make an anlysis of their current state
free_df2 = free_df[free_df['index year']==2013].copy()

#for simplicity, let's ignore those who have not been scored as well
free_df2 = free_df2[free_df2['overall score']!='N/A'].copy()

free_df2.columns

low_freedom = free_df2.sort(['overall score'], ascending=True)
low_freedom = low_freedom[:10]
low_freedom[['name', 'overall score']]

high_freedom = free_df2.sort(['overall score'], ascending=False)
high_freedom = high_freedom[:10]
high_freedom[['name', 'overall score']]

#high corruption
high_corruption = free_df2.sort(['freedom from corruption'], ascending=True)
high_corruption = high_corruption[:10]
high_corruption[['name', 'freedom from corruption']]

#low corruption
low_corruption = free_df2.sort(['freedom from corruption'], ascending=False)
low_corruption = low_corruption[:10]
low_corruption[['name', 'freedom from corruption']]

numOfproj_by_country[:10]

#recall the resultcpi looking at the projects funded, converted to 2013 dollars

#let's sort by country
country_cpi = resultcpi.sort(column='country', ascending=True)
country_cpi[['country', 'totalamt', 'grantamt', 'year']][:2]

#there are a few problems with this data-- First, there are continents included in the countryname:

country_cpi= country_cpi.dropna()

#country_cpi= country_cpi[((country_cpi.country !='Africa')
#                &(country_cpi.country !='Central America') 
#                &(country_cpi.country !='Latin America')
#                &(country_cpi.country !='Europe')
#                &(country_cpi.country !='East Asia and Pacific')
#                &(country_cpi.country !='Europe and Central Asia')
#                &(country_cpi.country !='World')
#                &(country_cpi.country !='Asia')
#                &(country_cpi.country !='Middle East and North Africa')
#                &(country_cpi.country !='Africa')
#                &(country_cpi.country !='South Eastern Europe and Balkans'))]

#Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding.
#Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding.
low_Freedom_list= ['Belize','Turkmenistan','Republic of Zimbabwe','Republic of Uzbekistan',
                    'Republic of Haiti', 'Republic of Burundi', 'Republic of Equatorial Guinea', 
                    'People\'s Republic of Angola', 'Republica Bolivariana de Venezuela']
high_Freedom_list= ['Kingdom of Norway', 'New Zealand', 'Kingdom of Denmark', 'Republic of Finland',
                    'Republic of Sweden', 'Kingdom of The Netherlands', 'Common of Australia']

#now we can see how much money was committed to each country in the Freedom_list
low_Freedom_nations = country_cpi[country_cpi['country'].isin(low_Freedom_list)].groupby(['country']).size()
low_Freedom_nations

low_Freedom_nations.plot(kind='bar', title='Lending to Countries with low Freedom Index'); plt.tight_layout()

import pandas as pd
import wikipydia as wk
import mwparserfromhell
from wikitools import wiki
from wikitools import api
from wikitools import category
from wikitools import page
import itertools
import re
wikisite = "http://en.wikipedia.org/w/api.php"
wikiObject = wiki.Wiki(wikisite)

projectsAPI = pd.read_csv('../data/projects_operations_api.csv')
wikipediadf = pd.read_csv('../data/matchcountries.csv')

# some cleaning on the datasets
wikipediadf.index =wikipediadf['countryname'] 
projectsAPI['countryname'] = [str(country).split(";")[0] for country in projectsAPI['countryname']] 
#print matchNames.columns
#print projectsAPI.columns
projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left')
projects = projects[projects['countryname'].map(type) != type(0.0)]
projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)]

projects['totalamt'] = projects['totalamt'].str.replace(';','')
projects['totalamt'] = projects['totalamt'].astype('float32')
print projects.columns
projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']]
projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']] 

import matplotlib.pyplot as plt
import matplotlib.colors as col

def color_variant(hex_color, brightness_offset=1):  
    if len(hex_color) != 7:  
        raise Exception("Passed %s into color_variant(), needs to be in #87c95f format." % hex_color)  
    rgb_hex = [hex_color[x:x+2] for x in [1, 3, 5]]  
    new_rgb_int = [int(hex_value, 16) + brightness_offset for hex_value in rgb_hex]  
    new_rgb_int = [min([255, max([0, i])]) for i in new_rgb_int] # make sure new values are between 0 and 255  
    # hex() produces "0x88", we want just "88"  
    
    hexcolor = "#"
    for i in new_rgb_int:
        if(i<16):
            hexcolor+="0"+str(hex(i)[2:])
        else:
            hexcolor+=str(hex(i)[2:])
    return hexcolor

def drawBarCharReference(Color,targetlist, field, title, labels):
    fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k')
    
    ax = fig.add_subplot(111)
    
    ColorBase = Color
    changeQuantile = True
    changeRange = 0.10
    i = 0
    
    for x in targetlist.sort(columns=field,ascending=True).index:     
        if i/float(len(targetlist.index)) > changeRange:
            ColorBase = color_variant(ColorBase,20)
            changeRange = changeRange + 0.10
        targetlist['color'][x] =  ColorBase
        #print (type(targetlist[field][x]))
        ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase)) 
        i+=1
    
    ax.set_xticklabels( ([x[1] for x in targetlist.sort(columns=field,ascending=True).index]) )
    #plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35)
    plt.xticks(np.arange(0.5, i+1, 1))
    plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical')
    plt.setp(ax.get_yticklabels(), fontsize=10)
    plt.title(title)
    plt.xlabel(labels[0],fontsize=18)
    plt.ylabel(labels[1],fontsize=18)
    plt.show()

    
# http://www.geophysique.be/2013/02/12/matplotlib-basemap-tutorial-10-shapefiles-unleached-continued/

#
# BaseMap example by geophysique.be
# tutorial 10
 
import os
import inspect
import numpy as np
import matplotlib.pyplot as plt
from itertools import islice, izip
from mpl_toolkits.basemap import Basemap

def zip_filter_by_state(records, shapes, included_states=None):
    # by default, no filtering
    # included_states is a list of states fips prefixes
    for (record, state) in izip(records, shapes):
        if record[1] in included_states:
            yield (record, state) 


def draw_global_map(colors, indexlist, titles):
    ### PARAMETERS FOR MATPLOTLIB :
    import matplotlib as mpl
    mpl.rcParams['font.size'] = 14.
    mpl.rcParams['font.family'] = 'Serif'
    mpl.rcParams['axes.labelsize'] = 8.
    mpl.rcParams['xtick.labelsize'] = 40.
    mpl.rcParams['ytick.labelsize'] = 20.
     
    fig = plt.figure(figsize=(11.7,8.3))
    #Custom adjust of the subplots
    plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05)
    ax = plt.subplot(111)
    #Let's create a basemap of USA
    
    x1 = -179.
    x2 = 179.
    y1 = -60.
    y2 = 80.
    
    
    i=0
    #colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B']
    
        
    m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2)
    m.drawcountries(linewidth=0.5)
    m.drawcoastlines(linewidth=0.5)
    m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels
    m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians
     
        
    from matplotlib.collections import LineCollection
    from matplotlib import cm
    import shapefile 
    
    basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data")

    # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir
    if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")):
        shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
    else:
        # put in your path
        #shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county")
        shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
    
    shapes = shpf.shapes()
    records = shpf.records()
    
    #print cm.colors.ColorConverter.to_rgba('#eeefff') 
    #random_number = 38*145*155
    
    # show only CA and AK (for example)
    for record, shape in zip(records, shapes):
        lons,lats = zip(*shape.points)
        data = np.array(m(lons, lats)).T
     
        if len(shape.parts) == 1:
            segs = [data,]
        else:
            segs = []
            for i in range(1,len(shape.parts)):
                index = shape.parts[i-1]
                index2 = shape.parts[i]
                segs.append(data[index:index2])
            segs.append(data[index2:])
     
        lines = LineCollection(segs,antialiaseds=(1,))
        #cm.jet(random_number)
        lines.set_facecolors(colors[0])
        lines.set_edgecolors(colors[1])
        lines.set_linewidth(0.1)
        ax.add_collection(lines)
    for record, shape in zip_filter_by_state(records, shapes, [x[1] for x in indexlist.index]):
        lons,lats = zip(*shape.points)
        data = np.array(m(lons, lats)).T
     
        if len(shape.parts) == 1:
            segs = [data,]
        else:
            segs = []
            for i in range(1,len(shape.parts)):
                index = shape.parts[i-1]
                index2 = shape.parts[i]
                segs.append(data[index:index2])
            segs.append(data[index2:])
     
        lines = LineCollection(segs,antialiaseds=(1,))
        #cm.jet(random_number)
        i=i+1
        x_color=None
        for w in heatmapfounding.index:
            if record[1] in w[1]:
                x_color = w
                break
    
        lines.set_facecolors(indexlist['color'][x_color])
        lines.set_edgecolors(indexlist['color'][x_color])
        lines.set_linewidth(0.1)
        ax.add_collection(lines)
    plt.title(titles[0])
    plt.savefig('tutorial10.png',dpi=300)
    plt.show()


#draw_global_map(['#3C989E','#424242'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013'])


heatmapfounding = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','totalamt','year'])
heatmapfounding = pd.DataFrame(heatmapfounding[heatmapfounding.year>='2001'], columns=['wikiname','mapname','totalamt','year'])

heatmapfounding = heatmapfounding.groupby(['wikiname','mapname']).sum()
heatmapfounding['color'] = pd.Series(["hola" for x in heatmapfounding.index], index=heatmapfounding.index)

drawBarCharReference( '#C73F2A',heatmapfounding, "totalamt","Total World Bank Lending Commitments Accumulated 2001-2013",['Country','US$'])
draw_global_map(['#ffffff','#000000'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013'])

def cleanFloatnumber(x):
    if type(x) is float:
        return float(x)
    elif type(x) is str:
        if len(x) ==0:
            return None
        x=re.sub('<!--.*?-->','',x)
        x=re.sub('<*?>.*?<*?>','',x)
        x=x.strip()
        delimiterRegex = re.compile(r'[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?')
        Numbers = re.findall(delimiterRegex,x)
        if len(Numbers)>0:
            return float(Numbers[0])
        else:
            return None
    else:
        return None

def cleanIntNumber(x):
    if type(x) is float:
        return float(x)
    elif type(x) is str:
        if len(x) ==0:
            return None
        x=re.sub('<!--.*?-->','',x)
        x=re.sub('<*?>.*?<*?>','',x)
        x=re.sub(',','',x)
        x=x.strip()
        delimiterRegex = re.compile(r'[0-9]+')
        Numbers = re.findall(delimiterRegex,x)
        if len(Numbers)>0:
            return float(Numbers[0])
        else:
            return None
    else:
        return None

def get_infobox_from_wikipedia(countryname):
    #print "Checking: "+str(countryname)+"__"
    
    country_found = False
    hdi = None
    gini = None
    GDP = None
    GDP_nominal_per_capita = None
    population = None
    if str(countryname).strip() == "" or countryname is None or str(countryname).strip()=='nan':
        return hdi,gini,GDP,GDP_nominal_per_capita, population
    try:
        wikipage = page.Page(wikiObject,title=countryname)
    except Exception as inst:
        print "No results from Wikipedia: "+str(countryname)
        return hdi,gini,GDP,GDP_nominal_per_capita, population
    wikiraw = wikipage.getWikiText()
    wikiraw = wikiraw.decode('UTF-8')
    parsedWikiText = mwparserfromhell.parse(wikiraw) 
    for x in parsedWikiText.nodes:
        if "template" in str(type(x)) and "Infobox country" in str(x.name):
            country_found = True
            if x.has_param('population_census'):
                population = cleanIntNumber(str(x.get('population_census').value))
            if population is None:
                if x.has_param('population_estimate'):
                    population = cleanIntNumber(str(x.get('population_estimate').value))
            if x.has_param('HDI'):
                hdi = cleanFloatnumber(str(x.get('HDI').value))
            if x.has_param('Gini'):
                gini = cleanFloatnumber(str(x.get('Gini').value))
            if x.has_param('GDP'):
                GDP = x.get('GDP').value
            if x.has_param('GDP_nominal_per_capita'):
                GDP_nominal_per_capita = str(x.get('GDP_nominal_per_capita').value)
            break
    if country_found == False:
        print "No Infobox: "+str(countryname)
    return hdi,gini,GDP,GDP_nominal_per_capita,population

wikipediadf["HDI"], wikipediadf["gini"],wikipediadf['GDP'],wikipediadf['GDP_nominal_per_capita'],wikipediadf['population'] = zip(*wikipediadf['wikiname'].map(get_infobox_from_wikipedia))

#pp = pd.DataFrame(zip(*wikipediadf[wikipediadf.wikiname == "Guinea"]['wikiname'].map(get_infobox_from_wikipedia)))

#print pp[:]

# It was not possible to process this data from wikipedia, so I decided to filter it (Ignacio)

for i in wikipediadf[wikipediadf.type == 'Country'].index:
    typeFound = type(wikipediadf['population'][i])
    if typeFound is not float and typeFound is not None:
        print "deleted"
        wikipediadf=wikipediadf.drop([i])
        break
for i in wikipediadf[wikipediadf.type == 'Country'].index:
    typeFound = type(wikipediadf['GDP_nominal_per_capita'][i])
    if typeFound is not float and typeFound is not None:
        print "deleted"
        wikipediadf=wikipediadf.drop([i])
        break

projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left')
projects = projects[projects['countryname'].map(type) != type(0.0)]
projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)]

projects['totalamt'] = projects['totalamt'].str.replace(';','')
projects['totalamt'] = projects['totalamt'].astype('float32')
print projects.columns
projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']]
projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']] 


def drawBarCharReference2(Color,targetlist, field, title,labels):
    fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k')
    
    ax = fig.add_subplot(111)
    
    ColorBase = Color
    changeQuantile = True
    changeRange = 0.10
    i = 0
    
    for x in targetlist.sort(columns=field,ascending=True).index:     
        if i/float(len(targetlist.index)) > changeRange:
            ColorBase = color_variant(ColorBase,20)
            changeRange = changeRange + 0.10
        targetlist['color'][x] =  ColorBase
        #print (type(targetlist[field][x]))
        ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase)) 
        i+=1
    
    ax.set_xticklabels( ([targetlist['mapname'][x] for x in targetlist.sort(columns=field,ascending=True).index]) )
    #plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35)
    plt.xticks(np.arange(0.5, i+1, 1))
    plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical')
    plt.setp(ax.get_yticklabels(), fontsize=10)
    plt.title(title)
    plt.xlabel(labels[0], fontsize=18)
    plt.ylabel(labels[1], fontsize=18)
    plt.show()


def zip_filter_by_state2(records, shapes, included_states=None):
    # by default, no filtering
    # included_states is a list of states fips prefixes
    for (record, state) in izip(records, shapes):
        if record[1] in included_states:
            yield (record, state) 

def draw_global_map2(colors, indexlist, titles):
    ### PARAMETERS FOR MATPLOTLIB :
    import matplotlib as mpl
    mpl.rcParams['font.size'] = 14.
    mpl.rcParams['font.family'] = 'Serif'
    mpl.rcParams['axes.labelsize'] = 8.
    mpl.rcParams['xtick.labelsize'] = 40.
    mpl.rcParams['ytick.labelsize'] = 20.
     
    fig = plt.figure(figsize=(11.7,8.3))
    #Custom adjust of the subplots
    plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05)
    ax = plt.subplot(111)
    #Let's create a basemap of USA
    
    x1 = -179.
    x2 = 179.
    y1 = -60.
    y2 = 80.
    
    
    i=0
    #colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B']
    
        
    m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2)
    m.drawcountries(linewidth=0.5)
    m.drawcoastlines(linewidth=0.5)
    m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels
    m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians
     
        
    from matplotlib.collections import LineCollection
    from matplotlib import cm
    import shapefile 
    
    basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data")

    # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir
    if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")):
        shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
    else:
        # put in your path
        #shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county")
        shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
    
    shapes = shpf.shapes()
    records = shpf.records()
    
    #print cm.colors.ColorConverter.to_rgba('#eeefff') 
    #random_number = 38*145*155
    
    # show only CA and AK (for example)
    for record, shape in zip(records, shapes):
        lons,lats = zip(*shape.points)
        data = np.array(m(lons, lats)).T
     
        if len(shape.parts) == 1:
            segs = [data,]
        else:
            segs = []
            for i in range(1,len(shape.parts)):
                index = shape.parts[i-1]
                index2 = shape.parts[i]
                segs.append(data[index:index2])
            segs.append(data[index2:])
     
        lines = LineCollection(segs,antialiaseds=(1,))
        #cm.jet(random_number)
        lines.set_facecolors(colors[0])
        lines.set_edgecolors(colors[1])
        lines.set_linewidth(0.1)
        ax.add_collection(lines)

    for record, shape in zip_filter_by_state2(records, shapes, [indexlist['mapname'][x] for x in indexlist.index]):
        lons,lats = zip(*shape.points)
        data = np.array(m(lons, lats)).T
     
        if len(shape.parts) == 1:
            segs = [data,]
        else:
            segs = []
            for i in range(1,len(shape.parts)):
                index = shape.parts[i-1]
                index2 = shape.parts[i]
                segs.append(data[index:index2])
            segs.append(data[index2:])
     
        lines = LineCollection(segs,antialiaseds=(1,))
        #cm.jet(random_number)
        i=i+1
        x_color=None
        for (w,x) in [(indexlist['mapname'][x],x) for x in indexlist.index]:
            if type(w) is str and record[1] in w:
                x_color = x
                break
    
        lines.set_facecolors(indexlist['color'][x_color])
        lines.set_edgecolors(indexlist['color'][x_color])
        lines.set_linewidth(0.1)
        ax.add_collection(lines)
    plt.title(titles[0])
    plt.savefig('tutorial10.png',dpi=300)
    plt.show()


#wikipediadf["HDI"], #
#wikipediadf["gini"],
#wikipediadf['GDP'],
#wikipediadf['GDP_nominal_per_capita'],
#wikipediadf['population']

heatmapHDI = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','HDI'])
#heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname'])
#heatmapHDI = heatmapHDI.groupby(['wikiname','mapname'])
#heatmapHDI = pd.DataFrame(heatmapHDI)
heatmapHDI = heatmapHDI.fillna(0)
heatmapHDI = heatmapHDI.drop_duplicates()
heatmapHDI['color'] = pd.Series(["hola" for x in heatmapHDI.index], index=heatmapHDI.index)
drawBarCharReference2( '#425910',heatmapHDI, 'HDI',"Human Development Index (Wikipedia)", ['Country','HDI Index'])
draw_global_map2(['#ffffff','#000000'],heatmapHDI, ['Human Development Index Map (Wikipedia)'])
The index Gini is an example of data that is not fully stored in Wikipedia
heatmapGini = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','gini'])
#heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname'])
#heatmapHDI = heatmapHDI.groupby(['wikiname','mapname'])
#heatmapHDI = pd.DataFrame(heatmapHDI)
heatmapGini = heatmapGini.fillna(0)
heatmapGini = heatmapGini.drop_duplicates()
heatmapGini['color'] = pd.Series(["hola" for x in heatmapGini.index], index=heatmapGini.index)
drawBarCharReference2( '#0E1B5A',heatmapGini, 'gini',"Inequality Index (Wikipedia)", ['Country','Gini Index'])
draw_global_map2(['#ffffff','#000000'],heatmapGini, ['Inequality Index (Wikipedia)'])

heatmapPopulation = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','population'])
#heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname'])
#heatmapHDI = heatmapHDI.groupby(['wikiname','mapname'])
#heatmapHDI = pd.DataFrame(heatmapHDI)
heatmapPopulation = heatmapPopulation.fillna(0)
heatmapPopulation = heatmapPopulation.drop_duplicates()
heatmapPopulation['color'] = pd.Series(["hola" for x in heatmapPopulation.index], index=heatmapHDI.index)
drawBarCharReference2( '#3D0A0E',heatmapPopulation, 'population',"Population per country (Wikipedia)",['Country','Population'])
draw_global_map2(['#ffffff','#000000'],heatmapPopulation, ['Population per country Map (Wikipedia)'])
In this graphs, we are interested in analyze the lending per capita, (lending over population per country)
heatmapfoundingPercapita = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','totalamt','year','population'])
heatmapfoundingPercapita = pd.DataFrame(heatmapfoundingPercapita[heatmapfoundingPercapita.year>='2001'], columns=['wikiname','mapname','totalamt','year','population'])

heatmapfoundingPercapita = heatmapfoundingPercapita.groupby(['wikiname','mapname']).sum()
for x in heatmapfoundingPercapita.sort(columns='totalamt',ascending=True).index: 
    #print ""+str(heatmapfoundingPercapita['totalamt'][x]/heatmapfoundingPercapita['population'][x])
    heatmapfoundingPercapita['totalamt'][x]=heatmapfoundingPercapita['totalamt'][x]/heatmapfoundingPercapita['population'][x]
    ## filtering outliers to 
    if heatmapfoundingPercapita['totalamt'][x]>=400:
        heatmapfoundingPercapita['totalamt'][x] =400
heatmapfoundingPercapita['color'] = pd.Series(["hola" for x in heatmapfoundingPercapita.index], index=heatmapfoundingPercapita.index)

drawBarCharReference( '#7A1138',heatmapfoundingPercapita, "totalamt","World Bank Lending per capita, commitments Accumulated 2001-2013",['Country','US$'])
draw_global_map(['#ffffff','#000000'],heatmapfoundingPercapita, ['World Bank Lending per capita, commitments Accumulated 2001-2013'])clusion
In general, we believe that Wikipedia has a great potential to help and improve global studies. 
In particular, the amount of information that was not posible to retrieve from wikipedia was not imposible of managing manually.

The markdown parsing was difficult to understand when we started, but at the endi, it facilitated the retrieval of information. But the fact that a markdown document in Wikipedia can be manually processed and cleaned  by people around the world, gives us a great idea of the potential of Wikipedia as a platform for Open Data.