from pandas import DataFrame, Series import pandas as pd import os import codecs # Verify existence of & Read in the datasets - project and operations & Freedom Index DATA_FILES={"projdict":"data/projects_operations_api.csv", "fredict":"data/FreedomIndex.csv"} def file_path(key): return os.path.join(os.pardir, DATA_FILES[key]) for file_key in DATA_FILES.keys(): abs_fname = file_path(file_key) print abs_fname, os.path.exists(abs_fname) f = codecs.open(file_path("projdict"), encoding='iso-8859-1') initial_proj_df = pd.read_csv(f) initial_proj_df.columns is_africa = initial_proj_df['regionname']=='AFRICA' initial_proj_df[is_africa]['countryname'][:5] initial_proj_df[is_africa][['countryname', 'totalamt']][:5] #The totalamt value is not properly formatted. This step cleans up the value by stripping out unnecessary characters. initial_proj_df['totalamt'] = initial_proj_df['totalamt'].str.replace(';','') initial_proj_df[is_africa]['totalamt'][:5] initial_proj_df['totalamt'] = initial_proj_df['totalamt'].astype('float32') sum(initial_proj_df[is_africa]['totalamt'][:5]) initial_proj_df[['regionname','countryname','projectstatusdisplay','totalamt']][:2] # This step is data cleaning. Removing the semi-column from the money values. initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].str.replace(';','') initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].astype('float32') initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].str.replace(';','') initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].astype('float32') initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].str.replace(';','') initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].astype('float32') initial_proj_df['grantamt'] = initial_proj_df['grantamt'].str.replace(';','') initial_proj_df['grantamt'] = initial_proj_df['grantamt'].astype('float32') initial_proj_df[is_africa][['countryname','project_name','boardapprovaldate','status','lendprojectcost','grantamt']][:10] projcp_df = initial_proj_df.copy() projcp_df = projcp_df.drop(['lendinginstrtype','envassesmentcategorycode','productlinetype','closingdate','url','sector2','sector3','sector4','sector5','sector','mjsector1','mjsector2','mjsector3','mjsector4','mjsector5','mjsector','theme1','theme2','theme3','theme4','theme5','financier','mjtheme2name','mjtheme3name','mjtheme4name','mjtheme5name'],axis=1) del projcp_df['projectstatusdisplay'] projcp_df2 = projcp_df.drop(['prodline','supplementprojectflg','goal','mjtheme1name','location'], axis=1) projcp_df2.columns projcp_df2[is_africa][:5] grouped = projcp_df2.groupby('regionname') # function to calculate the total amount awarded by the worldbank per country or regional operating body def func(x): totalamt = x['totalamt'].sum() return Series([totalamt] ,index=['totalamt']) # result dataframe result = grouped.apply(func) #create a new column in dataframe to hold the years from the board approval date projcp_df2['year'] = projcp_df2['boardapprovaldate'].str[:4] projcp_df2['year'][:2] # group data by year and region name grouped3 = projcp_df2.groupby(['regionname','year']) # statistics on the banks lending commitments to different regions over time grouped3['totalamt'].describe() grouped4 = projcp_df2.groupby(['regionname','year','board_approval_month']) result4 =grouped4.apply(func) result4.unstack('regionname')[:5] result5 = grouped3.apply(func).unstack('regionname').fillna(0) result5[:5] # python-us-cpi is a tool for parsing the latest US Consumer Price Index and also provides an inflation calculator api. #We'll be using this api to calculate the loan commitments from other years into today's dollars for better comparision. from uscpi import UsCpi cpi = UsCpi() # downloads the latest CPI data # $100 in 2012 is worth how much in 1980? cpi.value_with_inflation(100, 2012, 1980) projcpi = projcp_df2[['regionname','countryname','project_name','totalamt','grantamt','sector1','year']].copy() # Function used to convert monetary values to today's value from any year less than 2013 using the cpi api def fun2(y): totalamts = y['totalamt'] year = int(y['year']) regionname =y['regionname'] countryname = y['countryname'] project_name = y['project_name'] grantamt = y['grantamt'] sector1 = y['sector1'] boolVal = 1914 <= year <= 2013 if(boolVal): totalamts = cpi.value_with_inflation(totalamts,year,2013) return Series([regionname,countryname,project_name,totalamts,grantamt,sector1,year],index=['regionname','countryname','project_name','totalamt','grantamt','sector1','year']) resultcpi = projcpi.fillna(0).apply(fun2, axis=1) # Data cleaning: Removing un-wanted data from the sector1 string resultcpi['sectorMain'] = resultcpi['sector1'].str.split("!").str[0] resultcpi['country'] = resultcpi['countryname'].str.split(";").str[0] # validate that the data is cleaned. resultcpi[:4] resultcpi['year'] = resultcpi['year'].astype(int) # The dataset is big, which makes it very difficult to analyze. This next step we construct a boolean to extract only those items that have been funded from 200 - 2013 is_bv = (resultcpi['year'] >= 2000) & (resultcpi['year'] <= 2013) resultcpi2 = resultcpi[is_bv] # verify that data is formatted in the way we want to analyze it. resultcpi2[:4] #In the line below we are finding the total bank commitments to Africa over a period from 2000 - 2009. # We use the cpi function to calculate the inflation and CPI on all the loans less than 2013 ggroup_africa = resultcpi2[resultcpi2['regionname']=='AFRICA'].groupby('year').apply(func) ggroup_africa.plot(kind='bar', title='Bank lending commitments to Africa in year 2000 - 2013'); plt.tight_layout() #In the line below we are finding the total bank commitments per region over a period from 2000 - 2009. # We use the cpi function to calculate the inflation and CPI on all the loans less than 2013 amtByRegion =resultcpi2.groupby(['regionname','year']).apply(func).unstack('regionname') amtByRegion[:2] amtByRegion.plot(kind='bar',figsize=(16,8), title='Lendig commitments by the Bank from 1947 - 2013'); plt.legend(loc='best') # count the number of world bank projects from 2000 - 2013 per country numOfproj_by_country = resultcpi2.groupby('country').size().order(na_last=True, ascending=False, kind='mergesort') numOfproj_by_country[:5] # From above, I observed that the top most funded UN nations are BRICS, so the list below is created to filter out the BRICS for further observation and analysis listBRICS = ['Federative Republic of Brazil','Russian Federation','Republic of India','People\'s Republic of China','Republic of South Africa'] brics_nations = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','year']).size() # In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013 brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout() #rpt[rpt['STK_ID'].isin(stk_list)] df_of_BRICS = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','sector3']).size().order(na_last=True, ascending=False, kind='mergesort') df_of_BRICS.unstack('country').fillna(0) # We import the Freedom index csv for comparison analysis f = codecs.open(file_path("fredict"), encoding='iso-8859-1') free_df = pd.read_csv(f) free_df[:2] # similar to the projects and operations dataset, I restrict the analysis to only data from 2000 - 2009 free_df2 = free_df[free_df['index year']>=2000].copy() free_df2.columns # I extract the BRICS to further observe them free_df2 = free_df2[free_df2['name'].isin(['China', 'India', 'Russia', 'Brazil', 'South Africa'])] free_df2[:5] free_df3 = free_df2[['name','index year','overall score']].copy() free_df3[:2] free_df3['overall score'] = free_df3['overall score'].astype(float) free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(kind='line', title='freedom Index per BRICS country', figsize=(10,10)) free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(subplots=True, figsize=(8, 8)); plt.legend(loc='best');plt.tight_layout();plt.ylabel('Freedom Index'); # In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013 brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout() Findings: 1. In 2010, The world bank funding increased (compared to previous year) in all the BRICS except Russia and the GDP in these countries also dropped. Is there a relationship? Financial Crisis? Challenges 1. The Worldbank data has so many facets that can be useful for data analysis however not all variables are as properly explained. 2. During the analysis of the data, i noticed that after 1970, The world bank changed the format of their reporting which made it really difficult during munging ot the data. f = codecs.open(file_path("fredict"), encoding='iso-8859-1') free_df = pd.read_csv(f) #because I'm looking at the contribution of funds over a period of time, I want to look at the current #Freedom Index for these countries to make an anlysis of their current state free_df2 = free_df[free_df['index year']==2013].copy() #for simplicity, let's ignore those who have not been scored as well free_df2 = free_df2[free_df2['overall score']!='N/A'].copy() free_df2.columns low_freedom = free_df2.sort(['overall score'], ascending=True) low_freedom = low_freedom[:10] low_freedom[['name', 'overall score']] high_freedom = free_df2.sort(['overall score'], ascending=False) high_freedom = high_freedom[:10] high_freedom[['name', 'overall score']] #high corruption high_corruption = free_df2.sort(['freedom from corruption'], ascending=True) high_corruption = high_corruption[:10] high_corruption[['name', 'freedom from corruption']] #low corruption low_corruption = free_df2.sort(['freedom from corruption'], ascending=False) low_corruption = low_corruption[:10] low_corruption[['name', 'freedom from corruption']] numOfproj_by_country[:10] #recall the resultcpi looking at the projects funded, converted to 2013 dollars #let's sort by country country_cpi = resultcpi.sort(column='country', ascending=True) country_cpi[['country', 'totalamt', 'grantamt', 'year']][:2] #there are a few problems with this data-- First, there are continents included in the countryname: country_cpi= country_cpi.dropna() #country_cpi= country_cpi[((country_cpi.country !='Africa') # &(country_cpi.country !='Central America') # &(country_cpi.country !='Latin America') # &(country_cpi.country !='Europe') # &(country_cpi.country !='East Asia and Pacific') # &(country_cpi.country !='Europe and Central Asia') # &(country_cpi.country !='World') # &(country_cpi.country !='Asia') # &(country_cpi.country !='Middle East and North Africa') # &(country_cpi.country !='Africa') # &(country_cpi.country !='South Eastern Europe and Balkans'))] #Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding. #Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding. low_Freedom_list= ['Belize','Turkmenistan','Republic of Zimbabwe','Republic of Uzbekistan', 'Republic of Haiti', 'Republic of Burundi', 'Republic of Equatorial Guinea', 'People\'s Republic of Angola', 'Republica Bolivariana de Venezuela'] high_Freedom_list= ['Kingdom of Norway', 'New Zealand', 'Kingdom of Denmark', 'Republic of Finland', 'Republic of Sweden', 'Kingdom of The Netherlands', 'Common of Australia'] #now we can see how much money was committed to each country in the Freedom_list low_Freedom_nations = country_cpi[country_cpi['country'].isin(low_Freedom_list)].groupby(['country']).size() low_Freedom_nations low_Freedom_nations.plot(kind='bar', title='Lending to Countries with low Freedom Index'); plt.tight_layout() import pandas as pd import wikipydia as wk import mwparserfromhell from wikitools import wiki from wikitools import api from wikitools import category from wikitools import page import itertools import re wikisite = "http://en.wikipedia.org/w/api.php" wikiObject = wiki.Wiki(wikisite) projectsAPI = pd.read_csv('../data/projects_operations_api.csv') wikipediadf = pd.read_csv('../data/matchcountries.csv') # some cleaning on the datasets wikipediadf.index =wikipediadf['countryname'] projectsAPI['countryname'] = [str(country).split(";")[0] for country in projectsAPI['countryname']] #print matchNames.columns #print projectsAPI.columns projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left') projects = projects[projects['countryname'].map(type) != type(0.0)] projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)] projects['totalamt'] = projects['totalamt'].str.replace(';','') projects['totalamt'] = projects['totalamt'].astype('float32') print projects.columns projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']] projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']] import matplotlib.pyplot as plt import matplotlib.colors as col def color_variant(hex_color, brightness_offset=1): if len(hex_color) != 7: raise Exception("Passed %s into color_variant(), needs to be in #87c95f format." % hex_color) rgb_hex = [hex_color[x:x+2] for x in [1, 3, 5]] new_rgb_int = [int(hex_value, 16) + brightness_offset for hex_value in rgb_hex] new_rgb_int = [min([255, max([0, i])]) for i in new_rgb_int] # make sure new values are between 0 and 255 # hex() produces "0x88", we want just "88" hexcolor = "#" for i in new_rgb_int: if(i<16): hexcolor+="0"+str(hex(i)[2:]) else: hexcolor+=str(hex(i)[2:]) return hexcolor def drawBarCharReference(Color,targetlist, field, title, labels): fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ColorBase = Color changeQuantile = True changeRange = 0.10 i = 0 for x in targetlist.sort(columns=field,ascending=True).index: if i/float(len(targetlist.index)) > changeRange: ColorBase = color_variant(ColorBase,20) changeRange = changeRange + 0.10 targetlist['color'][x] = ColorBase #print (type(targetlist[field][x])) ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase)) i+=1 ax.set_xticklabels( ([x[1] for x in targetlist.sort(columns=field,ascending=True).index]) ) #plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35) plt.xticks(np.arange(0.5, i+1, 1)) plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical') plt.setp(ax.get_yticklabels(), fontsize=10) plt.title(title) plt.xlabel(labels[0],fontsize=18) plt.ylabel(labels[1],fontsize=18) plt.show() # http://www.geophysique.be/2013/02/12/matplotlib-basemap-tutorial-10-shapefiles-unleached-continued/ # # BaseMap example by geophysique.be # tutorial 10 import os import inspect import numpy as np import matplotlib.pyplot as plt from itertools import islice, izip from mpl_toolkits.basemap import Basemap def zip_filter_by_state(records, shapes, included_states=None): # by default, no filtering # included_states is a list of states fips prefixes for (record, state) in izip(records, shapes): if record[1] in included_states: yield (record, state) def draw_global_map(colors, indexlist, titles): ### PARAMETERS FOR MATPLOTLIB : import matplotlib as mpl mpl.rcParams['font.size'] = 14. mpl.rcParams['font.family'] = 'Serif' mpl.rcParams['axes.labelsize'] = 8. mpl.rcParams['xtick.labelsize'] = 40. mpl.rcParams['ytick.labelsize'] = 20. fig = plt.figure(figsize=(11.7,8.3)) #Custom adjust of the subplots plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05) ax = plt.subplot(111) #Let's create a basemap of USA x1 = -179. x2 = 179. y1 = -60. y2 = 80. i=0 #colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B'] m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2) m.drawcountries(linewidth=0.5) m.drawcoastlines(linewidth=0.5) m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians from matplotlib.collections import LineCollection from matplotlib import cm import shapefile basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data") # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")): shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp") else: # put in your path #shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county") shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp") shapes = shpf.shapes() records = shpf.records() #print cm.colors.ColorConverter.to_rgba('#eeefff') #random_number = 38*145*155 # show only CA and AK (for example) for record, shape in zip(records, shapes): lons,lats = zip(*shape.points) data = np.array(m(lons, lats)).T if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) #cm.jet(random_number) lines.set_facecolors(colors[0]) lines.set_edgecolors(colors[1]) lines.set_linewidth(0.1) ax.add_collection(lines) for record, shape in zip_filter_by_state(records, shapes, [x[1] for x in indexlist.index]): lons,lats = zip(*shape.points) data = np.array(m(lons, lats)).T if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) #cm.jet(random_number) i=i+1 x_color=None for w in heatmapfounding.index: if record[1] in w[1]: x_color = w break lines.set_facecolors(indexlist['color'][x_color]) lines.set_edgecolors(indexlist['color'][x_color]) lines.set_linewidth(0.1) ax.add_collection(lines) plt.title(titles[0]) plt.savefig('tutorial10.png',dpi=300) plt.show() #draw_global_map(['#3C989E','#424242'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013']) heatmapfounding = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','totalamt','year']) heatmapfounding = pd.DataFrame(heatmapfounding[heatmapfounding.year>='2001'], columns=['wikiname','mapname','totalamt','year']) heatmapfounding = heatmapfounding.groupby(['wikiname','mapname']).sum() heatmapfounding['color'] = pd.Series(["hola" for x in heatmapfounding.index], index=heatmapfounding.index) drawBarCharReference( '#C73F2A',heatmapfounding, "totalamt","Total World Bank Lending Commitments Accumulated 2001-2013",['Country','US$']) draw_global_map(['#ffffff','#000000'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013']) def cleanFloatnumber(x): if type(x) is float: return float(x) elif type(x) is str: if len(x) ==0: return None x=re.sub('','',x) x=re.sub('<*?>.*?<*?>','',x) x=x.strip() delimiterRegex = re.compile(r'[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?') Numbers = re.findall(delimiterRegex,x) if len(Numbers)>0: return float(Numbers[0]) else: return None else: return None def cleanIntNumber(x): if type(x) is float: return float(x) elif type(x) is str: if len(x) ==0: return None x=re.sub('','',x) x=re.sub('<*?>.*?<*?>','',x) x=re.sub(',','',x) x=x.strip() delimiterRegex = re.compile(r'[0-9]+') Numbers = re.findall(delimiterRegex,x) if len(Numbers)>0: return float(Numbers[0]) else: return None else: return None def get_infobox_from_wikipedia(countryname): #print "Checking: "+str(countryname)+"__" country_found = False hdi = None gini = None GDP = None GDP_nominal_per_capita = None population = None if str(countryname).strip() == "" or countryname is None or str(countryname).strip()=='nan': return hdi,gini,GDP,GDP_nominal_per_capita, population try: wikipage = page.Page(wikiObject,title=countryname) except Exception as inst: print "No results from Wikipedia: "+str(countryname) return hdi,gini,GDP,GDP_nominal_per_capita, population wikiraw = wikipage.getWikiText() wikiraw = wikiraw.decode('UTF-8') parsedWikiText = mwparserfromhell.parse(wikiraw) for x in parsedWikiText.nodes: if "template" in str(type(x)) and "Infobox country" in str(x.name): country_found = True if x.has_param('population_census'): population = cleanIntNumber(str(x.get('population_census').value)) if population is None: if x.has_param('population_estimate'): population = cleanIntNumber(str(x.get('population_estimate').value)) if x.has_param('HDI'): hdi = cleanFloatnumber(str(x.get('HDI').value)) if x.has_param('Gini'): gini = cleanFloatnumber(str(x.get('Gini').value)) if x.has_param('GDP'): GDP = x.get('GDP').value if x.has_param('GDP_nominal_per_capita'): GDP_nominal_per_capita = str(x.get('GDP_nominal_per_capita').value) break if country_found == False: print "No Infobox: "+str(countryname) return hdi,gini,GDP,GDP_nominal_per_capita,population wikipediadf["HDI"], wikipediadf["gini"],wikipediadf['GDP'],wikipediadf['GDP_nominal_per_capita'],wikipediadf['population'] = zip(*wikipediadf['wikiname'].map(get_infobox_from_wikipedia)) #pp = pd.DataFrame(zip(*wikipediadf[wikipediadf.wikiname == "Guinea"]['wikiname'].map(get_infobox_from_wikipedia))) #print pp[:] # It was not possible to process this data from wikipedia, so I decided to filter it (Ignacio) for i in wikipediadf[wikipediadf.type == 'Country'].index: typeFound = type(wikipediadf['population'][i]) if typeFound is not float and typeFound is not None: print "deleted" wikipediadf=wikipediadf.drop([i]) break for i in wikipediadf[wikipediadf.type == 'Country'].index: typeFound = type(wikipediadf['GDP_nominal_per_capita'][i]) if typeFound is not float and typeFound is not None: print "deleted" wikipediadf=wikipediadf.drop([i]) break projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left') projects = projects[projects['countryname'].map(type) != type(0.0)] projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)] projects['totalamt'] = projects['totalamt'].str.replace(';','') projects['totalamt'] = projects['totalamt'].astype('float32') print projects.columns projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']] projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']] def drawBarCharReference2(Color,targetlist, field, title,labels): fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ColorBase = Color changeQuantile = True changeRange = 0.10 i = 0 for x in targetlist.sort(columns=field,ascending=True).index: if i/float(len(targetlist.index)) > changeRange: ColorBase = color_variant(ColorBase,20) changeRange = changeRange + 0.10 targetlist['color'][x] = ColorBase #print (type(targetlist[field][x])) ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase)) i+=1 ax.set_xticklabels( ([targetlist['mapname'][x] for x in targetlist.sort(columns=field,ascending=True).index]) ) #plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35) plt.xticks(np.arange(0.5, i+1, 1)) plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical') plt.setp(ax.get_yticklabels(), fontsize=10) plt.title(title) plt.xlabel(labels[0], fontsize=18) plt.ylabel(labels[1], fontsize=18) plt.show() def zip_filter_by_state2(records, shapes, included_states=None): # by default, no filtering # included_states is a list of states fips prefixes for (record, state) in izip(records, shapes): if record[1] in included_states: yield (record, state) def draw_global_map2(colors, indexlist, titles): ### PARAMETERS FOR MATPLOTLIB : import matplotlib as mpl mpl.rcParams['font.size'] = 14. mpl.rcParams['font.family'] = 'Serif' mpl.rcParams['axes.labelsize'] = 8. mpl.rcParams['xtick.labelsize'] = 40. mpl.rcParams['ytick.labelsize'] = 20. fig = plt.figure(figsize=(11.7,8.3)) #Custom adjust of the subplots plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05) ax = plt.subplot(111) #Let's create a basemap of USA x1 = -179. x2 = 179. y1 = -60. y2 = 80. i=0 #colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B'] m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2) m.drawcountries(linewidth=0.5) m.drawcoastlines(linewidth=0.5) m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians from matplotlib.collections import LineCollection from matplotlib import cm import shapefile basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data") # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")): shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp") else: # put in your path #shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county") shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp") shapes = shpf.shapes() records = shpf.records() #print cm.colors.ColorConverter.to_rgba('#eeefff') #random_number = 38*145*155 # show only CA and AK (for example) for record, shape in zip(records, shapes): lons,lats = zip(*shape.points) data = np.array(m(lons, lats)).T if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) #cm.jet(random_number) lines.set_facecolors(colors[0]) lines.set_edgecolors(colors[1]) lines.set_linewidth(0.1) ax.add_collection(lines) for record, shape in zip_filter_by_state2(records, shapes, [indexlist['mapname'][x] for x in indexlist.index]): lons,lats = zip(*shape.points) data = np.array(m(lons, lats)).T if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) #cm.jet(random_number) i=i+1 x_color=None for (w,x) in [(indexlist['mapname'][x],x) for x in indexlist.index]: if type(w) is str and record[1] in w: x_color = x break lines.set_facecolors(indexlist['color'][x_color]) lines.set_edgecolors(indexlist['color'][x_color]) lines.set_linewidth(0.1) ax.add_collection(lines) plt.title(titles[0]) plt.savefig('tutorial10.png',dpi=300) plt.show() #wikipediadf["HDI"], # #wikipediadf["gini"], #wikipediadf['GDP'], #wikipediadf['GDP_nominal_per_capita'], #wikipediadf['population'] heatmapHDI = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','HDI']) #heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname']) #heatmapHDI = heatmapHDI.groupby(['wikiname','mapname']) #heatmapHDI = pd.DataFrame(heatmapHDI) heatmapHDI = heatmapHDI.fillna(0) heatmapHDI = heatmapHDI.drop_duplicates() heatmapHDI['color'] = pd.Series(["hola" for x in heatmapHDI.index], index=heatmapHDI.index) drawBarCharReference2( '#425910',heatmapHDI, 'HDI',"Human Development Index (Wikipedia)", ['Country','HDI Index']) draw_global_map2(['#ffffff','#000000'],heatmapHDI, ['Human Development Index Map (Wikipedia)']) The index Gini is an example of data that is not fully stored in Wikipedia heatmapGini = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','gini']) #heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname']) #heatmapHDI = heatmapHDI.groupby(['wikiname','mapname']) #heatmapHDI = pd.DataFrame(heatmapHDI) heatmapGini = heatmapGini.fillna(0) heatmapGini = heatmapGini.drop_duplicates() heatmapGini['color'] = pd.Series(["hola" for x in heatmapGini.index], index=heatmapGini.index) drawBarCharReference2( '#0E1B5A',heatmapGini, 'gini',"Inequality Index (Wikipedia)", ['Country','Gini Index']) draw_global_map2(['#ffffff','#000000'],heatmapGini, ['Inequality Index (Wikipedia)']) heatmapPopulation = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','population']) #heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname']) #heatmapHDI = heatmapHDI.groupby(['wikiname','mapname']) #heatmapHDI = pd.DataFrame(heatmapHDI) heatmapPopulation = heatmapPopulation.fillna(0) heatmapPopulation = heatmapPopulation.drop_duplicates() heatmapPopulation['color'] = pd.Series(["hola" for x in heatmapPopulation.index], index=heatmapHDI.index) drawBarCharReference2( '#3D0A0E',heatmapPopulation, 'population',"Population per country (Wikipedia)",['Country','Population']) draw_global_map2(['#ffffff','#000000'],heatmapPopulation, ['Population per country Map (Wikipedia)']) In this graphs, we are interested in analyze the lending per capita, (lending over population per country) heatmapfoundingPercapita = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','totalamt','year','population']) heatmapfoundingPercapita = pd.DataFrame(heatmapfoundingPercapita[heatmapfoundingPercapita.year>='2001'], columns=['wikiname','mapname','totalamt','year','population']) heatmapfoundingPercapita = heatmapfoundingPercapita.groupby(['wikiname','mapname']).sum() for x in heatmapfoundingPercapita.sort(columns='totalamt',ascending=True).index: #print ""+str(heatmapfoundingPercapita['totalamt'][x]/heatmapfoundingPercapita['population'][x]) heatmapfoundingPercapita['totalamt'][x]=heatmapfoundingPercapita['totalamt'][x]/heatmapfoundingPercapita['population'][x] ## filtering outliers to if heatmapfoundingPercapita['totalamt'][x]>=400: heatmapfoundingPercapita['totalamt'][x] =400 heatmapfoundingPercapita['color'] = pd.Series(["hola" for x in heatmapfoundingPercapita.index], index=heatmapfoundingPercapita.index) drawBarCharReference( '#7A1138',heatmapfoundingPercapita, "totalamt","World Bank Lending per capita, commitments Accumulated 2001-2013",['Country','US$']) draw_global_map(['#ffffff','#000000'],heatmapfoundingPercapita, ['World Bank Lending per capita, commitments Accumulated 2001-2013'])clusion In general, we believe that Wikipedia has a great potential to help and improve global studies. In particular, the amount of information that was not posible to retrieve from wikipedia was not imposible of managing manually. The markdown parsing was difficult to understand when we started, but at the endi, it facilitated the retrieval of information. But the fact that a markdown document in Wikipedia can be manually processed and cleaned by people around the world, gives us a great idea of the potential of Wikipedia as a platform for Open Data.