#Load in some libraries to help us work with the data import pandas as pd from ggplot import * #Data source - download the data to the data/ directory #wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/267640/pfi_current_projects_list_march_2013_published_dec_2013.xlsx -P data #Import the data we've just downloaded df=pd.read_excel('data/pfi_current_projects_list_march_2013_published_dec_2013.xlsx',skiprows=1) #Preview the first few rows df[:3] #Check the column headings df.columns.tolist() #Set up some convenenience groupings of columns basecols=['Unique HMT Project ID', 'Project Name', 'Department', 'Procuring authority', 'Sector', 'Constituency', 'Region', 'Project Status', 'Date Of OJEU', 'Date of preferred bidder', 'Date of financial close', 'Date of construction completion', 'First date of operations', 'Operational period of contract (years)', 'On / Off balance sheet under IFRS', 'On / Off balance sheet under ESA 95', 'On / Off balance sheet under UK GAAP', 'Capital Value (£m)','SPV name', 'SPV company number', 'SPV address'] financecols=['On / Off balance sheet under IFRS', 'On / Off balance sheet under ESA 95', 'On / Off balance sheet under UK GAAP'] corecols=['Unique HMT Project ID','Project Name','Procuring authority','Sector','Capital Value (£m)'] datecols=['Unitary charge payment 1992-93 (£m)', 'Unitary charge payment 1993-94 (£m)', 'Unitary charge payment 1994-95 (£m)', 'Unitary charge payment 1995-96 (£m)', 'Unitary charge payment 1996-97 (£m)', 'Unitary charge payment 1997-98 (£m)', 'Unitary charge payment 1998-99 (£m)', 'Unitary charge payment 1999-00 (£m)', 'Unitary charge payment 2000-01 (£m)', 'Unitary charge payment 2001-02 (£m)', 'Unitary charge payment 2002-03 (£m)', 'Unitary charge payment 2003-04 (£m)', 'Unitary charge payment 2004-05 (£m)', 'Unitary charge payment 2005-06 (£m)', 'Unitary charge payment 2006-07 (£m)', 'Unitary charge payment 2007-08 (£m)', 'Unitary charge payment 2008-09 (£m)', 'Unitary charge payment 2009-10 (£m)', 'Unitary charge payment 2010-11 (£m)', 'Unitary charge payment 2011-12 (£m)', 'Unitary charge payment 2012-13 (£m)', 'Estimated unitary charge payment 2013-14 (£m)', 'Estimated unitary charge payment 2014-15 (£m)', 'Estimated unitary charge payment 2015-16 (£m)', 'Estimated unitary charge payment 2016-17 (£m)', 'Estimated unitary charge payment 2017-18 (£m)', 'Estimated unitary charge payment 2018-19 (£m)', 'Estimated unitary charge payment 2019-20 (£m)', 'Estimated unitary charge payment 2020-21 (£m)', 'Estimated unitary charge payment 2021-22 (£m)', 'Estimated unitary charge payment 2022-23 (£m)', 'Estimated unitary charge payment 2023-24 (£m)', 'Estimated unitary charge payment 2024-25 (£m)', 'Estimated unitary charge payment 2025-26 (£m)', 'Estimated unitary charge payment 2026-27 (£m)', 'Estimated unitary charge payment 2027-28 (£m)', 'Estimated unitary charge payment 2028-29 (£m)', 'Estimated unitary charge payment 2029-30 (£m)', 'Estimated unitary charge payment 2030-31 (£m)', 'Estimated unitary charge payment 2031-32 (£m)', 'Estimated unitary charge payment 2032-33 (£m)', 'Estimated unitary charge payment 2033-34 (£m)', 'Estimated unitary charge payment 2034-35 (£m)', 'Estimated unitary charge payment 2035-36 (£m)', 'Estimated unitary charge payment 2036-37 (£m)', 'Estimated unitary charge payment 2037-38 (£m)', 'Estimated unitary charge payment 2038-39 (£m)', 'Estimated unitary charge payment 2039-40 (£m)', 'Estimated unitary charge payment 2040-41 (£m)', 'Estimated unitary charge payment 2041-42 (£m)', 'Estimated unitary charge payment 2042-43 (£m)', 'Estimated unitary charge payment 2043-44 (£m)', 'Estimated unitary charge payment 2044-45 (£m)', 'Estimated unitary charge payment 2045-46 (£m)', 'Estimated unitary charge payment 2046-47 (£m)', 'Estimated unitary charge payment 2047-48 (£m)', 'Estimated unitary charge payment 2048-49 (£m)', 'Estimated unitary charge payment 2049-50 (£m)', 'Estimated unitary charge payment 2050-51 (£m)', 'Estimated unitary charge payment 2051-52 (£m)', 'Estimated unitary charge payment 2052-53 (£m)', 'Estimated unitary charge payment 2053-54 (£m)', 'Estimated unitary charge payment 2054-55 (£m)', 'Estimated unitary charge payment 2055-56 (£m)', 'Estimated unitary charge payment 2056-57 (£m)', 'Estimated unitary charge payment 2057-58 (£m)', 'Estimated unitary charge payment 2058-59 (£m)', 'Estimated unitary charge payment 2059-60 (£m)'] equitycols=['Equity holder 1: Name', 'Equity holder 1: Equity share (%)', 'Equity holder 1: change of ownership since March 2011? \n(Yes / No)', 'Equity holder 2: Name', 'Equity holder 2: Equity share (%)', 'Equity holder 2: change of ownership since March 2011? \n(Yes / No)', 'Equity holder 3: Name', 'Equity holder 3: Equity share (%)', 'Equity holder 3: change of ownership since March 2011? \n(Yes / No)', 'Equity holder 4: Name', 'Equity holder 4: Equity share (%)', 'Equity holder 4: change of ownership since March 2011? \n(Yes / No)', 'Equity holder 5: Name', 'Equity holder 5: Equity share (%)', 'Equity holder 5: change of ownership since March 2011? \n(Yes / No)', 'Equity holder 6: Name', 'Equity holder 6: Equity share (%)', 'Equity holder 6: change of ownership since March 2011? \n(Yes / No)'] df['Department'].unique() df['Sector'].unique() #Group by sector and count the number of contracts in each group df.groupby('Sector').size().order(ascending=False) #Use the same pattern as before, but this time group by procuring authority and the ten with the most contracts df.groupby('Procuring authority').size().order(ascending=False)[:10] #The Capital Value column may contain things that aren't numbers. If so, ignore them so they don't break the sum calculation tmp=df[df['Capital Value (£m)'].apply(lambda x: isinstance(x, (int, float)))] #We can also force the type tmp['Capital Value (£m)']=tmp['Capital Value (£m)'].astype(float) #Now use the filtered dataframe to calculate the total Capital Value associated with each sector tmp[['Sector','Capital Value (£m)']].groupby('Sector').sum().sort('Capital Value (£m)',ascending=False) #With data in a dataframe, we can chart various elements of it #For example, we can generate a histogram that shows a count of projects with a particular oprsational duration df['Operational period of contract (years)'].hist() df['Estimated unitary charge payment 2014-15 (£m)'].sum() df['Estimated unitary charge payment 2014-15 (£m)'].sum().round(2) def spendBy_X(df,column, item, year=2014): ''' Display the total spend for a particular year, filtered by a particular value in a particular column ''' if year<2013: year='Unitary charge payment {0}-{1} (£m)'.format(year,year-1999) else: year='Estimated unitary charge payment {0}-{1} (£m)'.format(year,year-1999) print('{0} [total] for the {1} {2} is £{3}m across {4} contracts.'.format( year, item, column, df[df[column]==item][year].sum().round(2), df[df[column]==item][year].count() )) spendBy_X(df,'Procuring authority', 'Isle of Wight') spendBy_X(df,'Department', 'DEPARTMENT FOR EDUCATION (GROUP)') spendBy_X(df,'Sector', 'Emergency Services',2012) #Generate a view of the data that just has the core and date related columns dfTime=df[corecols+datecols] #Generate a long version of this data by melting the date related columns dfTimeLong=pd.melt(dfTime, id_vars=corecols, value_vars= datecols ) #Now we'll do some tidying, using a differnt method to the previous one to ensure we have a numeric datatype for the value #Turn the values to numerics dfTimeLong['value']=dfTimeLong['value'].convert_objects(convert_numeric=True) #Get rid of any non-numeric values dfTimeLong=dfTimeLong.dropna(subset=['value']) #Get rid of any zero values dfTimeLong=dfTimeLong[dfTimeLong['value']!=0] #Preview the data - note how the variable column identifies the year, and the value column the payment for that year dfTimeLong[:3] #Preview data for a particular project dfTimeLong[dfTimeLong['Unique HMT Project ID']==523] #Let's do some more tidying - pull out the start year that a value relates to dfTimeLong['year']=dfTimeLong['variable'].apply(lambda x: x.split('payment')[1].split('-')[0]) dfTimeLong['year']=dfTimeLong['year'].astype(int) #Identify whether the value was an actual payment or an estimated one dfTimeLong['typ']=dfTimeLong['variable'].apply(lambda x: 'estimated' if x.startswith('Estimated') else 'actual') dfTimeLong[:3] totalexposure=dfTimeLong[['year','value']].groupby(['year']).sum() totalexposure.reset_index(inplace=True) totalexposure[:3] ggplot(totalexposure,aes(x='year',y='value')) \ + geom_line() \ + ggtitle('Total UK PFI estimated spend') \ + ylab('Spend £m') #What's the exposure of a particular procuring authority to contracted PFI spend each year? exposure=dfTimeLong[['Procuring authority','year','value']].groupby(['Procuring authority','year']).sum() exposure.reset_index(inplace=True) exposure[:3] #Have a peek at the data for a particular authority ggplot(exposure[exposure['Procuring authority']=='Manchester'],aes(x='year',y='value'))+geom_line() #Let's bundle that up into a function def procAuthExposureChart(_df,procAuth): g=ggplot(_df[_df['Procuring authority']==procAuth],aes(x='year',y='value')) \ + geom_line() + ylab('(Epected) Unitary Charge Payment (£m)') \ + ggtitle(procAuth) return g procAuthExposureChart(exposure,'Isle of Wight') groupingrows= ['Unique HMT Project ID','Project Name','Procuring authority','year'] projexposure=dfTimeLong[groupingrows+['value']].groupby(groupingrows).sum() projexposure.reset_index(inplace=True) projexposure[:3] #If assign the colour aeshetic to the project, we can chart each project with a separately coloured line ggplot(projexposure[projexposure['Procuring authority']=='Manchester'],aes(x='year',y='value',colour='Project Name')) \ + geom_line() ggplot(projexposure[projexposure['Procuring authority']=='Manchester'],aes(x='year',y='value')) \ + geom_line() \ + facet_wrap('Project Name', scales = "fixed") #Grab the row for the Temple Primary School project and melt the date columns, retaining years where a value is set pd.melt(df[df['Project Name']=='Temple Primary School'][datecols]).dropna() #Let projcodeTotal represent summed total estimated unitary values for each project projcodeTotal=projexposure[['Project Name','value']].groupby('Project Name').sum() projcodeTotal.reset_index(inplace=True) projcodeTotal.sort('value',ascending=False)[:10] projexposure[['Procuring authority','value']].groupby('Procuring authority').sum().sort('value',ascending=False)[:10] #Create a view of the data over some core data for each project and the summed unitary spend for each project projReview=pd.merge(df[['Unique HMT Project ID','Project Name','Capital Value (£m)','Operational period of contract (years)']], projcodeTotal, on='Project Name') #Make sure we're dealing with numbers... projReview['Capital Value (£m)']=projReview['Capital Value (£m)'].convert_objects(convert_numeric=True) #Let's start to generate some comparative metrics #I'm not sure if these are in any way meaningful, but they may reveal some sort of signal... #...if we can work out how to interpet them?! #First, the extent to which the summed total unitary value is a multiple of the capital amount projReview['multiplier']=projReview['value'].divide(projReview['Capital Value (£m)']) #Secondly, the difference betweem the summed total unitary value is a multiple of the capital amount projReview['excess']= projReview['value']-projReview['Capital Value (£m)'] #Thirdly, that difference on average over the operational period of the contract projReview['avexcess']=projReview['excess']/projReview['Operational period of contract (years)'] #Now let's start to make some rankings... #First, let's rank according to the multiplier... projReview.sort('multiplier',ascending=False)[:5] def interpreter1(_df): #txt='The {0} project has capital value £{1}m and estimated total unitary spend of £{2}m, ({3} times more than the capital amount), with an operational contract period of {4} years.'.format(_df['Project Name'],_df['Capital Value (£m)'], _df['value'], _df['multiplier']) txt='' if _df['Capital Value (£m)']!=0: txt='''The {0} project has capital value £{1}m and estimated total unitary spend of £{2}m, \ ({3} times more than the capital amount), with an operational contract period of {4} years.\n'''.format(_df['Project Name'], round(_df['Capital Value (£m)'],2), round(_df['value'],2), round(_df['multiplier'],2), _df['Operational period of contract (years)']) print(txt) projReview.sort('multiplier',ascending=False)[:5].apply(interpreter1,axis=1 ) projReview.sort('excess',ascending=False)[:5] projReview.dropna(subset=['excess']).sort('excess')[:15] projReview.sort('avexcess',ascending=False)[:5] equitycols #Use the equity cols list to filter the columns we want, keying each row by project ID df[['Unique HMT Project ID']+equitycols][:2] #Create a dummy dataframe to hold the long format data dfequity=pd.DataFrame() #There are six possible equity partner column groupings for i in range(1, 6): #Calculate the index of the left most column in the current grouping lower=(i-1)*3 #Generate a data frame from the project id and the three columns in the current equity grouping tmp=df[['Unique HMT Project ID','Project Name','SPV name','SPV company number']+equitycols][[0,1,2,3]+np.arange(lower+4,lower+6).tolist()] #Rename the columns tmp.columns=['Unique HMT Project ID','Project Name','SPV name','SPV company number','equityName','equityShare'] #make sure the equity chart converts to a numeric tmp['equityShare']=tmp['equityShare'].convert_objects(convert_numeric=True) #Only keep rows where we have an equity share recorded tmp=tmp.dropna(subset=['equityShare']) #Add the data from the current equity grouping to the growing long format dataframe dfequity=pd.concat([dfequity,tmp]) #Let's see what our dataframe looks like dfequity[:3] def showPartner(partner): txt='\n * {0} ({1}% equity share)'.format(partner[1]['equityName'],100*partner[1]['equityShare']) return txt def equityPartnersInProject(_df, project,typ='name',endstr='\n'): if typ=='name':pp=_df[_df['Project Name']==project] elif typ=='id':pp=_df[_df['Unique HMT Project ID']==project] else: return txt='The {0} project (project id: {1}) has {2} equity partner(s):'.format(pp['Project Name'].iloc[0],pp['Unique HMT Project ID'].iloc[0],len(pp)) for partner in pp.iterrows(): txt+=showPartner(partner) txt+=endstr print(txt) equityPartnersInProject(dfequity, 'Lambeth Street Lighting') def equityPartnersBySPV(_df,spv,typ='name',endstr='\n'): if typ=='name':pp=_df[_df['SPV name'].notnull & _df['SPV name'].str.contains(spv)] elif typ=='id':pp=_df[_df['SPV company number']==spv] else: return if len(pp)==0: txt='No SPV with that {0} found.'.format(typ) else: txt='The {0} SPV has {1} equity partner(s):'.format(pp['SPV name'].iloc[0],len(pp)) for partner in pp.iterrows(): txt+=showPartner(partner) txt+=endstr print(txt) equityPartnersBySPV(dfequity,'Hounslow') equityPartnersBySPV(dfequity,'Island Roads') def showPartnerInProj(partner): txt='\n * {0} ({1}% equity share in {2} (project id: {3}), SPV: {4})'.format(partner[1]['equityName'],100*partner[1]['equityShare'], partner[1]['Project Name'],partner[1]['Unique HMT Project ID'], partner[1]['SPV name']) return txt def equitySearch(_df,equity,endstr='\n'): pp=_df[_df['equityName'].str.contains(equity)] if len(pp)==0: txt='No equity partner with that name found.' else: txt='That equity is associated with:' spv=[] for partner in pp.iterrows(): txt+=showPartnerInProj(partner) if partner[1]['SPV name'] not in spv: spv.append(partner[1]['SPV name']) txt+=','.join(spv)+endstr print(txt) equitySearch(dfequity,'G4S') projexposure[:3] projexco=pd.merge(dfequity,projexposure,on=['Unique HMT Project ID','Project Name']) projexco[:3] #Do any rows have equity share sums > 100% ? test=projexco[['Unique HMT Project ID','year','equityShare']].groupby(['Unique HMT Project ID','year']).sum().sort(ascending=False) test.reset_index(inplace=True) #May get floating point errors test[test['equityShare']>1.00000000000001]['Unique HMT Project ID'].unique() equityPartnersInProject(dfequity, 853,typ='id') equityPartnersInProject(dfequity, 8,typ='id') projexco['valueShare']=projexco['value']*projexco['equityShare'] projexco.reset_index(inplace=True) projexco[123:129] projexcoAnnual=projexco[['equityName','year','valueShare']].groupby(['equityName','year']).sum() projexcoAnnual.reset_index(inplace=True) projexcoAnnual[projexcoAnnual['year']==2014].sort('valueShare',ascending=False)[:10] projexco[(projexco['equityName'].str.contains('Trillium')) & (projexco['year']==2014)] trillium=projexco[(projexco['equityName'].str.contains('Trillium')) & (projexco['year']==2014)] for project in trillium['Project Name']: equityPartnersInProject(dfequity, project) projReview[projReview['Project Name']==project].apply(interpreter1,axis=1 ) def projDesc(_df,startstr='',endstr='\n'): if startstr=='': txt='The {0} project'.format(_df['Project Name']) else: txt=startstr if _df['Capital Value (£m)']!=0: txt+='''has capital value £{0}m and estimated total unitary spend of £{1}m, \ ({2} times more than the capital amount), with an operational contract period of {3} years.'''.format( round(_df['Capital Value (£m)'],2), round(_df['value'],2), round(_df['multiplier'],2), _df['Operational period of contract (years)']) txt+=endstr print(txt) for project in trillium['Project Name'][:1]: equityPartnersInProject(dfequity, project) projDesc(projReview[projReview['Project Name']==project].iloc[0],'It ') def projReportbyPartnerAndYear(partner,year): #Note that the search is case sensitive... projects=projexco[(projexco['equityName'].str.contains(partner)) & (projexco['year']==year)] for project in projects['Project Name']: equityPartnersInProject(dfequity, project,endstr='') projDesc(projReview[projReview['Project Name']==project].iloc[0],'It ') projReportbyPartnerAndYear('G4S',2015) projexco[(projexco['equityName'].str.contains('Sodexo')) & (projexco['year']==2014)] #Generate a data view containing rows where the name of the equity partner includes the word Sodexo sodexo=projexco[(projexco['equityName'].str.contains('Sodexo'))][['equityName','year','valueShare']] #For each name, generate the sum total of value shares going to that partner each year sodexo=sodexo.groupby(['equityName','year']).sum() sodexo.reset_index(inplace=True) ggplot(sodexo,aes(x='year',y='valueShare')) \ + geom_line() \ + facet_wrap('equityName',scales='fixed') \ + ggtitle('Sodexo') #Hmm - two takes on Sodexo? How come? sodexo['equityName'].unique()