#Load in some libraries to help us work with the data
import pandas as pd
from ggplot import *

#Data source - download the data to the data/ directory
#wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/267640/pfi_current_projects_list_march_2013_published_dec_2013.xlsx -P data

#Import the data we've just downloaded
df=pd.read_excel('data/pfi_current_projects_list_march_2013_published_dec_2013.xlsx',skiprows=1)

#Preview the first few rows
df[:3]

#Check the column headings
df.columns.tolist()

#Set up some convenenience groupings of columns
basecols=['Unique HMT Project ID', 'Project Name', 'Department', 'Procuring authority', 'Sector', 'Constituency', 
          'Region', 'Project Status', 'Date Of OJEU', 'Date of preferred bidder', 'Date of financial close',
          'Date of construction completion', 'First date of operations', 'Operational period of contract (years)',
          'On / Off balance sheet under IFRS', 'On / Off balance sheet under ESA 95', 'On / Off balance sheet under UK GAAP',
          'Capital Value (£m)','SPV name', 'SPV company number', 'SPV address']
financecols=['On / Off balance sheet under IFRS', 'On / Off balance sheet under ESA 95', 'On / Off balance sheet under UK GAAP']
corecols=['Unique HMT Project ID','Project Name','Procuring authority','Sector','Capital Value (£m)']
datecols=['Unitary charge payment 1992-93 (£m)', 'Unitary charge payment 1993-94 (£m)', 'Unitary charge payment 1994-95 (£m)',
          'Unitary charge payment 1995-96 (£m)', 'Unitary charge payment 1996-97 (£m)', 'Unitary charge payment 1997-98 (£m)',
          'Unitary charge payment 1998-99 (£m)', 'Unitary charge payment 1999-00 (£m)', 'Unitary charge payment 2000-01 (£m)',
          'Unitary charge payment 2001-02 (£m)', 'Unitary charge payment 2002-03 (£m)', 'Unitary charge payment 2003-04 (£m)',
          'Unitary charge payment 2004-05 (£m)', 'Unitary charge payment 2005-06 (£m)', 'Unitary charge payment 2006-07 (£m)',
          'Unitary charge payment 2007-08 (£m)', 'Unitary charge payment 2008-09 (£m)', 'Unitary charge payment 2009-10 (£m)',
          'Unitary charge payment 2010-11 (£m)', 'Unitary charge payment 2011-12 (£m)', 'Unitary charge payment 2012-13 (£m)',
          'Estimated unitary charge payment 2013-14 (£m)', 'Estimated unitary charge payment 2014-15 (£m)',
          'Estimated unitary charge payment 2015-16 (£m)', 'Estimated unitary charge payment 2016-17 (£m)',
          'Estimated unitary charge payment 2017-18 (£m)', 'Estimated unitary charge payment 2018-19 (£m)',
          'Estimated unitary charge payment 2019-20 (£m)', 'Estimated unitary charge payment 2020-21 (£m)',
          'Estimated unitary charge payment 2021-22 (£m)', 'Estimated unitary charge payment 2022-23 (£m)',
          'Estimated unitary charge payment 2023-24 (£m)', 'Estimated unitary charge payment 2024-25 (£m)',
          'Estimated unitary charge payment 2025-26 (£m)', 'Estimated unitary charge payment 2026-27 (£m)',
          'Estimated unitary charge payment 2027-28 (£m)', 'Estimated unitary charge payment 2028-29 (£m)',
          'Estimated unitary charge payment 2029-30 (£m)', 'Estimated unitary charge payment 2030-31 (£m)',
          'Estimated unitary charge payment 2031-32 (£m)', 'Estimated unitary charge payment 2032-33 (£m)',
          'Estimated unitary charge payment 2033-34 (£m)', 'Estimated unitary charge payment 2034-35 (£m)',
          'Estimated unitary charge payment 2035-36 (£m)', 'Estimated unitary charge payment 2036-37 (£m)',
          'Estimated unitary charge payment 2037-38 (£m)', 'Estimated unitary charge payment 2038-39 (£m)',
          'Estimated unitary charge payment 2039-40 (£m)', 'Estimated unitary charge payment 2040-41 (£m)',
          'Estimated unitary charge payment 2041-42 (£m)', 'Estimated unitary charge payment 2042-43 (£m)',
          'Estimated unitary charge payment 2043-44 (£m)', 'Estimated unitary charge payment 2044-45 (£m)',
          'Estimated unitary charge payment 2045-46 (£m)', 'Estimated unitary charge payment 2046-47 (£m)',
          'Estimated unitary charge payment 2047-48 (£m)', 'Estimated unitary charge payment 2048-49 (£m)',
          'Estimated unitary charge payment 2049-50 (£m)', 'Estimated unitary charge payment 2050-51 (£m)',
          'Estimated unitary charge payment 2051-52 (£m)', 'Estimated unitary charge payment 2052-53 (£m)',
          'Estimated unitary charge payment 2053-54 (£m)', 'Estimated unitary charge payment 2054-55 (£m)',
          'Estimated unitary charge payment 2055-56 (£m)', 'Estimated unitary charge payment 2056-57 (£m)',
          'Estimated unitary charge payment 2057-58 (£m)', 'Estimated unitary charge payment 2058-59 (£m)',
          'Estimated unitary charge payment 2059-60 (£m)']
equitycols=['Equity holder 1: Name', 'Equity holder 1: Equity share (%)',
            'Equity holder 1: change of ownership since March 2011? \n(Yes / No)',
            'Equity holder 2: Name', 'Equity holder 2: Equity share (%)',
            'Equity holder 2: change of ownership since March 2011? \n(Yes / No)',
            'Equity holder 3: Name', 'Equity holder 3: Equity share (%)',
            'Equity holder 3: change of ownership since March 2011? \n(Yes / No)',
            'Equity holder 4: Name', 'Equity holder 4: Equity share (%)',
            'Equity holder 4: change of ownership since March 2011? \n(Yes / No)',
            'Equity holder 5: Name', 'Equity holder 5: Equity share (%)',
            'Equity holder 5: change of ownership since March 2011? \n(Yes / No)',
            'Equity holder 6: Name', 'Equity holder 6: Equity share (%)',
            'Equity holder 6: change of ownership since March 2011? \n(Yes / No)']

df['Department'].unique()

df['Sector'].unique()

#Group by sector and count the number of contracts in each group
df.groupby('Sector').size().order(ascending=False)

#Use the same pattern as before, but this time group by procuring authority and the ten with the most contracts
df.groupby('Procuring authority').size().order(ascending=False)[:10]

#The Capital Value column may contain things that aren't numbers. If so, ignore them so they don't break the sum calculation
tmp=df[df['Capital Value (£m)'].apply(lambda x: isinstance(x, (int, float)))]
#We can also force the type
tmp['Capital Value (£m)']=tmp['Capital Value (£m)'].astype(float)
#Now use the filtered dataframe to calculate the total Capital Value associated with each sector
tmp[['Sector','Capital Value (£m)']].groupby('Sector').sum().sort('Capital Value (£m)',ascending=False)

#With data in a dataframe, we can chart various elements of it
#For example, we can generate a histogram that shows a count of projects with a particular oprsational duration
df['Operational period of contract (years)'].hist()

df['Estimated unitary charge payment 2014-15 (£m)'].sum()

df['Estimated unitary charge payment 2014-15 (£m)'].sum().round(2)

def spendBy_X(df,column, item, year=2014):
    ''' Display the total spend for a particular year, filtered by a particular value in a particular column '''
    
    if year<2013:  year='Unitary charge payment {0}-{1} (£m)'.format(year,year-1999)
    else: year='Estimated unitary charge payment {0}-{1} (£m)'.format(year,year-1999)

    print('{0} [total] for the {1} {2} is £{3}m across {4} contracts.'.format( 
            year,
            item,
            column,
            df[df[column]==item][year].sum().round(2),
            df[df[column]==item][year].count()
            ))


spendBy_X(df,'Procuring authority', 'Isle of Wight')
spendBy_X(df,'Department', 'DEPARTMENT FOR EDUCATION (GROUP)')
spendBy_X(df,'Sector', 'Emergency Services',2012)

#Generate a view of the data that just has the core and date related columns
dfTime=df[corecols+datecols]

#Generate a long version of this data by melting the date related columns
dfTimeLong=pd.melt(dfTime, id_vars=corecols, value_vars= datecols )

#Now we'll do some tidying, using a differnt method to the previous one to ensure we have a numeric datatype for the value
#Turn the values to numerics
dfTimeLong['value']=dfTimeLong['value'].convert_objects(convert_numeric=True)
#Get rid of any non-numeric values
dfTimeLong=dfTimeLong.dropna(subset=['value'])
#Get rid of any zero values
dfTimeLong=dfTimeLong[dfTimeLong['value']!=0]

#Preview the data - note how the variable column identifies the year, and the value column the payment for that year
dfTimeLong[:3]

#Preview data for a particular project
dfTimeLong[dfTimeLong['Unique HMT Project ID']==523]

#Let's do some more tidying - pull out the start year that a value relates to
dfTimeLong['year']=dfTimeLong['variable'].apply(lambda x: x.split('payment')[1].split('-')[0])
dfTimeLong['year']=dfTimeLong['year'].astype(int)

#Identify whether the value was an actual payment or an estimated one
dfTimeLong['typ']=dfTimeLong['variable'].apply(lambda x: 'estimated' if x.startswith('Estimated') else 'actual')
dfTimeLong[:3]

totalexposure=dfTimeLong[['year','value']].groupby(['year']).sum()
totalexposure.reset_index(inplace=True)
totalexposure[:3]

ggplot(totalexposure,aes(x='year',y='value')) \
    + geom_line() \
    + ggtitle('Total UK PFI estimated spend') \
    + ylab('Spend £m')

#What's the exposure of a particular procuring authority to contracted PFI spend each year?
exposure=dfTimeLong[['Procuring authority','year','value']].groupby(['Procuring authority','year']).sum()
exposure.reset_index(inplace=True)
exposure[:3]

#Have a peek at the data for a particular authority
ggplot(exposure[exposure['Procuring authority']=='Manchester'],aes(x='year',y='value'))+geom_line()

#Let's bundle that up into a function
def procAuthExposureChart(_df,procAuth):
    g=ggplot(_df[_df['Procuring authority']==procAuth],aes(x='year',y='value')) \
        + geom_line() + ylab('(Epected) Unitary Charge Payment (£m)') \
        + ggtitle(procAuth)
    return g

procAuthExposureChart(exposure,'Isle of Wight')

groupingrows= ['Unique HMT Project ID','Project Name','Procuring authority','year']
projexposure=dfTimeLong[groupingrows+['value']].groupby(groupingrows).sum()
projexposure.reset_index(inplace=True)
projexposure[:3]

#If assign the colour aeshetic to the project, we can chart each project with a separately coloured line
ggplot(projexposure[projexposure['Procuring authority']=='Manchester'],aes(x='year',y='value',colour='Project Name')) \
    + geom_line()

ggplot(projexposure[projexposure['Procuring authority']=='Manchester'],aes(x='year',y='value')) \
    + geom_line() \
    + facet_wrap('Project Name', scales = "fixed")

#Grab the row for the Temple Primary School project and melt the date columns, retaining years where a value is set
pd.melt(df[df['Project Name']=='Temple Primary School'][datecols]).dropna()

#Let projcodeTotal represent summed total estimated unitary values for each project
projcodeTotal=projexposure[['Project Name','value']].groupby('Project Name').sum()
projcodeTotal.reset_index(inplace=True)
projcodeTotal.sort('value',ascending=False)[:10]

projexposure[['Procuring authority','value']].groupby('Procuring authority').sum().sort('value',ascending=False)[:10]

#Create a view of the data over some core data for each project and the summed unitary spend for each project
projReview=pd.merge(df[['Unique HMT Project ID','Project Name','Capital Value (£m)','Operational period of contract (years)']],
                    projcodeTotal, on='Project Name')

#Make sure we're dealing with numbers...
projReview['Capital Value (£m)']=projReview['Capital Value (£m)'].convert_objects(convert_numeric=True)

#Let's start to generate some comparative metrics
#I'm not sure if these are in any way meaningful, but they may reveal some sort of signal...
#...if we can work out how to interpet them?!
#First, the extent to which the summed total unitary value is a multiple of the capital amount
projReview['multiplier']=projReview['value'].divide(projReview['Capital Value (£m)'])
#Secondly, the difference betweem the summed total unitary value is a multiple of the capital amount
projReview['excess']= projReview['value']-projReview['Capital Value (£m)']
#Thirdly, that difference on average over the operational period of the contract
projReview['avexcess']=projReview['excess']/projReview['Operational period of contract (years)']

#Now let's start to make some rankings...
#First, let's rank according to the multiplier...
projReview.sort('multiplier',ascending=False)[:5]

def interpreter1(_df):
    #txt='The {0} project has capital value £{1}m and estimated total unitary spend  of £{2}m, ({3} times more than the capital amount),  with an operational contract period of {4} years.'.format(_df['Project Name'],_df['Capital Value (£m)'], _df['value'], _df['multiplier'])
    txt=''
    if _df['Capital Value (£m)']!=0:
        txt='''The {0} project has capital value £{1}m and estimated total unitary spend of £{2}m, \
({3} times more than the capital amount), with an operational contract period of {4} years.\n'''.format(_df['Project Name'],
                           round(_df['Capital Value (£m)'],2),
                           round(_df['value'],2),
                           round(_df['multiplier'],2),
                           _df['Operational period of contract (years)'])
    print(txt)
    
projReview.sort('multiplier',ascending=False)[:5].apply(interpreter1,axis=1 )

projReview.sort('excess',ascending=False)[:5]

projReview.dropna(subset=['excess']).sort('excess')[:15]

projReview.sort('avexcess',ascending=False)[:5]

equitycols

#Use the equity cols list to filter the columns we want, keying each row by project ID
df[['Unique HMT Project ID']+equitycols][:2]

#Create a dummy dataframe to hold the long format data
dfequity=pd.DataFrame()
#There are six possible equity partner column groupings
for i in range(1, 6):
    #Calculate the index of the left most column in the current grouping
    lower=(i-1)*3
    #Generate a data frame from the project id and the three columns in the current equity grouping
    tmp=df[['Unique HMT Project ID','Project Name','SPV name','SPV company number']+equitycols][[0,1,2,3]+np.arange(lower+4,lower+6).tolist()]
    #Rename the columns
    tmp.columns=['Unique HMT Project ID','Project Name','SPV name','SPV company number','equityName','equityShare']
    #make sure the equity chart converts to a numeric
    tmp['equityShare']=tmp['equityShare'].convert_objects(convert_numeric=True)
    #Only keep rows where we have an equity share recorded
    tmp=tmp.dropna(subset=['equityShare'])
    #Add the data from the current equity grouping to the growing long format dataframe
    dfequity=pd.concat([dfequity,tmp])

#Let's see what our dataframe looks like
dfequity[:3]

def showPartner(partner):
    txt='\n * {0} ({1}% equity share)'.format(partner[1]['equityName'],100*partner[1]['equityShare'])
    return txt

def equityPartnersInProject(_df, project,typ='name',endstr='\n'):
    if typ=='name':pp=_df[_df['Project Name']==project]
    elif typ=='id':pp=_df[_df['Unique HMT Project ID']==project]
    else: return
    
    txt='The {0} project (project id: {1}) has {2} equity partner(s):'.format(pp['Project Name'].iloc[0],pp['Unique HMT Project ID'].iloc[0],len(pp))
    for partner in pp.iterrows():
        txt+=showPartner(partner)
    txt+=endstr
    print(txt)

equityPartnersInProject(dfequity, 'Lambeth Street Lighting')

def equityPartnersBySPV(_df,spv,typ='name',endstr='\n'):
    if typ=='name':pp=_df[_df['SPV name'].notnull & _df['SPV name'].str.contains(spv)]
    elif typ=='id':pp=_df[_df['SPV company number']==spv]
    else: return
    
    if len(pp)==0:
        txt='No SPV with that {0} found.'.format(typ)
    else:
        txt='The {0} SPV has {1} equity partner(s):'.format(pp['SPV name'].iloc[0],len(pp))
    for partner in pp.iterrows():
        txt+=showPartner(partner)
    txt+=endstr
    print(txt)

equityPartnersBySPV(dfequity,'Hounslow')
equityPartnersBySPV(dfequity,'Island Roads')

def showPartnerInProj(partner):
    txt='\n * {0} ({1}% equity share in {2} (project id: {3}), SPV: {4})'.format(partner[1]['equityName'],100*partner[1]['equityShare'],
                                                     partner[1]['Project Name'],partner[1]['Unique HMT Project ID'],
                                                     partner[1]['SPV name'])
    return txt

def equitySearch(_df,equity,endstr='\n'):
    pp=_df[_df['equityName'].str.contains(equity)]
    
    if len(pp)==0:
        txt='No equity partner with that name found.'
    else:
        txt='That equity is associated with:'
    spv=[]
    for partner in pp.iterrows():
        txt+=showPartnerInProj(partner)
        if partner[1]['SPV name'] not in spv: spv.append(partner[1]['SPV name'])
    txt+=','.join(spv)+endstr
    print(txt)

equitySearch(dfequity,'G4S')

projexposure[:3]

projexco=pd.merge(dfequity,projexposure,on=['Unique HMT Project ID','Project Name'])
projexco[:3]

#Do any rows have equity share sums  > 100% ?
test=projexco[['Unique HMT Project ID','year','equityShare']].groupby(['Unique HMT Project ID','year']).sum().sort(ascending=False)
test.reset_index(inplace=True)
#May get floating point errors
test[test['equityShare']>1.00000000000001]['Unique HMT Project ID'].unique()

equityPartnersInProject(dfequity, 853,typ='id')
equityPartnersInProject(dfequity, 8,typ='id')

projexco['valueShare']=projexco['value']*projexco['equityShare']
projexco.reset_index(inplace=True)
projexco[123:129]

projexcoAnnual=projexco[['equityName','year','valueShare']].groupby(['equityName','year']).sum()
projexcoAnnual.reset_index(inplace=True)

projexcoAnnual[projexcoAnnual['year']==2014].sort('valueShare',ascending=False)[:10]

projexco[(projexco['equityName'].str.contains('Trillium')) & (projexco['year']==2014)]

trillium=projexco[(projexco['equityName'].str.contains('Trillium')) & (projexco['year']==2014)]
for project in trillium['Project Name']:
    equityPartnersInProject(dfequity, project)
    projReview[projReview['Project Name']==project].apply(interpreter1,axis=1 )

def projDesc(_df,startstr='',endstr='\n'):
    if startstr=='': txt='The {0} project'.format(_df['Project Name'])
    else: txt=startstr
    if _df['Capital Value (£m)']!=0:
        txt+='''has capital value £{0}m and estimated total unitary spend of £{1}m, \
({2} times more than the capital amount), with an operational contract period of {3} years.'''.format(
                           round(_df['Capital Value (£m)'],2),
                           round(_df['value'],2),
                           round(_df['multiplier'],2),
                           _df['Operational period of contract (years)'])
    txt+=endstr
    print(txt)
    

for project in trillium['Project Name'][:1]:
    equityPartnersInProject(dfequity, project)
    projDesc(projReview[projReview['Project Name']==project].iloc[0],'It ')

def projReportbyPartnerAndYear(partner,year):
    #Note that the search is case sensitive...
    projects=projexco[(projexco['equityName'].str.contains(partner)) & (projexco['year']==year)]
    for project in projects['Project Name']:
        equityPartnersInProject(dfequity, project,endstr='')
        projDesc(projReview[projReview['Project Name']==project].iloc[0],'It ')

projReportbyPartnerAndYear('G4S',2015)

projexco[(projexco['equityName'].str.contains('Sodexo')) & (projexco['year']==2014)]

#Generate a data view containing rows where the name of the equity partner includes the word Sodexo
sodexo=projexco[(projexco['equityName'].str.contains('Sodexo'))][['equityName','year','valueShare']]
#For each name, generate the sum total of value shares going to that partner each year
sodexo=sodexo.groupby(['equityName','year']).sum()
sodexo.reset_index(inplace=True)

ggplot(sodexo,aes(x='year',y='valueShare')) \
    + geom_line() \
    + facet_wrap('equityName',scales='fixed') \
    + ggtitle('Sodexo')

#Hmm - two takes on Sodexo? How come?
sodexo['equityName'].unique()