#Load in some libraries to help us work with the data
import pandas as pd
import networkx as nx

#Data source - download the PFI summary data to the data/ directory
#wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/267640/pfi_current_projects_list_march_2013_published_dec_2013.xlsx -P data

#Import the data we've just downloaded - by insprection, the first row is ignorable
df=pd.read_excel('data/pfi_current_projects_list_march_2013_published_dec_2013.xlsx',skiprows=1)

df[:3]

#Look to see if there are any obvious 'not known' style beacons in the SPV company number column
df['SPV company number'].value_counts()

#To make life easier, produce a view over the data that contains things that might have SPV company numbers available
spv=df.dropna(subset=['SPV company number'])
spv=spv[~spv['SPV company number'].isin(['N/K',0,'0','not known'])]

#Preview the first few rows
spv[:3]

#We're going to grab some JSON data from the OpenCorporates API - so load in some helper libraries
import requests, json

def getOCdata(cid):
    ''' Get company data from OpenCorporates given OpenCorporates id '''
    
    apiurl='https://api.opencorporates.com/v0.3'
    r=requests.get(apiurl+cid)
    return json.loads(r.text)['results']

def getOCdataFromCompanyNumber(cid,territory='gb'):
    return getOCdata('/'.join(['/companies',territory,cid]))

def companyReport(ocdata):
    c=ocdata['company']
    txt='{0} (company number: {1}) of {2}, is an {3} company originally incorporated on {4}.'.format(c['name'],c['company_number'], c['registered_address_in_full'],c['current_status'],c['incorporation_date'])
    print(txt)

pyramid=getOCdataFromCompanyNumber('5232548')
pyramid

companyReport(pyramid)

#We can tunnel into the OpenCorporates data to get directors information
def getCurrentOfficers(ocd,role=[]):
    ''' Get current officers from OpenCorporates company data '''
    
    curr=[]
    for o in ocd['company']['officers']:
        if o['officer']['end_date']==None:
            if role!=[]:
                if o['officer']['position'] in role:
                    curr.append(o['officer'])
            else: curr.append(o['officer'])
    return curr
#getCurrentOfficers(ocd)

pyramiddf=pd.DataFrame.from_dict(getCurrentOfficers(pyramid,'director'))
pyramiddf

def directorsSummary(cdata):
    cd=pd.DataFrame.from_dict(getCurrentOfficers(cdata,'director'))
    txt= '{0} currently has {1} active directors ({2}).'.format(cdata['company']['name'],len(cd),', '.join(cd['name']))
    print(txt)

directorsSummary(pyramid)

APISTUB='http://api.opencorporates.com/v0.3'

#To make calling the OpenCorporates API easier, we can register an account and get an API key
#I store my key in a key file and load it in as required
def getKey():
    json_data=open('data/keys.json')
    return json.load(json_data)['ockey']
ockey=getKey()

def deslash(x): return x.strip('/')
def signed(url,sig=True):
    if sig: return url+'?api_token='+ockey
    return url
 
def occStrip(ocURL):
    return deslash(ocURL.replace('http://opencorporates.com/companies',''))
 
    
#Old code - needs revising to use requests library more idiomatically
def buildURL(items):
    url=APISTUB
    for i in items:
        url='/'.join([url,deslash(i)])
    return signed(url)


def getOCofficerData(ocid):
    ''' Function to get data from the OpenCorporates API about a director, by director ID''' 
    
    ocurl=buildURL(['officers',ocid])
    r=requests.get(ocurl)
    ocdata= json.loads(r.text)
    return ocdata['results']
    

getOCofficerData('97067020')

required=['director']
def getOCofficerCompaniesSearch(name,page=1,cidnames=pd.DataFrame(columns=['oname','ocurl','cname'])):
    url=APISTUB+'/officers/search'
    r=requests.get(url,params={'api_token':getKey(),'q':'"'+name+'"','per_page':100,'page':page})
    #print(r.url)
    ocdata= json.loads(r.text)['results']
    for officer in ocdata['officers']:
        if name==officer['officer']['name']:
            if required!=[] and officer['officer']['position']in required:
                #print 'Possible new company for',name,officer['officer']['company']['name']
                #would a nominated secretary be interesting to search on? eg FIRST SECRETARIES LIMITED
                #cidnames.append( ( occStrip(officer['officer']['company']['opencorporates_url']), occStrip(officer['officer']['company']['name']) ) )
                cidnames=pd.concat([cidnames, pd.DataFrame([(name,occStrip(officer['officer']['company']['opencorporates_url']), occStrip(officer['officer']['company']['name']))],columns=['oname','ocurl','cname'])])
    if page < ocdata['total_pages']:
        #If there are lots of pages, eg for a nominee, we maybe need to warn the user....?
        page=page+1
        cidnames=getOCofficerCompaniesSearch(name,page,cidnames)
    #http://api.opencorporates.com/v0.2/officers/search?q=john+smith
    return cidnames

on=pd.DataFrame()
for name in pyramiddf['name']:
    on=pd.concat( [on, getOCofficerCompaniesSearch(name) ])
    
tmp=on.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)
tmp

tmp[tmp==len(pyramiddf)].reset_index()

def voodooMagic(cnum):
    cdata=getOCdataFromCompanyNumber(cnum)
    companyReport(cdata)
    directorsSummary(cdata)
    directors=pd.DataFrame.from_dict(getCurrentOfficers(cdata,'director'))
    directorCos=pd.DataFrame()
    for name in directors['name']:
        directorCos=pd.concat( [directorCos, getOCofficerCompaniesSearch(name) ])

    sharedCos=directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)
    sharedCos=sharedCos[sharedCos==len(directors)].reset_index()
    return directors,sharedCos,directorCos

directors,sharedCos,directoCos=voodooMagic('5232548')
print(directors)
print(sharedCos)

directors,sharedCos,directorCos=voodooMagic('4934390')
print(directors)
print(sharedCos)
directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15]

directors,sharedCos,directorCos=voodooMagic('07417700')
print(directors)
print(sharedCos)
directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15]

directors,sharedCos,directorCos=voodooMagic('8169805')
print(directors)
print(sharedCos)
directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15]