#Load in some libraries to help us work with the data import pandas as pd import networkx as nx #Data source - download the PFI summary data to the data/ directory #wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/267640/pfi_current_projects_list_march_2013_published_dec_2013.xlsx -P data #Import the data we've just downloaded - by insprection, the first row is ignorable df=pd.read_excel('data/pfi_current_projects_list_march_2013_published_dec_2013.xlsx',skiprows=1) df[:3] #Look to see if there are any obvious 'not known' style beacons in the SPV company number column df['SPV company number'].value_counts() #To make life easier, produce a view over the data that contains things that might have SPV company numbers available spv=df.dropna(subset=['SPV company number']) spv=spv[~spv['SPV company number'].isin(['N/K',0,'0','not known'])] #Preview the first few rows spv[:3] #We're going to grab some JSON data from the OpenCorporates API - so load in some helper libraries import requests, json def getOCdata(cid): ''' Get company data from OpenCorporates given OpenCorporates id ''' apiurl='https://api.opencorporates.com/v0.3' r=requests.get(apiurl+cid) return json.loads(r.text)['results'] def getOCdataFromCompanyNumber(cid,territory='gb'): return getOCdata('/'.join(['/companies',territory,cid])) def companyReport(ocdata): c=ocdata['company'] txt='{0} (company number: {1}) of {2}, is an {3} company originally incorporated on {4}.'.format(c['name'],c['company_number'], c['registered_address_in_full'],c['current_status'],c['incorporation_date']) print(txt) pyramid=getOCdataFromCompanyNumber('5232548') pyramid companyReport(pyramid) #We can tunnel into the OpenCorporates data to get directors information def getCurrentOfficers(ocd,role=[]): ''' Get current officers from OpenCorporates company data ''' curr=[] for o in ocd['company']['officers']: if o['officer']['end_date']==None: if role!=[]: if o['officer']['position'] in role: curr.append(o['officer']) else: curr.append(o['officer']) return curr #getCurrentOfficers(ocd) pyramiddf=pd.DataFrame.from_dict(getCurrentOfficers(pyramid,'director')) pyramiddf def directorsSummary(cdata): cd=pd.DataFrame.from_dict(getCurrentOfficers(cdata,'director')) txt= '{0} currently has {1} active directors ({2}).'.format(cdata['company']['name'],len(cd),', '.join(cd['name'])) print(txt) directorsSummary(pyramid) APISTUB='http://api.opencorporates.com/v0.3' #To make calling the OpenCorporates API easier, we can register an account and get an API key #I store my key in a key file and load it in as required def getKey(): json_data=open('data/keys.json') return json.load(json_data)['ockey'] ockey=getKey() def deslash(x): return x.strip('/') def signed(url,sig=True): if sig: return url+'?api_token='+ockey return url def occStrip(ocURL): return deslash(ocURL.replace('http://opencorporates.com/companies','')) #Old code - needs revising to use requests library more idiomatically def buildURL(items): url=APISTUB for i in items: url='/'.join([url,deslash(i)]) return signed(url) def getOCofficerData(ocid): ''' Function to get data from the OpenCorporates API about a director, by director ID''' ocurl=buildURL(['officers',ocid]) r=requests.get(ocurl) ocdata= json.loads(r.text) return ocdata['results'] getOCofficerData('97067020') required=['director'] def getOCofficerCompaniesSearch(name,page=1,cidnames=pd.DataFrame(columns=['oname','ocurl','cname'])): url=APISTUB+'/officers/search' r=requests.get(url,params={'api_token':getKey(),'q':'"'+name+'"','per_page':100,'page':page}) #print(r.url) ocdata= json.loads(r.text)['results'] for officer in ocdata['officers']: if name==officer['officer']['name']: if required!=[] and officer['officer']['position']in required: #print 'Possible new company for',name,officer['officer']['company']['name'] #would a nominated secretary be interesting to search on? eg FIRST SECRETARIES LIMITED #cidnames.append( ( occStrip(officer['officer']['company']['opencorporates_url']), occStrip(officer['officer']['company']['name']) ) ) cidnames=pd.concat([cidnames, pd.DataFrame([(name,occStrip(officer['officer']['company']['opencorporates_url']), occStrip(officer['officer']['company']['name']))],columns=['oname','ocurl','cname'])]) if page < ocdata['total_pages']: #If there are lots of pages, eg for a nominee, we maybe need to warn the user....? page=page+1 cidnames=getOCofficerCompaniesSearch(name,page,cidnames) #http://api.opencorporates.com/v0.2/officers/search?q=john+smith return cidnames on=pd.DataFrame() for name in pyramiddf['name']: on=pd.concat( [on, getOCofficerCompaniesSearch(name) ]) tmp=on.groupby(['cname','ocurl']).oname.nunique().order(ascending=False) tmp tmp[tmp==len(pyramiddf)].reset_index() def voodooMagic(cnum): cdata=getOCdataFromCompanyNumber(cnum) companyReport(cdata) directorsSummary(cdata) directors=pd.DataFrame.from_dict(getCurrentOfficers(cdata,'director')) directorCos=pd.DataFrame() for name in directors['name']: directorCos=pd.concat( [directorCos, getOCofficerCompaniesSearch(name) ]) sharedCos=directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False) sharedCos=sharedCos[sharedCos==len(directors)].reset_index() return directors,sharedCos,directorCos directors,sharedCos,directoCos=voodooMagic('5232548') print(directors) print(sharedCos) directors,sharedCos,directorCos=voodooMagic('4934390') print(directors) print(sharedCos) directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15] directors,sharedCos,directorCos=voodooMagic('07417700') print(directors) print(sharedCos) directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15] directors,sharedCos,directorCos=voodooMagic('8169805') print(directors) print(sharedCos) directorCos.groupby(['cname','ocurl']).oname.nunique().order(ascending=False)[:15]