In [1]:
"""
conda install pandas numpy seaborn folium basemap
pip install ckanapi cufflinks
"""
%matplotlib nbagg
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import matplotlib.ticker as tick
from mpl_toolkits.basemap import Basemap
import folium

import pandas as pd

from ckanapi import RemoteCKAN

import cufflinks
import seaborn


graph_figsize = (10,6)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

Data Science with Python

AKA How am I expected to follow that?

Andrew Bolster

Points of Order

Soz Colin, Python's older

In February 1991, van Rossum published the code (labeled version 0.9.0) to alt.sources - Wikipedia

Python reached version 1.0 in January 1994. - Wikipedia

PS Don't even think about using anything < Python 3.5 for new projects, it's awesome

Python and R share a massive amount

  • including a ggplot mapping
  • pandas in particular basically lifted the DataFrame structure
In [2]:
# Pandas does reads
df = pd.read_csv("https://www.jumpingrivers.com/data/movie.txt") # It Grap
display(df.head())
Title Year Length Budget Rating Votes r1 r2 r3 r4 ... r9 r10 mpaa Action Animation Comedy Drama Documentary Romance Short
0 A.k.a. Cassius Clay 1970 85 -1.0 5.7 43 4.5 0.0 4.5 14.5 ... 4.5 14.5 PG 0 0 0 0 1 0 0
1 AKA 2002 123 -1.0 6.0 335 24.5 4.5 4.5 4.5 ... 4.5 14.5 R 0 0 0 1 0 0 0
2 AVP: Alien Vs. Predator 2004 102 45000000.0 5.4 14651 4.5 4.5 4.5 4.5 ... 4.5 4.5 PG-13 1 0 0 0 0 0 0
3 Abandon 2002 99 25000000.0 4.7 2364 4.5 4.5 4.5 14.5 ... 4.5 4.5 PG-13 0 0 0 1 0 0 0
4 Abendland 1999 146 -1.0 5.0 46 14.5 4.5 4.5 4.5 ... 4.5 24.5 R 0 0 0 0 0 0 0

5 rows × 24 columns

In [3]:
df.Rating.mean() # it means
Out[3]:
5.522715081493712
In [4]:
f,ax = plt.subplots()
df.Rating.plot.hist(ax=ax) # it histz
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f06c9e3c908>
In [5]:
df.describe() # it describz
Out[5]:
Year Length Budget Rating Votes r1 r2 r3 r4 r5 ... r8 r9 r10 Action Animation Comedy Drama Documentary Romance Short
count 4847.000000 4847.000000 4.847000e+03 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 ... 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000 4847.000000
mean 1998.302249 100.878069 1.028689e+07 5.522715 4013.529606 9.599443 5.912214 6.249845 7.177326 9.592016 ... 12.715081 8.000722 13.317103 0.191252 0.013823 0.335053 0.487105 0.024758 0.152465 0.003301
std 6.110453 17.341517 2.303971e+07 1.451864 10620.208393 10.117409 4.328182 4.571213 4.977764 5.999638 ... 8.393503 6.236248 10.379292 0.393328 0.116768 0.472058 0.499885 0.155402 0.359509 0.057365
min 1934.000000 2.000000 -1.000000e+00 1.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1997.000000 90.000000 -1.000000e+00 4.600000 111.000000 4.500000 4.500000 4.500000 4.500000 4.500000 ... 4.500000 4.500000 4.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1999.000000 97.000000 -1.000000e+00 5.700000 504.000000 4.500000 4.500000 4.500000 4.500000 4.500000 ... 14.500000 4.500000 14.500000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 2002.000000 107.000000 8.000000e+06 6.600000 2991.500000 14.500000 4.500000 4.500000 14.500000 14.500000 ... 14.500000 14.500000 14.500000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000
max 2005.000000 251.000000 2.000000e+08 9.100000 157608.000000 84.500000 44.500000 34.500000 44.500000 45.500000 ... 45.500000 45.500000 74.500000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 22 columns

  • pip -> cran
  • 114304 packages with tests, documentation, and binary bundles (wheels so if you don't want to build, you can bin)
  • the anaconda distribution is even nicer and my preferred versionn

Zee Plan

  • (Honorary Mention) CKANApi
  • Pandas
  • Seaborn
  • Patsy (If I don't run massively over time) I ran massively over time and got distracted by
  • Cufflinks
  • Basemap

Pandas

  • Data Import / Export (Not really shown but ask me about it)
  • Actually Decent Datetime operation
  • Boiled in Stats and Grouping

Seaborn

  • Pretty Pretty Graphs
  • Fantastic "I have no idea what I'm looking at" explorations

Patsy

  • R-like statistical modelling building on statsmodels and pandas
  • IANASIJP1OTV - I Am Not A Statisician I Just Play 1 On TV
  • "Prettier, Interactive, Seaborn" - Me 2016

Basemap

  • Everything looks better on a map

ckanapi

  • Wrapper to talk to CKAN datasets 'easily'
  • I discovered this 3 days ago so have only scratched the surface...

Getting a list of dataset groups

In [6]:
ua = 'ItsBolster/29.5 (+http://farsetlab.org.uk/)'
demo = RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua)
groups = demo.action.group_list(id='data-explorer')
print(groups)
['economy', 'education', 'environment', 'finance', 'health', 'population', 'property', 'tourism', 'transport']

Getting stats on Datasets per group

In [7]:
group_data = demo.action.group_list(id='data-explorer', all_fields=True)
group_data
Out[7]:
[{'approval_status': 'approved',
  'created': '2015-07-28T16:24:55.207729',
  'description': '',
  'display_name': 'Economy, industry & employment',
  'id': '39dae6c3-e452-4eed-bee9-4dd056d6a8de',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-160744.768275economy.png',
  'image_url': '2015-11-20-160744.768275economy.png',
  'is_organization': False,
  'name': 'economy',
  'num_followers': 0,
  'package_count': 35,
  'revision_id': 'bfc12df7-e28e-4be5-8879-ad2d9cd459a7',
  'state': 'active',
  'title': 'Economy, industry & employment',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:53.911198',
  'description': '',
  'display_name': 'Education',
  'id': '040dacc9-7871-433e-ada9-646f086832cf',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162541.954816education.png',
  'image_url': '2015-11-20-162541.954816education.png',
  'is_organization': False,
  'name': 'education',
  'num_followers': 0,
  'package_count': 23,
  'revision_id': 'e747dc9a-3ee8-4ee5-adfc-dbe8740fd0ed',
  'state': 'active',
  'title': 'Education',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:54.727141',
  'description': '',
  'display_name': 'Environment & agriculture',
  'id': '53255e5c-2c7e-4792-a0f6-d4ede189a614',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162610.730369environment.png',
  'image_url': '2015-11-20-162610.730369environment.png',
  'is_organization': False,
  'name': 'environment',
  'num_followers': 0,
  'package_count': 62,
  'revision_id': 'fff7a193-d2ab-4cf6-96d9-2d8e23b5c169',
  'state': 'active',
  'title': 'Environment & agriculture',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:54.188726',
  'description': '',
  'display_name': 'Finance',
  'id': '2940449f-07d9-4c02-9a49-d45982b97155',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162631.313020finance.png',
  'image_url': '2015-11-20-162631.313020finance.png',
  'is_organization': False,
  'name': 'finance',
  'num_followers': 0,
  'package_count': 34,
  'revision_id': 'bae4b69a-3aa3-439e-a860-92d72e07be2b',
  'state': 'active',
  'title': 'Finance',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:53.601881',
  'description': '',
  'display_name': 'Health',
  'id': 'ec045f21-9af2-4bc0-aa04-e6e3df296590',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162619.537575health.png',
  'image_url': '2015-11-20-162619.537575health.png',
  'is_organization': False,
  'name': 'health',
  'num_followers': 0,
  'package_count': 37,
  'revision_id': 'b5f6e4df-ab0b-4ce1-82f3-4b0215df0557',
  'state': 'active',
  'title': 'Health',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:55.720122',
  'description': '',
  'display_name': 'Population & society',
  'id': 'e7198ce2-5ef0-4b2a-9a2e-ece09b59c6e7',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162646.501471population.png',
  'image_url': '2015-11-20-162646.501471population.png',
  'is_organization': False,
  'name': 'population',
  'num_followers': 0,
  'package_count': 77,
  'revision_id': '0fa9ebfe-1df7-4345-bd72-415d9b7b2861',
  'state': 'active',
  'title': 'Population & society',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:54.982063',
  'description': '',
  'display_name': 'Property & land',
  'id': '26561b26-fbf2-47ea-8da8-b2edf32fd514',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162714.698062property.png',
  'image_url': '2015-11-20-162714.698062property.png',
  'is_organization': False,
  'name': 'property',
  'num_followers': 0,
  'package_count': 90,
  'revision_id': 'ea0cb342-609e-4266-bd1f-9ca5b2c0bf88',
  'state': 'active',
  'title': 'Property & land',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-07-28T16:24:55.473908',
  'description': '',
  'display_name': 'Tourism, leisure, culture & arts',
  'id': '2806dcbc-26fe-48f0-8e0f-40b4eb38d42c',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162704.641811tourism.png',
  'image_url': '2015-11-20-162704.641811tourism.png',
  'is_organization': False,
  'name': 'tourism',
  'num_followers': 0,
  'package_count': 27,
  'revision_id': '1a5ec38c-085a-41d5-8719-257f24e18d1d',
  'state': 'active',
  'title': 'Tourism, leisure, culture & arts',
  'type': 'group'},
 {'approval_status': 'approved',
  'created': '2015-05-11T11:52:50.764482',
  'description': '',
  'display_name': 'Transport',
  'id': '3efef411-492a-4ccc-830d-f7ab351698e5',
  'image_display_url': 'https://www.opendatani.gov.uk/uploads/group/2015-11-20-162726.184495transport.png',
  'image_url': '2015-11-20-162726.184495transport.png',
  'is_organization': False,
  'name': 'transport',
  'num_followers': 0,
  'package_count': 44,
  'revision_id': 'f5c647ca-e6e4-4c89-8888-d6fc9d44c674',
  'state': 'active',
  'title': 'Transport',
  'type': 'group'}]

JSON is ugly as sin pandas can help with that

In [8]:
pd.DataFrame(group_data).head()
Out[8]:
approval_status created description display_name id image_display_url image_url is_organization name num_followers package_count revision_id state title type
0 approved 2015-07-28T16:24:55.207729 Economy, industry & employment 39dae6c3-e452-4eed-bee9-4dd056d6a8de https://www.opendatani.gov.uk/uploads/group/20... 2015-11-20-160744.768275economy.png False economy 0 35 bfc12df7-e28e-4be5-8879-ad2d9cd459a7 active Economy, industry & employment group
1 approved 2015-07-28T16:24:53.911198 Education 040dacc9-7871-433e-ada9-646f086832cf https://www.opendatani.gov.uk/uploads/group/20... 2015-11-20-162541.954816education.png False education 0 23 e747dc9a-3ee8-4ee5-adfc-dbe8740fd0ed active Education group
2 approved 2015-07-28T16:24:54.727141 Environment & agriculture 53255e5c-2c7e-4792-a0f6-d4ede189a614 https://www.opendatani.gov.uk/uploads/group/20... 2015-11-20-162610.730369environment.png False environment 0 62 fff7a193-d2ab-4cf6-96d9-2d8e23b5c169 active Environment & agriculture group
3 approved 2015-07-28T16:24:54.188726 Finance 2940449f-07d9-4c02-9a49-d45982b97155 https://www.opendatani.gov.uk/uploads/group/20... 2015-11-20-162631.313020finance.png False finance 0 34 bae4b69a-3aa3-439e-a860-92d72e07be2b active Finance group
4 approved 2015-07-28T16:24:53.601881 Health ec045f21-9af2-4bc0-aa04-e6e3df296590 https://www.opendatani.gov.uk/uploads/group/20... 2015-11-20-162619.537575health.png False health 0 37 b5f6e4df-ab0b-4ce1-82f3-4b0215df0557 active Health group
In [9]:
pd.DataFrame(group_data).keys()
Out[9]:
Index(['approval_status', 'created', 'description', 'display_name', 'id',
       'image_display_url', 'image_url', 'is_organization', 'name',
       'num_followers', 'package_count', 'revision_id', 'state', 'title',
       'type'],
      dtype='object')

Basic Graphing with matplotlib and pandas

In [10]:
df = pd.DataFrame(group_data).set_index('title')['package_count'].sort_values()
df
Out[10]:
title
Education                           23
Tourism, leisure, culture & arts    27
Finance                             34
Economy, industry & employment      35
Health                              37
Transport                           44
Environment & agriculture           62
Population & society                77
Property & land                     90
Name: package_count, dtype: int64
In [11]:
f,ax = plt.subplots(figsize=graph_figsize)
_=df.plot.barh(ax=ax,rot=45, title='Number of Open Datasets by Category')

Acquiring Datastores from OpenDataNI directly

I've not explored the datastore query language in ckanapi so these are magic numbers taken from the OpenDataNI Website

In [12]:
# Contracts awarded by CPD Construction Division - 
construction_resource_id = '6cc96ec3-9ec5-426d-9e5d-f67a9423d0ab'
# Contracts awarded by CPD Supplies and Services 
supplies_resource_id = 'd5b993f2-4f6f-4e94-9f45-c77b98201438'

Lets start with the Construction Division (Smaller dataset (71))

Also: Fun Python Language Construct:

Generator Expressions

  • Lazy evaluation of iterables
In [13]:
def dataset_generator(resource_id): 
    """A Generator that yields records from a given dataset resource id"""
    offset=0
    while True:
        datastore_page = demo.action.datastore_search(resource_id=resource_id, offset=offset)
        if not datastore_page['records']:
            raise StopIteration
        for record in datastore_page['records']:
            yield record   ## Execution is passed back to the caller here
            offset+=1
In [14]:
df = pd.DataFrame.from_records(dataset_generator(construction_resource_id))
df.head()
Out[14]:
Awarded value Basis for DAC Award City Company name Contract Number Contract awarded by Date of Award Latitude Longitude Postal code Street Title _id
0 40976 N/A Belfast Atkins Ltd CPDC-100 Department of Agriculture Environment and Rura... 2016-04-26T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 Old Channel Road DARD - Flood Risk Management Investment Strategy 1
1 529740 N/A Belfast McAdam Design Ltd CPDC-28 Department of Education NI 2016-04-27T00:00:00 54.5795073 -5.8760029 BT5 6BQ 1C Montgomery House 478 Castlereagh Road DENI - Appointment of Integrated Design Team f... 2
2 43914.7 N/A Belfast Atkins Ltd CPDC-68 Department of Agriculture Environment and Rura... 2016-12-05T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 Old Channel Road DAERA - Killynure Armco Culvert Feasibility Study 3
3 17275 N/A Belfast AECOM Infrastructure & Environment UK Ltd CPDC-109 Department of Agriculture Environment and Rura... 2016-05-25T00:00:00 54.5479762 -5.9111546 BT8 7RP Beechill House 40 Beechill Road DAERA - Linen Green Moygashel Feasibility Study 4
4 110000 N/A Belfast Ove Arup & Partners International Ltd CPDC-45 Department of Agriculture Environment and Rura... 2016-05-25T00:00:00 54.5943264 -5.930001 BT2 8BG The Linenhall 32-38 Linenhall Street NI Fisheries and Harbour Authority - Framework... 5

How much?

ckanapi unfortunately doesn't do any data-type introspection so everything appears as strings initially

Easily fixed with pandas

In [15]:
# Floatify monies
df['Awarded value']=df['Awarded value'].astype(float)
df['Awarded value'].mean()
Out[15]:
643731.4957142854

Pandas has built in stats capability to easily describe columns

In [16]:
(df['Awarded value']/1000).describe() # NOTE in £k
Out[16]:
count       70.000000
mean       643.731496
std       1813.213222
min          1.896380
25%         29.296310
50%        177.180895
75%        397.129900
max      12642.728600
Name: Awarded value, dtype: float64
In [17]:
f,ax = plt.subplots(figsize=graph_figsize)
_=(df['Awarded value']/1000).plot.hist(ax=ax)
In [18]:
f,ax = plt.subplots(figsize=graph_figsize)
_=(df['Awarded value']/1000).plot.hist(ax=ax, logy=True)
In [19]:
f,ax = plt.subplots(figsize=graph_figsize)
_=(df['Awarded value']/1000).plot.hist(ax=ax, logy=True, cumulative=True)

Open Datasets (esp. Gov) notoriously bad at "Coding" entries, so end up with variations in:

  • Case (NI vs ni, Ltd vs LTD, WITHHELD vs Withheld)
  • Abbreviations (NI vs Northern Ireland)
In [20]:
# Note withheld, RPS Ireland, WYG
company_list = df['Company name'].unique()
print("There are {} unique company names".format(len(company_list)), sorted(company_list))
There are 27 unique company names ['AECOM Infrastructure & Environment', 'AECOM Infrastructure & Environment UK Ltd', 'AG Wilson Ltd', 'Active Maintenance Solutions Ltd', 'Atkins Ltd', 'Cleary Contracting Ltd', 'Doran Consulting Ltd', 'Doran HMK', 'Doran HMK JV', 'Fox Building & Engineering Ltd', 'Hamilton Architects', 'John McQuillan (Contracts) Ltd', 'Lowry Construction Ltd', 'McAdam Design', 'McAdam Design Ltd', 'McMackin Contracts Ltd', 'Ove Arup & Partners International Ltd', 'Quinn Automatic Ltd', 'RPS Ireland Ltd (t/a RPS Consulting Engineers)', 'RPS Ireland Ltd (t/a RPS Consulting engineers)', 'Sirius Geotechnical & Environmental Ltd', 'WITHHELD', 'WYG Management Services', 'WYG Management Services (NI) Ltd', 'WYG Management Services Ltd', 'Withheld', 'Woodvale Construction Company']

So we perform some (greedy, niave, probably broken) input cleaning

In [21]:
text_fields = ['City','Basis for DAC Award','Company name','Contract awarded by','Title','Street']
interesting_fields = None
for text_field in text_fields:
    df[text_field]=df[text_field].str.lower().str.strip() 
    df[text_field]=df[text_field].str.replace(' ltd','')
    df[text_field]=df[text_field].str.replace(' uk','')
    df[text_field]=df[text_field].str.replace(' \(ni\)','')
    
company_list = df['Company name'].unique()
print("There are now {} unique company names".format(len(company_list)), sorted(company_list))
There are now 21 unique company names ['active maintenance solutions', 'aecom infrastructure & environment', 'ag wilson', 'atkins', 'cleary contracting', 'doran consulting', 'doran hmk', 'doran hmk jv', 'fox building & engineering', 'hamilton architects', 'john mcquillan (contracts)', 'lowry construction', 'mcadam design', 'mcmackin contracts', 'ove arup & partners international', 'quinn automatic', 'rps ireland (t/a rps consulting engineers)', 'sirius geotechnical & environmental', 'withheld', 'woodvale construction company', 'wyg management services']

In some cases, there are columns you either don't care about or doesn't contain any meaningful data for analysis; so drop it

In [22]:
df['Basis for DAC Award'].describe()
Out[22]:
count      71
unique      1
top       n/a
freq       71
Name: Basis for DAC Award, dtype: object
In [23]:
df.drop('Basis for DAC Award', axis=1, inplace=True)

Who gave out what?

As well as buggering the coding for Company name, department names are also buggered with abbrevs. and inconsistencies

In [24]:
dept_list = df['Contract awarded by'].unique()
print("There are {} unique dept. names".format(len(dept_list)), sorted(dept_list))
There are 26 unique dept. names ['', 'daera', 'dard', 'de', 'deni', 'department for communities', 'department for infrastructure', 'department for infrastructure ni', 'department for the economy ni', 'department of agriculture environment and rural affairs ni', 'department of education', 'department of education ni', 'department of finance', 'department of health ni', 'department of justice ni courts & tribunals service', 'dfc', 'dfi', 'dof properties division', 'lisburn & castlereagh city council', 'ni policing board', 'nips', 'northern ireland office', 'northern ireland prison service', 'psni', 'teo', 'the executive office ni']
In [25]:
df['Contract awarded by'].str.endswith(' ni').mean()
Out[25]:
0.25352112676056338

How do you solve a problem like gov. depts.

  • Department names are long, ugly and inconsistent
  • Department abbreviations are gobbledegook and inconsistent too (DARD/DAERA)
  • Set "Optimistic" conversions between abbrevs. and depts.
In [26]:
df['Contract awarded by']=df['Contract awarded by'].str.replace(' ni','')
# Edge case for misues of DE/DENI
df.replace(to_replace={'Contract awarded by': {'deni':'de'}}, inplace=True)

department_replacements = {
    'department for communities': 'dfc',
    'department for infrastructure': 'dfi',
    'department for infrastructure transport': 'dfi',
    'department for employment and learning': 'dfe/dfc',
    'department for the economy': 'dfe',
    'department of agriculture environment and rural affairs': 'daera',
    'department of education': 'de',
    'department of finance': 'df',
    'department of health': 'dh',
    'department of justice': 'doj',
    'department of justice courts & tribunals service': 'doj',
    'intertrade ireland': 'iti',
    'invest northern ireland': 'ini',
    'northern ireland office': 'nio',
    'northern ireland prison service': 'nips',
    'ni public health agency': 'pha',
    'police service of northern ireland': 'psni',
    'the executive office': 'teo'
}
df.replace(to_replace={'Contract awarded by':department_replacements})['Contract awarded by'].unique()
Out[26]:
array(['daera', 'de', 'ni policing board', 'df', 'dfc', 'dfi', 'dh', 'nio',
       'dfe', 'teo', 'dof properties division', 'doj', '',
       'lisburn & castlereagh city council', 'nips', 'dard', 'psni'], dtype=object)
In [27]:
df.replace(to_replace={'Contract awarded by':department_replacements}, inplace=True)
df.head()
Out[27]:
Awarded value City Company name Contract Number Contract awarded by Date of Award Latitude Longitude Postal code Street Title _id
0 40976.0 belfast atkins CPDC-100 daera 2016-04-26T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 old channel road dard - flood risk management investment strategy 1
1 529740.0 belfast mcadam design CPDC-28 de 2016-04-27T00:00:00 54.5795073 -5.8760029 BT5 6BQ 1c montgomery house 478 castlereagh road deni - appointment of integrated design team f... 2
2 43914.7 belfast atkins CPDC-68 daera 2016-12-05T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 old channel road daera - killynure armco culvert feasibility study 3
3 17275.0 belfast aecom infrastructure & environment CPDC-109 daera 2016-05-25T00:00:00 54.5479762 -5.9111546 BT8 7RP beechill house 40 beechill road daera - linen green moygashel feasibility study 4
4 110000.0 belfast ove arup & partners international CPDC-45 daera 2016-05-25T00:00:00 54.5943264 -5.930001 BT2 8BG the linenhall 32-38 linenhall street ni fisheries and harbour authority - framework... 5
In [28]:
construction_df = df.copy()
construction_df['Division'] = 'construction'
In [29]:
construction_df.head()
Out[29]:
Awarded value City Company name Contract Number Contract awarded by Date of Award Latitude Longitude Postal code Street Title _id Division
0 40976.0 belfast atkins CPDC-100 daera 2016-04-26T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 old channel road dard - flood risk management investment strategy 1 construction
1 529740.0 belfast mcadam design CPDC-28 de 2016-04-27T00:00:00 54.5795073 -5.8760029 BT5 6BQ 1c montgomery house 478 castlereagh road deni - appointment of integrated design team f... 2 construction
2 43914.7 belfast atkins CPDC-68 daera 2016-12-05T00:00:00 54.6050193 -5.9093978 BT3 9DE 71 old channel road daera - killynure armco culvert feasibility study 3 construction
3 17275.0 belfast aecom infrastructure & environment CPDC-109 daera 2016-05-25T00:00:00 54.5479762 -5.9111546 BT8 7RP beechill house 40 beechill road daera - linen green moygashel feasibility study 4 construction
4 110000.0 belfast ove arup & partners international CPDC-45 daera 2016-05-25T00:00:00 54.5943264 -5.930001 BT2 8BG the linenhall 32-38 linenhall street ni fisheries and harbour authority - framework... 5 construction

And what about services?

In [30]:
df = pd.DataFrame.from_records(dataset_generator(supplies_resource_id))
df.head() # Helpful renaming of the "Basis for DAC Award Column", well done DF...
Out[30]:
Awarded value Basis of Award City Company name Contract awarded by Contract end date Date of Award Latitude Longitude Postal code Street Title _id
0 £77,000.00 LONDON WILMINGTON PUBLISHING &amp; INFORMATION LTD Department of Agriculture Environment and Rura... 2021-03-04T00:00:00 2016-04-04T00:00:00 51.5299906 -0.0911146 N1 7JQ 6-14 UNDERWOOD STREET CFT223170 - DOE –... 1
1 £35,400.00 BELFAST PERCEPTIVE INSIGHT MARKET RESEARCH LTD Department for Communities NI 2016-04-08T00:00:00 2016-04-04T00:00:00 54.5942121 -5.8886505 BT5 5AB 109 Bloomfield Avenue [275750- SC1] - DSD – Consultancy to Carry Out... 2
2 £430,700.00 Ballyvourney Industrial Estate, Co. Cork DGP INTELSIUS LTD Department of Agriculture Environment and Rura... 2020-04-04T00:00:00 2016-05-04T00:00:00 51.944811 -9.161592 P12 A527 Unit 8 [123894- SC1] - DARD - Supply and Delivery of ... 3
3 £430,700.00 BELFAST PREMIER SCIENTIFIC LTD Department of Agriculture Environment and Rura... 2020-04-04T00:00:00 2016-05-04T00:00:00 54.6392358 -5.9014562 BT3 9LA Unit 3 Scientific Ltd [123894- SC2] - DARD - Supply and Delivery of ... 4
4 1* COALVILLE ANTALIS LTD Department of Finance 2016-12-31T00:00:00 2016-06-04T00:00:00 52.6972651 -1.3422078 LE67 1LE Gateway House [272312- SC1] - Pan Goverment Collaborative Ar... 5

Everything old is new again

Do it all over again, and fix column naming inconsistency

In [31]:
df.rename(columns={'Basis of Award':'Basis for DAC Award'}, inplace=True)
text_fields = ['City','Basis for DAC Award','Company name','Contract awarded by','Title','Street']
interesting_fields = None
for text_field in text_fields:
    df[text_field]=df[text_field].str.lower().str.strip()
    df[text_field]=df[text_field].str.replace(' ltd','')
    df[text_field]=df[text_field].str.replace(' uk','')
    df[text_field]=df[text_field].str.replace(' \(ni\)','')

df['Contract awarded by']=df['Contract awarded by'].str.replace(' ni','') # Drop ni
df.replace(to_replace={'Contract awarded by': {'deni':'de'}}, inplace=True) # Edge case for misues of DE/DENI
df.replace(to_replace={'Contract awarded by':department_replacements}, inplace=True) # Depts > Abbrevs
In [32]:
df.head() # WTF is 1*?
Out[32]:
Awarded value Basis for DAC Award City Company name Contract awarded by Contract end date Date of Award Latitude Longitude Postal code Street Title _id
0 £77,000.00 london wilmington publishing &amp; information daera 2021-03-04T00:00:00 2016-04-04T00:00:00 51.5299906 -0.0911146 N1 7JQ 6-14 underwood street cft223170 - doe – access to pension scheme leg... 1
1 £35,400.00 belfast perceptive insight market research dfc 2016-04-08T00:00:00 2016-04-04T00:00:00 54.5942121 -5.8886505 BT5 5AB 109 bloomfield avenue [275750- sc1] - dsd – consultancy to carry out... 2
2 £430,700.00 ballyvourney industrial estate, co. cork dgp intelsius daera 2020-04-04T00:00:00 2016-05-04T00:00:00 51.944811 -9.161592 P12 A527 unit 8 [123894- sc1] - dard - supply and delivery of ... 3
3 £430,700.00 belfast premier scientific daera 2020-04-04T00:00:00 2016-05-04T00:00:00 54.6392358 -5.9014562 BT3 9LA unit 3 scientific [123894- sc2] - dard - supply and delivery of ... 4
4 1* coalville antalis df 2016-12-31T00:00:00 2016-06-04T00:00:00 52.6972651 -1.3422078 LE67 1LE gateway house [272312- sc1] - pan goverment collaborative ar... 5

WTF is 1*?

In [33]:
# Boolean operators yield Series objects!
(df['Awarded value']=='1*').mean()
Out[33]:
0.11654135338345864

11.7% of Service contracts are 'withheld' as per the guidance on the ODNI page:

For certain types of contract it is difficult to predict the estimated contract value, so this has been recorded as 1*.

Yey for commuinicating the meaning, boo for hiding and hand flapping.

Plan: Replace 1* with None and hope pandas can cope (it can)

Cynical Hypothesis: 1*'s are more likely to be direct (unadvertised) awards

WTF is DAC

Direct Award Contract, or "We asked for these guys and noone else"

In [34]:
df['Basis for DAC Award'].describe()
# Lets keep those, but what do they look like?
Out[34]:
count     532
unique     15
top          
freq      456
Name: Basis for DAC Award, dtype: object
In [35]:
df[df['Basis for DAC Award'].str.len()>0]['Basis for DAC Award'].unique()
# regulation32 basically means "it was a direct tender". Interesting but not relevant
Out[35]:
array(['basis of award: regulation 32',
       'basis of award: in the public interest',
       'basis of award regulation 32',
       'this procurement is for the collection of eu referendum poll cards and delivery to the electorate of northern ireland. [framework value £319714]',
       'travel management services [framework value £60000000]',
       'basis of award:in compliance with regulation 32 of the public contracts regulations 2015',
       'basis of award: in compliance with regulation 32 of the public contracts regulations 2015',
       'framework for disposal services for it equipment, electronic and electrical equipment [framework value £1,000,000]',
       'basis of award public contracts regulations 2015, “as for reasons of extreme urgency brought about by events unforeseeable by the department.',
       'basis of award: in compliance with regulation 32 of the public contracts regulations 2015 regulation 32',
       'the framework agreement (fa) will be used to provide a range of external generic learning and development (l and d) services and support the provision of future services. the objectives of the fa are to provide a variety of learning delivery channels including classroom and digital/e-learning interventions by incorporating the latest tools and techniques within l and d interventions to ensure the best learning outcomes. the fa will also promote collaborative work with contractors, departments and professions to identify, design develop and deliver current and future learning interventions and improve utilisation of evaluation and management information. [framework value £5,000,000]',
       'supply and delivery of nearly new and used vehicles [framework value £2000000]',
       'to establish a collaborative framework agreement for the provision of a travel management company (tmc) to provide a fully managed service for all and international travel, accommodation and other travel associated services within northern ireland on behalf of northern ireland civil service (nics) departments, agencies, non departmental public bodies and other participating bodies or other nominated individuals working on behalf of the client framework value [£60000000]',
       'dac 01/17 - award basis - in compliance with regulation 32 of the public contracts regulation 2015'], dtype=object)

Sidebar: WTF was extreme urgency?

In [36]:
urgency = df[df['Basis for DAC Award'].str.contains('extreme').fillna(False)][['Awarded value','Company name','Title','Contract awarded by','Date of Award']]
urgency.iloc[0],urgency.iloc[0]['Title']
Out[36]:
(Awarded value                                              £2,500,000.00
 Company name                        microsoft ireland operations limited
 Title                  [667055- sc1] - dac 206/16-dfc-welfare reform ...
 Contract awarded by                                                  dfc
 Date of Award                                        2016-06-12T00:00:00
 Name: 285, dtype: object,
 '[667055- sc1] - dac 206/16-dfc-welfare reform mitigation measures')

So M$FT got £2.5m to fix welfare reform

“as for reasons of extreme urgency brought about by events unforeseeable by the department.'

Question: How many 1*'s were Direct Tenders and is this different from the overall ratio

In [37]:
direct_awards_df = df[df['Basis for DAC Award'].str.len()>0]
direct_awards_df.shape[0], (direct_awards_df['Awarded value']=='1*').mean()
Out[37]:
(76, 0.092105263157894732)

Cynical Hypothesis was wrong; if anything DAC tenders are slightly less likely to be withheld

Nothing to be gained by keeping the 1* value so we replace it with None

In [38]:
df.replace(to_replace={'Awarded value':{'1*':None}}, inplace=True)
df.head()
# Now we can follow the money
Out[38]:
Awarded value Basis for DAC Award City Company name Contract awarded by Contract end date Date of Award Latitude Longitude Postal code Street Title _id
0 £77,000.00 london wilmington publishing &amp; information daera 2021-03-04T00:00:00 2016-04-04T00:00:00 51.5299906 -0.0911146 N1 7JQ 6-14 underwood street cft223170 - doe – access to pension scheme leg... 1
1 £35,400.00 belfast perceptive insight market research dfc 2016-04-08T00:00:00 2016-04-04T00:00:00 54.5942121 -5.8886505 BT5 5AB 109 bloomfield avenue [275750- sc1] - dsd – consultancy to carry out... 2
2 £430,700.00 ballyvourney industrial estate, co. cork dgp intelsius daera 2020-04-04T00:00:00 2016-05-04T00:00:00 51.944811 -9.161592 P12 A527 unit 8 [123894- sc1] - dard - supply and delivery of ... 3
3 £430,700.00 belfast premier scientific daera 2020-04-04T00:00:00 2016-05-04T00:00:00 54.6392358 -5.9014562 BT3 9LA unit 3 scientific [123894- sc2] - dard - supply and delivery of ... 4
4 None coalville antalis df 2016-12-31T00:00:00 2016-06-04T00:00:00 52.6972651 -1.3422078 LE67 1LE gateway house [272312- sc1] - pan goverment collaborative ar... 5

Following the Money

Need to get rid of £ and , so we can safely convert the string to a float

In [39]:
df['Awarded value']=df['Awarded value'].replace('[£,]','', regex=True).astype(float)
df.head()
Out[39]:
Awarded value Basis for DAC Award City Company name Contract awarded by Contract end date Date of Award Latitude Longitude Postal code Street Title _id
0 77000.0 london wilmington publishing &amp; information daera 2021-03-04T00:00:00 2016-04-04T00:00:00 51.5299906 -0.0911146 N1 7JQ 6-14 underwood street cft223170 - doe – access to pension scheme leg... 1
1 35400.0 belfast perceptive insight market research dfc 2016-04-08T00:00:00 2016-04-04T00:00:00 54.5942121 -5.8886505 BT5 5AB 109 bloomfield avenue [275750- sc1] - dsd – consultancy to carry out... 2
2 430700.0 ballyvourney industrial estate, co. cork dgp intelsius daera 2020-04-04T00:00:00 2016-05-04T00:00:00 51.944811 -9.161592 P12 A527 unit 8 [123894- sc1] - dard - supply and delivery of ... 3
3 430700.0 belfast premier scientific daera 2020-04-04T00:00:00 2016-05-04T00:00:00 54.6392358 -5.9014562 BT3 9LA unit 3 scientific [123894- sc2] - dard - supply and delivery of ... 4
4 NaN coalville antalis df 2016-12-31T00:00:00 2016-06-04T00:00:00 52.6972651 -1.3422078 LE67 1LE gateway house [272312- sc1] - pan goverment collaborative ar... 5
In [40]:
df.sort_values(by='Awarded value', ascending=False).head()
Out[40]:
Awarded value Basis for DAC Award City Company name Contract awarded by Contract end date Date of Award Latitude Longitude Postal code Street Title _id
5 1.000000e+08 withheld withheld psni 2023-04-30T00:00:00 2016-08-04T00:00:00 None None Withheld withheld [42197- sc1] - doj - psni - core it managed se... 6
292 4.500000e+07 withheld withheld doj 2024-06-30T00:00:00 2016-12-13T00:00:00 None None Withheld withheld [449719- sc1] - doj - nicts - security and anc... 288
123 3.000000e+07 marlow whistl df 2019-07-31T00:00:00 2016-07-19T00:00:00 51.5718457 -0.7664076 SL7 1TB meridian house, fieldhouse lane [372743- sc1] - provision of postal services f... 120
226 1.268955e+07 ballycastle rathlin island ferry dfi 2026-12-31T00:00:00 2016-02-11T00:00:00 55.2058251 -6.2416245 BT54 6BT 18 bayview road [498825- sc1] - department for infrastructure ... 222
497 1.240690e+07 antrim firmus energy (supply) limited df 2019-03-31T00:00:00 2017-03-13T00:00:00 54.722412 -6.2244468 BT41 4LZ 5 kilbegs road fergusons way [871163- sc1] - supply of natural gas - lot 4 498
In [41]:
services_df = df.copy()
services_df['Division'] = 'services'

Graphs : Still on Services Only

In [42]:
f,ax = plt.subplots(figsize = graph_figsize)
_=services_df.groupby('Contract awarded by')['Awarded value'].sum().sort_values().plot.pie(ax=ax)
In [43]:
def make_autopct(total):
    def my_autopct(pct):
        val = int(round(pct*total/100.0))
        return '{p:.1f}%  (£{v:,.1f}m)'.format(p=pct,v=val/1000000)
    return my_autopct
In [44]:
f,ax = plt.subplots(figsize = graph_figsize)
_=services_df.groupby('Contract awarded by')['Awarded value'].sum().sort_values().plot.pie(
    ax=ax, 
    autopct=make_autopct(services_df['Awarded value'].sum()),
    title='Who Gave Out How Much?[Services]'
)
In [45]:
f,ax = plt.subplots(figsize = graph_figsize)
_=services_df.groupby('Company name')['Awarded value'].sum().sort_values().plot.pie(
    ax=ax,
    autopct=make_autopct(services_df['Awarded value'].sum()),
    title='Who Got How Much?[Services]'
)

Graphing: Construction

Same as before but on the Construction Dataset

In [46]:
f,ax = plt.subplots(figsize = graph_figsize)
_=construction_df.groupby('Contract awarded by')['Awarded value'].sum().sort_values().plot.pie(
    ax=ax, autopct=make_autopct(construction_df['Awarded value'].sum()),
    title='Who Gave Out How Much?[Construction]'
)