This notebook represents a test data conversation with the UK Office of National Statistcs (ONS) API.
I can't guarantee that all the mappings are correct...
Several of the call responses include URLs for downloadable datasets (though I got corrupt/non-openable zip files on several downloads I tried?
It may be worth returning the URL responses from queries so that whole datasets can be downloaded directly?
import requests
import pandas as pd
class ONS_CONFIG:
ONS_STUB= 'http://data.ons.gov.uk/ons/api/data/'
DEFAULT_CONTEXT='Census'
with open('onspwd.txt','r') as f:
APIKEY=f.read()
def __init__(self):
self.url=ONS_STUB
self.params={'jsontype':'json-stat','apikey':APIKEY}
def _ONS_SetParam(self,param,value):
self.params[param]=value
def _ONS_SetContext(self,context):
self.params['context']=context
def _ONS_Census(self):
self._ONS_SetContext('Census')
def _ONS_SetConcept(self,concept):
self.params['concept']=concept
def _ONS_SetGeog(self,geog):
self.params['geog']=geog
def _ONS_SetConcepts(self,context=DEFAULT_CONTEXT):
self.url=ONS_STUB+'concepts.json'
#context is required?
self._ONS_SetContext(context)
def _ONS_SetCollections(self):
self.url=ONS_STUB+'collections.json'
def _ONS_SetCollectionDetails(self,collId):
self.url=ONS_STUB+'collectiondetails/'+collId+'.json'
def _ONS_SetDatasetDetails(self,datasetId,geog,context=DEFAULT_CONTEXT):
self.url=ONS_STUB+'datasetdetails/'+datasetId+'.json'
#context is required?
self._ONS_SetContext(context)
#geog is required?
self._ONS_SetGeog(geog)
def _ONS_SetGeohierarchies(self,geohier):
self.url=ONS_STUB+'hierarchies/hierarchy/'+geohier+'.json'
def _ONS_SetDataset(self,dataset,geohier,context,dims):
self.url=ONS_STUB+'dataset/'+dataset+'.json'
self._ONS_SetContext(context)
self._ONS_SetGeog(geohier)
for param in dims:
self._ONS_SetParam(param,dims[param])
def _ONS_getConcepts(d,context='Census',lang='en'):
d._ONS_SetConcepts(context)
#Context MUST be defined
r = requests.get(d.url,params=d.params)
items={}
for _concept in r.json()['ons']['conceptList']['concept']:
for concept in _concept['names']['name']:
if concept['@xml.lang']==lang:
items[_concept['id']]=concept['$']
return items
concepts=_ONS_getConcepts(ONS_CONFIG())
df=pd.DataFrame.from_dict(concepts,orient='index')
df[:5]
0 | |
---|---|
1 | Age |
21 | Demography |
22 | Dependent children |
23 | Household composition |
24 | Household deprivation dimensions |
def _getName(d,lang='en'):
for n in d['names']['name']:
if n['@xml.lang']==lang:
return n['$']
def _getLangItemFromList(d,attr,lang='en'):
for n in d[attr]:
if n['@xml.lang']==lang:
return n['$']
def _getGeographicalTypes(d,lang='en'):
geotypes=[]
dh=d['geographicalHierarchies']['geographicalHierarchy']
if not(isinstance(dh,list)):
dh=[dh]
for geohier in dh:
for geotype in geohier['geographicalType']:
if geotype['@xml.lang']==lang:
geotypes.append(geotype['$'])
return geotypes
#collections_url='http://data.ons.gov.uk/ons/api/data/collections.json?apikey='+APIKEY+'&context=Census'
def _ONS_getCollections(d,concept='',context='Census',lang='en'):
d._ONS_SetCollections()
d._ONS_SetContext(context)
d._ONS_SetConcept(concept)
r = requests.get(d.url,params=d.params)
data=r.json()['ons']['collectionList']['collection']
if not(isinstance(data,list)): data=[data]
items=[]
for data_el in data:
item={'description':data_el['description'],
'name':_getName(data_el),
'id':data_el['id'],
'geographicalTypes':_getGeographicalTypes(data_el)
}
items.append(item)
return items
#List all collections
data=_ONS_getCollections(ONS_CONFIG())
df=pd.DataFrame(data)
df[:5]
description | geographicalTypes | id | name | |
---|---|---|---|---|
0 | collection CT0010 | [Ward Admin, Parliamentary Constituency] | CT0010 | Ethnic group (write-in responses) |
1 | collection DC2101EW | [Census Merged Wards] | DC2101EW | Ethnic group by sex by age |
2 | collection DC2102EW | [Census Merged Wards] | DC2102EW | National identity by sex by age |
3 | collection DC2103EW | [Census Merged Wards] | DC2103EW | Country of birth by sex by age |
4 | collection DC2104EW | [Census Merged Wards] | DC2104EW | Main language by sex by age |
#Display a particular collection
_ONS_getCollections(ONS_CONFIG(),60)
[{'id': 'QS501EW', 'description': 'collection QS501EW', 'name': 'Highest level of qualification', 'geographicalTypes': ['Ward Admin', 'Parliamentary Constituency']}]
#purl='collectiondetails/QS501EW.json?context=Census&apikey=**'
def _ONS_parse_dimensions(d):
dimensions=[]
for dimension in d['dimensions']['dimension']:
dimensions.append({
'id':dimension['dimensionId'],
'title': _getLangItemFromList(dimension['dimensionTitles'],'dimensionTitle')
})
return dimensions
def _ONS_parse_areas(data):
dl=data['geographicalHierarchies']['geographicalHierarchy']
metaitems=[]
if not(isinstance(dl,list)):
dl=[dl]
for metaitem in dl:
items={'area':[]}
for el in ['year','id']:
if el in metaitem:
items[el]=metaitem[el]
items['name']=_getLangItemFromList(metaitem['types'],'geographicalType')
if not(isinstance(metaitem['areaTypes']['areaType'],list)):
metaitem['areaTypes']['areaType']=[metaitem['areaTypes']['areaType']]
for item in metaitem['areaTypes']['areaType']:
tmp={'areaTypeCodeName':item['codename'],
'areaTypeCodeAbbrev':item['abbreviation'],
'areaTypeLevel':item['level']
}
items['area'].append(tmp)
metaitems.append(items)
return metaitems
def _datasetDescription(d,lang='en'):
return _getLangItemFromList(d['refMetadata']['refMetadataItem']['descriptions'],'description')
def _ONS_getCollectionDetails(d,collId,context='Census',lang='en'):
d._ONS_SetCollectionDetails(collId)
#Context required?
d._ONS_SetContext(context)
r = requests.get(d.url,params=d.params)
data=r.json()['ons']['collectionDetail']
item={'description':_datasetDescription(data),
'dimensions':_ONS_parse_dimensions(data),
'id':data['id'],
'publicationDate': data['publicationDate'],
'areas':_ONS_parse_areas(data)
}
return item
_ONS_getCollectionDetails(ONS_CONFIG(),'QS501EW')
{'dimensions': [{'id': 'CL_0000052', 'title': 'Highest level of qualification (T008A)'}, {'id': '2011WARDH', 'title': '2011 Administrative Hierarchy'}, {'id': '2011PCONH', 'title': '2011 Westminster Parliamentary Constituency Hierarchy'}], 'id': 'QS501EW', 'description': '<p>\r\n\tThis dataset provides 2011 Census estimates that classify usual residents aged 16 and over in England and Wales by their highest level of qualification. The estimates are as at census day, 27 March 2011.</p>\r\n<p>\r\n\tThis information identifies educational achievement across the population to help government resource allocation and policy making, especially in relation to disadvantaged population groups and educationally deprived areas.</p>\r\n<p>\r\n\tSimilar estimates from the 2001 Census were provided in table UV24 however the category "Apprenticeship" is identified separately in 2011 Census estimates to provide more detail.</p>\r\n<p>\r\n\tFor more information about qualifications please see Part 4 of the 2011 Census User Guide at <a href="http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html">http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html</a></p>', 'areas': [{'area': [{'areaTypeCodeAbbrev': 'NAT ', 'areaTypeLevel': 0, 'areaTypeCodeName': 'England and Wales'}, {'areaTypeCodeAbbrev': 'CTRY', 'areaTypeLevel': 1, 'areaTypeCodeName': 'Country'}, {'areaTypeCodeAbbrev': 'WPC', 'areaTypeLevel': 3, 'areaTypeCodeName': 'Westminster Parliamentary Constituency'}, {'areaTypeCodeAbbrev': 'RGN', 'areaTypeLevel': 2, 'areaTypeCodeName': 'Region'}], 'year': 2011, 'name': 'Parliamentary Constituency', 'id': '2011PCONH'}, {'area': [{'areaTypeCodeAbbrev': 'UA', 'areaTypeLevel': 5, 'areaTypeCodeName': 'Unitary Authority'}, {'areaTypeCodeAbbrev': 'RGN', 'areaTypeLevel': 4, 'areaTypeCodeName': 'Region'}, {'areaTypeCodeAbbrev': 'NMD', 'areaTypeLevel': 6, 'areaTypeCodeName': 'Non-metropolitan District'}, {'areaTypeCodeAbbrev': 'CTY', 'areaTypeLevel': 5, 'areaTypeCodeName': 'County'}, {'areaTypeCodeAbbrev': 'MD', 'areaTypeLevel': 6, 'areaTypeCodeName': 'Metropolitan District '}, {'areaTypeCodeAbbrev': 'WD', 'areaTypeLevel': 7, 'areaTypeCodeName': 'Electoral Ward/Division '}, {'areaTypeCodeAbbrev': 'WD', 'areaTypeLevel': 7, 'areaTypeCodeName': 'Electoral Division'}, {'areaTypeCodeAbbrev': 'MCTY', 'areaTypeLevel': 5, 'areaTypeCodeName': 'Metropolitan County'}, {'areaTypeCodeAbbrev': 'NAT ', 'areaTypeLevel': 2, 'areaTypeCodeName': 'England and Wales'}, {'areaTypeCodeAbbrev': 'LONB', 'areaTypeLevel': 6, 'areaTypeCodeName': 'London Borough '}, {'areaTypeCodeAbbrev': 'IOL', 'areaTypeLevel': 5, 'areaTypeCodeName': 'Inner and Outer London'}, {'areaTypeCodeAbbrev': 'CTRY', 'areaTypeLevel': 3, 'areaTypeCodeName': 'Country'}], 'year': 2011, 'name': 'Ward Admin', 'id': '2011WARDH'}], 'publicationDate': '2013-10-19+01:00'}
def _ONS_getDatasetDetails(d,datasetId,geog,context='Census',lang='en'):
d._ONS_SetDatasetDetails(datasetId,geog,context)
r = requests.get(d.url,params=d.params)
data=r.json()['ons']['datasetDetail']
item={'description':_datasetDescription(data),
'dimensions':_ONS_parse_dimensions(data),
'id':data['id'],
'publicationDate': data['publicationDate'],
'areas':_ONS_parse_areas(data)
}
return item
_ONS_getDatasetDetails(ONS_CONFIG(),'QS501EW','2011PCONH')
{'dimensions': [{'id': 'CL_0000052', 'title': 'Highest level of qualification (T008A)'}, {'id': '2011PCONH', 'title': '2011 Westminster Parliamentary Constituency Hierarchy'}], 'id': 'QS501EW', 'description': '<p>\tThis dataset provides 2011 Census estimates that classify usual residents aged 16 and over in England and Wales by their highest level of qualification. The estimates are as at census day, 27 March 2011.</p><p>\tThis information identifies educational achievement across the population to help government resource allocation and policy making, especially in relation to disadvantaged population groups and educationally deprived areas.</p><p>\tSimilar estimates from the 2001 Census were provided in table UV24 however the category "Apprenticeship" is identified separately in 2011 Census estimates to provide more detail.</p><p>\tFor more information about qualifications please see Part 4 of the 2011 Census User Guide at <a href="http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html">http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html</a></p>', 'areas': [{'area': [{'areaTypeCodeAbbrev': 'WPC', 'areaTypeLevel': 3, 'areaTypeCodeName': 'Westminster Parliamentary Constituency'}, {'areaTypeCodeAbbrev': 'CTRY', 'areaTypeLevel': 1, 'areaTypeCodeName': 'Country'}, {'areaTypeCodeAbbrev': 'NAT ', 'areaTypeLevel': 0, 'areaTypeCodeName': 'England and Wales'}, {'areaTypeCodeAbbrev': 'RGN', 'areaTypeLevel': 2, 'areaTypeCodeName': 'Region'}], 'name': 'Parliamentary Constituency', 'id': '2011PCONH'}], 'publicationDate': '2013-10-19+01:00'}
def _ONS_getHierarchies(d,geohier,context='',lang='en'):
d._ONS_SetGeohierarchies(geohier)
if context!='':
d._ONS_SetContext(context)
r = requests.get(url=d.url,params=d.params)
data=r.json()['ons']['geographyList']
items=[]
for item in data['items']['item']:
tmp={'label':_getLangItemFromList(item['labels'],'label'),
'itemCode':item['itemCode'],
'areaTypeCodeName':item['areaType']['codename'],
'areaTypeCodeAbbrev':item['areaType']['abbreviation'],
'areaTypeLevel':item['areaType']['level'],
'parentCode':''
}
if 'parentCode' in item:
tmp['parentCode']=item['parentCode']
items.append(tmp)
return pd.DataFrame(items)
_ONS_getHierarchies(ONS_CONFIG(),'2011PCONH')[:5]#,'Census') #2011WARDH, 2011PCONH
areaTypeCodeAbbrev | areaTypeCodeName | areaTypeLevel | itemCode | label | parentCode | |
---|---|---|---|---|---|---|
0 | WPC | Westminster Parliamentary Constituency | 3 | E14000842 | North East Cambridgeshire | E12000006 |
1 | WPC | Westminster Parliamentary Constituency | 3 | E14000962 | St Helens North | E12000002 |
2 | WPC | Westminster Parliamentary Constituency | 3 | E14001035 | West Worcestershire | E12000005 |
3 | WPC | Westminster Parliamentary Constituency | 3 | E14000865 | Nottingham East | E12000004 |
4 | WPC | Westminster Parliamentary Constituency | 3 | E14000539 | Banbury | E12000008 |
The following is where we actually start to get some data!
The fucntion currently includes a slightly modified version of code from ONS blog.
Ideally, we need to start developing something here to construct and return a pandas
dataframe in an efficient and understandable way.
def _ONS_SetDataset(d,dataset,geohier,context,dims,lang='en'):
d._ONS_SetDataset(dataset,geohier,context,dims)
r = requests.get(url=d.url,params=d.params)
data=r.json()
#-------
#Via http://digitalpublishing.ons.gov.uk/2014/08/07/ons-api-just-the-numbers/
#First we create an empty dict object
datax = {'vals':{}}
#We need to specify the dataset name
dataset = "QS501EW"
geog=''
# We get the actual observation values from the JSON-STAT as a list
values = obj[dataset]['value']
# Then we get the index of the observations and its associated categories as a dict
##TH: Original uses index 0 - but this desonlt respond correctly for multiple areas?
index = obj[dataset]['dimension'][obj[dataset]['dimension']['id'][0]]['category']['index']
#and finally the labels for the categories as another dict
##TH: Original uses index 0 - but this doesn't respond correctly for multiple areas?
labels = obj[dataset]['dimension'][obj[dataset]['dimension']['id'][0]]['category']['label']
#What's the measure?
key=obj[dataset]['dimension'][obj[dataset]['dimension']['id'][1]]['category']['label']
ix=obj[dataset]['dimension'][obj[dataset]['dimension']['id'][1]]['category']['index']
print(key,ix)
datax['keys']=[]
for ixl in ix:
datax['keys'].append({'label':key[ixl],'id':ixl})
for l in labels: #Now we can iterate through the labels
num = index[l] # get the position in the values dict of the specific label
count = values[str(num)] #get that value from the values list
datax['vals'][labels[l]] = count #create a new object in data dict for the value with the label as its name
#print(datax) #All done!
return datax
_ONS_SetDataset(ONS_CONFIG(),'QS501EW','2011WARDH','Census',{'dm/2011WARDH':'E05008481,E05003606'})
{'CI_0000368': 'All categories: Highest level of qualification'} {'CI_0000368': 0}
{'keys': [{'id': 'CI_0000368', 'label': 'All categories: Highest level of qualification'}], 'vals': {'Kingsteignton East': 4841, 'Brading, St Helens and Bembridge': 6031}}
Maybe the best way to handle the json-stat data is to use a library to parse it?
!pip3 install pyjstat
Downloading/unpacking pyjstat Downloading pyjstat-0.1.5.tar.gz (157kB): 157kB downloaded Running setup.py (path:/tmp/pip_build_root/pyjstat/setup.py) egg_info for package pyjstat Requirement already satisfied (use --upgrade to upgrade): pandas in /usr/local/lib/python3.4/dist-packages (from pyjstat) Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in /usr/local/lib/python3.4/dist-packages (from pandas->pyjstat) Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in /usr/local/lib/python3.4/dist-packages (from pandas->pyjstat) Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0b2 in /usr/lib/python3/dist-packages (from pandas->pyjstat) Requirement already satisfied (use --upgrade to upgrade): six in /usr/local/lib/python3.4/dist-packages (from python-dateutil>=2->pandas->pyjstat) Installing collected packages: pyjstat Running setup.py install for pyjstat Successfully installed pyjstat Cleaning up...
from pyjstat import pyjstat
#Use example from ONS blog post
d=ONS_CONFIG()
d._ONS_SetDataset('QS104EW','2011WARDH','Census',{'dm/2011WARDH':'K04000001','totals':'false'})
r = requests.get(url=d.url,params=d.params)
data=r.json()
data
{'QS104EW': {'source': 'Office for National Statistics', 'label': 'Sex', 'value': {'0': 56075912, '2': 28502536, '1': 27573376}, 'updated': '17/10/2013 16:51:00', 'dimension': {'CL_0000137': {'category': {'label': {'CI_0000001': '2011'}, 'index': {'CI_0000001': 0}}, 'label': 'Time Dimension'}, 'size': [8965, 3, 1, 1], 'CL_0000035': {'category': {'label': {'CI_0000070': 'Females', 'CI_0000071': 'Males', 'CI_0000121': 'All categories: Sex'}, 'index': {'CI_0000070': 2, 'CI_0000071': 1, 'CI_0000121': 0}}, 'label': 'Sex (T003A)'}, '2011WARDH': {'category': {'label': {'K04000001': 'England and Wales'}, 'index': {'K04000001': 0}}, 'label': '2011 Administrative Hierarchy'}, 'id': ['2011WARDH', 'CL_0000035', 'Att_000001', 'CL_0000137'], 'role': {'metric': ['Att_000001'], 'geo': ['2011WARDH'], 'time': ['CL_0000137']}, 'Att_000001': {'category': {'unit': {'Segment_1': {'multiplier': 'Units', 'type': 'Count', 'unit': 'Number', 'label': 'All usual residents', 'base': 'Person'}}, 'index': {'Segment_1': 0}}, 'label': 'Measures'}}}}
results = pyjstat.from_json_stat(data)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-395-88b8c10d54a7> in <module>() ----> 1 results = pyjstat.from_json_stat(data) /usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in from_json_stat(datasets, naming) 241 values = [] 242 js_dict = datasets[dataset] --> 243 dimensions, dim_names = get_dimensions(js_dict, naming) 244 values = get_values(js_dict) 245 output = pd.DataFrame(columns=dim_names + [unicode('value', 'utf-8')], /usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in get_dimensions(js_dict, naming) 72 dim_name = dim 73 if (naming == 'label'): ---> 74 dim_label = get_dim_label(js_dict, dim) 75 dimensions.append(dim_label) 76 dim_names.append(dim_name) /usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in get_dim_label(js_dict, dim) 106 dim_label.values()), 107 index=dim_label.keys(), --> 108 columns=['id', 'label']) 109 return dim_label 110 /usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy) 255 copy=copy) 256 elif isinstance(data, collections.Iterator): --> 257 raise TypeError("data argument can't be an iterator") 258 else: 259 try: TypeError: data argument can't be an iterator
Hmm... so what's not working? The library or the format used to return the data from the API call?
FWIW: