Notebook

ONS Scratchpad¶

This notebook represents a test data conversation with the UK Office of National Statistcs (ONS) API.

I can't guarantee that all the mappings are correct...

Several of the call responses include URLs for downloadable datasets (though I got corrupt/non-openable zip files on several downloads I tried?

It may be worth returning the URL responses from queries so that whole datasets can be downloaded directly?

In [4]:

import requests
import pandas as pd

In [374]:

class ONS_CONFIG:
    ONS_STUB= 'http://data.ons.gov.uk/ons/api/data/'
    DEFAULT_CONTEXT='Census'
    
    with open('onspwd.txt','r') as f:
        APIKEY=f.read()

    def __init__(self):
        
        self.url=ONS_STUB
        self.params={'jsontype':'json-stat','apikey':APIKEY}
 
    def _ONS_SetParam(self,param,value):
        self.params[param]=value

    def _ONS_SetContext(self,context):
        self.params['context']=context

    def _ONS_Census(self):
        self._ONS_SetContext('Census')

    def _ONS_SetConcept(self,concept):
        self.params['concept']=concept

    def _ONS_SetGeog(self,geog):
        self.params['geog']=geog
        
    def _ONS_SetConcepts(self,context=DEFAULT_CONTEXT):
        self.url=ONS_STUB+'concepts.json'
        #context is required?
        self._ONS_SetContext(context) 
        
    def _ONS_SetCollections(self):
        self.url=ONS_STUB+'collections.json'

    def _ONS_SetCollectionDetails(self,collId):
        self.url=ONS_STUB+'collectiondetails/'+collId+'.json'

    def _ONS_SetDatasetDetails(self,datasetId,geog,context=DEFAULT_CONTEXT):
        self.url=ONS_STUB+'datasetdetails/'+datasetId+'.json'
        #context is required?
        self._ONS_SetContext(context)
        #geog is required?
        self._ONS_SetGeog(geog)
        
    def _ONS_SetGeohierarchies(self,geohier):
        self.url=ONS_STUB+'hierarchies/hierarchy/'+geohier+'.json'
        
    def _ONS_SetDataset(self,dataset,geohier,context,dims):
        self.url=ONS_STUB+'dataset/'+dataset+'.json'
        self._ONS_SetContext(context)
        self._ONS_SetGeog(geohier)
        for param in dims:
            self._ONS_SetParam(param,dims[param])

In [310]:

def _ONS_getConcepts(d,context='Census',lang='en'):

    d._ONS_SetConcepts(context)
    #Context MUST be defined
    
    r = requests.get(d.url,params=d.params)
    
    items={}
    for _concept in r.json()['ons']['conceptList']['concept']:
        for concept in _concept['names']['name']:
            if concept['@xml.lang']==lang:
                items[_concept['id']]=concept['$']
    return items

concepts=_ONS_getConcepts(ONS_CONFIG())
df=pd.DataFrame.from_dict(concepts,orient='index')
df[:5]

Out[310]:

	0
1	Age
21	Demography
22	Dependent children
23	Household composition
24	Household deprivation dimensions

In [311]:

def _getName(d,lang='en'):
    for n in d['names']['name']:
        if n['@xml.lang']==lang:
            return n['$']

def _getLangItemFromList(d,attr,lang='en'):
    for n in d[attr]:
        if n['@xml.lang']==lang:
            return n['$']

In [376]:

def _getGeographicalTypes(d,lang='en'):
    geotypes=[]
    dh=d['geographicalHierarchies']['geographicalHierarchy']
    if not(isinstance(dh,list)):
        dh=[dh]
    for geohier in dh:
        for geotype in geohier['geographicalType']:
            if geotype['@xml.lang']==lang:
                geotypes.append(geotype['$'])
    return geotypes

In [377]:

#collections_url='http://data.ons.gov.uk/ons/api/data/collections.json?apikey='+APIKEY+'&context=Census'

def _ONS_getCollections(d,concept='',context='Census',lang='en'):

    d._ONS_SetCollections()
    d._ONS_SetContext(context)
    d._ONS_SetConcept(concept)
    r = requests.get(d.url,params=d.params)
    
    data=r.json()['ons']['collectionList']['collection']
    if not(isinstance(data,list)): data=[data]
    items=[]
    for data_el in data:
        item={'description':data_el['description'],
              'name':_getName(data_el),
              'id':data_el['id'],
              'geographicalTypes':_getGeographicalTypes(data_el)
        }
        items.append(item)
    return items

#List all collections
data=_ONS_getCollections(ONS_CONFIG())
df=pd.DataFrame(data)
df[:5]

Out[377]:

	description	geographicalTypes	id	name
0	collection CT0010	[Ward Admin, Parliamentary Constituency]	CT0010	Ethnic group (write-in responses)
1	collection DC2101EW	[Census Merged Wards]	DC2101EW	Ethnic group by sex by age
2	collection DC2102EW	[Census Merged Wards]	DC2102EW	National identity by sex by age
3	collection DC2103EW	[Census Merged Wards]	DC2103EW	Country of birth by sex by age
4	collection DC2104EW	[Census Merged Wards]	DC2104EW	Main language by sex by age

In [314]:

#Display a particular collection
_ONS_getCollections(ONS_CONFIG(),60)

Out[314]:

[{'id': 'QS501EW',
  'description': 'collection QS501EW',
  'name': 'Highest level of qualification',
  'geographicalTypes': ['Ward Admin', 'Parliamentary Constituency']}]

In [342]:

#purl='collectiondetails/QS501EW.json?context=Census&apikey=**'

def _ONS_parse_dimensions(d):
    dimensions=[]
    for dimension in d['dimensions']['dimension']:
        dimensions.append({
            'id':dimension['dimensionId'],
            'title': _getLangItemFromList(dimension['dimensionTitles'],'dimensionTitle')
        })
    return dimensions

def _ONS_parse_areas(data):
    dl=data['geographicalHierarchies']['geographicalHierarchy']
    metaitems=[]
    if not(isinstance(dl,list)):
        dl=[dl]
    for metaitem in dl:
        items={'area':[]}
        for el in ['year','id']:
            if el in metaitem:
                items[el]=metaitem[el]
        items['name']=_getLangItemFromList(metaitem['types'],'geographicalType')
        if not(isinstance(metaitem['areaTypes']['areaType'],list)):
            metaitem['areaTypes']['areaType']=[metaitem['areaTypes']['areaType']]
        for item in metaitem['areaTypes']['areaType']:
            tmp={'areaTypeCodeName':item['codename'],
                  'areaTypeCodeAbbrev':item['abbreviation'],
                  'areaTypeLevel':item['level']
                  }
            items['area'].append(tmp)
        metaitems.append(items)
    return metaitems

def _datasetDescription(d,lang='en'):
    return _getLangItemFromList(d['refMetadata']['refMetadataItem']['descriptions'],'description')
        
def _ONS_getCollectionDetails(d,collId,context='Census',lang='en'):
    d._ONS_SetCollectionDetails(collId)
    #Context required?
    d._ONS_SetContext(context)
    
    r = requests.get(d.url,params=d.params)
    
    data=r.json()['ons']['collectionDetail']
    item={'description':_datasetDescription(data),
          'dimensions':_ONS_parse_dimensions(data),
          'id':data['id'],
          'publicationDate': data['publicationDate'],
          'areas':_ONS_parse_areas(data)
          }
    return item
_ONS_getCollectionDetails(ONS_CONFIG(),'QS501EW')

Out[342]:

{'dimensions': [{'id': 'CL_0000052',
   'title': 'Highest level of qualification (T008A)'},
  {'id': '2011WARDH', 'title': '2011 Administrative Hierarchy'},
  {'id': '2011PCONH',
   'title': '2011 Westminster Parliamentary Constituency Hierarchy'}],
 'id': 'QS501EW',
 'description': '<p>\r\n\tThis dataset provides 2011 Census estimates that classify usual residents aged 16 and over in England and Wales by their highest level of qualification.&nbsp; The estimates are as at census day, 27 March 2011.</p>\r\n<p>\r\n\tThis information identifies educational achievement across the population to help government resource allocation and policy making, especially in relation to disadvantaged population groups and educationally deprived areas.</p>\r\n<p>\r\n\tSimilar estimates from the 2001 Census were provided in table UV24 however the category "Apprenticeship" is identified separately in 2011 Census estimates&nbsp; to provide more detail.</p>\r\n<p>\r\n\tFor more information about qualifications please see Part 4 of the 2011 Census User Guide at <a href="http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html">http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html</a></p>',
 'areas': [{'area': [{'areaTypeCodeAbbrev': 'NAT ',
     'areaTypeLevel': 0,
     'areaTypeCodeName': 'England and Wales'},
    {'areaTypeCodeAbbrev': 'CTRY',
     'areaTypeLevel': 1,
     'areaTypeCodeName': 'Country'},
    {'areaTypeCodeAbbrev': 'WPC',
     'areaTypeLevel': 3,
     'areaTypeCodeName': 'Westminster Parliamentary Constituency'},
    {'areaTypeCodeAbbrev': 'RGN',
     'areaTypeLevel': 2,
     'areaTypeCodeName': 'Region'}],
   'year': 2011,
   'name': 'Parliamentary Constituency',
   'id': '2011PCONH'},
  {'area': [{'areaTypeCodeAbbrev': 'UA',
     'areaTypeLevel': 5,
     'areaTypeCodeName': 'Unitary Authority'},
    {'areaTypeCodeAbbrev': 'RGN',
     'areaTypeLevel': 4,
     'areaTypeCodeName': 'Region'},
    {'areaTypeCodeAbbrev': 'NMD',
     'areaTypeLevel': 6,
     'areaTypeCodeName': 'Non-metropolitan District'},
    {'areaTypeCodeAbbrev': 'CTY',
     'areaTypeLevel': 5,
     'areaTypeCodeName': 'County'},
    {'areaTypeCodeAbbrev': 'MD',
     'areaTypeLevel': 6,
     'areaTypeCodeName': 'Metropolitan District '},
    {'areaTypeCodeAbbrev': 'WD',
     'areaTypeLevel': 7,
     'areaTypeCodeName': 'Electoral Ward/Division '},
    {'areaTypeCodeAbbrev': 'WD',
     'areaTypeLevel': 7,
     'areaTypeCodeName': 'Electoral Division'},
    {'areaTypeCodeAbbrev': 'MCTY',
     'areaTypeLevel': 5,
     'areaTypeCodeName': 'Metropolitan County'},
    {'areaTypeCodeAbbrev': 'NAT ',
     'areaTypeLevel': 2,
     'areaTypeCodeName': 'England and Wales'},
    {'areaTypeCodeAbbrev': 'LONB',
     'areaTypeLevel': 6,
     'areaTypeCodeName': 'London Borough '},
    {'areaTypeCodeAbbrev': 'IOL',
     'areaTypeLevel': 5,
     'areaTypeCodeName': 'Inner and Outer London'},
    {'areaTypeCodeAbbrev': 'CTRY',
     'areaTypeLevel': 3,
     'areaTypeCodeName': 'Country'}],
   'year': 2011,
   'name': 'Ward Admin',
   'id': '2011WARDH'}],
 'publicationDate': '2013-10-19+01:00'}

In [363]:

def _ONS_getDatasetDetails(d,datasetId,geog,context='Census',lang='en'):
    d._ONS_SetDatasetDetails(datasetId,geog,context)

    r = requests.get(d.url,params=d.params)
    
    data=r.json()['ons']['datasetDetail']
    item={'description':_datasetDescription(data),
          'dimensions':_ONS_parse_dimensions(data),
          'id':data['id'],
          'publicationDate': data['publicationDate'],
          'areas':_ONS_parse_areas(data)
          }
    return item

_ONS_getDatasetDetails(ONS_CONFIG(),'QS501EW','2011PCONH')

Out[363]:

{'dimensions': [{'id': 'CL_0000052',
   'title': 'Highest level of qualification (T008A)'},
  {'id': '2011PCONH',
   'title': '2011 Westminster Parliamentary Constituency Hierarchy'}],
 'id': 'QS501EW',
 'description': '<p>\tThis dataset provides 2011 Census estimates that classify usual residents aged 16 and over in England and Wales by their highest level of qualification.&nbsp; The estimates are as at census day, 27 March 2011.</p><p>\tThis information identifies educational achievement across the population to help government resource allocation and policy making, especially in relation to disadvantaged population groups and educationally deprived areas.</p><p>\tSimilar estimates from the 2001 Census were provided in table UV24 however the category "Apprenticeship" is identified separately in 2011 Census estimates&nbsp; to provide more detail.</p><p>\tFor more information about qualifications please see Part 4 of the 2011 Census User Guide at <a href="http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html">http://www.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html</a></p>',
 'areas': [{'area': [{'areaTypeCodeAbbrev': 'WPC',
     'areaTypeLevel': 3,
     'areaTypeCodeName': 'Westminster Parliamentary Constituency'},
    {'areaTypeCodeAbbrev': 'CTRY',
     'areaTypeLevel': 1,
     'areaTypeCodeName': 'Country'},
    {'areaTypeCodeAbbrev': 'NAT ',
     'areaTypeLevel': 0,
     'areaTypeCodeName': 'England and Wales'},
    {'areaTypeCodeAbbrev': 'RGN',
     'areaTypeLevel': 2,
     'areaTypeCodeName': 'Region'}],
   'name': 'Parliamentary Constituency',
   'id': '2011PCONH'}],
 'publicationDate': '2013-10-19+01:00'}

In [364]:

def _ONS_getHierarchies(d,geohier,context='',lang='en'):
    d._ONS_SetGeohierarchies(geohier)
    if context!='':
       d._ONS_SetContext(context)
    r = requests.get(url=d.url,params=d.params)
    data=r.json()['ons']['geographyList']
    
    items=[]
    for item in data['items']['item']:
        tmp={'label':_getLangItemFromList(item['labels'],'label'),
              'itemCode':item['itemCode'],
              'areaTypeCodeName':item['areaType']['codename'],
              'areaTypeCodeAbbrev':item['areaType']['abbreviation'],
              'areaTypeLevel':item['areaType']['level'],
              'parentCode':''
            }
        if 'parentCode' in item:
            tmp['parentCode']=item['parentCode']
        items.append(tmp)
    
    return pd.DataFrame(items)
    
_ONS_getHierarchies(ONS_CONFIG(),'2011PCONH')[:5]#,'Census') #2011WARDH, 2011PCONH

Out[364]:

	areaTypeCodeAbbrev	areaTypeCodeName	areaTypeLevel	itemCode	label	parentCode
0	WPC	Westminster Parliamentary Constituency	3	E14000842	North East Cambridgeshire	E12000006
1	WPC	Westminster Parliamentary Constituency	3	E14000962	St Helens North	E12000002
2	WPC	Westminster Parliamentary Constituency	3	E14001035	West Worcestershire	E12000005
3	WPC	Westminster Parliamentary Constituency	3	E14000865	Nottingham East	E12000004
4	WPC	Westminster Parliamentary Constituency	3	E14000539	Banbury	E12000008

The following is where we actually start to get some data!

The fucntion currently includes a slightly modified version of code from ONS blog.

Ideally, we need to start developing something here to construct and return a pandas dataframe in an efficient and understandable way.

In [397]:

def _ONS_SetDataset(d,dataset,geohier,context,dims,lang='en'):
    d._ONS_SetDataset(dataset,geohier,context,dims)
    r = requests.get(url=d.url,params=d.params)
    data=r.json()

    #-------
    #Via http://digitalpublishing.ons.gov.uk/2014/08/07/ons-api-just-the-numbers/

    #First we create an empty dict object
    datax = {'vals':{}}


    #We need to specify the dataset name
    dataset = "QS501EW"
    geog=''

    # We get the actual observation values from the JSON-STAT as a list
    values  =  obj[dataset]['value']

    # Then we get the index of the observations and its associated categories as a dict
    ##TH: Original uses index 0 - but this desonlt respond correctly for multiple areas?
    index = obj[dataset]['dimension'][obj[dataset]['dimension']['id'][0]]['category']['index']

    #and finally the labels for the categories as another dict 
    ##TH: Original uses index 0 - but this doesn't respond correctly for multiple areas?
    labels = obj[dataset]['dimension'][obj[dataset]['dimension']['id'][0]]['category']['label']

    #What's the measure?
    key=obj[dataset]['dimension'][obj[dataset]['dimension']['id'][1]]['category']['label']
    ix=obj[dataset]['dimension'][obj[dataset]['dimension']['id'][1]]['category']['index']
    print(key,ix)
    datax['keys']=[]
    for ixl in ix:
        datax['keys'].append({'label':key[ixl],'id':ixl})


    for l in labels:    #Now we can iterate through the labels 
        num = index[l]    # get the position in the values dict of the specific label
        count = values[str(num)] #get that  value from the values  list
        datax['vals'][labels[l]] = count  #create a new object in  data dict for the value with the label as its name

    #print(datax)  #All  done!
    
    
    return datax

_ONS_SetDataset(ONS_CONFIG(),'QS501EW','2011WARDH','Census',{'dm/2011WARDH':'E05008481,E05003606'})

{'CI_0000368': 'All categories: Highest level of qualification'} {'CI_0000368': 0}

Out[397]:

{'keys': [{'id': 'CI_0000368',
   'label': 'All categories: Highest level of qualification'}],
 'vals': {'Kingsteignton East': 4841,
  'Brading, St Helens and Bembridge': 6031}}

Maybe the best way to handle the json-stat data is to use a library to parse it?

In [378]:

!pip3 install pyjstat

Downloading/unpacking pyjstat
  Downloading pyjstat-0.1.5.tar.gz (157kB): 157kB downloaded
  Running setup.py (path:/tmp/pip_build_root/pyjstat/setup.py) egg_info for package pyjstat
    
Requirement already satisfied (use --upgrade to upgrade): pandas in /usr/local/lib/python3.4/dist-packages (from pyjstat)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in /usr/local/lib/python3.4/dist-packages (from pandas->pyjstat)
Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in /usr/local/lib/python3.4/dist-packages (from pandas->pyjstat)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0b2 in /usr/lib/python3/dist-packages (from pandas->pyjstat)
Requirement already satisfied (use --upgrade to upgrade): six in /usr/local/lib/python3.4/dist-packages (from python-dateutil>=2->pandas->pyjstat)
Installing collected packages: pyjstat
  Running setup.py install for pyjstat
    
Successfully installed pyjstat
Cleaning up...

In [379]:

from pyjstat import pyjstat

In [393]:

#Use example from ONS blog post
d=ONS_CONFIG()
d._ONS_SetDataset('QS104EW','2011WARDH','Census',{'dm/2011WARDH':'K04000001','totals':'false'})
r = requests.get(url=d.url,params=d.params)
data=r.json()

In [394]:

data

Out[394]:

{'QS104EW': {'source': 'Office for National Statistics',
  'label': 'Sex',
  'value': {'0': 56075912, '2': 28502536, '1': 27573376},
  'updated': '17/10/2013 16:51:00',
  'dimension': {'CL_0000137': {'category': {'label': {'CI_0000001': '2011'},
     'index': {'CI_0000001': 0}},
    'label': 'Time Dimension'},
   'size': [8965, 3, 1, 1],
   'CL_0000035': {'category': {'label': {'CI_0000070': 'Females',
      'CI_0000071': 'Males',
      'CI_0000121': 'All categories: Sex'},
     'index': {'CI_0000070': 2, 'CI_0000071': 1, 'CI_0000121': 0}},
    'label': 'Sex (T003A)'},
   '2011WARDH': {'category': {'label': {'K04000001': 'England and Wales'},
     'index': {'K04000001': 0}},
    'label': '2011 Administrative Hierarchy'},
   'id': ['2011WARDH', 'CL_0000035', 'Att_000001', 'CL_0000137'],
   'role': {'metric': ['Att_000001'],
    'geo': ['2011WARDH'],
    'time': ['CL_0000137']},
   'Att_000001': {'category': {'unit': {'Segment_1': {'multiplier': 'Units',
       'type': 'Count',
       'unit': 'Number',
       'label': 'All usual residents',
       'base': 'Person'}},
     'index': {'Segment_1': 0}},
    'label': 'Measures'}}}}

In [395]:

results = pyjstat.from_json_stat(data)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-395-88b8c10d54a7> in <module>()
----> 1 results = pyjstat.from_json_stat(data)

/usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in from_json_stat(datasets, naming)
    241         values = []
    242         js_dict = datasets[dataset]
--> 243         dimensions, dim_names = get_dimensions(js_dict, naming)
    244         values = get_values(js_dict)
    245         output = pd.DataFrame(columns=dim_names + [unicode('value', 'utf-8')],

/usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in get_dimensions(js_dict, naming)
     72             dim_name = dim
     73         if (naming == 'label'):
---> 74             dim_label = get_dim_label(js_dict, dim)
     75             dimensions.append(dim_label)
     76             dim_names.append(dim_name)

/usr/local/lib/python3.4/dist-packages/pyjstat/pyjstat.py in get_dim_label(js_dict, dim)
    106                                      dim_label.values()),
    107                                  index=dim_label.keys(),
--> 108                                  columns=['id', 'label'])
    109     return dim_label
    110 

/usr/local/lib/python3.4/dist-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    255                                          copy=copy)
    256         elif isinstance(data, collections.Iterator):
--> 257             raise TypeError("data argument can't be an iterator")
    258         else:
    259             try:

TypeError: data argument can't be an iterator

Hmm... so what's not working? The library or the format used to return the data from the API call?

FWIW:

example notebook using pyjstat on data from Statistics Norway http://nbviewer.ipython.org/gist/predicador37/9736709e086f685939eb

In [ ]: