In [9]:
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import ConjunctiveGraph, Namespace, Literal, RDF, RDFS, BNode, URIRef, XSD, Variable
import operator
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import urllib2, StringIO, csv
import re
%matplotlib inline
NG_TEMPLATE = 'http://lod.cedar-project.nl/resource/r1/CUBE'
END_POINT = 'http://lod.cedar-project.nl:8080/sparql/cedar'
In [10]:
url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/cubes.txt'
cubes = [cube.strip() for cube in StringIO.StringIO(urllib2.urlopen(url).read())]

What is the evolution of the number of teachers over time ?

Teachers have the HISCO code 13000, this question is a simplified version of that found on slide 30 of this presentation

In [28]:
data_years=[]
data_count=[]
for cube in cubes:
    (cube_type, cube_year) = cube.split('-')
    if cube_type != 'BRT':
        continue
    data_years.append(int(cube_year))
    named_graph = NG_TEMPLATE.replace('CUBE', cube)
    sparql = SPARQLWrapper(END_POINT)
    query = """
    prefix cedar: <http://cedar.example.org/ns#>
    prefix qb: <http://purl.org/linked-data/cube#>
    
    select (sum(?s) as ?total) from <GRAPH> where {
    ?o a qb:Observation.
    ?o cedar:occupation cedar:hisco-13000.
    ?o cedar:populationSize ?s.
    }
    """.replace('GRAPH',named_graph)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query().convert()["results"]["bindings"][0]
    total = 0
    if 'total' in result:
        total = int(result['total']['value'])
    data_count.append(total)
    
In [29]:
print data_years, data_count
[1889, 1899, 1909, 1920, 1930, 1947] [17653, 9170, 267, 0, 0, 0]
In [31]:
plt.plot(data_years, data_count, alpha=0.5)
plt.scatter(data_years, data_count, alpha=0.5)
plt.show()

What is the evolution of the number of married women ?

In [32]:
data_years=[]
data_count=[]
for cube in cubes:
    (cube_type, cube_year) = cube.split('-')
    if cube_type != 'VT':
        continue
    data_years.append(int(cube_year))
    named_graph = NG_TEMPLATE.replace('CUBE', cube)
    sparql = SPARQLWrapper(END_POINT)
    query = """
    prefix cedar: <http://cedar.example.org/ns#>
    prefix qb: <http://purl.org/linked-data/cube#>
    prefix sdmx-dimension: <http://purl.org/linked-data/sdmx/2009/dimension#>
    prefix sdmx-code: <http://purl.org/linked-data/sdmx/2009/code#>
    
    select (sum(?s) as ?total) from <GRAPH> where {
    ?o a qb:Observation.
    ?o cedar:maritalStatus cedar:marital-Married.
    ?o sdmx-dimension:sex sdmx-code:sex-V.
    ?o cedar:populationSize ?s.
    }
    """.replace('GRAPH',named_graph)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query().convert()["results"]["bindings"][0]
    total = 0
    if 'total' in result:
        total = int(result['total']['value'])
    data_count.append(total)
In [36]:
print data_years, data_count
[1795, 1830, 1840, 1849, 1859, 1869, 1879, 1889, 1899, 1909, 1920, 1930, 1947, 1971] [0, 3782337, 3630140, 5070644, 7513441, 6504906, 3541892, 1263520, 14269943, 10292988, 12812929, 22087751, 12975842, 0]
In [34]:
plt.plot(data_years, data_count, alpha=0.5)
plt.scatter(data_years, data_count, alpha=0.5)
plt.show()

What is the evolution of the population of Amsterdam ?

In [37]:
data_years=[]
data_count=[]
for cube in cubes:
    (cube_type, cube_year) = cube.split('-')
    if cube_type != 'VT':
        continue
    data_years.append(int(cube_year))
    named_graph = NG_TEMPLATE.replace('CUBE', cube)
    sparql = SPARQLWrapper(END_POINT)
    query = """
    prefix cedar: <http://cedar.example.org/ns#>
    prefix qb: <http://purl.org/linked-data/cube#>
    prefix sdmx-dimension: <http://purl.org/linked-data/sdmx/2009/dimension#>
    prefix sdmx-code: <http://purl.org/linked-data/sdmx/2009/code#>
    
    select (sum(?s) as ?total) from <GRAPH> where {
    ?o a qb:Observation.
    ?o cedar:city cedar:ac-11150.
    ?o cedar:populationSize ?s.
    }
    """.replace('GRAPH',named_graph)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query().convert()["results"]["bindings"][0]
    total = 0
    if 'total' in result:
        total = int(result['total']['value'])
    data_count.append(total)
In [38]:
print data_years, data_count
[1795, 1830, 1840, 1849, 1859, 1869, 1879, 1889, 1899, 1909, 1920, 1930, 1947, 1971] [2389519, 606525, 0, 1470085, 8212669, 5137603, 4978346, 6423682, 11996651, 7260189, 16510509, 23924726, 2186887, 7358562]
In [40]:
plt.plot(data_years, data_count, alpha=0.5)
plt.scatter(data_years, data_count, alpha=0.5)
plt.show()
In [ ]: