Introduction

This first part contains many uninteresting things... we first load plenty of libraries we will use later

In [27]:
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import ConjunctiveGraph, Namespace, Literal, RDF, RDFS, BNode, URIRef, XSD, Variable
import operator
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import urllib2, StringIO, csv
import re
%matplotlib inline
NG_TEMPLATE = 'http://lod.cedar-project.nl/resource/v2/TABLE'
END_POINT = 'http://lod.cedar-project.nl:8080/sparql/cedar'

This function will be used to clean text

In [9]:
def clean_string(text):
    # Remove some extra things
    text_clean = text.replace('.', '').replace('_', ' ').lower()
    # Shrink spaces
    text_clean = re.sub(r'\s+', ' ', text_clean)
    # Remove lead and trailing whitespaces
    text_clean = text_clean.strip()
    return text_clean

We then get the list of tables

In [10]:
url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/tables.txt'
tables = [table.strip() for table in StringIO.StringIO(urllib2.urlopen(url).read())]

Some queries over the column headers

We first get the raw data out of the SPARQL end point (this takes a while !)

In [13]:
headers = {}
for table in tables:
    named_graph = NG_TEMPLATE.replace('TABLE', table) 
    sparql = SPARQLWrapper(END_POINT)
    query = """
    prefix tablink: <http://example.org/ns#>
    prefix skos: <http://www.w3.org/2004/02/skos/core#>
    select distinct ?header ?label ?parent from <GRAPH> where {
    ?header a tablink:ColumnHeader;
            tablink:subColHeaderOf ?parent;
            skos:prefLabel ?label.
    } 
    """.replace('GRAPH',named_graph)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    table_headers = {}
    for result in results["results"]["bindings"]:
        h = URIRef(result['header']['value'])
        p = URIRef(result['parent']['value'])
        l = Literal(result['label']['value'])
        table_headers[h] = {'parent':p, 'label':l}
    headers[table] = table_headers

Which keywords are most commonly found as column header ?

We first generate the table

In [17]:
keyword_count = {}
for (table, table_headers) in headers.iteritems():
    for (header, data) in table_headers.iteritems():
        clean_label = clean_string(data['label'])
        for label_part in clean_label.split(' '):
            if label_part not in keyword_count:
                keyword_count[label_part] = 0
            keyword_count[label_part] = keyword_count[label_part] + 1
sorted_keyword_count = sorted(keyword_count.iteritems(), key=operator.itemgetter(1), reverse=True)

We print the top 15 results

In [18]:
for (k,v) in sorted_keyword_count[:15]:
        print '{0:<8} => {1}'.format(v, k)
9642     => m
9509     => v
4775     => totaal
4673     => gehuwd
3630     => in
2841     => vrouwen
2720     => mannen
2565     => en
2545     => ongehuwd
2320     => geweest
1825     => der
1768     => van
1529     => de
1389     => o
1332     => provincie

We can also plot the complete data to see how the frequency evolves

In [19]:
values = [v for (k,v) in sorted_keyword_count]
plt.semilogy(range(len(values)), values, 'ro')
plt.axis([0, len(values), 0, values[0]])
plt.xlabel('Index of the dimension')
plt.ylabel('Frequency')
plt.show()

New let's generate a set of keywords per table

In [22]:
keyword_sets = {}
for (table, table_headers) in headers.iteritems():
    keyword_sets[table] = set()
    for (header, data) in table_headers.iteritems():
        clean_label = clean_string(data['label'])
        for label_part in clean_label.split(' '):
            keyword_sets[table].add(label_part)

and use this set to create a networked based on the size of the set intersection

In [42]:
# Create the edges
edges = []
sorted_tb_names = sorted(keyword_sets.keys())
maximum_size = 0.0
for i in range(0, len(sorted_tb_names)):
    for j in range(i+1, len(sorted_tb_names)):
        table_a = sorted_tb_names[i]
        table_b = sorted_tb_names[j]
        size = float(len(keyword_sets[table_a] & keyword_sets[table_b]))
        if size > 0:
            edges.append((table_a, table_b, size))
            if size > maximum_size:
                maximum_size = size
# Normalize the weights
for i in range(0, len(edges)):
    (a,b,s) = edges[i]
    edges[i] = (a, b, s/maximum_size)        

Now we can load this network in networkx and plot it :-)

In [59]:
# Create the graph
G=nx.Graph()
# Add the nodes
for node in sorted_tb_names:
    G.add_node(node)
# Add the edges
for (src,dst,weight) in edges:
    if weight > 0.025: # Filter a bit to get less edges
        G.add_edge(src, dst, {'weight': weight})
# Position the nodes
pos=nx.spring_layout(G, iterations=100)
# Color them according to their type (beroep in blue, volk in green and woning in red)
color_map = {'BRT': 'b', 'VT' : 'g', 'WT' : 'r'}
typed_nodes = {}
for node in sorted_tb_names:
    node_type = node.split('_')[0]
    typed_nodes.setdefault(node_type, []).append(node)
# Plot the network
plt.figure(figsize=(15,15))
plt.axis('off')
nx.draw_networkx_edges(G, pos, alpha=0.4, width=1)
for node_type in color_map.keys():
    nx.draw_networkx_nodes(G, pos, alpha=0.6, node_size=80, nodelist=typed_nodes[node_type], node_color=color_map[node_type])