The goal of this notebook is to learn the basics of SPARQL. Only the READ part of SPARQL will be exposed.
This notebook assumes you've created a project within the AWS deployment of Nexus. If not follow the Blue Brain Nexus Quick Start tutorial.
You'll work through the following steps:
Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/%5C$PROJECTLABEL/graph/sparql for a project withe label $PROJECTLABEL. The address of a SparqlView is also called a SPARQL endpoint.
#Configuration for the Nexus deployment
nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"
token= "your token here"
org ="tutorialnexus"
project ="$PROJECTLABEL"
headers = {}
#Let install sparqlwrapper which a python wrapper around sparql client
!pip install git+https://github.com/RDFLib/sparqlwrapper
# Utility functions to create sparql wrapper around a sparql endpoint
from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests
def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
sparql_client = SPARQLWrapper(sparql_endpoint)
#sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
if token:
sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
sparql_client.setMethod(http_query_method)
sparql_client.setReturnFormat(result_format)
if http_query_method == POST:
sparql_client.setRequestMethod(POSTDIRECTLY)
return sparql_client
# Utility functions
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
cols = json_sparql_results['head']['vars']
out = []
for row in json_sparql_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
# Send a query using a sparql wrapper
def query_sparql(query, sparql_client):
sparql_client.setQuery(query)
result_object = sparql_client.query()
if sparql_client.returnFormat == JSON:
return result_object._convertJSON()
return result_object.convert()
# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)
Let write our first query.
select_all_query = """
SELECT ?s ?p ?o
WHERE
{
?s ?p ?o
}
OFFSET 0
LIMIT 5
"""
nexus_results = query_sparql(select_all_query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
Most SPARQL queries you'll see will have the anotomy above with:
Multiple triples can be provided as graph pattern to match but each triple should end with a period. As an example, let retrieve 5 movies (?movie) along with their titles (?title).
movie_with_title = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movie ?title
WHERE {
?movie a vocab:Movie.
?movie vocab:title ?title.
} LIMIT 5
"""%(org,project)
nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
Note PREFIX clauses. It is way to shorten URIS within a SPARQL query. Without them we would have to use full URI for all properties.
The ?movie variable is bound to a URI (the internal Nexus id). Let retrieve the movieId just like in the MovieLens csv files for simplicity.
movie_with_title = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
#
?movie vocab:title ?title.
} LIMIT 5
"""%(org,project)
nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
In the above query movies are things (or entities) of type vocab:Movie. This is a typical instance query where entities are filtered by their type(s) and then some of their properties are retrieved (here ?title).
Let retrieve everything that is linked (outgoing) to the movies. The * character in the SELECT clause indicates to retreve all variables: ?movie, ?p, ?o
movie_with_properties = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select *
WHERE {
?movie a vocab:Movie.
?movie ?p ?o.
} LIMIT 20
"""%(org,project)
nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)
As a little exercise, write a query retrieving incoming entities to movies. You can copy past the query above and modify it.
Hints: ?s ?p ?o can be read as: ?o is linked to ?s with an outgoing link.
Do you have results ?
#Your query here
Let retrieve the movie ratings
movie_with_properties = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?userId ?movieId ?rating ?timestamp
WHERE {
?movie a vocab:Movie.
?movie vocab:movieId ?movieId.
?ratingNode vocab:movieId ?ratingmovieId.
?ratingNode vocab:rating ?rating.
?ratingNode vocab:userId ?userId.
?ratingNode vocab:timestamp ?timestamp.
# Somehow pandas is movieId as double for rating
FILTER(xsd:integer(?ratingmovieId) = ?movieId)
} LIMIT 20
"""%(org,project)
nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)
As a little exercise, write a query retrieving the movie tags along with the user id and timestamp. You can copy and past the query above and modify it.
#Your query here
Aggregates apply some operations over a group of solutions. Available aggregates are: COUNT, SUM, MIN, MAX, AVG, GROUP_CONCAT, and SAMPLE.
We will not see them all but we'll look at some examples.
The next query will compute the average rating score for 'funny' movies.
tag_value = "funny"
movie_avg_ratings = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ( AVG(?ratingvalue) AS ?score)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag vocab:movieId ?movieId.
?tag vocab:tag ?tagvalue.
FILTER(?tagvalue = "%s").
# Keep movies with ratings
?rating vocab:movieId ?ratingmovidId.
FILTER(xsd:integer(?ratingmovidId) = xsd:integer(?movieId))
?rating vocab:rating ?ratingvalue.
}
""" %(org,project,tag_value)
nexus_results = query_sparql(movie_avg_ratings,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))
nexus_df=nexus_df.astype(float)
Retrieve the number of tags per movie. Can be a little bit slow depending on the size of your data.
nbr_tags_per_movie = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?title (COUNT(?tagvalue) as ?tagnumber)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag a vocab:Tag.
?tag vocab:movieId ?tagmovieId.
FILTER(?tagmovieId = ?movieId)
?movie vocab:title ?title.
?tag vocab:tag ?tagvalue.
}
GROUP BY ?title
ORDER BY DESC(?tagnumber)
LIMIT 10
""" %(org,project)
nexus_results = query_sparql(nbr_tags_per_movie,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))
#Let plot the result
nexus_df.tagnumber = pd.to_numeric(nexus_df.tagnumber)
nexus_df.plot(x="title",y="tagnumber",kind="bar")
The next query will retrieve movies along with users that tagged them separated by a comma
# Group Concat
movie_tag_users = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId (group_concat(DISTINCT ?userId;separator=",") as ?users)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag vocab:movieId ?movieId.
?tag vocab:userId ?userId.
}
GROUP BY ?movieId
LIMIT 10
"""%(org,project)
nexus_results = query_sparql(movie_tag_users,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)