In this notebook you'll experience how to consume structured data available on Wikidata to extend and enrich data managed within Nexus.
This notebook assumes you've created a project within the AWS deployment of Blue Brain Nexus. If not follow the Blue Brain Nexus Quick Start tutorial.
You'll work through the following steps:
!pip install git+https://github.com/BlueBrain/nexus-cli
import nexussdk as nexus
nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"
token = "your token here"
nexus.config.set_environment(nexus_deployment)
nexus.config.set_token(token)
org ="tutorialnexus"
#Provide your project name here.
project ="your_project_here"
Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/%5C$PROJECTLABEL/graph/sparql for a project withe label $PROJECTLABEL. The address of a SparqlView is also called a sparql endpoint.
!pip install git+https://github.com/RDFLib/sparqlwrapper
# Utility functions to create sparql wrapper around a sparql endpoint
from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests
def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
sparql_client = SPARQLWrapper(sparql_endpoint)
#sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
if token:
sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
sparql_client.setMethod(http_query_method)
sparql_client.setReturnFormat(result_format)
if http_query_method == POST:
sparql_client.setRequestMethod(POSTDIRECTLY)
return sparql_client
# Utility functions
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
cols = json_sparql_results['head']['vars']
out = []
for row in json_sparql_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
# Send a query using a sparql wrapper
def query_sparql(query, sparql_client):
sparql_client.setQuery(query)
result_object = sparql_client.query()
if sparql_client.returnFormat == JSON:
return result_object._convertJSON()
return result_object.convert()
# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)
Let test that the SparqlView wrapper works by running a simple Sparql query to get 5 movies along with their titles.
five_movie_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movie_nexus_Id ?movieId ?title ?genres ?imdbId ?tmdbId ?revision
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id nxv:rev ?revision.
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
} LIMIT 5
""" %(org, project)
nexus_results = query_sparql(five_movie_query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
wikidata_sparql_endpoint = "https://query.wikidata.org/sparql"
wikidata_sparql_wrapper = create_sparql_client(sparql_endpoint=wikidata_sparql_endpoint,http_query_method= GET, result_format=JSON)
Let test that the wrapper works by running a query that fetch the logo url for a given movie tmdbId 862 (Toy Story). You can play the following query in the Wikidata playground Try It.
In the query below:
# wdt:P4947 is the wikidata property for tmdbId
movie_logo_query = """
SELECT *
WHERE
{
?movie wdt:P4947 "%s".
OPTIONAL{
?movie wdt:P154 ?logo.
}
}
""" % (862)
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
wiki_df.head()
Let display the logo of the Toy Story movie. This part might take some time but you can skip it.
from IPython.display import SVG, display
movie_logo_url = wiki_df.at[0,'logo']
display(SVG(movie_logo_url))
For every movie retrieved from the Nexus SparqlView, we will get:
Try it for the movie Toy Story.
from functools import reduce
import json
def panda_merge(df,df2, on):
cols_to_use = df2.columns.difference(df.columns)
dfNew = pd.merge(df, df2[cols_to_use], left_index=True, right_index=True, how='outer')
return dfNew
def panda_concatenate(dfs):
df = pd.concat(dfs)
return df
wiki_dataframes = []
for index, row in nexus_df.iterrows():
imdbdId = row['tmdbId']
movie_logo_query = """
SELECT ?tmdbId ?movie ?logo ?nativelanguage ?publication_date ?cast ?givenName ?familyName
WHERE
{
?movie wdt:P4947 ?tmdbId.
FILTER (?tmdbId = "%s").
OPTIONAL{
?movie wdt:P154 ?logo.
}
OPTIONAL{
?movie wdt:P725|wdt:P161 ?cast.
?cast wdt:P735/wdt:P1705 ?givenName.
?cast wdt:P734/wdt:P1705 ?familyName.
}
OPTIONAL{
?movie wdt:P364/wdt:P1705 ?nativelanguage.
}
OPTIONAL {
?movie p:P577 ?publication_date_node.
?publication_date_node ps:P577 ?publication_date. # publication date statement
?publication_date_node pq:P291 wd:Q30. # qualifier on the release date
}
}
""" % (row['tmdbId'])
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
print("""Display metadata (from wikidata) for %s""" %(nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]))
display(wiki_df.head())
wiki_dataframes.append(wiki_df)
#Let concatenate all dataframes from wikidata
result_wiki_dataframes = panda_concatenate(wiki_dataframes)
#display(result_wiki_dataframes.head())
merge_wiki_dataframes = panda_merge(result_wiki_dataframes,nexus_df,"tmdbId")
#display(merge_wiki_dataframes.head())
wiki_dataframes_tojson = (merge_wiki_dataframes.apply(lambda x: x.dropna(), 1).groupby(['tmdbId','movie','nativelanguage'], as_index=False)
.apply(lambda x: x[['cast','givenName','familyName']].to_dict('r'))
.reset_index()
.rename(columns={0:'casting'})
.to_json(orient='records'))
updated_movies_json = json.loads(wiki_dataframes_tojson)
# We obtained a json array to be loaded
from urllib.parse import urlencode, quote_plus
def update_in_nexus(row):
row["@type"]= "Movie"
_id = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]
rev = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'revision'].iloc[0]
row["title"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'title'].iloc[0]
row["genres"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'genres'].iloc[0]
row["movieId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movieId'].iloc[0]
row["imdbId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'imdbId'].iloc[0]
data = nexus.resources.fetch(org_label=org,project_label=project,resource_id=_id,schema_id="_")
current_revision = data["_rev"]
url = nexus_deployment+"/resources/"+org+"/"+project+"/"+"_/"+quote_plus(_id)
row["_self"] = url
nexus.resources.update(resource=row, rev=current_revision)
for item in updated_movies_json:
update_in_nexus(item)
The data can now be listed with the casting metadata obtained from wikidata.
# List movies with their casting names separated by a comma.
movie_acting_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId (group_concat(DISTINCT ?castname;separator=", ") as ?casting)
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id vocab:casting ?cast.
?cast vocab:givenName ?givenName.
?cast vocab:familyName ?familyName.
BIND (CONCAT(?givenName, " ", ?familyName) AS ?castname).
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
}
Group By ?movieId ?title ?genres ?imdbId ?tmdbId
LIMIT 100
""" %(org,project)
nexus_updated_results = query_sparql(movie_acting_query,sparqlview_wrapper)
nexus_df_updated=sparql2dataframe(nexus_updated_results)
nexus_df_updated.head(100)
# Pick an actor and find movies within which they acted
actor = updated_movies_json[0]["casting"][0]
given_name = actor["givenName"]
family_name = actor["familyName"]
query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id vocab:casting ?cast.
?cast vocab:familyName "%s".
?cast vocab:givenName "%s".
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
} LIMIT 5
""" % (org, project, family_name, given_name)
nexus_results = query_sparql(query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()