Notebook

Linking data on the web¶

In this notebook you'll experience how to consume structured data available on Wikidata to extend and enrich data managed within Nexus.

Prerequisites¶

This notebook assumes you've created a project within the AWS deployment of Blue Brain Nexus. If not follow the Blue Brain Nexus Quick Start tutorial.

Overview¶

You'll work through the following steps:

Install and configure the Blue Brain Nexus python sdk
Create a sparql wrapper around your project's SparqlView
Create a sparql wrapper around the Wikidata Sparql Service
Query more metadata from the Wikidata Sparql Service for movies stored in Nexus using their tmdbId
Save the extended movies metadata back to Nexus and perform new queries

Step 1: Install and configure the Blue Brain Nexus python sdk¶

In [ ]:

!pip install git+https://github.com/BlueBrain/nexus-cli

In [1]:

import nexussdk as nexus

nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"

token = "your token here"
nexus.config.set_environment(nexus_deployment)
nexus.config.set_token(token)

org ="tutorialnexus"

#Provide your project name here.
project ="your_project_here"

Step 2: Create a sparql wrapper around your project's SparqlView¶

Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/%5C$PROJECTLABEL/graph/sparql for a project withe label $PROJECTLABEL. The address of a SparqlView is also called a sparql endpoint.

In [ ]:

!pip install git+https://github.com/RDFLib/sparqlwrapper

In [ ]:

# Utility functions to create sparql wrapper around a sparql endpoint

from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests



def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
    sparql_client = SPARQLWrapper(sparql_endpoint)
    #sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
    if token:
        sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
    sparql_client.setMethod(http_query_method)
    sparql_client.setReturnFormat(result_format)
    if http_query_method == POST:
        sparql_client.setRequestMethod(POSTDIRECTLY)
    
    return sparql_client

In [ ]:

# Utility functions
import pandas as pd

pd.set_option('display.max_colwidth', -1)

# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
    cols = json_sparql_results['head']['vars']
    out = []
    for row in json_sparql_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
    return pd.DataFrame(out, columns=cols)

# Send a query using a sparql wrapper 
def query_sparql(query, sparql_client):
    sparql_client.setQuery(query)
    
    result_object = sparql_client.query()
    if sparql_client.returnFormat == JSON:
        return result_object._convertJSON()
    return result_object.convert()

In [ ]:

# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)

Let test that the SparqlView wrapper works by running a simple Sparql query to get 5 movies along with their titles.

In [ ]:

five_movie_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movie_nexus_Id ?movieId ?title ?genres ?imdbId ?tmdbId ?revision
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id nxv:rev ?revision.
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
} LIMIT 5
""" %(org, project)

nexus_results = query_sparql(five_movie_query,sparqlview_wrapper)


nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()

Step 3: Create a Sparql wrapper around the Wikidata Sparql Service¶

In [ ]:

wikidata_sparql_endpoint = "https://query.wikidata.org/sparql"
wikidata_sparql_wrapper = create_sparql_client(sparql_endpoint=wikidata_sparql_endpoint,http_query_method= GET, result_format=JSON)

Let test that the wrapper works by running a query that fetch the logo url for a given movie tmdbId 862 (Toy Story). You can play the following query in the Wikidata playground Try It.

In the query below:

wdt:P4947 is the wikidata property for tmdbId
wdt:P154 is the wiki data property for logo image url

In [ ]:

# wdt:P4947 is the wikidata property for tmdbId

movie_logo_query = """
SELECT *
WHERE
{
    ?movie wdt:P4947 "%s".
    OPTIONAL{
    ?movie wdt:P154 ?logo.
    }
}
""" % (862)
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
wiki_df.head()

Let display the logo of the Toy Story movie. This part might take some time but you can skip it.

In [ ]:

from IPython.display import SVG, display

movie_logo_url = wiki_df.at[0,'logo']
display(SVG(movie_logo_url))

In [ ]:

Step 4: Query more metadata from the Wikidata Sparql Service¶

For every movie retrieved from the Nexus SparqlView, we will get:

its cast members or voice actors
its publication date in the United States of America if any

Try it for the movie Toy Story.

In [ ]:

from functools import reduce
import json


def panda_merge(df,df2, on):
    cols_to_use = df2.columns.difference(df.columns)
    dfNew = pd.merge(df, df2[cols_to_use], left_index=True, right_index=True, how='outer')
    return dfNew

def panda_concatenate(dfs):
    df = pd.concat(dfs)
    return df

wiki_dataframes = []
for index, row in nexus_df.iterrows():
    imdbdId = row['tmdbId']
    movie_logo_query = """
    SELECT ?tmdbId ?movie ?logo ?nativelanguage ?publication_date ?cast ?givenName ?familyName
    WHERE
    {
        ?movie wdt:P4947 ?tmdbId.
        FILTER (?tmdbId = "%s").
        OPTIONAL{
          ?movie wdt:P154 ?logo.
        }
        OPTIONAL{
          ?movie wdt:P725|wdt:P161 ?cast.
          ?cast wdt:P735/wdt:P1705 ?givenName.
          ?cast wdt:P734/wdt:P1705 ?familyName.
        }
        OPTIONAL{
        ?movie wdt:P364/wdt:P1705 ?nativelanguage.
        }
       
      OPTIONAL {
     ?movie p:P577 ?publication_date_node.
     ?publication_date_node ps:P577 ?publication_date. # publication date statement
     ?publication_date_node pq:P291 wd:Q30.            # qualifier on the release date
  }
    }
    """ % (row['tmdbId'])

    wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
    wiki_df =sparql2dataframe(wiki_results)
    print("""Display metadata (from wikidata) for %s""" %(nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]))
    display(wiki_df.head())
    wiki_dataframes.append(wiki_df)


#Let concatenate all dataframes from wikidata
result_wiki_dataframes = panda_concatenate(wiki_dataframes)
#display(result_wiki_dataframes.head())


merge_wiki_dataframes = panda_merge(result_wiki_dataframes,nexus_df,"tmdbId")
#display(merge_wiki_dataframes.head())
wiki_dataframes_tojson = (merge_wiki_dataframes.apply(lambda x: x.dropna(), 1).groupby(['tmdbId','movie','nativelanguage'], as_index=False)
             .apply(lambda x: x[['cast','givenName','familyName']].to_dict('r'))
             .reset_index()
             .rename(columns={0:'casting'})
             .to_json(orient='records'))





updated_movies_json = json.loads(wiki_dataframes_tojson)

Step 5: Save the extended movies metadata back to Nexus¶

In [ ]:

# We obtained a json array to be loaded
from urllib.parse import urlencode, quote_plus

def update_in_nexus(row):
    row["@type"]= "Movie"
    _id = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]
    rev = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'revision'].iloc[0]
    row["title"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'title'].iloc[0]
    row["genres"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'genres'].iloc[0]
    row["movieId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movieId'].iloc[0]
    row["imdbId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'imdbId'].iloc[0]
    
    
    data = nexus.resources.fetch(org_label=org,project_label=project,resource_id=_id,schema_id="_")
    current_revision = data["_rev"]
  
    
    url = nexus_deployment+"/resources/"+org+"/"+project+"/"+"_/"+quote_plus(_id)
    row["_self"] = url
    nexus.resources.update(resource=row, rev=current_revision)
    
for item in updated_movies_json:
    update_in_nexus(item)
    

The data can now be listed with the casting metadata obtained from wikidata.

In [ ]:

# List movies with their casting names separated by a comma.

movie_acting_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId (group_concat(DISTINCT ?castname;separator=", ") as ?casting)
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id vocab:casting ?cast.
    ?cast vocab:givenName ?givenName.
    ?cast vocab:familyName ?familyName.
    BIND (CONCAT(?givenName, " ", ?familyName) AS ?castname).
   
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
}
Group By ?movieId ?title ?genres ?imdbId ?tmdbId
LIMIT 100
""" %(org,project)

nexus_updated_results = query_sparql(movie_acting_query,sparqlview_wrapper)
nexus_df_updated=sparql2dataframe(nexus_updated_results)
nexus_df_updated.head(100)

In [ ]:

# Pick an actor and find movies within which they acted
actor = updated_movies_json[0]["casting"][0]
given_name = actor["givenName"]
family_name = actor["familyName"]

query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id vocab:casting ?cast.
    ?cast vocab:familyName "%s".
    ?cast vocab:givenName "%s".
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
} LIMIT 5
""" % (org, project, family_name, given_name)

nexus_results = query_sparql(query,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()

In [ ]: