#!/usr/bin/env python # coding: utf-8 # # Loading Cora dataset into Neo4j database # #
Run the latest release of this notebook:
# This notebook demonstrates how to load Cora dataset into Neo4j graph database. # # In[1]: # install StellarGraph if running on Google Colab import sys if 'google.colab' in sys.modules: get_ipython().run_line_magic('pip', 'install -q stellargraph[demos]==1.2.1') # In[2]: # verify that we're using the correct version of StellarGraph for this notebook import stellargraph as sg try: sg.utils.validate_notebook_version("1.2.1") except AttributeError: raise ValueError( f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed. Please see ." ) from None # In[3]: import pandas as pd import os from stellargraph import datasets from IPython.display import display, HTML # ## Load Cora dataset # (See [the "Loading from Pandas" demo](../../basics/loading-pandas.ipynb) for details on how data can be loaded.) # In[4]: dataset = datasets.Cora() display(HTML(dataset.description)) dataset.download() # In[5]: edge_list = pd.read_csv( os.path.join(dataset.data_directory, "cora.cites"), sep="\t", header=None, names=["target", "source"], ) edge_list["label"] = "cites" # In[6]: display(edge_list.head(5)) # In[7]: feature_names = ["w_{}".format(ii) for ii in range(1433)] column_names = feature_names + ["subject"] node_list = pd.read_csv( os.path.join(dataset.data_directory, "cora.content"), sep="\t", header=None, names=column_names, ) # ## Preprocess data # In[8]: # gather all features into lists under 'features' column. node_list["features"] = node_list[feature_names].values.tolist() node_list = node_list.drop(columns=feature_names) node_list["id"] = node_list.index node_list.head(5) # ## Ingest data into Neo4j database # # We define the graph schema as below: # # - Each vertex represents a paper # + subject (String): the class where each subject belongs to. There are seven classes in total. # + features (List[int]): 1D-vector represents the presence of each words in the dictionary. # + ID (int): id of each paper. (**Note**: this ID attribute is different from the Neo4j id, i.e., the id of each node or relationship which Neo4j automatically assigns with). # # - Each *directed* edge represents a citation. Each edge points to the paper being cited. # # As the Cora dataset is small, we could use Cypher queries and execute the transactions via a Python-supported driver. # # For bigger dataset, this loading job might take very long, so it is more convenient to use ```neo4j-admin import ``` tool, [tutorial here](https://neo4j.com/docs/operations-manual/current/tutorial/import-tool/). # In[9]: import time # In[10]: import py2neo default_host = os.environ.get("STELLARGRAPH_NEO4J_HOST") # Create the Neo4j Graph database object; the arguments can be edited to specify location and authentication graph = py2neo.Graph(host=default_host, port=None, user=None, password=None) # Delete the existing edges and relationships in the current database. # In[11]: empty_db_query = """ MATCH(n) DETACH DELETE(n) """ tx = graph.begin(autocommit=True) tx.evaluate(empty_db_query) # Delete any existing constraints or indexes in the current database. # In[12]: constraints = graph.run("CALL db.constraints").data() for constraint in constraints: graph.run(f"DROP CONSTRAINT {constraint['name']}") indexes = graph.run("CALL db.indexes").data() for index in indexes: graph.run(f"DROP INDEX {index['name']}") # Load all nodes to the graph database. # In[13]: loading_node_query = """ UNWIND $node_list as node CREATE( e: paper { ID: toInteger(node.id), subject: node.subject, features: node.features }) """ # For efficient loading, we will load batch of nodes into Neo4j. batch_len = 500 for batch_start in range(0, len(node_list), batch_len): batch_end = batch_start + batch_len # turn node dataframe into a list of records records = node_list.iloc[batch_start:batch_end].to_dict("records") tx = graph.begin(autocommit=True) tx.evaluate(loading_node_query, parameters={"node_list": records}) # Load all edges to the graph database. # In[14]: loading_edge_query = """ UNWIND $edge_list as edge MATCH(source: paper {ID: toInteger(edge.source)}) MATCH(target: paper {ID: toInteger(edge.target)}) MERGE (source)-[r:cites]->(target) """ batch_len = 500 for batch_start in range(0, len(edge_list), batch_len): batch_end = batch_start + batch_len # turn edge dataframe into a list of records records = edge_list.iloc[batch_start:batch_end].to_dict("records") tx = graph.begin(autocommit=True) tx.evaluate(loading_edge_query, parameters={"edge_list": records}) # Ensure node IDs are unique. Creating this constraint also automatically creates an index which will improve performance of querying nodes by ID. # In[15]: node_id_constraint = """ CREATE CONSTRAINT ON (n:paper) ASSERT n.ID IS UNIQUE """ tx = graph.begin(autocommit=True) tx.evaluate(node_id_constraint) #
Run the latest release of this notebook: