This notebook demonstrates how to load Cora dataset into Neo4j graph database.
# install StellarGraph if running on Google Colab
import sys
if 'google.colab' in sys.modules:
%pip install -q stellargraph[demos]==1.2.1
# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg
try:
sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
raise ValueError(
f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed. Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
) from None
import pandas as pd
import os
from stellargraph import datasets
from IPython.display import display, HTML
(See the "Loading from Pandas" demo for details on how data can be loaded.)
dataset = datasets.Cora()
display(HTML(dataset.description))
dataset.download()
edge_list = pd.read_csv(
os.path.join(dataset.data_directory, "cora.cites"),
sep="\t",
header=None,
names=["target", "source"],
)
edge_list["label"] = "cites"
display(edge_list.head(5))
target | source | label | |
---|---|---|---|
0 | 35 | 1033 | cites |
1 | 35 | 103482 | cites |
2 | 35 | 103515 | cites |
3 | 35 | 1050679 | cites |
4 | 35 | 1103960 | cites |
feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names = feature_names + ["subject"]
node_list = pd.read_csv(
os.path.join(dataset.data_directory, "cora.content"),
sep="\t",
header=None,
names=column_names,
)
# gather all features into lists under 'features' column.
node_list["features"] = node_list[feature_names].values.tolist()
node_list = node_list.drop(columns=feature_names)
node_list["id"] = node_list.index
node_list.head(5)
subject | features | id | |
---|---|---|---|
31336 | Neural_Networks | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 31336 |
1061127 | Rule_Learning | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... | 1061127 |
1106406 | Reinforcement_Learning | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 1106406 |
13195 | Reinforcement_Learning | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 13195 |
37879 | Probabilistic_Methods | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 37879 |
We define the graph schema as below:
Each vertex represents a paper
Each directed edge represents a citation. Each edge points to the paper being cited.
As the Cora dataset is small, we could use Cypher queries and execute the transactions via a Python-supported driver.
For bigger dataset, this loading job might take very long, so it is more convenient to use neo4j-admin import
tool, tutorial here.
import time
import py2neo
default_host = os.environ.get("STELLARGRAPH_NEO4J_HOST")
# Create the Neo4j Graph database object; the arguments can be edited to specify location and authentication
graph = py2neo.Graph(host=default_host, port=None, user=None, password=None)
Delete the existing edges and relationships in the current database.
empty_db_query = """
MATCH(n) DETACH
DELETE(n)
"""
tx = graph.begin(autocommit=True)
tx.evaluate(empty_db_query)
Delete any existing constraints or indexes in the current database.
constraints = graph.run("CALL db.constraints").data()
for constraint in constraints:
graph.run(f"DROP CONSTRAINT {constraint['name']}")
indexes = graph.run("CALL db.indexes").data()
for index in indexes:
graph.run(f"DROP INDEX {index['name']}")
Load all nodes to the graph database.
loading_node_query = """
UNWIND $node_list as node
CREATE( e: paper {
ID: toInteger(node.id),
subject: node.subject,
features: node.features
})
"""
# For efficient loading, we will load batch of nodes into Neo4j.
batch_len = 500
for batch_start in range(0, len(node_list), batch_len):
batch_end = batch_start + batch_len
# turn node dataframe into a list of records
records = node_list.iloc[batch_start:batch_end].to_dict("records")
tx = graph.begin(autocommit=True)
tx.evaluate(loading_node_query, parameters={"node_list": records})
Load all edges to the graph database.
loading_edge_query = """
UNWIND $edge_list as edge
MATCH(source: paper {ID: toInteger(edge.source)})
MATCH(target: paper {ID: toInteger(edge.target)})
MERGE (source)-[r:cites]->(target)
"""
batch_len = 500
for batch_start in range(0, len(edge_list), batch_len):
batch_end = batch_start + batch_len
# turn edge dataframe into a list of records
records = edge_list.iloc[batch_start:batch_end].to_dict("records")
tx = graph.begin(autocommit=True)
tx.evaluate(loading_edge_query, parameters={"edge_list": records})
Ensure node IDs are unique. Creating this constraint also automatically creates an index which will improve performance of querying nodes by ID.
node_id_constraint = """
CREATE CONSTRAINT
ON (n:paper)
ASSERT n.ID IS UNIQUE
"""
tx = graph.begin(autocommit=True)
tx.evaluate(node_id_constraint)