#!/usr/bin/env python
# coding: utf-8
# # Loading Cora dataset into Neo4j database
#
#
Run the latest release of this notebook: | | |
# This notebook demonstrates how to load Cora dataset into Neo4j graph database.
#
# In[1]:
# install StellarGraph if running on Google Colab
import sys
if 'google.colab' in sys.modules:
get_ipython().run_line_magic('pip', 'install -q stellargraph[demos]==1.2.1')
# In[2]:
# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg
try:
sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
raise ValueError(
f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed. Please see ."
) from None
# In[3]:
import pandas as pd
import os
from stellargraph import datasets
from IPython.display import display, HTML
# ## Load Cora dataset
# (See [the "Loading from Pandas" demo](../../basics/loading-pandas.ipynb) for details on how data can be loaded.)
# In[4]:
dataset = datasets.Cora()
display(HTML(dataset.description))
dataset.download()
# In[5]:
edge_list = pd.read_csv(
os.path.join(dataset.data_directory, "cora.cites"),
sep="\t",
header=None,
names=["target", "source"],
)
edge_list["label"] = "cites"
# In[6]:
display(edge_list.head(5))
# In[7]:
feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names = feature_names + ["subject"]
node_list = pd.read_csv(
os.path.join(dataset.data_directory, "cora.content"),
sep="\t",
header=None,
names=column_names,
)
# ## Preprocess data
# In[8]:
# gather all features into lists under 'features' column.
node_list["features"] = node_list[feature_names].values.tolist()
node_list = node_list.drop(columns=feature_names)
node_list["id"] = node_list.index
node_list.head(5)
# ## Ingest data into Neo4j database
#
# We define the graph schema as below:
#
# - Each vertex represents a paper
# + subject (String): the class where each subject belongs to. There are seven classes in total.
# + features (List[int]): 1D-vector represents the presence of each words in the dictionary.
# + ID (int): id of each paper. (**Note**: this ID attribute is different from the Neo4j id, i.e., the id of each node or relationship which Neo4j automatically assigns with).
#
# - Each *directed* edge represents a citation. Each edge points to the paper being cited.
#
# As the Cora dataset is small, we could use Cypher queries and execute the transactions via a Python-supported driver.
#
# For bigger dataset, this loading job might take very long, so it is more convenient to use ```neo4j-admin import ``` tool, [tutorial here](https://neo4j.com/docs/operations-manual/current/tutorial/import-tool/).
# In[9]:
import time
# In[10]:
import py2neo
default_host = os.environ.get("STELLARGRAPH_NEO4J_HOST")
# Create the Neo4j Graph database object; the arguments can be edited to specify location and authentication
graph = py2neo.Graph(host=default_host, port=None, user=None, password=None)
# Delete the existing edges and relationships in the current database.
# In[11]:
empty_db_query = """
MATCH(n) DETACH
DELETE(n)
"""
tx = graph.begin(autocommit=True)
tx.evaluate(empty_db_query)
# Delete any existing constraints or indexes in the current database.
# In[12]:
constraints = graph.run("CALL db.constraints").data()
for constraint in constraints:
graph.run(f"DROP CONSTRAINT {constraint['name']}")
indexes = graph.run("CALL db.indexes").data()
for index in indexes:
graph.run(f"DROP INDEX {index['name']}")
# Load all nodes to the graph database.
# In[13]:
loading_node_query = """
UNWIND $node_list as node
CREATE( e: paper {
ID: toInteger(node.id),
subject: node.subject,
features: node.features
})
"""
# For efficient loading, we will load batch of nodes into Neo4j.
batch_len = 500
for batch_start in range(0, len(node_list), batch_len):
batch_end = batch_start + batch_len
# turn node dataframe into a list of records
records = node_list.iloc[batch_start:batch_end].to_dict("records")
tx = graph.begin(autocommit=True)
tx.evaluate(loading_node_query, parameters={"node_list": records})
# Load all edges to the graph database.
# In[14]:
loading_edge_query = """
UNWIND $edge_list as edge
MATCH(source: paper {ID: toInteger(edge.source)})
MATCH(target: paper {ID: toInteger(edge.target)})
MERGE (source)-[r:cites]->(target)
"""
batch_len = 500
for batch_start in range(0, len(edge_list), batch_len):
batch_end = batch_start + batch_len
# turn edge dataframe into a list of records
records = edge_list.iloc[batch_start:batch_end].to_dict("records")
tx = graph.begin(autocommit=True)
tx.evaluate(loading_edge_query, parameters={"edge_list": records})
# Ensure node IDs are unique. Creating this constraint also automatically creates an index which will improve performance of querying nodes by ID.
# In[15]:
node_id_constraint = """
CREATE CONSTRAINT
ON (n:paper)
ASSERT n.ID IS UNIQUE
"""
tx = graph.begin(autocommit=True)
tx.evaluate(node_id_constraint)
# Run the latest release of this notebook: | | |