Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data
import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning
sys.path.append(os.path.realpath('..'))
import nodevectors
import csrgraph as cg
from csrgraph import methods
from nodevectors.evaluation import link_pred
from nodevectors.evaluation import graph_eval
# From the related karateclub lib (on pip)
# https://github.com/benedekrozemberczki/KarateClub
from karateclub.node_embedding.neighbourhood import GraRep, NodeSketch, Walklets
# UMAP to test (on pip)
import umap
warnings.simplefilter("ignore")
def nx_node_weights(G, method, **kwargs):
"""Node Weights through networkX API"""
pr = np.zeros(len(G))
prdict = method(G, **kwargs)
for i in G.nodes:
pr[i] = prdict[i]
return pr
#### CONFIG
N_COMPONENTS = 6 # resulting embedding dim
SEED = 42 # RNG Seed
TEST_SIZE = 0.2
# For resampling tests
RESAMPLE_WALKS = 30
RESAMPLE_LEN = 5
#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on
#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
# G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")
#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")
#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)
#### Lazy way to set up evaluation
try:
y = labels.label
n_clusters = y.nunique()
HAS_LABELS = True
print(f"clusters: {n_clusters}")
except:
try: # Multilabels
y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
HAS_LABELS = True
print(f"multilabels: {y.shape[1]}")
except: # No Labels
HAS_LABELS = False
print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")
clusters: 6 Nodes: 500 Edges: 17668 connected: True
ggvec_params = dict(
n_components=N_COMPONENTS,
order=1,
tol=0.1,
tol_samples=100,
max_epoch=6_000,
learning_rate=0.1,
negative_ratio=0.05,
exponent=0.33,
verbose=True,
)
start_t = time.time()
w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Loss: 0.4954 : 2%|▏ | 101/6000 [00:02<02:51, 34.48it/s]
Converged! Loss: 0.4951 Time: 2.9698 Link Prediction: (logit) AUC-ROC: 0.516, AUC-PR: 0.507, Acc: 0.511, F1: 0.512 (lgbm) AUC-ROC: 0.734, AUC-PR: 0.707, Acc: 0.673, F1: 0.688
Loss: 0.4937 : 2%|▏ | 101/6000 [00:00<00:16, 352.27it/s]
Converged! Loss: 0.4954 MI: 0.18, RAND 0.30, FM: 0.30 Label Prediction: (logit) Acc: 0.540, F1 micro: 0.540, F1 macro: 0.540 (lgbm) Acc: 0.450, F1 micro: 0.450, F1 macro: 0.450
n2v_params = dict(
n_components=N_COMPONENTS,
epochs=5,
walklen=30,
return_weight=1.,
neighbor_weight=1.,
w2vparams={
"window":3,
"negative":5,
"iter":2,
"batch_words":128}
)
start_t = time.time()
w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Making walks... Done, T=1.92 Mapping Walk Names... Done, T=0.10 Training W2V... Done, T=0.32 Time: 2.3881 Link Prediction: (logit) AUC-ROC: 0.552, AUC-PR: 0.482, Acc: 0.539, F1: 0.534 (lgbm) AUC-ROC: 0.948, AUC-PR: 0.930, Acc: 0.918, F1: 0.920 Making walks... Done, T=0.01 Mapping Walk Names... Done, T=0.13 Training W2V... Done, T=0.31 MI: 0.93, RAND 0.86, FM: 0.86 Label Prediction: (logit) Acc: 0.940, F1 micro: 0.940, F1 macro: 0.940 (lgbm) Acc: 0.950, F1 micro: 0.950, F1 macro: 0.950
pne_params = dict(
n_components=N_COMPONENTS,
step=5,
mu=0.2,
theta=0.5,
)
start_t = time.time()
pne = nodevectors.ProNE(**pne_params)
w_train = pne.fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
pne = nodevectors.ProNE(**pne_params)
w = pne.fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Time: 0.0773 Link Prediction: (logit) AUC-ROC: 0.528, AUC-PR: 0.463, Acc: 0.538, F1: 0.540 (lgbm) AUC-ROC: 0.951, AUC-PR: 0.940, Acc: 0.928, F1: 0.928 MI: 0.87, RAND 0.82, FM: 0.82 Label Prediction: (logit) Acc: 0.980, F1 micro: 0.980, F1 macro: 0.980 (lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990
grarep_params = dict(
n_components=N_COMPONENTS,
order=2,
embedder=TruncatedSVD(
n_iter=10,
random_state=42),
merger=(lambda x : np.sum(x, axis=0)),
)
start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
w = nodevectors.GraRep(**grarep_params).fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
100%|██████████| 2/2 [00:00<00:00, 17.00it/s]
Time: 0.2583 Link Prediction: (logit) AUC-ROC: 0.515, AUC-PR: 0.453, Acc: 0.565, F1: 0.599 (lgbm) AUC-ROC: 0.957, AUC-PR: 0.939, Acc: 0.941, F1: 0.940
100%|██████████| 2/2 [00:00<00:00, 17.91it/s]
MI: 1.00, RAND 1.00, FM: 1.00 Label Prediction: (logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000 (lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
##### GraRep + GGVec ####
grarep_params = dict(
n_components=N_COMPONENTS,
order=2,
embedder=nodevectors.GGVec(
n_components=N_COMPONENTS,
tol=0.1,
tol_samples=200,
max_epoch=6_000,
learning_rate=0.02,
negative_ratio=0.6,
exponent=0.33,
verbose=True,
),
verbose=False,
merger=(lambda x : np.sum(x, axis=0)),
)
start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
w = nodevectors.GraRep(**grarep_params).fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Loss: 0.0229 : 4%|▍ | 228/6000 [00:02<01:11, 80.47it/s] Loss: 0.0241 : 0%| | 7/6000 [00:00<01:27, 68.87it/s]
Converged! Loss: 0.0225
Loss: 0.0158 : 4%|▎ | 216/6000 [00:03<01:23, 69.54it/s]
Converged! Loss: 0.0156 Time: 6.0436 Link Prediction: (logit) AUC-ROC: 0.534, AUC-PR: 0.465, Acc: 0.513, F1: 0.513 (lgbm) AUC-ROC: 0.953, AUC-PR: 0.939, Acc: 0.931, F1: 0.932
Loss: 0.0229 : 4%|▎ | 218/6000 [00:02<01:07, 85.74it/s] Loss: 0.0243 : 0%| | 7/6000 [00:00<01:32, 64.95it/s]
Converged! Loss: 0.0229
Loss: 0.0155 : 4%|▎ | 214/6000 [00:03<01:27, 66.29it/s]
Converged! Loss: 0.0155 MI: 1.00, RAND 1.00, FM: 1.00 Label Prediction: (logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000 (lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
ump_params = dict(
embedder=umap.UMAP,
n_neighbors=3,
min_dist=0.,
metric='cosine',
normalize_graph=True,
n_components=N_COMPONENTS,
)
start_t = time.time()
w_train = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
w = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Time: 3.8006 Link Prediction: (logit) AUC-ROC: 0.541, AUC-PR: 0.472, Acc: 0.534, F1: 0.537 (lgbm) AUC-ROC: 0.952, AUC-PR: 0.938, Acc: 0.939, F1: 0.939 MI: 1.00, RAND 1.00, FM: 1.00 Label Prediction: (logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000 (lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
### GLoVe with random walks ###
glove_params = dict(
n_components=N_COMPONENTS,
tol=0.0005,
max_epoch=6_000,
learning_rate=0.02,
max_loss=10.,
max_count=50,
exponent=0.5,
)
start_t = time.time()
wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = nodevectors.Glove(**glove_params).fit_transform(wg)
print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w = nodevectors.Glove(**glove_params).fit_transform(wg)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
1%|▏ | 76/6000 [00:02<02:54, 33.95it/s]
Time: 2.9679 Virtual edges: 53851 Link Prediction: (logit) AUC-ROC: 0.535, AUC-PR: 0.472, Acc: 0.527, F1: 0.525 (lgbm) AUC-ROC: 0.944, AUC-PR: 0.936, Acc: 0.904, F1: 0.906
5%|▌ | 327/6000 [00:01<00:24, 236.32it/s]
MI: 1.00, RAND 1.00, FM: 1.00 Label Prediction: (logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000 (lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
### GGVec with random walks ###
ggvec_params = dict(
n_components=N_COMPONENTS,
tol=0.02,
tol_samples=200,
max_epoch=6_000,
learning_rate=0.02,
negative_ratio=0.3,
exponent=0.35,
verbose=True,
)
start_t = time.time()
wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = wg.ggvec(**ggvec_params)
print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w = wg.ggvec(**ggvec_params)
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
Loss: 0.2861 : 16%|█▌ | 967/6000 [00:05<00:26, 188.58it/s]
Converged! Loss: 0.2859 Time: 5.6420 Virtual edges: 54151 Link Prediction: (logit) AUC-ROC: 0.534, AUC-PR: 0.485, Acc: 0.527, F1: 0.530 (lgbm) AUC-ROC: 0.958, AUC-PR: 0.944, Acc: 0.937, F1: 0.937
Loss: 0.2796 : 15%|█▌ | 911/6000 [00:03<00:18, 270.13it/s]
Converged! Loss: 0.2795 MI: 1.00, RAND 1.00, FM: 1.00 Label Prediction: (logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000 (lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990
###### Slooooowwwwwww ########
# walklets_params = dict(
# walk_number=10,
# walk_length=30,
# dimensions=N_COMPONENTS,
# window_size=4,
# epochs=1,
# learning_rate=0.05
# )
# try: # Karateclub models don't handle certain graphs
# start_t = time.time()
# model = Walklets(**walklets_params)
# model.fit(G_train)
# print(f"Time: {time.time() - start_t :.3f}")
# w_train = model.get_embedding()
# result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
# if HAS_LABELS:
# model = Walklets(**walklets_params)
# model.fit(G)
# w = model.get_embedding()
# graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
# except: pass
### Completely random baseline ###
w = np.random.randn(len(G), N_COMPONENTS)
result = link_pred.LinkPrediction(w, G, G_train, testing_pos_edges)
try:
graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
except: pass