import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning
sys.path.append(os.path.realpath('..'))
import nodevectors
import csrgraph as cg
from csrgraph import methods
from nodevectors.evaluation import link_pred
from nodevectors.evaluation import graph_eval
# UMAP to test (on pip)
import umap
warnings.simplefilter("ignore")
def nx_node_weights(G, method, **kwargs):
"""Node Weights through networkX API"""
pr = np.zeros(len(G))
prdict = method(G, **kwargs)
for i in G.nodes:
pr[i] = prdict[i]
return pr
Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data Just download it and point the graph generation methods below to it
The data is in a different repo to avoid polluting the pip package.
#### CONFIG
TEST_SIZE = 0.2
OUT_FILE = 'email.csv'
SEED = 42
ALL_COMPONENTS = [1, 2, 4, 8, 16, 32, 64, 128, 256]
#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on
#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
# G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")
#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")
#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)
#### Lazy way to set up evaluation
try:
y = labels.label
n_clusters = y.nunique()
HAS_LABELS = True
print(f"clusters: {n_clusters}")
except:
try: # Multilabels
y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
HAS_LABELS = True
print(f"multilabels: {y.shape[1]}")
except: # No Labels
HAS_LABELS = False
print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")
clusters: 18 Nodes: 820 Edges: 9658 connected: True
### GGVEC ####
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
ggvec_params = dict(
n_components=N_COMPONENTS,
order=1,
tol=0.05,
tol_samples=75,
max_epoch=6_000,
learning_rate=0.05,
negative_ratio=0.33,
exponent=0.33,
verbose=True,
)
start_t = time.time()
time.sleep(0.3)
w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'ggvec'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'ggvec'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
-------N: 1--------
Loss: 0.1498 : 1%|▏ | 88/6000 [00:02<03:04, 32.13it/s]
Converged! Loss: 0.1484 Time: 3.0665 Link Prediction: (logit) AUC-ROC: 0.517, AUC-PR: 0.463, Acc: 0.509, F1: 0.488 (lgbm) AUC-ROC: 0.772, AUC-PR: 0.700, Acc: 0.735, F1: 0.784
Loss: 0.1475 : 1%|▏ | 86/6000 [00:00<00:10, 562.90it/s]
Converged! Loss: 0.1490 Label Prediction: (logit) Acc: 0.091, F1 micro: 0.091, F1 macro: 0.091 (lgbm) Acc: 0.165, F1 micro: 0.165, F1 macro: 0.165 MI: 0.37, RAND 0.32, FM: 0.32
### GGVEC - 2 ####
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
ggvec_params = dict(
n_components=N_COMPONENTS,
order=2,
tol=0.1,
tol_samples=10,
max_epoch=500,
learning_rate=0.1,
negative_ratio=0.1,
exponent=0.33,
verbose=True,
)
start_t = time.time()
time.sleep(0.3)
w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'ggvec2'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'ggvec2'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
-------N: 1--------
Loss: 0.0341 : 3%|▎ | 15/500 [00:00<00:01, 265.23it/s]
Converged! Loss: 0.0339 Time: 1.2681 Link Prediction: (logit) AUC-ROC: 0.466, AUC-PR: 0.436, Acc: 0.486, F1: 0.504 (lgbm) AUC-ROC: 0.819, AUC-PR: 0.776, Acc: 0.760, F1: 0.783
Loss: 0.0335 : 3%|▎ | 16/500 [00:00<00:01, 257.96it/s]
Converged! Loss: 0.0336 Label Prediction: (logit) Acc: 0.177, F1 micro: 0.177, F1 macro: 0.177 (lgbm) Acc: 0.293, F1 micro: 0.293, F1 macro: 0.293 MI: 0.31, RAND 0.30, FM: 0.30
### N2V ####
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
n2v_params = dict(
n_components=N_COMPONENTS,
epochs=20,
walklen=60,
return_weight=1.,
neighbor_weight=1.,
w2vparams={
"window":3,
"negative":5,
"iter":2,
"batch_words":128}
)
start_t = time.time()
w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'node2vec'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'node2vec'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
-------N: 1-------- Making walks... Done, T=1.21 Mapping Walk Names... Done, T=1.01 Training W2V... Done, T=3.06 Time: 5.3080 Link Prediction: (logit) AUC-ROC: 0.520, AUC-PR: 0.471, Acc: 0.510, F1: 0.505 (lgbm) AUC-ROC: 0.781, AUC-PR: 0.729, Acc: 0.718, F1: 0.759 Making walks... Done, T=0.13 Mapping Walk Names... Done, T=1.16 Training W2V... Done, T=2.81 Label Prediction: (logit) Acc: 0.110, F1 micro: 0.110, F1 macro: 0.110 (lgbm) Acc: 0.183, F1 micro: 0.183, F1 macro: 0.183 MI: -0.00, RAND 0.23, FM: 0.23
### ProNE ####
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
pne_params = dict(
n_components=N_COMPONENTS,
step=5,
mu=0.2,
theta=0.5,
)
start_t = time.time()
w_train = nodevectors.ProNE(**pne_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'prone'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
w = nodevectors.ProNE(**pne_params).fit_transform(G)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'prone'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
-------N: 1-------- Time: 0.0400 Link Prediction: (logit) AUC-ROC: 0.555, AUC-PR: 0.552, Acc: 0.592, F1: 0.656 (lgbm) AUC-ROC: 0.720, AUC-PR: 0.653, Acc: 0.675, F1: 0.752 Label Prediction: (logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024 (lgbm) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024 MI: -0.00, RAND 0.23, FM: 0.23
### GRaRep ####
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
grarep_params = dict(
n_components=N_COMPONENTS,
order=1,
embedder=TruncatedSVD(
n_iter=10,
random_state=42),
merger=(lambda x : np.sum(x, axis=0)),
)
start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'grarep'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
w = nodevectors.GraRep(**grarep_params).fit_transform(G)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'grarep'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
100%|██████████| 1/1 [00:00<00:00, 37.95it/s]
-------N: 1-------- Time: 0.0507 Link Prediction:
(logit) AUC-ROC: 0.519, AUC-PR: 0.572, Acc: 0.517, F1: 0.540 (lgbm) AUC-ROC: 0.895, AUC-PR: 0.865, Acc: 0.821, F1: 0.829
100%|██████████| 1/1 [00:00<00:00, 54.70it/s]
Label Prediction: (logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024 (lgbm) Acc: 0.354, F1 micro: 0.354, F1 macro: 0.354 MI: -0.00, RAND 0.23, FM: 0.23
### GLoVe with random walks ###
for N_COMPONENTS in ALL_COMPONENTS:
print(f"\n\n-------N: {N_COMPONENTS}--------")
glove_params = dict(
n_components=N_COMPONENTS,
tol=0.001,
max_epoch=6_000,
learning_rate=0.01,
max_loss=10.,
max_count=50,
exponent=0.5,
)
start_t = time.time()
wg = cg.csrgraph(G_train).random_walk_resample(walklen=7, epochs=30)
w_train = nodevectors.Glove(**glove_params).fit_transform(wg)
print(f"Time: {time.time() - start_t :.4f}")
lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
lpred['algorithm'] = 'glove'
lpred['dim'] = N_COMPONENTS
lpred['time'] = str(f"{time.time() - start_t :.1f}")
lpred = pd.DataFrame([pd.Series(lpred)])
time.sleep(0.3)
LPRED_FILE = "linkpred_" + OUT_FILE
if os.path.isfile(LPRED_FILE):
lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
else:
lpred.to_csv(LPRED_FILE, float_format='%.3f')
wg = cg.csrgraph(G).random_walk_resample(walklen=7, epochs=30)
w = nodevectors.Glove(**glove_params).fit_transform(wg)
labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
labelpred['algorithm'] = 'glove'
labelpred['dim'] = N_COMPONENTS
labelpred['time'] = str(f"{time.time() - start_t :.1f}")
labelpred = pd.DataFrame([pd.Series(labelpred)])
time.sleep(0.3)
LPRED_FILE = OUT_FILE
if os.path.isfile(OUT_FILE):
labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
else:
labelpred.to_csv(OUT_FILE, float_format='%.3f')
-------N: 1--------
1%|▏ | 83/6000 [00:02<03:00, 32.78it/s]
Time: 4.1126 Link Prediction: (logit) AUC-ROC: 0.502, AUC-PR: 0.449, Acc: 0.504, F1: 0.502 (lgbm) AUC-ROC: 0.797, AUC-PR: 0.730, Acc: 0.756, F1: 0.797
2%|▏ | 116/6000 [00:00<00:40, 144.66it/s]
Label Prediction: (logit) Acc: 0.189, F1 micro: 0.189, F1 macro: 0.189 (lgbm) Acc: 0.244, F1 micro: 0.244, F1 macro: 0.244 MI: 0.34, RAND 0.31, FM: 0.31