Notebook

DATA Availability¶

Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data

In [1]:

import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning

sys.path.append(os.path.realpath('..'))

import nodevectors
import csrgraph as cg
from csrgraph import methods
from nodevectors.evaluation import link_pred
from nodevectors.evaluation import graph_eval

# From the related karateclub lib (on pip)
# https://github.com/benedekrozemberczki/KarateClub
from karateclub.node_embedding.neighbourhood import GraRep, NodeSketch, Walklets
# UMAP to test (on pip)
import umap

warnings.simplefilter("ignore")

def nx_node_weights(G, method, **kwargs):
    """Node Weights through networkX API"""
    pr = np.zeros(len(G))
    prdict = method(G, **kwargs)
    for i in G.nodes:
        pr[i] = prdict[i]
    return pr

In [2]:

#### CONFIG
N_COMPONENTS = 6 # resulting embedding dim
SEED = 42 # RNG Seed
TEST_SIZE = 0.2

# For resampling tests
RESAMPLE_WALKS = 30
RESAMPLE_LEN = 5

In [3]:

#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on

#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
# G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")


#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")

In [4]:

#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)

#### Lazy way to set up evaluation
try:
    y = labels.label
    n_clusters = y.nunique()
    HAS_LABELS = True
    print(f"clusters: {n_clusters}")
except:
    try: # Multilabels 
        y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
        HAS_LABELS = True
        print(f"multilabels: {y.shape[1]}")
    except: # No Labels
        HAS_LABELS = False
        print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")

clusters: 6
Nodes: 500
Edges: 17668
connected: True

In [5]:

ggvec_params = dict(
    n_components=N_COMPONENTS,
    order=1,
    tol=0.1,
    tol_samples=100,
    max_epoch=6_000,
    learning_rate=0.1,
    negative_ratio=0.05,
    exponent=0.33,
    verbose=True,
)

start_t = time.time()
w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Loss: 0.4954	:   2%|▏         | 101/6000 [00:02<02:51, 34.48it/s]

Converged! Loss: 0.4951
Time: 2.9698
Link Prediction:
	(logit) AUC-ROC: 0.516, AUC-PR: 0.507, Acc: 0.511, F1: 0.512
	(lgbm)  AUC-ROC: 0.734, AUC-PR: 0.707, Acc: 0.673, F1: 0.688

Loss: 0.4937	:   2%|▏         | 101/6000 [00:00<00:16, 352.27it/s]

Converged! Loss: 0.4954
MI: 0.18, RAND 0.30, FM: 0.30
Label Prediction:
	(logit) Acc: 0.540, F1 micro: 0.540, F1 macro: 0.540
	(lgbm) Acc: 0.450, F1 micro: 0.450, F1 macro: 0.450

In [6]:

n2v_params = dict(
    n_components=N_COMPONENTS,
    epochs=5,
    walklen=30,
    return_weight=1.,
    neighbor_weight=1.,
    w2vparams={
        "window":3, 
        "negative":5, 
        "iter":2,
        "batch_words":128}
)

start_t = time.time()
w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Making walks... Done, T=1.92
Mapping Walk Names... Done, T=0.10
Training W2V... Done, T=0.32
Time: 2.3881
Link Prediction:
	(logit) AUC-ROC: 0.552, AUC-PR: 0.482, Acc: 0.539, F1: 0.534
	(lgbm)  AUC-ROC: 0.948, AUC-PR: 0.930, Acc: 0.918, F1: 0.920
Making walks... Done, T=0.01
Mapping Walk Names... Done, T=0.13
Training W2V... Done, T=0.31
MI: 0.93, RAND 0.86, FM: 0.86
Label Prediction:
	(logit) Acc: 0.940, F1 micro: 0.940, F1 macro: 0.940
	(lgbm) Acc: 0.950, F1 micro: 0.950, F1 macro: 0.950

In [7]:

pne_params = dict(
    n_components=N_COMPONENTS,
    step=5,
    mu=0.2,
    theta=0.5,
)

start_t = time.time()
pne = nodevectors.ProNE(**pne_params)
w_train = pne.fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    pne = nodevectors.ProNE(**pne_params)
    w = pne.fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Time: 0.0773
Link Prediction:
	(logit) AUC-ROC: 0.528, AUC-PR: 0.463, Acc: 0.538, F1: 0.540
	(lgbm)  AUC-ROC: 0.951, AUC-PR: 0.940, Acc: 0.928, F1: 0.928
MI: 0.87, RAND 0.82, FM: 0.82
Label Prediction:
	(logit) Acc: 0.980, F1 micro: 0.980, F1 macro: 0.980
	(lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990

In [8]:

grarep_params = dict(
    n_components=N_COMPONENTS,
    order=2,
    embedder=TruncatedSVD(
        n_iter=10,
        random_state=42),
    merger=(lambda x : np.sum(x, axis=0)),
)

start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GraRep(**grarep_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

100%|██████████| 2/2 [00:00<00:00, 17.00it/s]

Time: 0.2583
Link Prediction:
	(logit) AUC-ROC: 0.515, AUC-PR: 0.453, Acc: 0.565, F1: 0.599
	(lgbm)  AUC-ROC: 0.957, AUC-PR: 0.939, Acc: 0.941, F1: 0.940

100%|██████████| 2/2 [00:00<00:00, 17.91it/s]

MI: 1.00, RAND 1.00, FM: 1.00
Label Prediction:
	(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
	(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000

In [9]:

##### GraRep + GGVec ####
grarep_params = dict(
    n_components=N_COMPONENTS,
    order=2,
    embedder=nodevectors.GGVec(
        n_components=N_COMPONENTS,
        tol=0.1,
        tol_samples=200,
        max_epoch=6_000,
        learning_rate=0.02,
        negative_ratio=0.6,
        exponent=0.33,
        verbose=True,
    ),
    verbose=False,
    merger=(lambda x : np.sum(x, axis=0)),
)

start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GraRep(**grarep_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Loss: 0.0229	:   4%|▍         | 228/6000 [00:02<01:11, 80.47it/s]
Loss: 0.0241	:   0%|          | 7/6000 [00:00<01:27, 68.87it/s]

Converged! Loss: 0.0225

Loss: 0.0158	:   4%|▎         | 216/6000 [00:03<01:23, 69.54it/s]

Converged! Loss: 0.0156
Time: 6.0436
Link Prediction:
	(logit) AUC-ROC: 0.534, AUC-PR: 0.465, Acc: 0.513, F1: 0.513
	(lgbm)  AUC-ROC: 0.953, AUC-PR: 0.939, Acc: 0.931, F1: 0.932

Loss: 0.0229	:   4%|▎         | 218/6000 [00:02<01:07, 85.74it/s]
Loss: 0.0243	:   0%|          | 7/6000 [00:00<01:32, 64.95it/s]

Converged! Loss: 0.0229

Loss: 0.0155	:   4%|▎         | 214/6000 [00:03<01:27, 66.29it/s]

Converged! Loss: 0.0155
MI: 1.00, RAND 1.00, FM: 1.00
Label Prediction:
	(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
	(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000

In [10]:

ump_params = dict(
    embedder=umap.UMAP,
    n_neighbors=3,
    min_dist=0.,
    metric='cosine',
    normalize_graph=True,
    n_components=N_COMPONENTS,
)

start_t = time.time()
w_train = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    w = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Time: 3.8006
Link Prediction:
	(logit) AUC-ROC: 0.541, AUC-PR: 0.472, Acc: 0.534, F1: 0.537
	(lgbm)  AUC-ROC: 0.952, AUC-PR: 0.938, Acc: 0.939, F1: 0.939
MI: 1.00, RAND 1.00, FM: 1.00
Label Prediction:
	(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
	(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000

In [11]:

### GLoVe with random walks ###
glove_params = dict(
    n_components=N_COMPONENTS,
    tol=0.0005,
    max_epoch=6_000,
    learning_rate=0.02, 
    max_loss=10.,
    max_count=50, 
    exponent=0.5,
)

start_t = time.time()
wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = nodevectors.Glove(**glove_params).fit_transform(wg)

print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
    w = nodevectors.Glove(**glove_params).fit_transform(wg)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

  1%|▏         | 76/6000 [00:02<02:54, 33.95it/s]

Time: 2.9679
Virtual edges: 53851
Link Prediction:
	(logit) AUC-ROC: 0.535, AUC-PR: 0.472, Acc: 0.527, F1: 0.525
	(lgbm)  AUC-ROC: 0.944, AUC-PR: 0.936, Acc: 0.904, F1: 0.906

  5%|▌         | 327/6000 [00:01<00:24, 236.32it/s]

MI: 1.00, RAND 1.00, FM: 1.00
Label Prediction:
	(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
	(lgbm) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000

In [13]:

### GGVec with random walks ###
ggvec_params = dict(
    n_components=N_COMPONENTS,
    tol=0.02,
    tol_samples=200,
    max_epoch=6_000,
    learning_rate=0.02,
    negative_ratio=0.3,
    exponent=0.35,
    verbose=True,
)

start_t = time.time()
wg = cg.csrgraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = wg.ggvec(**ggvec_params)

print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    wg = cg.csrgraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
    w = wg.ggvec(**ggvec_params)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Loss: 0.2861	:  16%|█▌        | 967/6000 [00:05<00:26, 188.58it/s]

Converged! Loss: 0.2859
Time: 5.6420
Virtual edges: 54151
Link Prediction:
	(logit) AUC-ROC: 0.534, AUC-PR: 0.485, Acc: 0.527, F1: 0.530
	(lgbm)  AUC-ROC: 0.958, AUC-PR: 0.944, Acc: 0.937, F1: 0.937

Loss: 0.2796	:  15%|█▌        | 911/6000 [00:03<00:18, 270.13it/s]

Converged! Loss: 0.2795
MI: 1.00, RAND 1.00, FM: 1.00
Label Prediction:
	(logit) Acc: 1.000, F1 micro: 1.000, F1 macro: 1.000
	(lgbm) Acc: 0.990, F1 micro: 0.990, F1 macro: 0.990

In [ ]:

###### Slooooowwwwwww ########
# walklets_params = dict(
#     walk_number=10, 
#     walk_length=30, 
#     dimensions=N_COMPONENTS,
#     window_size=4,
#     epochs=1, 
#     learning_rate=0.05
# )

# try: # Karateclub models don't handle certain graphs
#     start_t = time.time()
#     model = Walklets(**walklets_params)
#     model.fit(G_train)
#     print(f"Time: {time.time() - start_t :.3f}")
#     w_train = model.get_embedding()
#     result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
#     if HAS_LABELS:
#         model = Walklets(**walklets_params)
#         model.fit(G)
#         w = model.get_embedding()
#         graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
# except: pass

In [ ]:

### Completely random baseline ###

w = np.random.randn(len(G), N_COMPONENTS)

result = link_pred.LinkPrediction(w, G, G_train, testing_pos_edges)
try:
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
except: pass

In [ ]: