Notebook

In [1]:

import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning

sys.path.append(os.path.realpath('..'))

import nodevectors
import csrgraph as cg
from csrgraph import methods
from nodevectors.evaluation import link_pred
from nodevectors.evaluation import graph_eval

# UMAP to test (on pip)
import umap

warnings.simplefilter("ignore")

def nx_node_weights(G, method, **kwargs):
    """Node Weights through networkX API"""
    pr = np.zeros(len(G))
    prdict = method(G, **kwargs)
    for i in G.nodes:
        pr[i] = prdict[i]
    return pr

Data Availability¶

Data for these notebooks can be found here: https://github.com/VHRanger/Graph-Data Just download it and point the graph generation methods below to it

The data is in a different repo to avoid polluting the pip package.

In [2]:

#### CONFIG
TEST_SIZE = 0.2
OUT_FILE = 'email.csv'
SEED = 42
ALL_COMPONENTS = [1, 2, 4, 8, 16, 32, 64, 128, 256]

In [3]:

#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on

#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
# G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")


#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")

In [4]:

#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)

#### Lazy way to set up evaluation
try:
    y = labels.label
    n_clusters = y.nunique()
    HAS_LABELS = True
    print(f"clusters: {n_clusters}")
except:
    try: # Multilabels 
        y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
        HAS_LABELS = True
        print(f"multilabels: {y.shape[1]}")
    except: # No Labels
        HAS_LABELS = False
        print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")

clusters: 18
Nodes: 820
Edges: 9658
connected: True

In [5]:

### GGVEC ####
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    ggvec_params = dict(
        n_components=N_COMPONENTS,
        order=1,
        tol=0.05,
        tol_samples=75,
        max_epoch=6_000,
        learning_rate=0.05,
        negative_ratio=0.33,
        exponent=0.33,
        verbose=True,
    )
    start_t = time.time()
    time.sleep(0.3)
    w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'ggvec'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')


    w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'ggvec'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')


-------N: 1--------

Loss: 0.1498	:   1%|▏         | 88/6000 [00:02<03:04, 32.13it/s]

Converged! Loss: 0.1484
Time: 3.0665
Link Prediction:
	(logit) AUC-ROC: 0.517, AUC-PR: 0.463, Acc: 0.509, F1: 0.488
	(lgbm)  AUC-ROC: 0.772, AUC-PR: 0.700, Acc: 0.735, F1: 0.784

Loss: 0.1475	:   1%|▏         | 86/6000 [00:00<00:10, 562.90it/s]

Converged! Loss: 0.1490
Label Prediction:
	(logit) Acc: 0.091, F1 micro: 0.091, F1 macro: 0.091
	(lgbm) Acc: 0.165, F1 micro: 0.165, F1 macro: 0.165
MI: 0.37, RAND 0.32, FM: 0.32

In [6]:

### GGVEC - 2 ####
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    ggvec_params = dict(
        n_components=N_COMPONENTS,
        order=2,
        tol=0.1,
        tol_samples=10,
        max_epoch=500,
        learning_rate=0.1,
        negative_ratio=0.1,
        exponent=0.33,
        verbose=True,
    )
    start_t = time.time()
    time.sleep(0.3)
    w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'ggvec2'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')


    w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'ggvec2'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')


-------N: 1--------

Loss: 0.0341	:   3%|▎         | 15/500 [00:00<00:01, 265.23it/s]

Converged! Loss: 0.0339
Time: 1.2681
Link Prediction:
	(logit) AUC-ROC: 0.466, AUC-PR: 0.436, Acc: 0.486, F1: 0.504
	(lgbm)  AUC-ROC: 0.819, AUC-PR: 0.776, Acc: 0.760, F1: 0.783

Loss: 0.0335	:   3%|▎         | 16/500 [00:00<00:01, 257.96it/s]

Converged! Loss: 0.0336
Label Prediction:
	(logit) Acc: 0.177, F1 micro: 0.177, F1 macro: 0.177
	(lgbm) Acc: 0.293, F1 micro: 0.293, F1 macro: 0.293
MI: 0.31, RAND 0.30, FM: 0.30

In [7]:

### N2V ####
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    n2v_params = dict(
        n_components=N_COMPONENTS,
        epochs=20,
        walklen=60,
        return_weight=1.,
        neighbor_weight=1.,
        w2vparams={
            "window":3, 
            "negative":5, 
            "iter":2,
            "batch_words":128}
    )
    start_t = time.time()
    w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'node2vec'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')


    w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'node2vec'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')


-------N: 1--------
Making walks... Done, T=1.21
Mapping Walk Names... Done, T=1.01
Training W2V... Done, T=3.06
Time: 5.3080
Link Prediction:
	(logit) AUC-ROC: 0.520, AUC-PR: 0.471, Acc: 0.510, F1: 0.505
	(lgbm)  AUC-ROC: 0.781, AUC-PR: 0.729, Acc: 0.718, F1: 0.759
Making walks... Done, T=0.13
Mapping Walk Names... Done, T=1.16
Training W2V... Done, T=2.81
Label Prediction:
	(logit) Acc: 0.110, F1 micro: 0.110, F1 macro: 0.110
	(lgbm) Acc: 0.183, F1 micro: 0.183, F1 macro: 0.183
MI: -0.00, RAND 0.23, FM: 0.23

In [8]:

### ProNE ####
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    pne_params = dict(
        n_components=N_COMPONENTS,
        step=5,
        mu=0.2,
        theta=0.5,
    )
    start_t = time.time()
    w_train = nodevectors.ProNE(**pne_params).fit_transform(G_train)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'prone'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')


    w = nodevectors.ProNE(**pne_params).fit_transform(G)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'prone'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')


-------N: 1--------
Time: 0.0400
Link Prediction:
	(logit) AUC-ROC: 0.555, AUC-PR: 0.552, Acc: 0.592, F1: 0.656
	(lgbm)  AUC-ROC: 0.720, AUC-PR: 0.653, Acc: 0.675, F1: 0.752
Label Prediction:
	(logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
	(lgbm) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
MI: -0.00, RAND 0.23, FM: 0.23

In [9]:

### GRaRep ####
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    grarep_params = dict(
        n_components=N_COMPONENTS,
        order=1,
        embedder=TruncatedSVD(
            n_iter=10,
            random_state=42),
        merger=(lambda x : np.sum(x, axis=0)),
    )
    start_t = time.time()
    w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'grarep'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')


    w = nodevectors.GraRep(**grarep_params).fit_transform(G)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'grarep'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')

100%|██████████| 1/1 [00:00<00:00, 37.95it/s]


-------N: 1--------
Time: 0.0507
Link Prediction:

	(logit) AUC-ROC: 0.519, AUC-PR: 0.572, Acc: 0.517, F1: 0.540
	(lgbm)  AUC-ROC: 0.895, AUC-PR: 0.865, Acc: 0.821, F1: 0.829

100%|██████████| 1/1 [00:00<00:00, 54.70it/s]

Label Prediction:
	(logit) Acc: 0.024, F1 micro: 0.024, F1 macro: 0.024
	(lgbm) Acc: 0.354, F1 micro: 0.354, F1 macro: 0.354
MI: -0.00, RAND 0.23, FM: 0.23

In [10]:

### GLoVe with random walks ###
for N_COMPONENTS in ALL_COMPONENTS:
    print(f"\n\n-------N: {N_COMPONENTS}--------")
    glove_params = dict(
        n_components=N_COMPONENTS,
        tol=0.001,
        max_epoch=6_000,
        learning_rate=0.01, 
        max_loss=10.,
        max_count=50, 
        exponent=0.5,
    )
    start_t = time.time()
    wg = cg.csrgraph(G_train).random_walk_resample(walklen=7, epochs=30)
    w_train = nodevectors.Glove(**glove_params).fit_transform(wg)
    print(f"Time: {time.time() - start_t :.4f}")
    lpred = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)

    lpred['algorithm'] = 'glove'
    lpred['dim'] = N_COMPONENTS
    lpred['time'] = str(f"{time.time() - start_t :.1f}")
    lpred = pd.DataFrame([pd.Series(lpred)])
    time.sleep(0.3)
    LPRED_FILE = "linkpred_" + OUT_FILE
    if os.path.isfile(LPRED_FILE):
        lpred.to_csv(LPRED_FILE, mode='a', header=False, float_format='%.3f')
    else:
        lpred.to_csv(LPRED_FILE, float_format='%.3f')

    wg = cg.csrgraph(G).random_walk_resample(walklen=7, epochs=30)
    w = nodevectors.Glove(**glove_params).fit_transform(wg)
    labelpred = graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

    labelpred['algorithm'] = 'glove'
    labelpred['dim'] = N_COMPONENTS
    labelpred['time'] = str(f"{time.time() - start_t :.1f}")
    labelpred = pd.DataFrame([pd.Series(labelpred)])
    time.sleep(0.3)
    LPRED_FILE = OUT_FILE
    if os.path.isfile(OUT_FILE):
        labelpred.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
    else:
        labelpred.to_csv(OUT_FILE, float_format='%.3f')


-------N: 1--------

  1%|▏         | 83/6000 [00:02<03:00, 32.78it/s]

Time: 4.1126
Link Prediction:
	(logit) AUC-ROC: 0.502, AUC-PR: 0.449, Acc: 0.504, F1: 0.502
	(lgbm)  AUC-ROC: 0.797, AUC-PR: 0.730, Acc: 0.756, F1: 0.797

  2%|▏         | 116/6000 [00:00<00:40, 144.66it/s]

Label Prediction:
	(logit) Acc: 0.189, F1 micro: 0.189, F1 macro: 0.189
	(lgbm) Acc: 0.244, F1 micro: 0.244, F1 macro: 0.244
MI: 0.34, RAND 0.31, FM: 0.31

In [ ]: