# Load libraries
# Math
import numpy as np
# Visualization
%matplotlib notebook
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import ndimage
# Print output of LFR code
import subprocess
# Sparse matrix
import scipy.sparse
import scipy.sparse.linalg
# 3D visualization
import pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot
# Import data
import scipy.io
# Import functions in lib folder
import sys
sys.path.insert(1, 'lib')
# Import helper functions
%load_ext autoreload
%autoreload 2
from lib.utils import compute_ncut
from lib.utils import reindex_W_with_classes
from lib.utils import nldr_visualization
from lib.utils import construct_knn_graph
from lib.utils import compute_purity
# Import distance function
import sklearn.metrics.pairwise
# Remove warnings
import warnings
warnings.filterwarnings("ignore")
# Load 10 classes of 4,000 text documents
mat = scipy.io.loadmat('datasets/20news_5classes_raw_data.mat')
X = mat['X']
n = X.shape[0]
d = X.shape[1]
Cgt = mat['Cgt'] - 1; Cgt = Cgt.squeeze()
nc = len(np.unique(Cgt))
print('Number of data =',n)
print('Data dimensionality =',d);
print('Number of classes =',nc);
Number of data = 2000 Data dimensionality = 7939 Number of classes = 5
Question 1a: Compute the k-NN graph (k=10) with L2/Euclidean distance
Hint: You may use the function W=construct_knn_graph(X,k,'euclidean')
# Your code here
Question 1b: Plot the adjacency matrix of the graph.
Hint: Use function plt.spy(W, markersize=1)
# Your code here
Question 1c: Reindex the adjacency matrix of the graph w.r.t. ground
truth communities. Plot the reindexed adjacency matrix of the graph.
Hint: You may use the function [W_reindex,C_classes_reindex]=reindex_W_with_classes(W,C_classes).
# Your code here
# Your code here
Question 1d: Perform graph clustering with NCut technique. What is
the clustering accuracy of the NCut solution? What is the clustering
accuracy of a random partition? Reindex the adjacency matrix of the
graph w.r.t. NCut communities. Plot the reindexed adjacency matrix of
the graph.
Hint: You may use function C_ncut, accuracy = compute_ncut(W,C_solution,n_clusters) that performs Ncut clustering.
Hint: You may use function accuracy = compute_purity(C_computed,C_solution,n_clusters) that returns the
accuracy of a computed partition w.r.t. the ground truth partition. A
random partition can be generated with the function np.random.randint.
# Your code here
Question 2a: Compute the k-NN graph (k=10) with Cosine distance.
Answer to questions 1b-1d for this graph.
Hint: You may use function W=construct_knn_graph(X,10,'cosine').
# Reload data matrix
X = mat['X']
# Your code here
# Compute the k-NN graph with Cosine distance
# Your code here
# Your code here
# Your code here
Question 2b: Visualize the adjacency matrix with the non-linear reduction
technique in 2D and 3D.
Hint: You may use function [X,Y,Z] = nldr_visualization(W).
Hint: You may use function plt.scatter(X,Y,c=Cncut) for 2D visualization and ax.scatter(X,Y,Z,c=Cncut) for 3D visualization.
# Your code here