Title: Distances and Neighbors in High Dimensions Author: Thomas Breuel Institution: UniKL
from scipy.spatial.distance import cdist
from numpy import sort as asort
def random_rotation(d): M = rand(d,d); M = dot(M.T,M); _,M = eig(dot(M.T,M)); return M
Distribution of Vector Lengths:
for d in [2,4,8,16,32]:
vs = rand(100000,d)
ds = sum(vs**2,axis=1)**.5
plot(linspace(0,10,100),histogram(ds,100,range=(0,10))[0])
Distribution of Relative Vector Lengths:
for d in [2,4,8,16,32]:
vs = rand(100000,d)
ds = sum(vs**2,axis=1)**.5
ds /= mean(ds)
plot(linspace(0,4,100),histogram(ds,100,range=(0,4))[0])
Distribution of Relative Vector Lengths (large dims):
for d in [10,100,1000]:
vs = rand(100000,d)
ds = sum(vs**2,axis=1)**.5
ds /= mean(ds)
plot(linspace(0,4,100),histogram(ds,100,range=(0,4))[0])
Pairwise Distances:
from scipy.spatial.distance import cdist
for d in [10,100,1000]:
vs = rand(10000,d)
dists = cdist(vs,vs)
for i in range(len(dists)): dists[i,i] = inf
md = amin(dists,axis=1)
md /= mean(md)
plot(linspace(0,2,100),histogram(md,100,range=(0,2))[0])
Fraction of Neighbors within 10% of Nearest Neighbor:
r = 1000
eps = 0.1
dims = [2,5,10,20,50,100,200,500,1000,2000]
fracs = []
for d in dims:
vs = rand(r,d)
dists = cdist(vs,vs)
dists = asort(dists,axis=1)
f = mean(sum(dists<((1+eps)*dists[:,1][:,newaxis]),axis=1)-1)*1.0/r
fracs.append(f)
# probability that a randomly chosen vector has distance within 10% of nearest neighbor
gca().set_xscale("log"); plot(dims,fracs)
[<matplotlib.lines.Line2D at 0x3b16410>]
Nearest Neighbor Classification and Approximate Nearest Neighbor Algorithms:
Intrinsic Dimension:
l = rand(10000)
c = (l<0.5)*1
v = c_[cos(15*l),sin(15*l),l]
figsize(8,8)
from mpl_toolkits.mplot3d import Axes3D
gcf().add_subplot(111, projection='3d')
gca().scatter(v[:,0], v[:,1],v[:,2],c=array(["r","b"])[c],marker='o',alpha=0.5)
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x3b16ed0>
v2 = v+0.05*randn(*v.shape)
figsize(8,8)
gcf().add_subplot(111, projection='3d')
gca().scatter(v2[:,0],v2[:,1],v2[:,2],c=array(["r","b"])[c],marker='o',alpha=0.5)
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x4064f90>
vdists = mean(asort(cdist(v,v),axis=1),axis=0)
n = len(vdists)
plot(vdists,arange(n))
[<matplotlib.lines.Line2D at 0x524a610>]
v2dists = mean(asort(cdist(v2,v2),axis=1),axis=0)
v3 = randn(*v.shape)/2
v3dists = mean(asort(cdist(v3,v3),axis=1),axis=0)
plot(vdists,arange(n))
plot(v2dists,arange(n))
plot(v3dists,arange(n))
[<matplotlib.lines.Line2D at 0x6e4da10>]
At small scales, we see a linear growth in the number of neighbors for both the noise-free dataset and the noisy intrinsically 1D dataset.
xlim((0,0.7)); ylim((0,1000))
plot(vdists,arange(n))
plot(v2dists,arange(n))
plot(v3dists,arange(n))
[<matplotlib.lines.Line2D at 0x69c3e90>]
v3z = randn(*v.shape)*0.2
v3zdists = mean(asort(cdist(v3z,v3z),axis=1),axis=0)
Zooming into the origin, we see that for small distances, the error-free intrinsic 1D dataset has a linear growth of the number of neighbors, whereas the dataset disturbed with Gaussian noise grows cubically, just like the intrinsically 3D dataset.
xlim((0,0.2)); ylim((0,300))
plot(vdists,arange(n))
plot(v2dists,arange(n))
plot(v3zdists,arange(n))
[<matplotlib.lines.Line2D at 0x402b590>]
On a log-log plot, we can read off the intrinsic dimensionality of the data at different scales.
plot(log(vdists[1:]),log(arange(1,n)))
plot(log(v2dists[1:]),log(arange(1,n)))
plot(log(v3dists[1:]),log(arange(1,n)))
[<matplotlib.lines.Line2D at 0x55f1f10>]
Covering dimension:
There is a closely related measure of the dimension of a dataset, namely the covering dimension.
We ask: how many $\epsilon$-balls around randomly picked samples does it take to cover the dataset, and how does this number grow with $\epsilon$?
Determining intrinsic dimensionality:
vs = c_[randn(100,1),0.02*randn(100,1),0.02*randn(100,1)]
M = randrot(3)
vsr = dot(vs,M.T)
figsize(8,8)
gcf().add_subplot(111, projection='3d')
gca().scatter(vs[:,0],vs[:,1],vs[:,2])
gca().scatter(vsr[:,0],vsr[:,1],vsr[:,2],color='r')
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0xda34b50>
e,Md = eig(dot(vsr.T,vsr))
vsm = dot(vsr,Md)
gcf().add_subplot(111, projection='3d')
gca().scatter(vs[:,0],vs[:,1],vs[:,2],s=3)
gca().scatter(vsr[:,0],vsr[:,1],vsr[:,2],color='r')
gca().scatter(vsm[:,0],vsm[:,1],vsm[:,2],color='g',alpha=0.5,s=80)
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0xe9b6e90>
from sklearn.decomposition import PCA
pca = PCA(2)
vp = pca.fit_transform(vsr)
xlim((-2,2)); ylim((-2,2))
scatter(vp[:,0],vp[:,1])
<matplotlib.collections.PathCollection at 0xe705150>
Metric Multidimensional Scaling
Given a set of vectors $v_i$, find another set of corresponding vectors $u_i$ such that the following error is minimized:
$$ E = \sum_{ij} ||d(v_i,v_j) - d(u_i,u_j)||^2 $$# Multidimensional Scaling
from sklearn import manifold
mds = manifold.MDS(2)
vl = mds.fit_transform(v[::50])
scatter(vl[:,0],vl[:,1],c=array(["r","b"])[c[::50]])
<matplotlib.collections.PathCollection at 0x8e45f90>
Locally Linear Embedding
# Locally Linear Embedding
mds = manifold.LocallyLinearEmbedding()
vl = mds.fit_transform(v[::50])
scatter(vl[:,0],vl[:,1],c=array(["r","b"])[c[::50]])
<matplotlib.collections.PathCollection at 0x9041390>
Isomap
# Isomap
mds = manifold.Isomap()
vl = mds.fit_transform(v[::50])
scatter(vl[:,0],vl[:,1],c=array(["r","b"])[c[::50]])
<matplotlib.collections.PathCollection at 0x9318650>
# Isomap
mds = manifold.Isomap(n_neighbors=2)
vl = mds.fit_transform(v[::50])
scatter(vl[:,0],vl[:,1],c=array(["r","b"])[c[::50]])
<matplotlib.collections.PathCollection at 0x99c1f90>
Nearest neighbor methods generally depend only on "the" intrinsic dimension of data, not the dimension of the space that the data is embedded in.
"The" intrinsic dimension depends on scale: