import numpy as np
import matplotlib.pyplot as plt
import umap
# Generate 400 points, uniformly distributed in a 4-D cube
# The 4 dimensions will be plotted as a (R,G,B,a) color
data = np.random.rand(400, 4)
# fit with UMAP
fit = umap.UMAP()
%time u = fit.fit_transform(data)
# Plot the dots at positions (x,y) = umap value
plt.scatter(u[:,0], u[:,1], c=data)
plt.show()
CPU times: user 7.27 s, sys: 105 ms, total: 7.38 s Wall time: 7.35 s
# let's define a utility that computes the UMAP and plots it
def draw_umap(n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean'):
fit = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
metric=metric
)
u = fit.fit_transform(data);
if n_components == 1:
plt.scatter(u[:,0], range(len(u)), c=data)
if n_components == 2:
plt.scatter(u[:,0], u[:,1], c=data)
if n_components == 3:
plt.scatter(u[:,0], u[:,1], list(100 * abs(i) for i in u[:,2]), c=data)
return plt.show()
# n_neighbors (default=15)
for n in (2, 3, 4, 5, 8, 12, 20, 40, 100, 200):
print(n, 'neighbors')
draw_umap(n_neighbors=n)
2 neighbors
3 neighbors
4 neighbors
5 neighbors
8 neighbors
12 neighbors
20 neighbors
40 neighbors
100 neighbors
200 neighbors
# min_dist (default=0.1)
for d in (0., 0.1, 0.2, 0.3, 0.5, 0.8, 0.99):
print(d, 'min_dist')
draw_umap(min_dist=d)
0.0 min_dist
0.1 min_dist
0.2 min_dist
0.3 min_dist
0.5 min_dist
0.8 min_dist
0.99 min_dist
# We can reduce our dataset further, to n_components = 1 dimension (default 2)
# - x = UMAP value
# - dots are randomly spread out on the y axis, so as to make them visible
draw_umap(n_components=1)
# Or we can reduce it to dimension 3
# - x, y = UMAP values 0 and 1
# - size = UMAP value 2, think "nearer" or "further"
draw_umap(n_components=3)
# Change the metric
import numba
# compare only red channel
@numba.njit()
def Rmetric(a,b):
return np.abs(a[0] - b[0])
# compare red + blue, ignore green
@numba.njit()
def RBmetric(a,b):
return np.abs(a[0] - b[0]) + np.abs(a[2] - b[2])
# luminance
@numba.njit()
def Lmetric(a,b):
return np.abs((a[0]+a[1]+a[2]) - (b[0]+b[1]+b[2]))
for m in ("euclidean", Rmetric, RBmetric, Lmetric):
print(m)
draw_umap(n_components=2, metric=m)
euclidean
CPUDispatcher(<function Rmetric at 0x1513ed6158>)
CPUDispatcher(<function RBmetric at 0x1513ed6510>)
CPUDispatcher(<function Lmetric at 0x1513ed6ae8>)