Tamás Gál (tamas.gal@fau.de)
The latest version of this notebook is available at https://github.com/escape2020/school2021
import numpy as np
import sys
print(f"Python version: {sys.version}\n"
f"NumPy version: {np.__version__}")
rng = np.random.default_rng(42) # initialise our random number generator
Python version: 3.9.4 | packaged by conda-forge | (default, May 10 2021, 22:10:52) [Clang 11.1.0 ] NumPy version: 1.20.3
N = 500
n_dims = 3
points = rng.random((N, n_dims))
points
array([[0.77395605, 0.43887844, 0.85859792], [0.69736803, 0.09417735, 0.97562235], [0.7611397 , 0.78606431, 0.12811363], ..., [0.82221355, 0.64818373, 0.78175705], [0.42816986, 0.63793674, 0.856229 ], [0.63106544, 0.34767363, 0.66252959]])
N = 4
n_dims = 3
points = rng.random((N, n_dims))
points
array([[0.67185419, 0.96058696, 0.37091232], [0.42508177, 0.81212296, 0.50576231], [0.73657309, 0.45970946, 0.21549514], [0.74520384, 0.13115517, 0.19858366]])
distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)
distances[np.diag_indices_from(distances)] = np.inf
np.argmin(distances, axis=1)
array([1, 0, 3, 2])
points
array([[0.67185419, 0.96058696, 0.37091232], [0.42508177, 0.81212296, 0.50576231], [0.73657309, 0.45970946, 0.21549514], [0.74520384, 0.13115517, 0.19858366]])
points.shape
(4, 3)
reshaped_points = points.reshape(N, 1, n_dims)
reshaped_points
array([[[0.67185419, 0.96058696, 0.37091232]], [[0.42508177, 0.81212296, 0.50576231]], [[0.73657309, 0.45970946, 0.21549514]], [[0.74520384, 0.13115517, 0.19858366]]])
reshaped_points.shape
(4, 1, 3)
differences = reshaped_points - points
differences
array([[[ 0. , 0. , 0. ], [ 0.24677242, 0.148464 , -0.13484999], [-0.0647189 , 0.5008775 , 0.15541718], [-0.07334965, 0.82943179, 0.17232866]], [[-0.24677242, -0.148464 , 0.13484999], [ 0. , 0. , 0. ], [-0.31149133, 0.3524135 , 0.29026717], [-0.32012208, 0.68096779, 0.30717865]], [[ 0.0647189 , -0.5008775 , -0.15541718], [ 0.31149133, -0.3524135 , -0.29026717], [ 0. , 0. , 0. ], [-0.00863075, 0.3285543 , 0.01691148]], [[ 0.07334965, -0.82943179, -0.17232866], [ 0.32012208, -0.68096779, -0.30717865], [ 0.00863075, -0.3285543 , -0.01691148], [ 0. , 0. , 0. ]]])
print(reshaped_points.shape, " - ", points.shape, " => ", differences.shape)
(4, 1, 3) - (4, 3) => (4, 4, 3)
distances = np.linalg.norm(differences, axis=2)
distances
array([[0. , 0.31799797, 0.52841395, 0.85031432], [0.31799797, 0. , 0.55269987, 0.81274473], [0.52841395, 0.55269987, 0. , 0.32910244], [0.85031432, 0.81274473, 0.32910244, 0. ]])
distances.shape
(4, 4)
distances
array([[0. , 0.31799797, 0.52841395, 0.85031432], [0.31799797, 0. , 0.55269987, 0.81274473], [0.52841395, 0.55269987, 0. , 0.32910244], [0.85031432, 0.81274473, 0.32910244, 0. ]])
# get rid of self-distances first ;)
distances[np.diag_indices_from(distances)] = np.inf
distances
array([[ inf, 0.31799797, 0.52841395, 0.85031432], [0.31799797, inf, 0.55269987, 0.81274473], [0.52841395, 0.55269987, inf, 0.32910244], [0.85031432, 0.81274473, 0.32910244, inf]])
nearest_points = np.argmin(distances, axis=1)
nearest_points
array([1, 0, 3, 2])
scikit-learn
¶nearest_points
array([1, 0, 3, 2])
from sklearn.neighbors import NearestNeighbors
_, indices = NearestNeighbors().fit(points).kneighbors(points, 2)
indices[:, 1]
array([1, 0, 3, 2])
%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)
84 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%%timeit
distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)
distances[np.diag_indices_from(distances)] = np.inf
np.argmin(distances, axis=1)
11.6 µs ± 229 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
...but it also requires much more memory O(N^2) due to the blow-up-trick.
N = 1000
n_dims = 3
points = rng.random((N, n_dims))
points
array([[0.62682498, 0.7472698 , 0.89468789], [0.2725865 , 0.11072426, 0.95604666], [0.15442309, 0.19766698, 0.29132945], ..., [0.46039559, 0.13064526, 0.39747408], [0.4116557 , 0.18771592, 0.75630711], [0.32140068, 0.58898729, 0.51573018]])
%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)
892 µs ± 7.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%%timeit
distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)
distances[np.diag_indices_from(distances)] = np.inf
np.argmin(distances, axis=1)
22.9 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)