import numpy
import sys
import nmslib
import time
import math
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
print(sys.version)
print("NMSLIB version:", nmslib.__version__)
# Just read the data
all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')
# Create a held-out query data set
(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)
print("# of queries %d, # of data points %d" % (query_matrix.shape[0], data_matrix.shape[0]) )
# of queries 1000, # of data points 9000
# Set index parameters
# These are the most important onese
M = 15
efC = 100
num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)
Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}
# Number of neighbors
K=100
# Space name should correspond to the space name
# used for brute-force search
space_name='l2'
# Intitialize the library, specify the space, the type of the vector and add data points
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(data_matrix)
9000
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params)
end = time.time()
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))
Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100} Indexing time = 0.246434
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)
Setting query-time parameters {'efSearch': 100}
# Querying
query_qty = query_matrix.shape[0]
start = time.time()
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
kNN time total=0.026962 (sec), per query=0.000027 (sec), per query adjusted for thread number=0.000108 (sec)
# Computing gold-standard data
print('Computing gold-standard data')
start = time.time()
sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)
end = time.time()
print('Brute-force preparation time %f' % (end - start))
start = time.time()
gs = sindx.kneighbors(query_matrix)
end = time.time()
print('brute-force kNN time total=%f (sec), per query=%f (sec)' %
(end-start, float(end-start)/query_qty) )
Computing gold-standard data Brute-force preparation time 0.001275 brute-force kNN time total=0.242228 (sec), per query=0.000242 (sec)
# Finally computing recall
recall=0.0
for i in range(0, query_qty):
correct_set = set(gs[1][i])
ret_set = set(nbrs[i][0])
recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)
kNN recall 0.993460
# Save a meta index, but no data!
index.saveIndex('dense_index_optim.bin', save_data=False)
# Re-intitialize the library, specify the space, the type of the vector.
newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
# For an optimized L2 index, there's no need to re-load data points, but this would be required for
# non-optimized index or any other methods different from HNSW (other methods can save only meta indices)
#newIndex.addDataPointBatch(data_matrix)
# Re-load the index and re-run queries
newIndex.loadIndex('dense_index_optim.bin')
# Setting query-time parameters and querying
print('Setting query-time parameters', query_time_params)
newIndex.setQueryTimeParams(query_time_params)
query_qty = query_matrix.shape[0]
start = time.time()
new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
(end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))
Setting query-time parameters {'efSearch': 100} kNN time total=0.026182 (sec), per query=0.000026 (sec), per query adjusted for thread number=0.000105 (sec)
# Finally computing recall for the new result set
recall=0.0
for i in range(0, query_qty):
correct_set = set(gs[1][i])
ret_set = set(new_nbrs[i][0])
recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)
recall = recall / query_qty
print('kNN recall %f' % recall)
kNN recall 0.993460