import numpy as np
import pandas as pd
import cudf
import os
from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
if os.path.exists(cached) and source=='mortgage':
print('use mortgage data')
with gzip.open(cached) as f:
X = np.load(f)
X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
else:
print('use random data')
X = np.random.random((nrows,ncols)).astype('float32')
df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
return df
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
a = to_nparray(a)
b = to_nparray(b)
if with_sign == False:
a,b = np.abs(a),np.abs(b)
if metric=='mse':
error = mean_squared_error(a,b)
res = error<threshold
elif metric=='abs':
error = a-b
res = len(error[error>threshold]) == 0
elif metric == 'acc':
error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
res = error<threshold
return res
def accuracy(a,b, threshold=1e-4):
a = to_nparray(a)
b = to_nparray(b)
c = a-b
c = len(c[c>1]) / (c.shape[0]*c.shape[1])
return c<threshold
def to_nparray(x):
if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
return np.array(x)
elif isinstance(x,np.float64):
return np.array([x])
elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
return x.to_pandas().values
return x
%%time
nrows = 2**15
ncols = 40
X = load_data(nrows,ncols)
print('data',X.shape)
use random data data (32768, 40) CPU times: user 34.2 ms, sys: 3.49 ms, total: 37.6 ms Wall time: 35.9 ms
n_neighbors = 10
%%time
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)
CPU times: user 31.5 s, sys: 4.52 s, total: 36 s Wall time: 36 s
%%time
X = cudf.DataFrame.from_pandas(X)
CPU times: user 361 ms, sys: 71.5 ms, total: 432 ms Wall time: 430 ms
%%time
knn_cuml = cumlKNN()
knn_cuml.fit(X)
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)
CPU times: user 6.75 s, sys: 250 ms, total: 7 s Wall time: 2.71 s
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)
compare knn: cuml vs sklearn distances NOT equal
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)
compare knn: cuml vs sklearn indexes equal