import numpy as np
from numba import cuda
@cuda.jit('void( float64 [ : , : ] , float64 [ : , : ] , float64 [ : , : ] , int32 )')
def cu_matmul(a , b, c , n) :
x, y = cuda.grid (2)
if (x >= n) or (y >= n) :
return
c[x, y] = 0
for i in range(n) :
c[x, y] += a[x, i ] * b[ i , y]
device = cuda.get_current_device()
tpb = device.WARP_SIZE
n = 320
bpg = (n+tpb-1)//tpb
grid_dim = (bpg, bpg)
block_dim = (tpb , tpb)
A = np.random.random((n, n ) ).astype (np. float64 )
B = np.random.random((n, n ) ).astype (np. float64 )
C = np.empty((n, n) , dtype=np.float64 )
dev_A = cuda.to_device(A)
dev_B = cuda.to_device(B)
dev_C = cuda.to_device(C, copy=False )
result_cuda = %timeit -o cu_matmul[grid_dim , block_dim](dev_A, dev_B, dev_C, n)
dev_C. copy_to_host(C)
assert (np. allclose (np. dot(A, B) , C))
14.7 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit C = np.dot(A, B)
525 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)