versioninfo()
Julia Version 1.7.0-DEV.1112 Commit 8a0a9a7388 (2021-05-18 18:42 UTC) Platform Info: OS: Windows (x86_64-w64-mingw32) CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz WORD_SIZE: 64 LIBM: libopenlibm LLVM: libLLVM-11.0.1 (ORCJIT, skylake) Environment: JULIA_NUM_THREADS = 12 JULIA_PYTHONCALL_EXE = C:\Users\genkuroki\.julia\conda\3\python.exe
using LoopVectorization, SLEEFPirates, BenchmarkTools
sin_ccall(x::Float64) = ccall(:sin, Float64, (Float64,), x)
function f_ccall(N)
y = 0.0
for n in 1:N
y += sin_ccall(Float64(n))/n
end
2y + 1
end
@btime f_ccall(10^6)
31.395 ms (0 allocations: 0 bytes)
3.141590588883843
function f_naive(N)
y = 0.0
for n in 1:N
y += sin(n)/n
end
2y + 1
end
@btime f_naive(10^6)
11.739 ms (0 allocations: 0 bytes)
3.141590588883843
function f_simd(N)
y = 0.0
@simd for n in 1:N
y += sin(n)/n
end
2y + 1
end
@btime f_simd(10^6)
11.932 ms (0 allocations: 0 bytes)
3.141590588883843
function f_sleef(N)
y = 0.0
for n in 1:N
y += SLEEFPirates.sin_fast(float(n))/n
end
2y + 1
end
@btime f_sleef(10^6)
6.512 ms (0 allocations: 0 bytes)
3.1415905888838433
function f_simd_sleef(N)
y = 0.0
@simd for n in 1:N
y += SLEEFPirates.sin_fast(float(n))/n
end
2y + 1
end
@btime f_simd_sleef(10^6)
3.026 ms (0 allocations: 0 bytes)
3.141590588883785
function f_avx(N)
y = 0.0
@avx for n in 1:N
y += sin(n)/n
end
2y + 1
end
@btime f_avx(10^6)
1.609 ms (0 allocations: 0 bytes)
3.1415905888837843
function f_avx_sleef(N)
y = 0.0
@avx for n in 1:N
y += SLEEFPirates.sin_fast(float(n))/n
end
2y + 1
end
@btime f_avx_sleef(10^6)
2.347 ms (0 allocations: 0 bytes)
3.141590588883785
function f_avxt(N)
y = 0.0
@avxt for n in 1:N
y += sin(n)/n
end
2y + 1
end
@btime f_avxt(10^6)
375.300 μs (24 allocations: 464 bytes)
3.141590588883795
function f_avxt_sleef(N)
y = 0.0
@avxt for n in 1:N
y += SLEEFPirates.sin_fast(float(n))/n
end
2y + 1
end
@btime f_avxt_sleef(10^6)
476.200 μs (24 allocations: 464 bytes)
3.14159058888379
print("f_ccall(10^6): ")
@btime f_ccall(10^6)
print("f_naive(10^6): ")
@btime f_naive(10^6)
print("f_simd(10^6): ")
@btime f_simd(10^6)
print("f_sleef(10^6): ")
@btime f_sleef(10^6)
print("f_simd_sleef(10^6): ")
@btime f_simd_sleef(10^6)
print("f_avx(10^6): ")
@btime f_avx(10^6)
print("f_avx_sleef(10^6): ")
@btime f_avx_sleef(10^6)
print("f_avxt(10^6): ")
@btime f_avxt(10^6)
print("f_avxt_sleef(10^6): ")
@btime f_avxt_sleef(10^6);
f_ccall(10^6): 31.138 ms (0 allocations: 0 bytes) f_naive(10^6): 11.924 ms (0 allocations: 0 bytes) f_simd(10^6): 11.913 ms (0 allocations: 0 bytes) f_sleef(10^6): 6.439 ms (0 allocations: 0 bytes) f_simd_sleef(10^6): 2.974 ms (0 allocations: 0 bytes) f_avx(10^6): 1.613 ms (0 allocations: 0 bytes) f_avx_sleef(10^6): 2.349 ms (0 allocations: 0 bytes) f_avxt(10^6): 353.200 μs (24 allocations: 464 bytes) f_avxt_sleef(10^6): 566.400 μs (24 allocations: 464 bytes)
log_ccall(x::Float64) = ccall(:log, Float64, (Float64,), x)
function g_naive(N)
y = 0.0
for n in 1:N
y += log(n)
end
y
end
function g_ccall(N)
y = 0.0
for n in 1:N
y += log_ccall(float(n))
end
y
end
function g_sleef(N)
y = 0.0
for n in 1:N
y += SLEEFPirates.log_fast(float(n))
end
y
end
function g_simd_sleef(N)
y = 0.0
@simd for n in 1:N
y += SLEEFPirates.log_fast(float(n))
end
y
end
function g_avx(N)
y = 0.0
@avx for n in 1:N
y += log(n)
end
y
end
function g_avx_sleef(N)
y = 0.0
@avx for n in 1:N
y += SLEEFPirates.log_fast(float(n))
end
y
end
function g_avxt(N)
y = 0.0
@avxt for n in 1:N
y += log(n)
end
y
end
function g_avxt_sleef(N)
y = 0.0
@avxt for n in 1:N
y += SLEEFPirates.log_fast(float(n))
end
y
end
g_stirling(N) = N*log(N) - N + log(N)/2 + log(2π)/2 + 1/(12N)
@show g_naive(10^6)
@show g_ccall(10^6)
@show g_sleef(10^6)
@show g_simd_sleef(10^6)
@show g_avx(10^6)
@show g_avx_sleef(10^6)
@show g_avxt(10^6)
@show g_avxt_sleef(10^6)
@show g_stirling(10^6)
println()
print("g_naive(10^6): ")
@btime g_naive(10^6)
print("g_ccall(10^6): ")
@btime g_ccall(10^6)
print("g_sleef(10^6): ")
@btime g_sleef(10^6)
print("g_simd_sleef(10^6): ")
@btime g_simd_sleef(10^6)
print("g_avx(10^6): ")
@btime g_avx(10^6)
print("g_avx_sleef(10^6): ")
@btime g_avx_sleef(10^6)
print("g_avxt(10^6): ")
@btime g_avxt(10^6)
print("g_avxt_sleef(10^6): ")
@btime g_avxt_sleef(10^6)
print("g_stirling(10^6): ")
@btime g_stirling(10^6);
g_naive(10 ^ 6) = 1.2815518384657895e7 g_ccall(10 ^ 6) = 1.2815518384657895e7 g_sleef(10 ^ 6) = 1.2815518384657895e7 g_simd_sleef(10 ^ 6) = 1.2815518384658162e7 g_avx(10 ^ 6) = 1.281551838465816e7 g_avx_sleef(10 ^ 6) = 1.2815518384658225e7 g_avxt(10 ^ 6) = 1.2815518384658162e7 g_avxt_sleef(10 ^ 6) = 1.2815518384658169e7 g_stirling(10 ^ 6) = 1.2815518384658167e7 g_naive(10^6): 10.776 ms (0 allocations: 0 bytes) g_ccall(10^6): 8.716 ms (0 allocations: 0 bytes) g_sleef(10^6): 4.236 ms (0 allocations: 0 bytes) g_simd_sleef(10^6): 1.539 ms (0 allocations: 0 bytes) g_avx(10^6): 2.289 ms (0 allocations: 0 bytes) g_avx_sleef(10^6): 1.859 ms (0 allocations: 0 bytes) g_avxt(10^6): 464.400 μs (24 allocations: 464 bytes) g_avxt_sleef(10^6): 346.100 μs (24 allocations: 464 bytes) g_stirling(10^6): 22.992 ns (0 allocations: 0 bytes)