黒木玄
2018-01-28
2つの整数の組 $(a,b)$ が互いに素になる「確率」(最大公約数が $1$ になる「確率」)は $\zeta(2)=\pi^2/6$ の逆数になることがよく知られている. このノートブックではJulia言語の gcd
函数のテストをしながら, そのよく知られている事実を数値的に確認してみよう.
注意: このノートブックのコードは極めて効率が悪い. gcd
函数のテストをせずに効率のよい計算法を採用すればものすごく速く計算できる. その方法については次のリンク先を参照して欲しい.
versioninfo()
Julia Version 0.6.2 Commit d386e40c17* (2017-12-13 18:08 UTC) Platform Info: OS: Windows (x86_64-w64-mingw32) CPU: Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz WORD_SIZE: 64 BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell) LAPACK: libopenblas64_ LIBM: libopenlibm LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
using BenchmarkTools
N = 2^12
4096
function calc_pi_by_gcd(N)
s = 0
for a in 1:N
for b in 1:N
s += ifelse(gcd(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd(N)
@benchmark calc_pi_by_gcd(N)
calc_pi_by_gcd(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 601.057 ms (0.00% GC) median time: 623.556 ms (0.00% GC) mean time: 628.945 ms (0.00% GC) maximum time: 677.749 ms (0.00% GC) -------------- samples: 8 evals/sample: 1
ほんの少しだけ速くなる.
# https://github.com/JuliaLang/julia/blob/d386e40c17d43b79fc89d3e579fc04547241787c/base/intfuncs.jl#L28-L49
#
# binary GCD (aka Stein's) algorithm
# about 1.7x (2.1x) faster for random Int64s (Int128s)
@inline function gcd_inline(a::T, b::T) where T<:Union{Int64,UInt64,Int128,UInt128}
a == 0 && return abs(b)
b == 0 && return abs(a)
za = trailing_zeros(a)
zb = trailing_zeros(b)
k = min(za, zb)
u = unsigned(abs(a >> za))
v = unsigned(abs(b >> zb))
while u != v
if u > v
u, v = v, u
end
v -= u
v >>= trailing_zeros(v)
end
r = u << k
# T(r) would throw InexactError; we want OverflowError instead
r > typemax(T) && throw(OverflowError())
r % T
end
function calc_pi_by_gcd_inline(N)
s = 0
for a in 1:N
for b in 1:N
s += ifelse(gcd_inline(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_inline(N)
@benchmark calc_pi_by_gcd_inline(N)
calc_pi_by_gcd_inline(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 564.928 ms (0.00% GC) median time: 567.645 ms (0.00% GC) mean time: 571.927 ms (0.00% GC) maximum time: 593.906 ms (0.00% GC) -------------- samples: 9 evals/sample: 1
非常に遅い.
# Euclidean algorithm
@inline function gcd_euclid(a::T, b::T) where T<:Integer
while b != 0
t = b
b = rem(a, b)
a = t
end
abs(a)
end
function calc_pi_by_gcd_euclid(N)
s = 0
for a in 1:N
for b in 1:N
s += ifelse(gcd_euclid(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_euclid(N)
@benchmark calc_pi_by_gcd_euclid(N)
calc_pi_by_gcd_euclid(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 1.528 s (0.00% GC) median time: 1.531 s (0.00% GC) mean time: 1.539 s (0.00% GC) maximum time: 1.564 s (0.00% GC) -------------- samples: 4 evals/sample: 1
@show workers();
workers() = [2, 3, 4, 5, 6, 7, 8, 9]
function calc_pi_by_gcd_parallel(N)
s = @parallel (+) for a in 1:N
s = 0
for b in 1:N
s += ifelse(gcd(a,b)==1, 1, 0)
end
s
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_parallel(N)
@benchmark calc_pi_by_gcd_parallel(N)
calc_pi_by_gcd_parallel(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 89.94 KiB allocs estimate: 995 -------------- minimum time: 261.367 ms (0.00% GC) median time: 263.501 ms (0.00% GC) mean time: 264.096 ms (0.00% GC) maximum time: 268.579 ms (0.00% GC) -------------- samples: 19 evals/sample: 1
以下の C_code は実質的に
https://gist.github.com/Toru3/7c2fbb64b5a46d5e87273696c5d2e594
からのコピー&ペーストです。Toru3さんにはいつもお世話になっています。
C_code = raw"""
//gcc pi_gcd.c -O3 -march=native -lm -fopenmp
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
inline int ctz(long x){
// GCCの組み込み関数
return __builtin_ctzl(x);
}
inline int min(int a, int b){
return a<b ? a : b;
}
long gcd(long a, long b){
if(a==0) return b;
if(b==0) return a;
a = labs(a);
b = labs(b);
int ta = ctz(a);
int tb = ctz(b);
int shift = min(ta,tb);
a >>= ta;
b >>= tb;
if(a<b){
long t=a;
a=b;
b=t;
}
do{
a-=b;
a >>= ctz(a);
if(a<b){
long t=a;
a=b;
b=t;
}
}while(b!=0);
return a << shift;
}
double pi_gcd_omp(long n){
long count=0;
#pragma omp parallel for reduction(+:count) collapse(2)
for(long i=1; i<=n; i++){
for(long j=1; j<=n; j++){
if(gcd(i,j)==1){
count++;
}
}
}
return sqrt(6.0*n*n/count);
}
double pi_gcd(long n){
long count=0;
for(long i=1; i<=n; i++){
for(long j=1; j<=n; j++){
if(gcd(i,j)==1){
count++;
}
}
}
return sqrt(6.0*n*n/count);
}
int main(int argc, const char* argv[]){
long n=20000;
if(argc>=2){
n=atol(argv[1]);
}
printf("%.15f\n", pi_gcd_omp(n));
}
"""
@everywhere file = "calc_pi_by_gcd_gcc_gcc"
filedl = file * "." * Libdl.dlext
open(`gcc -Wall -O3 -march=native -lm -fopenmp -xc -shared -o $filedl -`, "w") do f
print(f, C_code)
end
run(`ls -l $filedl`)
@everywhere const calc_pi_by_gcd_gcc_lib = file
@everywhere calc_pi_by_gcd_gcc_gcc(n::Int64) = ccall((:pi_gcd, calc_pi_by_gcd_gcc_lib), Float64, (Int64,), n)
@everywhere calc_pi_by_gcd_gcc_gcc_omp(n::Int64) = ccall((:pi_gcd_omp, calc_pi_by_gcd_gcc_lib), Float64, (Int64,), n)
@everywhere gcd_gcc(m::Int64, n::Int64) = ccall((:gcd, calc_pi_by_gcd_gcc_lib), Int64, (Int64, Int64,), m, n)
@show calc_pi_by_gcd_gcc_gcc(N)
@show calc_pi_by_gcd_gcc_gcc_omp(N);
-rwxr-xr-x 1 genkuroki genkuroki 120962 Jan 30 15:19 calc_pi_by_gcd_gcc_gcc.dll calc_pi_by_gcd_gcc_gcc(N) = 3.1414825885490334 calc_pi_by_gcd_gcc_gcc_omp(N) = 3.1414825885490334
@benchmark calc_pi_by_gcd_gcc_gcc(N)
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 451.784 ms (0.00% GC) median time: 481.064 ms (0.00% GC) mean time: 480.601 ms (0.00% GC) maximum time: 512.907 ms (0.00% GC) -------------- samples: 11 evals/sample: 1
function calc_pi_by_gcd_gcc_julia(N)
s = 0
for a in 1:N
for b in 1:N
s += ifelse(gcd_gcc(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_gcc_julia(N)
@benchmark calc_pi_by_gcd_gcc_julia(N)
calc_pi_by_gcd_gcc_julia(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 456.009 ms (0.00% GC) median time: 472.454 ms (0.00% GC) mean time: 473.144 ms (0.00% GC) maximum time: 496.123 ms (0.00% GC) -------------- samples: 11 evals/sample: 1
このようにJulia版で gcd を gcc 版の gcd で置き換えると gcc 版とほぼ同じ速さになる.
Julia版が遅くなっているのは gcd が遅いことが原因である.
@benchmark calc_pi_by_gcd_gcc_gcc_omp(N)
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 77.485 ms (0.00% GC) median time: 86.072 ms (0.00% GC) mean time: 92.076 ms (0.00% GC) maximum time: 125.016 ms (0.00% GC) -------------- samples: 55 evals/sample: 1
function calc_pi_by_gcd_gcc_julia_parallel(N)
s = @parallel (+) for a in 1:N
s = 0
for b in 1:N
s += ifelse(gcd_gcc(a,b)==1, 1, 0)
end
s
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_gcc_julia_parallel(N)
@benchmark calc_pi_by_gcd_gcc_julia_parallel(N)
calc_pi_by_gcd_gcc_julia_parallel(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 91.39 KiB allocs estimate: 1030 -------------- minimum time: 236.239 ms (0.00% GC) median time: 238.812 ms (0.00% GC) mean time: 239.103 ms (0.00% GC) maximum time: 244.510 ms (0.00% GC) -------------- samples: 21 evals/sample: 1
gcc omp 版が非常に速いことがわかる!
# gcc版 (omp parallel を使用**しない**場合)
@btime calc_pi_by_gcd_gcc_gcc(20000)
12.605 s (0 allocations: 0 bytes)
3.1415283804654663
# julia版
@btime calc_pi_by_gcd(20000)
16.953 s (0 allocations: 0 bytes)
3.1415283804654663
# Julia版 (gcd を gcc 版の gcd に置き換えた場合)
@btime calc_pi_by_gcd_gcc_julia(20000)
13.052 s (0 allocations: 0 bytes)
3.1415283804654663
gcd を gcc 版 gcd_gcc に置き換えたら、純粋 gcc 版とほぼ同じ速さになった.
Julia版が遅くなっているのは gcd が遅いことが原因である.
Juliaの単純forループが遅いというようなことにはなっていない.
# gcc版 (omp parallel を使用**した**場合)
@btime calc_pi_by_gcd_gcc_gcc_omp(20000)
2.141 s (0 allocations: 0 bytes)
3.1415283804654663
# julia版 (@parallel を使った場合)
@btime calc_pi_by_gcd_parallel(20000)
6.835 s (1118 allocations: 94.50 KiB)
3.1415283804654663
# julia版 (gcd を gcc 版に置き換えて、@parallel を使った場合)
@btime calc_pi_by_gcd_gcc_julia_parallel(20000)
6.109 s (1149 allocations: 96.28 KiB)
3.1415283804654663
# https://github.com/JuliaLang/julia/blob/d386e40c17d43b79fc89d3e579fc04547241787c/base/intfuncs.jl#L28-L49
#
# binary GCD (aka Stein's) algorithm
# about 1.7x (2.1x) faster for random Int64s (Int128s)
@inline function gcd_inline_nounsigned(a::T, b::T) where T<:Union{Int64,UInt64,Int128,UInt128}
a == 0 && return abs(b)
b == 0 && return abs(a)
za = trailing_zeros(a)
zb = trailing_zeros(b)
k = min(za, zb)
# u = unsigned(abs(a >> za))
# v = unsigned(abs(b >> zb))
u = abs(a >> za)
v = abs(b >> zb)
while u != v
if u > v
u, v = v, u
end
v -= u
v >>= trailing_zeros(v)
end
r = u << k
# T(r) would throw InexactError; we want OverflowError instead
r > typemax(T) && throw(OverflowError())
r % T
end
function calc_pi_by_gcd_inline_nounsigned(N)
s = 0
for a in 1:N
for b in 1:N
s += ifelse(gcd_inline_nounsigned(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_inline_nounsigned(N)
@benchmark calc_pi_by_gcd_inline_nounsigned(N)
calc_pi_by_gcd_inline_nounsigned(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 453.976 ms (0.00% GC) median time: 462.135 ms (0.00% GC) mean time: 469.617 ms (0.00% GC) maximum time: 502.549 ms (0.00% GC) -------------- samples: 11 evals/sample: 1
function calc_pi_by_gcd_inline_nounsigned_UInt64(N)
s = 0
for a in UnitRange{UInt64}(1:N)
for b in UnitRange{UInt64}(1:N)
s += ifelse(gcd_inline_nounsigned(a,b)==1, 1, 0)
end
end
sqrt(6N^2/s)
end
@show calc_pi_by_gcd_inline_nounsigned_UInt64(N)
@benchmark calc_pi_by_gcd_inline_nounsigned_UInt64(N)
calc_pi_by_gcd_inline_nounsigned_UInt64(N) = 3.1414825885490334
BenchmarkTools.Trial: memory estimate: 16 bytes allocs estimate: 1 -------------- minimum time: 559.848 ms (0.00% GC) median time: 562.741 ms (0.00% GC) mean time: 567.686 ms (0.00% GC) maximum time: 587.242 ms (0.00% GC) -------------- samples: 9 evals/sample: 1
# 純粋 Julia 版 (gcd を Int64 で計算する)
@btime calc_pi_by_gcd_inline_nounsigned(20000)
12.773 s (0 allocations: 0 bytes)
3.1415283804654663
結論: Julia言語のgcd函数をInt64で計算するように変更すると gcc や Rust と同等の速さになる.
# オーバーフローエラー (これは意図通りの挙動)
gcd(2^63,2^63)
OverflowError() Stacktrace: [1] gcd(::Int64, ::Int64) at .\intfuncs.jl:47 [2] include_string(::String, ::String) at .\loading.jl:522
# これはエラーにならず、負の Int64 の値を返す!
@show gcd(2^63,0);
gcd(2 ^ 63, 0) = -9223372036854775808
# すべて Int64 で計算するとこうなる.
@show gcd_inline_nounsigned(2^63, 2^63)
@show gcd_inline_nounsigned(2^63, 0);
gcd_inline_nounsigned(2 ^ 63, 2 ^ 63) = -9223372036854775808 gcd_inline_nounsigned(2 ^ 63, 0) = -9223372036854775808
# Uint64(2^63) は InexactError() になる.
UInt64(2^63)
InexactError() Stacktrace: [1] UInt64(::Int64) at .\sysimg.jl:77 [2] include_string(::String, ::String) at .\loading.jl:522
# 2^63 % UInt64 ならばエラーにならない.
@show a = 2^63 % UInt64
@show gcd(a, a)
@show gcd(a, 0);
a = 2 ^ 63 % UInt64 = 0x8000000000000000 gcd(a, a) = 0x8000000000000000 gcd(a, 0) = 0x8000000000000000
# Int128 の場合
@show b = Int128(2)^127
b = Int128(2) ^ 127 = -170141183460469231731687303715884105728
-170141183460469231731687303715884105728
gcd(b, b)
OverflowError() Stacktrace: [1] gcd(::Int128, ::Int128) at .\intfuncs.jl:47 [2] include_string(::String, ::String) at .\loading.jl:522
gcd(b, zero(b))
-170141183460469231731687303715884105728
@show c = b % UInt128
@show gcd(c, c)
@show gcd(c, zero(c));
c = b % UInt128 = 0x80000000000000000000000000000000 gcd(c, c) = 0x80000000000000000000000000000000 gcd(c, zero(c)) = 0x80000000000000000000000000000000
@inline function mygcd(a::T, b::T) where T<:Union{Int64,UInt64,Int128,UInt128}
# When T is Int64 or Int128, then abs(typemin(T)) throws InexactError().
# We want OverflowError instead.
a == typemin(T) < 0 && throw(OverflowError())
b == typemin(T) < 0 && throw(OverflowError())
a, b = abs(a), abs(b)
a == 0 && return b
b == 0 && return a
za = trailing_zeros(a)
zb = trailing_zeros(b)
k = min(za, zb)
u = a >> za
v = b >> zb
while u != v
if u > v
u, v = v, u
end
v -= u
v >>= trailing_zeros(v)
end
u << k
end
mygcd (generic function with 1 method)
mygcd(0,2^63)
OverflowError() Stacktrace: [1] mygcd(::Int64, ::Int64) at .\In[31]:5 [2] include_string(::String, ::String) at .\loading.jl:522
mygcd(2^63, 0)
OverflowError() Stacktrace: [1] mygcd(::Int64, ::Int64) at .\In[31]:4 [2] include_string(::String, ::String) at .\loading.jl:522
mygcd(2^63+1, 2^63+1)
9223372036854775807
a = 2^63 % UInt64
0x8000000000000000
mygcd(a, zero(a))
0x8000000000000000
mygcd(zero(a), a)
0x8000000000000000
b = Int128(2)^127
-170141183460469231731687303715884105728
mygcd(b, b)
OverflowError() Stacktrace: [1] mygcd(::Int128, ::Int128) at .\In[31]:4 [2] include_string(::String, ::String) at .\loading.jl:522
mygcd(b, zero(0))
MethodError: no method matching mygcd(::Array{Int64,1}, ::Int64) Closest candidates are: mygcd(::T<:Union{Int128, Int64, UInt128, UInt64}, ::T<:Union{Int128, Int64, UInt128, UInt64}) where T<:Union{Int128, Int64, UInt128, UInt64} at In[31]:4 Stacktrace: [1] include_string(::String, ::String) at .\loading.jl:522
c = b % UInt128
0x80000000000000000000000000000000
mygcd(c, c)
0x80000000000000000000000000000000
mygcd(c, zero(c))
0x80000000000000000000000000000000
srand(2018)
a = rand(Int64, 10^6)
b = rand(Int64, 10^6);
bm_gcd = @benchmark gcd.(a,b)
BenchmarkTools.Trial: memory estimate: 7.63 MiB allocs estimate: 28 -------------- minimum time: 188.958 ms (0.00% GC) median time: 191.370 ms (0.00% GC) mean time: 196.693 ms (1.86% GC) maximum time: 271.267 ms (26.56% GC) -------------- samples: 26 evals/sample: 1
bm_mygcd = @benchmark mygcd.(a,b)
BenchmarkTools.Trial: memory estimate: 7.63 MiB allocs estimate: 28 -------------- minimum time: 134.023 ms (0.00% GC) median time: 139.579 ms (0.00% GC) mean time: 141.028 ms (1.84% GC) maximum time: 196.875 ms (31.47% GC) -------------- samples: 36 evals/sample: 1
bm_gcd.times[1]/bm_mygcd.times[1]
1.4098880519825778
srand(2018)
c = rand(Int128, 10^6)
d = rand(Int128, 10^6);
bm_gcd128 = @benchmark gcd.(c,d)
BenchmarkTools.Trial: memory estimate: 15.26 MiB allocs estimate: 28 -------------- minimum time: 828.751 ms (0.00% GC) median time: 835.910 ms (0.03% GC) mean time: 844.289 ms (1.39% GC) maximum time: 890.303 ms (6.87% GC) -------------- samples: 6 evals/sample: 1
bm_mygcd128 = @benchmark mygcd.(c,d)
BenchmarkTools.Trial: memory estimate: 15.26 MiB allocs estimate: 28 -------------- minimum time: 792.539 ms (0.00% GC) median time: 797.836 ms (0.00% GC) mean time: 805.527 ms (1.28% GC) maximum time: 856.856 ms (7.33% GC) -------------- samples: 7 evals/sample: 1
bm_gcd128.times[1]/bm_mygcd128.times[1]
1.04569075745307