Benchmark test of Monte Carlo calculation of pi (Julia v1.0.0)

gcc Allied Forces vs. Julia Imperial Military

Gen Kuroki

2017-08-20~2018-09-08

In [1]:
using Libdl
using Distributed
In [2]:
versioninfo()
Julia Version 1.0.0
Commit 5d4eaca0c9 (2018-08-08 20:58 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.0 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 4
  JULIA_PKGDIR = C:\JuliaPkg
In [3]:
# julia -p auto

addprocs(8)
nworkers()
Out[3]:
8
In [4]:
run(`gcc --version`)
gcc (Rev3, Built by MSYS2 project) 6.3.0
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Out[4]:
Process(`gcc --version`, ProcessExited(0))
In [5]:
ENV["OMP_NUM_THREADS"]
Out[5]:
"8"
In [6]:
using BenchmarkTools
In [7]:
function findpi_julia(n)
    s = 0
    for j in 1:n
        s += ifelse(rand()^2+rand()^2  1, 1, 0)
    end
    return 4.0 * s / n
end
findpi_julia(10^5)
Out[7]:
3.14456
In [8]:
@time findpi_julia(10^8)
  0.457286 seconds (1.95 k allocations: 116.855 KiB)
Out[8]:
3.141572
In [9]:
# julia -p auto

function findpi_julia_parallel(n)
    s = @distributed (+) for j in 1:n
        ifelse(rand()^2+rand()^2  1, 1, 0)
    end
    return 4.0 * s / n
end
findpi_julia_parallel(10^5)
Out[9]:
3.13984
In [10]:
@time findpi_julia_parallel(10^8)
  0.157769 seconds (831 allocations: 76.969 KiB)
Out[10]:
3.14158908
In [11]:
C_code= raw"""
#include <time.h>
#include <stdlib.h>

double findpi(unsigned long n){
    double x,y;
    srand((unsigned)time(NULL));
    double R = (double) RAND_MAX;
    unsigned long count = 0;
    for(unsigned long j=0; j < n; j++){
        x = ((double) rand())/R;
        y = ((double) rand())/R;
        if(x*x + y*y <= 1){
            count++;
        }
    }
    return ((double) 4.0)*((double) count)/((double) n);
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 328961 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_F837.tmp.dll
Out[11]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_F837.tmp.dll'`, ProcessExited(0))
In [12]:
const findpi_gcc_rand_lib = filename
findpi_gcc_rand(n::Int64) = ccall((:findpi, findpi_gcc_rand_lib), Float64, (Int64,), n)
findpi_gcc_rand(10^5)
Out[12]:
3.14904
In [13]:
@time findpi_gcc_rand(10^8)
  3.067088 seconds (6 allocations: 192 bytes)
Out[13]:
3.1415064
In [14]:
C_code= raw"""
#include <time.h>
#include <stdlib.h>

double findpi(unsigned long n){
    unsigned long x,y;
    srand((unsigned)time(NULL));
    double RR = (unsigned long) RAND_MAX*RAND_MAX;
    unsigned long count = 0;
    for(unsigned long j=0; j < n; j++){
        x = (unsigned long) rand();
        y = (unsigned long) rand();
        if(x*x + y*y <= RR){
            count++;
        }
    }
    return ((double) 4.0)*((double) count)/((double) n);
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 328961 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_1E85.tmp.dll
Out[14]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_1E85.tmp.dll'`, ProcessExited(0))
In [15]:
const findpi_gcc_rand_int_lib = filename
findpi_gcc_rand_int(n::Int64) = ccall((:findpi, findpi_gcc_rand_int_lib), Float64, (Int64,), n)
findpi_gcc_rand_int(10^5)
Out[15]:
3.13724
In [16]:
@time findpi_gcc_rand_int(10^8)
  2.251415 seconds (6 allocations: 192 bytes)
Out[16]:
3.1416166
In [17]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.sep.tgz
#
# Extract
#   mt19937ar.sep/mt19937ar.h
#   mt19937ar.sep/mt19937ar.c

C_code= raw"""
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mt19937ar.sep/mt19937ar.h"
#include "mt19937ar.sep/mt19937ar.c"

double findpi(unsigned long n){
    srand((unsigned)time(NULL));
    init_genrand(rand());
    unsigned long count = 0;
    double x,y;
    for(unsigned long j = 0; j < n; j++){
        x = genrand_real1();
        y = genrand_real1();
        if(x*x + y*y <= 1){
            count++;
        }
    }
    return ((double)4.0)*((double)count)/((double)n);
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 330305 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_8283.tmp.dll
Out[17]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_8283.tmp.dll'`, ProcessExited(0))
In [18]:
const findpi_gcc_MT_lib = filename
findpi_gcc_MT(n::Int64) = ccall((:findpi, findpi_gcc_MT_lib), Float64, (Int64,), n)
findpi_gcc_MT(10^5)
Out[18]:
3.13688
In [19]:
@time findpi_gcc_MT(10^8)
  1.116780 seconds (6 allocations: 192 bytes)
Out[19]:
3.14138972
In [20]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/mt19937-64stdint.tgz
#
# Extract
#   mt19937-64/mt64.h
#   mt19937-64/mt19937-64.c

C_code= raw"""
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define HAVE_SSE2
#include "mt19937-64/mt64.h"
#include "mt19937-64/mt19937-64.c"

double findpi(uint64_t n){
    srand((unsigned)time(NULL));
    uint64_t init[4]={(uint64_t)rand(), (uint64_t)rand(), (uint64_t)rand(), (uint64_t)rand()};
    uint64_t length = 4;
    init_by_array64(init, length);
    uint64_t count = 0;
    double x,y;
    for(uint64_t j = 0; j < n; j++){
        x = genrand64_real2();
        y = genrand64_real2();
        if(x*x + y*y <= 1){
            count++;
        }
    }
    return ((double)4.0)*((double)count)/((double)n);
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 332335 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_498.tmp.dll
Out[20]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_498.tmp.dll'`, ProcessExited(0))
In [21]:
const findpi_gcc_MT64_lib = filename
findpi_gcc_MT64(n::Int64) = ccall((:findpi, findpi_gcc_MT64_lib), Float64, (Int64,), n)
findpi_gcc_MT64(10^5)
Out[21]:
3.1388
In [22]:
@time findpi_gcc_MT64(10^8)
  1.584506 seconds (6 allocations: 192 bytes)
Out[22]:
3.14151604
In [23]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz
#
# Extract
#   dSFMT-src-2.2.3/dSFMT.h
#   dSFMT-src-2.2.3/dSFMT.c

C_code= raw"""
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define HAVE_SSE2
#define DSFMT_MEXP 521
#include "dSFMT-src-2.2.3/dSFMT.h"
#include "dSFMT-src-2.2.3/dSFMT.c"

double findpi(unsigned long n){
    srand((unsigned)time(NULL));
    dsfmt_t dsfmt;
    dsfmt_init_gen_rand(&dsfmt, rand());
    unsigned long count = 0;
    double x,y;
    for(unsigned long j = 0; j < n; j++){
        x = dsfmt_genrand_close_open(&dsfmt);
        y = dsfmt_genrand_close_open(&dsfmt);
        if(x*x + y*y <= 1){
            count++;
        }
    }
    return ((double)4.0)*((double)count)/((double)n);
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 364419 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_9801.tmp.dll
Out[23]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_9801.tmp.dll'`, ProcessExited(0))
In [24]:
const findpi_gcc_dSFMT_lib = filename
findpi_gcc_dSFMT(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_lib), Float64, (Int64,), n)
findpi_gcc_dSFMT(10^5)
Out[24]:
3.14672
In [25]:
@time findpi_gcc_dSFMT(10^8)
  0.360705 seconds (6 allocations: 192 bytes)
Out[25]:
3.1418504
In [26]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz
#
# Extract
#   dSFMT-src-2.2.3/dSFMT.h
#   dSFMT-src-2.2.3/dSFMT.c

# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7
# findpi_dSMFT_openmp.c

C_code= raw"""
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <omp.h>
#define HAVE_SSE2
#define DSFMT_MEXP 521
#include "dSFMT-src-2.2.3/dSFMT.h"
#include "dSFMT-src-2.2.3/dSFMT.c"

double findpi(unsigned long n){
    srand((unsigned)time(NULL));
    unsigned long threads = omp_get_num_procs();
    n = (n+threads-1)/threads*threads;
    const unsigned long size = n/threads;

    dsfmt_t dsfmt[threads];
    for(unsigned long i=0; i<threads; i++){
        dsfmt_init_gen_rand(dsfmt+i, rand());
    }

    unsigned long count = 0;
#pragma omp parallel for reduction(+:count)
    for(unsigned long i=0; i<threads; i++){
        for(unsigned long j=0; j<size; j++){
            double x = dsfmt_genrand_close_open(dsfmt+i);
            double y = dsfmt_genrand_close_open(dsfmt+i);
            if(x*x+y*y<=1){
                count++;
            }
        }
    }
    return 4.0*count/n;
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -Wall -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 365820 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_AD2E.tmp.dll
Out[26]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_AD2E.tmp.dll'`, ProcessExited(0))
In [27]:
const findpi_gcc_dSFMT_OMP_lib = filename
findpi_gcc_dSFMT_OMP(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_lib), Float64, (Int64,), n)
findpi_gcc_dSFMT_OMP(10^5)
Out[27]:
3.14084
In [28]:
@time findpi_gcc_dSFMT_OMP(10^8)
  0.144327 seconds (6 allocations: 192 bytes)
Out[28]:
3.14162164
In [29]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz
#
# Extract
#   dSFMT-src-2.2.3/dSFMT.h
#   dSFMT-src-2.2.3/dSFMT.c

# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7
# findpi_dSMFT_openmp_sse.c

C_code= raw"""
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <omp.h>
#include <emmintrin.h>
#define HAVE_SSE2
#define DSFMT_MEXP 521
#include "dSFMT-src-2.2.3/dSFMT.h"
#include "dSFMT-src-2.2.3/dSFMT.c"

double findpi(unsigned long n){
    srand((unsigned)time(NULL));
    unsigned long threads = omp_get_num_procs();
    const unsigned long N = 8;
    unsigned long width = threads * N;
    n = (n+width-1)/width*width;
    const unsigned long size = n/threads;

    dsfmt_t dsfmt[threads];
    for(unsigned long i=0; i<threads; i++){
        dsfmt_init_gen_rand(dsfmt+i, rand());
    }

    unsigned long count = 0;
#pragma omp parallel for reduction(+:count)
    for(unsigned long i=0; i<threads; i++){
        __m128d one;
        {
            double one_d = 1;
            one = _mm_loadl_pd(one, &one_d);
            one = _mm_loadh_pd(one, &one_d);
        }
        double s[2*N];
        double t[N/2][2];
        __m128d x0, x1, x2, x3;
        __m128d y0, y1, y2, y3;
        __m128d c0, c1, c2, c3;
        for(unsigned long j=0; j<size; j+=N){
            dsfmt_fill_array_close1_open2(dsfmt+i, s, 2*N);
            x0 = _mm_loadu_pd(s+0);
            y0 = _mm_loadu_pd(s+2);
            x1 = _mm_loadu_pd(s+4);
            y1 = _mm_loadu_pd(s+6);
            x2 = _mm_loadu_pd(s+8);
            y2 = _mm_loadu_pd(s+10);
            x3 = _mm_loadu_pd(s+12);
            y3 = _mm_loadu_pd(s+14);

            x0 = _mm_sub_pd(x0, one);
            y0 = _mm_sub_pd(y0, one);
            x1 = _mm_sub_pd(x1, one);
            y1 = _mm_sub_pd(y1, one);
            x2 = _mm_sub_pd(x2, one);
            y2 = _mm_sub_pd(y2, one);
            x3 = _mm_sub_pd(x3, one);
            y3 = _mm_sub_pd(y3, one);

            x0 = _mm_mul_pd(x0, x0);
            y0 = _mm_mul_pd(y0, y0);
            x1 = _mm_mul_pd(x1, x1);
            y1 = _mm_mul_pd(y1, y1);
            x2 = _mm_mul_pd(x2, x2);
            y2 = _mm_mul_pd(y2, y2);
            x3 = _mm_mul_pd(x3, x3);
            y3 = _mm_mul_pd(y3, y3);

            x0 = _mm_add_pd(x0, y0);
            x1 = _mm_add_pd(x1, y1);
            x2 = _mm_add_pd(x2, y2);
            x3 = _mm_add_pd(x3, y3);
            c0 = _mm_cmple_pd(x0, one);
            c1 = _mm_cmple_pd(x1, one);
            c2 = _mm_cmple_pd(x2, one);
            c3 = _mm_cmple_pd(x3, one);

            _mm_storeu_pd(t[0], c0);
            _mm_storeu_pd(t[1], c1);
            _mm_storeu_pd(t[2], c2);
            _mm_storeu_pd(t[3], c3);

            count +=
                 (t[0][0]!=0)+(t[0][1]!=0)
                +(t[1][0]!=0)+(t[1][1]!=0)
                +(t[2][0]!=0)+(t[2][1]!=0)
                +(t[3][0]!=0)+(t[3][1]!=0);
        }
    }
    return 4.0*count/n;
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 366332 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_1E7E.tmp.dll
Out[29]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_1E7E.tmp.dll'`, ProcessExited(0))
In [30]:
const findpi_gcc_dSFMT_OMP_SSE_lib = filename
findpi_gcc_dSFMT_OMP_SSE(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_SSE_lib), Float64, (Int64,), n)
findpi_gcc_dSFMT_OMP_SSE(10^5)
Out[30]:
3.151191618682022
In [31]:
@time findpi_gcc_dSFMT_OMP_SSE(10^8)
  0.122259 seconds (6 allocations: 192 bytes)
Out[31]:
3.14140916
In [32]:
# Download http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/SFMT/dSFMT-src-2.2.3.tar.gz
#
# Extract
#   dSFMT-src-2.2.3/dSFMT.h
#   dSFMT-src-2.2.3/dSFMT.c

# https://gist.github.com/Toru3/fba81b9106d8fa88e120ce85ab1d98f7
# findpi_dSMFT_openmp_fma.c

C_code= raw"""
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>
#define HAVE_SSE2
#define DSFMT_MEXP 521
#include "dSFMT-src-2.2.3/dSFMT.h"
#include "dSFMT-src-2.2.3/dSFMT.c"

double findpi(unsigned long n){
    srand((unsigned)time(NULL));
    unsigned long threads = omp_get_num_procs();
    const unsigned long N = 32;
    unsigned long width = threads * N;
    n = (n+width-1)/width*width;
    const unsigned long size = n/threads;

    unsigned long seed[threads];
    for(unsigned long i=0; i<threads; i++) seed[i] = rand();

    unsigned long count = 0;
#pragma omp parallel for reduction(+:count)
    for(unsigned long i=0; i<threads; i++){
        dsfmt_t dsfmt;
        dsfmt_init_gen_rand(&dsfmt, seed[i]);
        __m256d one;
        {
            double one_d = 1;
            one = _mm256_broadcast_sd(&one_d);
        }
        double s[2*N];
        long long t[N/4][4];
        __m256d x0, x1, x2, x3, x4, x5, x6, x7;
        __m256d y0, y1, y2, y3, y4, y5, y6, y7;
        __m256d c0, c1, c2, c3, c4, c5, c6, c7;
        for(unsigned long j=0; j<size; j+=N){
            dsfmt_fill_array_close1_open2(&dsfmt, s, 2*N);

            x0 = _mm256_loadu_pd(s+ 0);
            x1 = _mm256_loadu_pd(s+ 4);
            x2 = _mm256_loadu_pd(s+ 8);
            x3 = _mm256_loadu_pd(s+12);
            x4 = _mm256_loadu_pd(s+16);
            x5 = _mm256_loadu_pd(s+20);
            x6 = _mm256_loadu_pd(s+24);
            x7 = _mm256_loadu_pd(s+28);

            y0 = _mm256_loadu_pd(s+32);
            y1 = _mm256_loadu_pd(s+36);
            y2 = _mm256_loadu_pd(s+40);
            y3 = _mm256_loadu_pd(s+44);
            y4 = _mm256_loadu_pd(s+48);
            y5 = _mm256_loadu_pd(s+52);
            y6 = _mm256_loadu_pd(s+56);
            y7 = _mm256_loadu_pd(s+60);

            x0 = _mm256_sub_pd(x0, one);
            x1 = _mm256_sub_pd(x1, one);
            x2 = _mm256_sub_pd(x2, one);
            x3 = _mm256_sub_pd(x3, one);
            x4 = _mm256_sub_pd(x4, one);
            x5 = _mm256_sub_pd(x5, one);
            x6 = _mm256_sub_pd(x6, one);
            x7 = _mm256_sub_pd(x7, one);

            y0 = _mm256_sub_pd(y0, one);
            y1 = _mm256_sub_pd(y1, one);
            y2 = _mm256_sub_pd(y2, one);
            y3 = _mm256_sub_pd(y3, one);
            y4 = _mm256_sub_pd(y4, one);
            y5 = _mm256_sub_pd(y5, one);
            y6 = _mm256_sub_pd(y6, one);
            y7 = _mm256_sub_pd(y7, one);

            x0 = _mm256_mul_pd(x0, x0);
            x1 = _mm256_mul_pd(x1, x1);
            x2 = _mm256_mul_pd(x2, x2);
            x3 = _mm256_mul_pd(x3, x3);
            x4 = _mm256_mul_pd(x4, x4);
            x5 = _mm256_mul_pd(x5, x5);
            x6 = _mm256_mul_pd(x6, x6);
            x7 = _mm256_mul_pd(x7, x7);

            x0 = _mm256_fmadd_pd(y0, y0, x0);
            x1 = _mm256_fmadd_pd(y1, y1, x1);
            x2 = _mm256_fmadd_pd(y2, y2, x2);
            x3 = _mm256_fmadd_pd(y3, y3, x3);
            x4 = _mm256_fmadd_pd(y4, y4, x4);
            x5 = _mm256_fmadd_pd(y5, y5, x5);
            x6 = _mm256_fmadd_pd(y6, y6, x6);
            x7 = _mm256_fmadd_pd(y7, y7, x7);

            c0 = _mm256_cmp_pd(x0, one, _CMP_LE_OQ);
            c1 = _mm256_cmp_pd(x1, one, _CMP_LE_OQ);
            c2 = _mm256_cmp_pd(x2, one, _CMP_LE_OQ);
            c3 = _mm256_cmp_pd(x3, one, _CMP_LE_OQ);
            c4 = _mm256_cmp_pd(x4, one, _CMP_LE_OQ);
            c5 = _mm256_cmp_pd(x5, one, _CMP_LE_OQ);
            c6 = _mm256_cmp_pd(x6, one, _CMP_LE_OQ);
            c7 = _mm256_cmp_pd(x7, one, _CMP_LE_OQ);

            _mm256_storeu_pd((double*)t[0], c0);
            _mm256_storeu_pd((double*)t[1], c1);
            _mm256_storeu_pd((double*)t[2], c2);
            _mm256_storeu_pd((double*)t[3], c3);
            _mm256_storeu_pd((double*)t[4], c4);
            _mm256_storeu_pd((double*)t[5], c5);
            _mm256_storeu_pd((double*)t[6], c6);
            _mm256_storeu_pd((double*)t[7], c7);

            unsigned int f =
                ((t[0][0]&1 | t[0][1]&2 | t[0][2]&4 | t[0][3]&8) <<  0) |
                ((t[1][0]&1 | t[1][1]&2 | t[1][2]&4 | t[1][3]&8) <<  4) |
                ((t[2][0]&1 | t[2][1]&2 | t[2][2]&4 | t[2][3]&8) <<  8) |
                ((t[3][0]&1 | t[3][1]&2 | t[3][2]&4 | t[3][3]&8) << 12) |
                ((t[4][0]&1 | t[4][1]&2 | t[4][2]&4 | t[4][3]&8) << 16) |
                ((t[5][0]&1 | t[5][1]&2 | t[5][2]&4 | t[5][3]&8) << 20) |
                ((t[6][0]&1 | t[6][1]&2 | t[6][2]&4 | t[6][3]&8) << 24) |
                ((t[7][0]&1 | t[7][1]&2 | t[7][2]&4 | t[7][3]&8) << 28) ;

            f = (f&0x55555555) + ((f>> 1)&0x55555555);
            f = (f&0x33333333) + ((f>> 2)&0x33333333);
            f = (f&0x0f0f0f0f) + ((f>> 4)&0x0f0f0f0f);
            f = (f&0x00ff00ff) + ((f>> 8)&0x00ff00ff);
            f = (f&0x0000ffff) + ((f>>16)&0x0000ffff);
            count += f;
        }
    }
    return 4.0*count/n;
}
"""

filename = tempname()
filenamedl = filename * "." * Libdl.dlext

open(`gcc -O3 -march=native -fopenmp -xc -shared -o $filenamedl -`, "w") do f
     print(f, C_code)
end

run(`ls -l $filenamedl`)
-rwxr-xr-x 1 genkuroki genkuroki 366844 Sep  8 12:45 C:\Users\GENKUR~1\AppData\Local\Temp\jl_DBC8.tmp.dll
Out[32]:
Process(`ls -l 'C:\Users\GENKUR~1\AppData\Local\Temp\jl_DBC8.tmp.dll'`, ProcessExited(0))
In [33]:
const findpi_gcc_dSFMT_OMP_FMA_lib = filename
findpi_gcc_dSFMT_OMP_FMA(n::Int64) = ccall((:findpi, findpi_gcc_dSFMT_OMP_FMA_lib), Float64, (Int64,), n)
findpi_gcc_dSFMT_OMP_FMA(10^5)
Out[33]:
3.147218670076726
In [34]:
@time findpi_gcc_dSFMT_OMP_FMA(10^8)
  0.088022 seconds (6 allocations: 192 bytes)
Out[34]:
3.14142828
In [35]:
@time findpi_julia(10^9)
@time findpi_gcc_rand(10^9)
@time findpi_gcc_rand_int(10^9)
@time findpi_gcc_MT(10^9)
@time findpi_gcc_dSFMT(10^9)
  4.025338 seconds (6 allocations: 192 bytes)
 28.596264 seconds (6 allocations: 192 bytes)
 20.276883 seconds (6 allocations: 192 bytes)
 10.277577 seconds (6 allocations: 192 bytes)
  3.236910 seconds (6 allocations: 192 bytes)
Out[35]:
3.141573236
In [36]:
@btime findpi_julia(10^9)
@btime findpi_gcc_MT64(10^9)
@btime findpi_gcc_MT(10^9)
@btime findpi_gcc_dSFMT(10^9)
  3.889 s (0 allocations: 0 bytes)
  12.614 s (0 allocations: 0 bytes)
  10.343 s (0 allocations: 0 bytes)
  3.115 s (0 allocations: 0 bytes)
Out[36]:
3.14157694
In [37]:
@btime findpi_julia_parallel(10^9)
@btime findpi_gcc_dSFMT_OMP(10^9)
@btime findpi_gcc_dSFMT_OMP_SSE(10^9)
@btime findpi_gcc_dSFMT_OMP_FMA(10^9)
  1.161 s (842 allocations: 75.89 KiB)
  1.275 s (0 allocations: 0 bytes)
  1.151 s (0 allocations: 0 bytes)
  644.210 ms (0 allocations: 0 bytes)
Out[37]:
3.141652768
In [ ]: