C_code = """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
"""
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
const Clib = tempname()
using Libdl
open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code)
end
c_sum(X::Array{Float64}) = ccall(("c_sum", Clib), Float64, (Csize_t, Ptr{Float64}), length(X), X)

# define a function to compute the relative (fractional) error |x-y| / mean(|x|,|y|)
relerr(x,y) = abs(x - y) * 2 / (abs(x) + abs(y))

a = rand(10^7) # array of random numbers in [0,1)
relerr(c_sum(a), sum(a))

using BenchmarkTools

c_bench = @btime c_sum($a)

using PyCall
PyCall.pyversion

# call a low-level PyCall function to get a Python list, because
# by default PyCall will convert to a NumPy array instead (we benchmark NumPy below):
apy_list = PyCall.array2py(a, 1, 1)
# get the Python built-in "sum" function:
pysum = pybuiltin("sum")

relerr(pysum(apy_list), sum(a))

py_list_bench = @btime $pysum($apy_list)

numpy_sum = pyimport("numpy")["sum"]
apy_numpy = PyObject(a) # converts to a numpy array by default
py_numpy_bench = @btime $numpy_sum($apy_numpy)

py"""
def mysum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""
mysum_py = py"mysum"

relerr(mysum_py(apy_list), sum(a))

@btime $mysum_py($apy_list)

@btime $mysum_py($apy_numpy)

j_bench = @btime sum($a)

typeof(a)

a_any = Vector{Any}(a)
j_bench_any = @btime sum($a_any)

function mysum1(A)
    s = zero(eltype(A)) # the correct type of zero for A
    for a in A
        s += a
    end
    return s
end
relerr(mysum1(a), sum(a))

j2_bench = @btime mysum1($a)

function mysum(A)
    s = zero(eltype(A))
    @simd for a in A
        s += a
    end
    return s
end
relerr(mysum(a), sum(a))

j3_bench = @btime mysum($a)

z = rand(Complex{Float64}, length(a));
@btime mysum($z)