versioninfo()

# Pkg.add("BenchmarkTools")
#
using BenchmarkTools

# http://nbviewer.jupyter.org/gist/genkuroki/81de23edcae631a995e19a2ecf946a4f
# の第1.6.1節を見よ.
#
# ENV["PYTHON"]="C:\\Anaconda3\\python.exe" # ← 自分の環境の合わせて変える
# Pkg.add("PyPlot")
#
using PyPlot

# 非常に遅い.

N = 10^8
h = 2π/N
y = 1.0 + 0.0im           # ← y を複素数の 1 で初期化
@time for i in 1:N
    y += h*im*y           # Julia言語では虚数単位を im と書く
end
y

# 函数の中に入れるだけで大幅に速くなる.

function test_of_integration()
    N = 10^8
    h = 2π/N
    y = 1.0 + 0.0im           # ← y を複素数の 1 で初期化
    for i in 1:N
        y += h*im*y
    end
    y
end

test_of_integration()
@show @time test_of_integration()

# コードを函数の中に入れたら, 常に @code_warntype で確認するようにする.
# 注意が必要な部分は赤字で表示される.
#println()
#@code_warntype test_of_integration()

# y = 1.0 + 0.0im を y = 1 に書き換えただけで大幅に遅くなる.

function test_of_integration_slow1()
    N = 10^8
    h = 2π/N                  # ← 厳密には h = Complex{Float64}(2π/N) と書くべきだがそうする必要はない.
    y = 1                      # ← y = 1 は y を整数 1 にするという意味. y = 1.0 + 0.0im と書くべき.
    for i in 1:N
        y += h*im*y            # ← y は整数変数だったはずなのに複素数を足している.
    end
    y
end

test_of_integration_slow1()
@show @time test_of_integration_slow1()

# y が複素変数なのか整数変数なのかわからない状態になってしまっている.
println()
@code_warntype test_of_integration_slow1()

# y = 1.0 でも同様の理由で遅い.

function test_of_integration_slow2()
    N = 10^8
    h = 2π/N                  # ← 厳密には h = Complex{Float64}(2π/N) と書くべきだがそうする必要はない.
    y = 1.0                    # ← y = 1.0 は y を浮動小数点数(実数)の 1 にするという意味. y = 1.0 + 0.0im と書くべき.
    for i in 1:N
        y += h*im*y            # ← y は実変数だったはずなのに複素数を足している.
    end
    y
end

test_of_integration_slow2()
@show @time test_of_integration_slow2()

# y が複素変数なのか実変数なのかわからない状態になってしまっている.
println()
@code_warntype test_of_integration_slow2()

function test_of_integration_array0()
    N = 10^8
    h = 2π/N                        # ← 厳密には h = Complex{Float64}(2π/N) と書きたくなるがその必要はない
    y = Array{Complex{Float64}}(N+1)
    y[1] = 1.0 + 0.0im
    for i in 1:N
        y[i+1] = (1 + h*im)*y[i]    # ← 厳密には 1 を one(Complex{Float64}) と書きたくなるがその必要はない
    end
    y[N+1]
end

test_of_integration_array0()
@show @time test_of_integration_array0()

#println()
#@code_warntype test_of_integration_array0()

function test_of_integration_array1()
    N = 10^8
    h = 2π/N                        # ← 厳密には h = Complex{Float64}(2π)/N と書きたくなるがその必要はない
    y = Array{Complex{Float64}}(N+1)
    y[1] = 1.0                       # ← 厳密には y[1] = 1.0 + 0.0im と書きたくなるがその必要はない.
    for i in 1:N
        y[i+1] = (1 + h*im)*y[i]     # ← 厳密には 1 を one(Complex{Float64}) と書きたくなるがその必要はない
    end
    y[N+1]
end

test_of_integration_array0()
@show @time test_of_integration_array1()

#println()
#@code_warntype test_of_integration_array1()

function test_of_integration_array2()
    N = 10^8
    h = 2π/N                        # ← 厳密には h = Complex{Float64}(2π/N) と書きたくなるがその必要はない
    y = Array{Complex{Float64}}(N+1)
    y[1] = 1                         # ← 厳密には y[1] = 1.0 + 0.0im と書きたくなるがその必要はない.
    for i in 1:N
        y[i+1] = (1 + h*im)*y[i]    # ← 厳密には 1 を one(Complex{Float64}) と書きたくなるがその必要はない
    end
    y[N+1]
end

test_of_integration_array0()
@show @time test_of_integration_array2()

#println()
#@code_warntype test_of_integration_array2()

# 非常に遅い.

L = 10^6
@show a = []
@benchmark for i in 1:L
    push!(a, rand())
end

# 函数の中に入れてもかなり遅い.

function test_of_push_Any_array(; L = 10^6)
    a = []
    for i in 1:L
        push!(a, rand())
    end
end

@benchmark test_of_push_Any_array()

# a を Float64 型の要素を持つ配列に型指定するだけでかなり速くなる.

function test_of_push_Float64_array(; L = 10^6)
    a = Float64[]
    for i in 1:L
        push!(a, rand()) # ← rand() の値は Float64 型
    end
    return a
end

@benchmark a = test_of_push_Float64_array()

# push! の繰り返しよりも, まとめて配列を確保して代入した方が速い.

function test_of_Array_Float64(; L = 10^6)
    a = Array{Float64}(L)
    for i in 1:L
        a[i] = rand()
    end
    return a
end

@benchmark a = test_of_Array_Float64()

# 組み込み函数は非常に速い.

@benchmark a = rand(10^6)

function myexp(x; N=10^6)
    h = x/N
    y = one(h)        # h と同じ型の1で y を初期化.  y は h と同じ型の変数になる.
    for i in 1:N
        y += h*y
    end
    y
end

@show x = 1
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = 1.0
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = Int8(1)
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = log(BigFloat(2))
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = log(2)
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = log(Float32(2))
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = 2*BigFloat(π)*im
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = 2π*im
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

@show x = 2*Float32(π)*im
@show typeof(x)
@time y = myexp(x)
y, typeof(y)

x = (π/6)*Float64[
    0 -1
    1  0
]
@show x
@show typeof(x)
@time y = myexp(x)
y

function myexp_orbit(x; N=10^3)
    h = x/N
    y = Array{typeof(h)}(N+1)       # hと同じ型の要素を持つ配列を作成
    y[1] = one(h)                   # 行列対応にするためには one(h) を 1 で置き換えてはいけない.
    for i in 1:N
        y[i+1] = (one(h) + h)*y[i]  # 行列対応にするためには one(h) を 1 で置き換えてはいけない.
    end
    y
end

N = 400
x = 3
y = myexp_orbit(x; N=N)
xs = linspace(0,x,N+1)

figure(figsize=(5,3.5))
plot(xs, y, label="y = myexp(x)")
plot(xs, exp.(xs), label="y = exp(x)", ls="--")
grid(ls=":")
xlabel("x")
ylabel("y")
legend()
title("exponential of real numbers")

# 複素数でも使える

N = 400
z = π*im
w = myexp_orbit(z; N=N)

figure(figsize=(5,3.5))
plot(linspace(0,π,N+1), real(w), label="real part")
plot(linspace(0,π,N+1), imag(w), label="imaginary part")
xlabel("arg")
grid(ls=":")
legend()
title("exponential of pure imaginary numbers")

# 行列でも使える

N = 10^4
Z = (5π/3)*Float64[
    0 -1
    1  0
]
W = myexp_orbit(Z; N=N)
x = [W[k][1,1] for k in 1:N+1]
y = [W[k][2,1] for k in 1:N+1]
plot(x, y)
plot([0.0, x[1]], [0.0, y[1]], color="k", lw=1)
plot([0.0, x[N+1]], [0.0, y[N+1]], color="k", lw=1)
grid()
axes()[:set_aspect]("equal")

# テストデータの作成

N = 10^8
srand(2018)
w = randn(N)
x = w.^2;

# 函数の直接呼出しは速い

weighted_mean(x,w) = mean(w[i]*x[i] for i in eachindex(w)) # w を後でパラメーターだとみなす.

weighted_mean(x,w)
@show @time weighted_mean(x,w)

# println()
# @code_warntype weighted_mean(x,w)

# 以下の場合には大域変数を含んでいるように見えても**速い**.
# 
# function_containing_global_var1(x) を実行しようとすると, 
# Julia言語は weighted_mean(x,w) を実行しようとする.
# そして, weighted_mean 函数を引数 x, w の型に合わせてコンパイルして実行する.
# Julia言語が引数の型に合わせて十分に型推定できる場合には計算が速くなる.

function_containing_global_var1(x) = weighted_mean(x,w)

function_containing_global_var1(x)
@show @time function_containing_global_var1(x)

println()

# 計算速度的には問題無かったが, @code_warntype で確認すると赤字の警告が出ているので避けた方が無難.
@code_warntype function_containing_global_var1(x)

# 以下のように書くと大幅に遅くなる.
# 
# function_containing_global_var1(x) を実行しようとすると, 
# Julia言語は mean(w[i]*x[i] for i in eachindex(w)) を実行しようとする.
# そして, mean(w[i]*x[i] for i in eachindex(w)) を引数 x　の型に合わせてコンパイルして実行する.
# そのとき w は大域変数なので型は不明であるとする.
# Julia言語が引数の型に合わせて十分に型推定できない場合には計算は遅くなる.

function_containing_global_var2(x) = mean(w[i]*x[i] for i in eachindex(w))

function_containing_global_var2(x)
@show @time function_containing_global_var2(x)

println()

# w の型が Any になっている
@code_warntype function_containing_global_var2(x)

# 以下のように大域変数を含んでいる場合であっても, 型宣言を追加すれば速くなる.

function_containing_global_var3(x) = (mean((w::Array{Float64,1})[i]*x[i] for i in eachindex(w::Array{Float64,1})))

function_containing_global_var3(x)
@show @time function_containing_global_var3(x)

println()

# 赤字の警告がすべて消えている.
@code_warntype function_containing_global_var3(x)

# 大域変数だった w の定義も含めて, 全体を函数の中に入れると速くなる.
#
# この解決法が最もシンプルでわかりやすいが, 柔軟性に欠けた不便な解決法でもある.

function test_of_function_containing_local_var()
    N = 10^8
    srand(2018)
    w = randn(N)
    x = w.^2
    
    function_containing_local_var(x) = mean(w[i]*x[i] for i in eachindex(w))

    function_containing_local_var(x)
    @show @time function_containing_local_var(x)
end

test_of_function_containing_local_var()

# println()
# @code_warntype test_of_function_containing_local_var()

# closureを使っても速くなる.

function make_function_closure(w)
    function_closure(x) = mean(w[i]*x[i] for i in eachindex(w))
    return function_closure
end

N = 10^8
srand(2018)
w = randn(N)
x = w.^2

function_closure = make_function_closure(w)
function_closure(x)
@show @time function_closure(x)

# println()
# @code_warntype function_closure(x)

# 大域変数 w を定数にすると速くなる.
#
# この方法も単純だが, 柔軟性に欠けた不便な解決法でもある.

N = 10^8
srand(2018)
const w_const = randn(N)

function_containing_const(x) = mean(w_const[i]*x[i] for i in eachindex(w_const))

function_containing_const(x)
@show @time function_containing_const(x)

# println()
# @code_warntype function_containing_const()

# パラメーター w の型が確定している function-like object は速い

mutable struct WeightedMean{T} w::T end
(f::WeightedMean)(x) = mean(f.w[i]*x[i] for i in eachindex(f.w))

function_like_object = WeightedMean(w)
function_like_object(x)
@show @time function_like_object(x)

println()

# パラメーター w の型 T が確定している.
@show typeof(function_like_object)

# println()
# @code_warntype function_like_object(x)

# 上と全く同じ内容を改行を増やして書くと以下のようになる.

mutable struct WeightedMean{T}
    w::T
end

function (f::WeightedMean)(x)
    mean(f.w[i]*x[i] for i in eachindex(f.w))
end
 
function_like_object = WeightedMean(w)
function_like_object(x)
@show @time function_like_object(x)

println()

# パラメーター w の型 T が Array{Float64,1} に確定している.
@show typeof(function_like_object)

# println()
# @code_warntype function_like_object(x)

# 次のような書き方もできる. 
# パラメーター w の意味的にはこちらの方が好ましいと思われる.

mutable struct WeightedMeanArrayOnly{T}
    w::Array{T}
end

function (f::WeightedMeanArrayOnly)(x)
    mean(f.w[i]*x[i] for i in eachindex(f.w))
end
 
function_like_object_array_only = WeightedMeanArrayOnly(w)
function_like_object_array_only(x)
@show @time function_like_object_array_only(x)

println()

# 型 T が Float64 に確定している.
@show typeof(function_like_object_array_only)

# println()
# @code_warntype function_like_object_array_only(x)

# パラメーターの w の型が不明の function-like object は遅い

mutable struct WeightedMeanSlow w end
(f::WeightedMeanSlow)(x) = mean(f.w[i]*x[i] for i in eachindex(f.w))

function_like_object_slow = WeightedMeanSlow(w)
function_like_object_slow(x)
@show @time function_like_object_slow(x)

println()
@show typeof(function_like_object_slow)

println()

# w の型が Any になってしまっている
@code_warntype function_like_object_slow(x)

# function-like object の函数の定義で f.w を誤って w と書いてしまうと大幅に遅くなる.
# w が大域変数として定義されているとエラーが出ないので要注意!

mutable struct WeightedMeanTypo{T} w::T end
(f::WeightedMeanTypo)(x) = mean(w[i]*x[i] for i in eachindex(f.w))

function_like_object_typo = WeightedMeanTypo(w)
function_like_object_typo(x)
@show @time function_like_object_typo(x)

println()

@code_warntype function_like_object_typo(x)

mutable struct AffineTransformation{T}
    A::Array{T,2}
    b::Array{T,1}
end

function (f::AffineTransformation)(x)
    f.A * x + f.b
end

θ = π/3
A = [
    cos(θ) -sin(θ)
    sin(θ)  cos(θ)
]
b = [
    5.0
    -2.0
]
v = [
    1.0
    0.0
]
Φ = AffineTransformation(A, b)

@show Φ
@show Φ(v)
@show Φ.A = eye(2)
@show Φ.b = zeros(2)
@show Φ
@show Φ(v);

# テストデータの作成

N = 10^8
srand(2018)
a = randn(N)
b = randn(N)
c = randn(N)
d = randn(N);

# 次のセルよりかなり遅い.  メモリ消費量の巨大さにも注目!

s = Array{Float64}(N)

function mysum_slow1(a, b, c, d)
    a + b + c + d
end

@benchmark s = mysum_slow1(a, b, c, d)

# 次のセルより遅い. 
# 前もって配列 s を確保してあるのに, 新たに同じサイズのメモリが消費されてしまっている.
# その理由は s = mysum_slow2(a, b, c, d) の = の右辺の分の配列が新たに確保されているからである.

s = Array{Float64}(N)

function mysum_slow2(a, b, c, d)
    a .+ b .+ c .+ d
end

@benchmark s = mysum_slow2(a, b, c, d)

# in-place 計算の使用によって代入操作も dot operator 化すると効率がよくなる.
#
# s .= a .+ b .+ c .+ d の代わりに @. s = a + b + c + d と書ける.

s = Array{Float64}(N)

function mysum_atdot1!(s, a, b, c, d)
    @. s = a + b + c + d
end

mysum_atdot1!(s, a, b, c, d)
@show s == a + b + c + d

@benchmark mysum_atdot1!(s, a, b, c, d)

# 上と本質的に同じ

s = Array{Float64}(N)

function mysum_atdot2!(s, a, b, c, d)
    s .= a .+ b .+ c .+ d
end

mysum_atdot2!(s, a, b, c, d)
@show s == a + b + c + d

@benchmark mysum_atdot2!(s, a, b, c, d)

@show a = 0
@show b = Array{typeof(a),1}(2)
@show b[1] = a
@show b
@show a = 1
@show b[2] = a
@show b
println("\n以上は「意図した通り」だろう.\n")

@show a = [0,0]
@show b = Array{typeof(a),1}(2)
@show b[1] = a
@show b
@show a = [1, 2]
@show b[2] = a
@show b
println("\n以上も「意図した通り」だろう.\n")

@show a = [0,0]
@show b = Array{typeof(a),1}(2)
@show b[1] = a
@show b
@show a[1], a[2] = 1, 2
@show b[2] = a
@show b

println("\nb の要素がどちらも [1,2] になってしまった!\n")

@show a = [0,0]
@show b = Array{typeof(a),1}(2)
@show b[1] = copy(a)
@show b
@show a[1], a[2] = 1, 2
@show b[2] = a
@show b

println("\nこのように b[1] = a を b[1] = copy(a) に置き換えると「意図した通り」になる.\n")

println("\n次のセルのように2次元配列を使っても「意図した通り」になる.")

a = [0,0]
b = Array{eltype(a),2}(2,2)
display(b)
b[:,1] = a
display(b)
a[1], a[2] = 1, 2
b[:,2] = a
display(b)

a = [1.2, 3.4]
s = zeros(2)
@show a
@show s

function f(s, a)
    s = a
end
f(s, a)

println()
@show s
println("函数内の s = a によって s の内容は変化していない.\n")

function g!(s, a)
    s .= a
end
g!(s, a)

@show s
println("函数内の s .= a によって s の内容が変化している.")

# 計算結果確認用のプロット函数

function plot2pcolormesh(u, v; cmap="CMRmap")
    sleep(0.1)
    d = size(u)[1]
    figure(figsize=(8,2))
    for i in 1:d
        fig = subplot(div(2d+2,4), 4, mod1(2i-1,4))
        pcolormesh(x, y, @view(u[i,:,:]), cmap=cmap)
        #colorbar()
        fig[:set_aspect]("equal")
        fig = subplot(div(2d+2,4), 4, mod1(2i,4))
        pcolormesh(x, y, @view(v[i,:,:]), cmap=cmap)
        #colorbar()
        fig[:set_aspect]("equal")
        tight_layout()
    end
end

# テストデータの作成
# 3次元配列をベクトルの2次元配列であるかのように扱う

n = 1000
x = y = linspace(-5, 5, n)
f(x, y) = x^2 - y^2 # ラプラシアンを作用させると0になる.
g(x, y) = exp(-(x-2)^2-(y-2)^2) + exp(-(x+2)^2-(y+2)^2) # 混合ガウス分布
u = Array{Float64, 3}(2, n, n)
u[1,:,:] .= f.(x', y)
u[2,:,:] .= g.(x', y);

# 平面全体での離散ラプラシアン
#
# laplacian_local! は離散ラプラシアンの値を局所的に計算する函数である.
# このように, 局所的なラプラシアンの計算を別の函数として分離しておく.
# u[:,i,j] が平面座標 (i,j) における u の値(ベクトル値になる)を表現していると仮定する.
# v に離散ラプラシアンを施した結果を返す.
#
function laplacian!(v, u, laplacian_local!)
    m, n = size(u)[end-1:end]
    for i in 1:m
       for j in 1:n
            laplacian_local!(v, u, m, n, i, j)
       end
    end
end

# 何も工夫していない laplacian_local_naive! を使用すると非常に遅い.
#
function laplacian_local_naive!(v, u, m, n, i, j)
    v[:,i,j] =
    u[:, ifelse(i+1 ≤ m, i+1, 1), j] + u[:, ifelse(i-1 ≥ 1, i-1, m), j] +
    u[:, i, ifelse(j+1 ≤ n, j+1, 1)] + u[:, i, ifelse(j-1 ≥ 1, j-1, n)] -
    4u[:, i, j]
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_naive!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_naive!)

# @view を使うと少しだけ速くなる.

function laplacian_local_view!(v, u, m, n, i, j)
    v[:,i,j] =
    @view(u[:, ifelse(i+1 ≤ m, i+1, 1), j]) + @view(u[:, ifelse(i-1 ≥ 1, i-1, m), j]) +
    @view(u[:, i, ifelse(j+1 ≤ n, j+1, 1)]) + @view(u[:, i, ifelse(j-1 ≥ 1, j-1, n)]) -
    4*@view(u[:, i, j])
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_view!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_view!)

# 上と同じ. @views を使った方が簡潔に書ける.

function laplacian_local_views!(v, u, m, n, i, j)
    @views(
        v[:,i,j] =
        u[:, ifelse(i+1 ≤ m, i+1, 1), j] + u[:, ifelse(i-1 ≥ 1, i-1, m), j] +
        u[:, i, ifelse(j+1 ≤ n, j+1, 1)] + u[:, i, ifelse(j-1 ≥ 1, j-1, n)] -
        4u[:, i, j]
    )
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_views!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_views!)

# さらに @. を付けるとかなり速くなる.
# しかし, なぜかメモリが結構使用されている.

function laplacian_local_atdot_views!(v, u, m, n, i, j)
    @. @views(
        v[:,i,j] =
        u[:, ifelse(i+1 ≤ m, i+1, 1), j] + u[:, ifelse(i-1 ≥ 1, i-1, m), j] +
        u[:, i, ifelse(j+1 ≤ n, j+1, 1)] + u[:, i, ifelse(j-1 ≥ 1, j-1, n)] -
        4u[:, i, j]
    )
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_atdot_views!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_atdot_views!)

# さらに @inline をつけるとかなり速くなる.
# メモリの使用量がほぼゼロになったことにも注目！

@inline function laplacian_local_inline_atdot_views!(v, u, m, n, i, j)
    @. @views(
        v[:,i,j] =
        u[:, ifelse(i+1 ≤ m, i+1, 1), j] + u[:, ifelse(i-1 ≥ 1, i-1, m), j] +
        u[:, i, ifelse(j+1 ≤ n, j+1, 1)] + u[:, i, ifelse(j-1 ≥ 1, j-1, n)] -
        4u[:, i, j]
    )
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_inline_atdot_views!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_inline_atdot_views!)

# forループに展開するともっと速い.

@inline function laplacian_local_inline_for!(v, u, m, n, i, j)
    for k in 1:size(u)[1]
        v[k, i, j] =
        u[k, ifelse(i+1 ≤ m, i+1, 1), j] + u[k, ifelse(i-1 ≥ 1, i-1, m), j] +
        u[k, i, ifelse(j+1 ≤ n, j+1, 1)] + u[k, i, ifelse(j-1 ≥ 1, j-1, n)] -
        4u[k, i, j]
    end
end

v = Array{Float64,3}(2, n, n)
laplacian!(v, u, laplacian_local_inline_for!)
plot2pcolormesh(u, v)
@benchmark laplacian!(v, u, laplacian_local_inline_for!)

# 2N個の和をまとめて sum(a) で計算すると非常に速い

N = 10^8
srand(2018)
a = rand(2N)
@benchmark sum(a)

# sumの中に配列の和a+bを入れると遅くなり, 大量にメモリを消費する.

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark sum(a+b)

# a+b を a.+b に変えても効果無し

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark sum(a.+b)

# forループに展開すると速い

function mysum(a,b)
    s = zero(eltype(a))
    for i in 1:endof(a)
        s += a[i] + b[i]
    end
    s
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark mysum(a,b)

# forループはマクロにすると短く書ける

macro sum(init, i, iter, f)
    quote
        begin
            s = $(esc(init))
            for $(esc(i)) in $(esc(iter))
                s += $(esc(f))
            end
            s
        end
    end
end

function mysum_macro(a,b)
    @sum(zero(eltype(a)), i, 1:endof(a), a[i]+b[i])
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark mysum_macro(a,b)

# @sum マクロがどのように展開されるか

@macroexpand @sum(0, k, 1:10, k)

# マクロで正しく計算されていることを確認

@show @sum(0.0, i, 1:3, @sum(0, j, 1:2, 2.0^(i*j)))
@show sum(2.0^(i*j) for i in 1:3 for j in 1:2);

# sum(i->a[i]+b[i], eachindex(a)) と書き換えると速くなる. 
# この方法は mean でも使用可能

function mysum_sum_of_forin(a,b)
    sum(i->a[i]+b[i], eachindex(a))
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark mysum_sum_of_forin(a,b)

# sum(a+b) を sum(a[i]+b[i] for i in eachindex(a)) に書き換えても速くなる.
# この方法は mean でも使用可能

function mysum_sum_of_f_itr(a,b)
    sum(a[i]+b[i] for i in eachindex(a))
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark mysum_sum_of_f_itr(a,b)

# 上と同様の趣旨

function mysum_reduce(a,b)
    reduce(+, zero(eltype(a)), (a[i]+b[i] for i in eachindex(a)))
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark mysum_reduce(a,b)

# 実は以下のように sum を先に取れば速い.

function plus_sum(a,b)
    sum(a) + sum(b)
end

N = 10^8
srand(2018)
a = rand(N)
b = rand(N)
@benchmark plus_sum(a,b)