# Array, Matrix, and Vector requires types
A = Array{Float64}(3,4)
A = Matrix{Float64}(3,4)
y = Vector{Float64}(3)
3-element Array{Float64,1}: 2.25716e-314 2.26195e-314 2.26195e-314
# instead, use zeros, ones, rand, randn, similar...
A = zeros(3,4)
3x4 Array{Float64,2}: 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# provided a typed vector y,
y = rand(3)
3-element Array{Float64,1}: 0.107162 0.403169 0.440116
# construct x as a new vector (uninitialized) with the same type
x = similar(y)
x = zero(y) # zeros(y)
3-element Array{Float64,1}: 0.0 0.0 0.0
A = Array{Any}(1000,1000)
A[5,100] = 0.0
@show typeof(A)
@show typeof(A[5,100])
typeof(A) = Array{Any,2} typeof(A[5,100]) = Float64
Float64
# avoid changing the type of a variable
function A_notype()
A = Array{Any}(1000,1000)
A[5,100] = 0.0
end
function A_type()
A = Array{Float64}(1000,1000)
A[5,100] = 0.0
end
@time A_notype()
@time A_type()
@time for i=1:100 A_notype(); end
@time for i=1:100 A_type(); end
0.006544 seconds (524 allocations: 7.659 MB) 0.003649 seconds (523 allocations: 7.659 MB) 0.327970 seconds (200 allocations: 762.946 MB, 47.92% gc time) 0.159583 seconds (200 allocations: 762.946 MB, 61.42% gc time)
soft(v,a) = sign(v).*max(0, abs(v)-a)
soft (generic function with 1 method)
function soft_t{T}(v::Array{T}, a::T)
return( sign(v).*max(0, abs(v)-a) )
end
soft_t (generic function with 1 method)
v = randn(1_000);
a = .5;
# jit-compliation if necessary
@time soft(v,a)
@time soft_t(v,a)
@time [soft(v,a) for i=1:10_000];
@time [soft_t(v,a) for i=1:10_000];
0.003094 seconds (1.86 k allocations: 132.383 KB) 0.002665 seconds (1.87 k allocations: 132.823 KB) 0.424714 seconds (140.00 k allocations: 388.107 MB, 44.54% gc time) 0.268319 seconds (140.00 k allocations: 388.107 MB, 29.71% gc time)
pos(x) = x < 0 ? 0: x
pos (generic function with 1 method)
pos_another(x) = x < zero(x) ? zero(x) : x
pos_another (generic function with 1 method)
n=1000000
x = rand(1)[1]
@time pos(x)
@time pos_another(x)
@time for i=1:n pos(x); end
@time for i=1:n pos_another(x); end
0.001758 seconds (408 allocations: 21.981 KB) 0.001559 seconds (434 allocations: 25.483 KB) 0.216706 seconds (3.00 M allocations: 61.023 MB, 3.83% gc time) 0.181200 seconds (4.00 M allocations: 76.282 MB, 5.43% gc time)
type MyAmbiguousType
a
end
type MyType{T<:AbstractFloat}
a::T
end
type MyStillAmbiguousType
a::AbstractFloat
end
m = MyType(3.2);
t = MyStillAmbiguousType(3.2);
typeof(m)
MyType{Float64}
typeof(t)
MyStillAmbiguousType
t.a = 4.5f0 # Float32
typeof(t.a)
Float32
m.a = 4.5f0 # Float32
typeof(m.a)
Float64
func(m::MyType) = m.a + 1
func (generic function with 1 method)
code_llvm(func, (MyType{Float64},))
define double @julia_func_22373(%jl_value_t*) { top: %1 = bitcast %jl_value_t* %0 to double* %2 = load double* %1, align 16 %3 = fadd double %2, 1.000000e+00 ret double %3 }
code_llvm(func, (MyType{AbstractFloat},))
define %jl_value_t* @julia_func_22374(%jl_value_t*, %jl_value_t**, i32) { top: %3 = alloca [4 x %jl_value_t*], align 8 %.sub = getelementptr inbounds [4 x %jl_value_t*]* %3, i64 0, i64 0 %4 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 2 store %jl_value_t* inttoptr (i64 4 to %jl_value_t*), %jl_value_t** %.sub, align 8 %5 = load %jl_value_t*** @jl_pgcstack, align 8 %6 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 1 %.c = bitcast %jl_value_t** %5 to %jl_value_t* store %jl_value_t* %.c, %jl_value_t** %6, align 8 store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8 store %jl_value_t* null, %jl_value_t** %4, align 8 %7 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 3 store %jl_value_t* null, %jl_value_t** %7, align 8 %8 = load %jl_value_t** %1, align 8 %9 = getelementptr inbounds %jl_value_t* %8, i64 0, i32 0 %10 = load %jl_value_t** %9, align 8 store %jl_value_t* %10, %jl_value_t** %4, align 8 store %jl_value_t* inttoptr (i64 4535263360 to %jl_value_t*), %jl_value_t** %7, align 8 %11 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 4543654352 to %jl_value_t*), %jl_value_t** %4, i32 2) %12 = load %jl_value_t** %6, align 8 %13 = getelementptr inbounds %jl_value_t* %12, i64 0, i32 0 store %jl_value_t** %13, %jl_value_t*** @jl_pgcstack, align 8 ret %jl_value_t* %11 }
code_llvm(func, (MyType,))
define %jl_value_t* @julia_func_22375(%jl_value_t*, %jl_value_t**, i32) { top: %3 = alloca [4 x %jl_value_t*], align 8 %.sub = getelementptr inbounds [4 x %jl_value_t*]* %3, i64 0, i64 0 %4 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 2 store %jl_value_t* inttoptr (i64 4 to %jl_value_t*), %jl_value_t** %.sub, align 8 %5 = load %jl_value_t*** @jl_pgcstack, align 8 %6 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 1 %.c = bitcast %jl_value_t** %5 to %jl_value_t* store %jl_value_t* %.c, %jl_value_t** %6, align 8 store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8 store %jl_value_t* null, %jl_value_t** %4, align 8 %7 = getelementptr [4 x %jl_value_t*]* %3, i64 0, i64 3 store %jl_value_t* null, %jl_value_t** %7, align 8 %8 = icmp eq i32 %2, 1 br i1 %8, label %ifcont, label %else else: ; preds = %top call void @jl_error(i8* getelementptr inbounds ([26 x i8]* @_j_str27, i64 0, i64 0)) unreachable ifcont: ; preds = %top %9 = load %jl_value_t** %1, align 8 store %jl_value_t* %9, %jl_value_t** %4, align 8 store %jl_value_t* inttoptr (i64 13141928520 to %jl_value_t*), %jl_value_t** %7, align 8 %10 = call %jl_value_t* @jl_f_get_field(%jl_value_t* null, %jl_value_t** %4, i32 2) store %jl_value_t* %10, %jl_value_t** %4, align 8 store %jl_value_t* inttoptr (i64 4535263360 to %jl_value_t*), %jl_value_t** %7, align 8 %11 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 4543654352 to %jl_value_t*), %jl_value_t** %4, i32 2) %12 = load %jl_value_t** %6, align 8 %13 = getelementptr inbounds %jl_value_t* %12, i64 0, i32 0 store %jl_value_t** %13, %jl_value_t*** @jl_pgcstack, align 8 ret %jl_value_t* %11 }
WARNING: Returned code may not match what actually runs.
# the same best practice for container types
type MySimpleContainer{A<:AbstractVector}
a::A
end
type MyStillAmbiguousContainer{T}
a::AbstractVector{T}
end
c = MySimpleContainer(1:3);
typeof(c)
MySimpleContainer{UnitRange{Int64}}
c = MySimpleContainer([1:3;]);
typeof(c)
MySimpleContainer{Array{Int64,1}}
b = MyStillAmbiguousContainer(1:3);
typeof(b)
MyStillAmbiguousContainer{Int64}
b = MyStillAmbiguousContainer([1:3;]);
typeof(b)
MyStillAmbiguousContainer{Int64}
# this also can help doing different things depending on the type
function sumfoo(c::MySimpleContainer)
s = 0
for x in c.a
s += foo(x)
end
return s
end
foo(x::Integer) = x
foo(x::AbstractFloat) = round(x)
foo (generic function with 4 methods)
@show sumfoo(MySimpleContainer(rand(100)))
sumfoo(MySimpleContainer(rand(100))) = 44.0
44.0
# we can write separate functions for each type instead,
function myfoo{T<:AbstractFloat}(c::MySimpleContainer{Vector{T}})
...
end
function myfoo{T<:Integer}(c::MySimpleContainer{Vector{T}})
...
end
function foo(a::Array{Any,1})
x = a[1]::Float64 # this helps complier to optimize
y = a[3]::Float64
b = x+y
end
a = [3.0, "Dortmund", 2.5, "NRW"];
foo(a)
5.5
#function with_keyword(x; name::Int = 1)
# passing dynamic list of keywords f(x; keywords...) can be slow
function foo(x; name::Int=1, height::Int=0, weight::Int=0)
height + weight
end
function foo2(x, name::Int, height::Int, weight::Int)
height + weight
end
@time for i=1:100000 foo(1, weight=12, height=10); end
@time for i=1:100000 foo2(1, 1, 10, 12); end
0.855028 seconds (10.00 M allocations: 915.541 MB, 6.34% gc time) 0.000000 seconds
function norm(A)
if isa(A, Vector)
return sqrt(real(dot(A,A)))
elseif isa(A, Matrix)
return maximum(svd(A)[2])
else
error("norm: invalid argument")
end
end
norm(x::Vector) = sqrt(real(dot(x,x)))
norm(A::Matrix) = maximum(svd(A)[2])
function twos(n)
a = Array(rand(Bool) ? Int64 : Float64, n)
for i=1:n
a[i] = 2
end
return a
end
twos (generic function with 1 method)
function twos_another(n)
a = Array(rand(Bool) ? Int64 : Float64, n)
fill_twos!(a)
return a
end
function twos_another2(n)
a = Array(rand(Bool) ? Int64 : Float64, n)
fill_twos2!(a)
return a
end
function fill_twos!(a)
for i=1:length(a)
a[i] = 2
end
end
function fill_twos2!(a)
map(x -> 2, a)
end
fill_twos2! (generic function with 1 method)
n=1000
@time twos(n)
@time twos_another(n)
@time for i=1:1000 twos(n); end
@time for i=1:1000 twos_another(n); end
@time for i=1:1000 twos_another2(n); end
0.000054 seconds (495 allocations: 15.703 KB) 0.008073 seconds (4.16 k allocations: 223.432 KB) 0.037601 seconds (491.00 k allocations: 15.182 MB, 4.92% gc time) 0.004284 seconds (2.12 k allocations: 7.727 MB) 0.018826 seconds (658.31 k allocations: 25.487 MB, 12.15% gc time)
function copy_cols{T}(x::Vector{T})
n = size(x, 1)
out = Array(eltype(x), n, n)
for i=1:n
out[:, i] = x
end
out
end
function copy_rows{T}(x::Vector{T})
n = size(x, 1)
out = Array(eltype(x), n, n)
for i=1:n
out[i, :] = x
end
out
end
function copy_col_row{T}(x::Vector{T})
n = size(x, 1)
out = Array(T, n, n)
for col=1:n, row=1:n
out[row, col] = x[row]
end
out
end
function copy_row_col{T}(x::Vector{T})
n = size(x, 1)
out = Array(T, n, n)
for row=1:n, col=1:n
out[row, col] = x[col]
end
out
end
copy_row_col (generic function with 1 method)
x = randn(10000);
fmt(f) = println(rpad(string(f)*": ", 14, ' '), @elapsed f(x))
map(fmt, Any[copy_cols, copy_rows, copy_col_row, copy_row_col]);
copy_cols: 0.422205443 copy_rows: 3.275927589 copy_col_row: 0.596654092 copy_row_col: 2.905686352
f = open("./tmp.txt", "w")
@time for i=1:100000
print(f, "$i $(.5) $(.2) $(i*2) $(i*4) $(i*.5)\n")
end
close(f)
0.328030 seconds (2.70 M allocations: 100.466 MB, 5.42% gc time)
f = open("./tmp.txt", "w")
@time for i=1:100000
@printf(f, "%d %.3f %.3f %d %d %.3f\n", i, .5, .2, i*2, i*4, i*.5)
end
close(f)
0.181660 seconds (777.47 k allocations: 21.019 MB, 2.27% gc time)
f = open("./tmp.txt", "w")
@time for i=1:100000
print(f, i, " ", .5, " ", .2, " ", i*2, " ", i*4, " ", i*.5, "\n")
end
close(f)
0.259693 seconds (1.90 M allocations: 61.022 MB, 4.39% gc time)
x = zeros(10);
y = x; # reference copy, not content copy
x[1] = -1;
y
10-element Array{Float64,1}: -1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
# why this works?
x = rand(1000)
y = rand(1000)
xPrev = x
@time for i=1:10
x = x + y; # x += y;
print(norm(x-xPrev), " : ", norm(y), "\n")
xPrev = x;
end
577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 577.409076405291 : 577.409076405291 0.108320 seconds (1.53 k allocations: 152.630 MB, 17.86% gc time)
# "more" correct version, but using more memory
xPrev = copy(x)
@time for i=1:10
x = x + y;
print(norm(x-xPrev), "\n")
xPrev = copy(x);
end
577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 577.409076405291 0.131592 seconds (4.08 k allocations: 229.079 MB, 23.99% gc time)
x = rand(100000)
y = rand(100000)
# any alternative?
@time for i=1:100
x = x+y;
end
@time for i=1:100
x[:] = x[:] + y[:];
end
@time for i=1:100
for j=1:length(x)
x[j] = x[j] + y[j];
end
end
@time for i=1:100
@simd for j=1:length(x)
@inbounds x[j] = x[j] + y[j];
end
end
function addtwo!(x::Array{Float64},y::Array{Float64})
for j=1:length(x)
x[j] = x[j] + y[j];
end
end
function addtwo_simd!(x::Array{Float64},y::Array{Float64})
@simd for j=1:length(x)
@inbounds x[j] = x[j] + y[j];
end
end
@time addtwo!(x,y);
@time addtwo_simd!(x,y);
@time for i=1:100
addtwo!(x,y);
end
@time for i=1:100
addtwo_simd!(x,y);
end
0.041157 seconds (400 allocations: 76.303 MB, 27.27% gc time) 0.139487 seconds (800 allocations: 228.903 MB, 17.99% gc time) 2.679650 seconds (59.90 M allocations: 1.042 GB, 4.07% gc time) 2.374955 seconds (49.90 M allocations: 761.394 MB, 2.13% gc time) 0.004888 seconds (2.16 k allocations: 108.103 KB) 0.006478 seconds (4.42 k allocations: 219.590 KB) 0.014174 seconds 0.005766 seconds
WARNING: could not attach metadata for @simd loop.
soft(v,a) = sign(v) .* max(0, abs(v) - a)
soft (generic function with 1 method)
function soft!(out, v, a)
out = sign(v) .* max(0, abs(v) - a);
return out
end
function soft2!(out, v, a)
for i=1:length(out)
out[i] = sign(v[i]) * max(0, abs(v[i]) - a);
end
return out
end
soft2! (generic function with 1 method)
out = [1.:4.;]
out2 = soft!(out, rand(4), .5);
@show out;
@show out2;
out = [1.0,2.0,3.0,4.0] out2 = [0.16987111663632604,0.4412477591980364,0.0,0.0]
out = [1.:4.;]
out2 = soft2!(out, rand(4), .5);
@show out;
@show out2;
out = [0.370447854977658,0.446340020406677,0.3884473629974061,0.19135612895460263] out2 = [0.370447854977658,0.446340020406677,0.3884473629974061,0.19135612895460263]
v = rand(1000)
a = .3
out = similar(v)
@time soft(v,a)
@time soft!(out,v,a)
@time soft2!(out,v,a)
@time for i=1:10000 soft(v,a); end
@time for i=1:10000 soft!(out, v,a); end
@time for i=1:10000 soft2!(out, v,a); end
0.002848 seconds (1.86 k allocations: 132.384 KB) 0.000031 seconds (18 allocations: 39.891 KB) 0.000009 seconds (4 allocations: 160 bytes) 0.186347 seconds (140.00 k allocations: 388.031 MB, 23.29% gc time) 0.193155 seconds (140.00 k allocations: 388.031 MB, 25.24% gc time) 0.038781 seconds
# A_mul_B!(OUT, A, B): OUT = A*B
# At_mul_B!(OUT, A, B): OUT = A'*B
# A_mul_Bt!
# At_mul_Bt!
#http://docs.julialang.org/en/release-0.4/stdlib/math/
n = 100;
A = rand(n,n);
B = rand(n,n);
OUT = zeros(n,n);
@time OUT = A*B;
@time A_mul_B!(OUT, A, B);
@time for i=1:1000 OUT = A*B; end
@time for i=1:1000 A_mul_B!(OUT, A, B); end
0.000126 seconds (7 allocations: 78.375 KB) 0.000119 seconds (4 allocations: 160 bytes) 0.090347 seconds (3.00 k allocations: 76.385 MB, 5.16% gc time) 0.082449 seconds
# scale!(s, A): A = s .* A
n = 1000;
A = rand(n,n);
B = copy(A);
s = rand(Float64);
@time A = s*A;
@time scale!(s,A);
@time for i=1:1000 A = s*A; end
A = B;
@time for i=1:1000 scale!(s,A); end
0.012666 seconds (6 allocations: 7.630 MB) 0.000791 seconds (4 allocations: 160 bytes) 3.918843 seconds (2.00 k allocations: 7.451 GB, 30.65% gc time) 0.946773 seconds
# BLAS operations
# http://docs.julialang.org/en/release-0.4/stdlib/linalg/#module-Base.LinAlg.BLAS
# BLAS.axpy!(a, X, Y); #Y = Y + a*X
n = 10000;
x = rand(n);
y = rand(n);
a = rand(Float64);
@time y = y + a*x;
@time BLAS.axpy!(a,x,y);
@time for i=1:10000 y = y + a*x; end
@time for i=1:10000 BLAS.axpy!(a,x,y); end
0.000030 seconds (10 allocations: 156.563 KB) 0.008656 seconds (4 allocations: 160 bytes) 0.533453 seconds (60.00 k allocations: 1.492 GB, 29.87% gc time) 0.056907 seconds
# Julia comes with OpenBLAS, which is multi-threaded
flops = zeros(4);
for i=1:4
blas_set_num_threads(i)
for j=1:1000
flops[i] += peakflops(100, parallel=true)
end
end
print("best no cores = $(indmax(flops))\n");
@show (flops./1000)
blas_set_num_threads(2)
best no cores = 2 flops ./ 1000 = [2.4384632284675438e10,3.2255511575664085e10,3.0021219894029892e10,2.9959863346103878e10]
tmp = y - Dalp + ρ*(D'*z);
x = L'\(L\tmp);
Dx = D*x;
z = soft(Dx + (1/ρ)*α, λ/ρ)
r = Dx-z
α = α + ρ*r
Dalp = D'*α
At_mul_B!(Dz, D, z); #D'z
scale!(ρ,Dz)
# x[:] = y[:] - Dalp[:] + Dz[:];
x = y - Dalp + Dz;
x = L'\(L\x);
A_mul_B!(Dx, D, x); # Dx = D*x
soft!(z, Dx + scale(1/ρ,α), λ/ρ) # z = soft(Dx + (1/ρ)*α, λ/ρ)
r = Dx-z;
BLAS.axpy!(ρ, r, α); #α = α + ρ*r
At_mul_B!(Dalp, D, α); #Dalp = D'*α
n = 1000;
sumN(n) = for i=1:n, j=1:n 1.5+2.5; end
mulN(n) = for i=1:n, j=1:n 1.5*2.5; end
@time for i=1:1000 A=Array{Float64}(n,n); end
@time for i=1:1000 sumN(n); end
@time for i=1:1000 mulN(n); end
1.370818 seconds (8.00 k allocations: 7.451 GB, 60.28% gc time) 0.002636 seconds (3.14 k allocations: 153.031 KB) 0.011886 seconds (3.13 k allocations: 152.531 KB)