using PyPlot using Distributions # xlog(x,y) = if x == 0 then 0 else x*log(x/y) # # Warning: # # Don't use x*log(x/y) instead of xlog(x,y). # Because log(0) = -Inf and 0*Inf = NaN. # function xlog(x::Float64, y::Float64) if x == zero(x) return zero(x) elseif y == zero(y) return Inf else return x*log(x/y) end end xlog(x,y) = xlog(Float64(x),Float64(y)) x, y = 0, 1 @show x, y @show x*log(x/y) @show xlog(x,y); prodprob(p::Float64, q::Float64) = [p*q, p*(1-q), (1-p)*q, (1-p)*(1-q)] function expected(a::AbstractArray{T,1}) where T n = sum(a) return [(a[1]+a[2])*(a[1]+a[3])/n, (a[1]+a[2])*(a[2]+a[4])/n, (a[3]+a[4])*(a[1]+a[3])/n, (a[3]+a[4])*(a[2]+a[4])/n] end function chisqtest(a::AbstractArray{Int64,1}) mu = expected(a) chisq = sum((a .- mu).^2 ./mu) pval = ccdf(Chisq(1), chisq) return pval end function gtest(a::AbstractArray{Int64,1}) mu = expected(a) g = 2*sum(xlog.(a,mu)) # Don't use a.*log(a./mu) pval = ccdf(Chisq(1), g) return pval end function pvaluehg(d::Hypergeometric, k::Int64) c = params(d) amax = min(c[1],c[3]) p0 = pdf(d, k) p1 = 0.0 pval = 0.0 for j in 0:amax p1 = pdf(d, j) pval += ifelse(p1 ≤ p0, p1, 0.0) end return min(pval, 1.0) end function fishertest(a::AbstractArray{Int64,1}) d = Hypergeometric(a[1]+a[2], a[3]+a[4], a[1]+a[3]) return pvaluehg(d, a[1]) end ecdf(pval::AbstractArray{Float64,1}, x::Float64) = count(p -> p ≤ x, pval)/length(pval) ecdf(pval, x) = ecdf(pval, Float64(x)) function randPoisson(n::Int64, p::AbstractArray{Float64}, N::Int64) return [ rand(Poisson(n*p[1]),N)'; rand(Poisson(n*p[2]),N)'; rand(Poisson(n*p[3]),N)'; rand(Poisson(n*p[4]),N)'; ] end function randMultinomial(n::Int64, p::AbstractArray{Float64}, N::Int64) return rand(Multinomial(n,p), N) end function randBinomial(n::Int64, p::AbstractArray{Float64}, N::Int64) m = Int64(round(n*(p[1]+p[2]))) q1 = p[1]/(p[1]+p[2]) q2 = p[3]/(p[3]+p[4]) return [rand(Multinomial(m, [q1, 1.0-q1]), N); rand(Multinomial(n-m, [q2, 1.0-q2]), N)] end function randHypergeometric(n::Int64, p::AbstractArray{Float64}, N::Int64) m1 = Int64(round(n*(p[1]+p[2]))) m2 = Int64(round(n*(p[3]+p[4]))) n1 = Int64(round(n*(p[1]+p[3]))) a = rand(Hypergeometric(m1, m2, n1), N)' b = m1 .- a c = n1 .- a d = m2 .- c return [a; b; c; d] end function pvaluesby(sampler::T, n::Int64; N = 10^4, alpha = 0.05) where T<:Function px = collect(0.05:0.05:0.50) py = px np = length(px) prob_chisq = Array{Float64,2}(np,np) prob_g = Array{Float64,2}(np,np) prob_fisher = Array{Float64,2}(np,np) a = Array{Float64,2}(4,N) for i in 1:np for j in 1:np if i > j prob_chisq[i,j] = prob_chisq[j,i] prob_g[i,j] = prob_g[j,i] prob_fisher[i,j] = prob_fisher[j,i] else a = sampler(n, prodprob(px[i],py[j]), N) prob_chisq[i,j] = ecdf([chisqtest(a[:,i]) for i in 1:size(a,2)], alpha) prob_g[i,j] = ecdf([gtest(a[:,i]) for i in 1:size(a,2)], alpha) prob_fisher[i,j] = ecdf([fishertest(a[:,i]) for i in 1:size(a,2)], alpha) end end end return alpha, px, prob_chisq, prob_g, prob_fisher end function plotcomparisontest(sampler, n; N = 10^4, alpha = 0.05) alpha, px, prob_chisq, prob_g, prob_fisher = pvaluesby(sampler, n; N = N, alpha = alpha) py = px np = length(px) ps = [0.0;px] vmin = 0.0 vmax = 2*alpha cmap = "RdBu_r" figure(figsize=(8,6.4)) ax1 = subplot2grid((16,20), (0,0), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_chisq, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("\$\\chi^2\$-test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) ax2 = subplot2grid((16,20), (0,12), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_g, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("G-test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) ax3 = subplot2grid((16,20), (9,0), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_fisher, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("Fisher's exact test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) suptitle("sampler = $(typeof(sampler)), n = $n, \$\\alpha\$ = $alpha") end @time plotcomparisontest(randPoisson, 25, N=10000, alpha=0.05) @time plotcomparisontest(randPoisson, 50, N=10000, alpha=0.05) @time plotcomparisontest(randPoisson, 100, N=10000, alpha=0.05) @time plotcomparisontest(randPoisson, 200, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 25, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 50, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 100, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 200, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 25, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 50, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 100, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 200, N=10000, alpha=0.05) @time plotcomparisontest(randHypergeometric, 25, N=10000, alpha=0.05) @time plotcomparisontest(randHypergeometric, 50, N=10000, alpha=0.05) @time plotcomparisontest(randHypergeometric, 100, N=10000, alpha=0.05) @time plotcomparisontest(randHypergeometric, 200, N=10000, alpha=0.05) function randprob(n, p,q) P = prodprob(p,q) R = similar(P) for i in 1:10000 R = P .* (1.0 .+ 0.5*rand(4)) R = R/sum(R) if 0.4 < chisqtest(Int64.(round.(n*R))) < 0.6 break end end return R end function randpvaluesby(sampler::T, n::Int64; N = 10^4, alpha = 0.05) where T<:Function px = collect(0.05:0.05:0.50) py = px np = length(px) prob_chisq = Array{Float64,2}(np,np) prob_g = Array{Float64,2}(np,np) prob_fisher = Array{Float64,2}(np,np) a = Array{Float64,2}(4,N) for i in 1:np for j in 1:np a = sampler(n, randprob(n, px[i],py[j]), N) prob_chisq[i,j] = ecdf([chisqtest(a[:,i]) for i in 1:size(a,2)], alpha) prob_g[i,j] = ecdf([gtest(a[:,i]) for i in 1:size(a,2)], alpha) prob_fisher[i,j] = ecdf([fishertest(a[:,i]) for i in 1:size(a,2)], alpha) end end return alpha, px, prob_chisq, prob_g, prob_fisher end function plotrandcomparisontest(sampler, n; N = 10^4, alpha = 0.05) alpha, px, prob_chisq, prob_g, prob_fisher = randpvaluesby(sampler, n; N = N, alpha = alpha) py = px np = length(px) ps = [0.0;px] vmin = 0.0 vmax = 2*alpha cmap = "RdBu_r" figure(figsize=(8,6.4)) ax1 = subplot2grid((16,20), (0,0), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_chisq, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("\$\\chi^2\$-test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) ax2 = subplot2grid((16,20), (0,12), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_g, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("G-test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) ax3 = subplot2grid((16,20), (9,0), rowspan=7, colspan=8) pcolormesh(ps, ps, prob_fisher, cmap=cmap, vmin=vmin, vmax=vmax) colorbar(label="probability of p-value \$\\leqq\$ $alpha") title("Fisher's exact test") xlabel("marginal probability", fontsize=8) ylabel("marginal probability", fontsize=8) suptitle("Dependent case: sampler = $(typeof(sampler)), n = $n, \$\\alpha\$ = $alpha") end n = 100 @show p = prodprob(0.2,0.3) @show r = randprob(n,0.2,0.3) chisqtest(Int.(round.(n*r))) n = 100 @time reshape([chisqtest(Int64.(round.(n*randprob(n,p,q)))) for p in 0.05:0.05:0.5 for q in 0.05:0.05:0.5],10,10) @time plotcomparisontest(randMultinomial, 25, N=10000, alpha=0.05) @time plotrandcomparisontest(randMultinomial, 25, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 50, N=10000, alpha=0.05) @time plotrandcomparisontest(randMultinomial, 50, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 100, N=10000, alpha=0.05) @time plotrandcomparisontest(randMultinomial, 100, N=10000, alpha=0.05) @time plotcomparisontest(randMultinomial, 200, N=10000, alpha=0.05) @time plotrandcomparisontest(randMultinomial, 200, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 25, N=10000, alpha=0.05) @time plotrandcomparisontest(randBinomial, 25, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 50, N=10000, alpha=0.05) @time plotrandcomparisontest(randBinomial, 50, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 100, N=10000, alpha=0.05) @time plotrandcomparisontest(randBinomial, 100, N=10000, alpha=0.05) @time plotcomparisontest(randBinomial, 200, N=10000, alpha=0.05) @time plotrandcomparisontest(randBinomial, 200, N=10000, alpha=0.05)