using Distributions
using Roots
using Memoize
using Plots
pyplot(fmt = :svg)
# ≈ \approx TAB
# ≉ \napprox TAB
# ⪅ \lessapprox TAB
# ⪆ \gtrapprox TAB
# ⪉ \lnapprox TAB
# ⪊ \gnapprox TAB
# α \alpha TAB
"""
x ⪅ y
Less-than-or-approximately-equal-to comparison operator.
"""
⪅(x, y) = x < y || x ≈ y
"""
x ⪉ y
Less-than-and-not-approximately-equal-to comparison operator.
"""
⪉(x, y) = x < y && x ≉ y
"""
x ⪆ y
Greater-than-or-approximately-equal-to comparison operator.
"""
⪆(x, y) = ⪅(y, x)
"""
x ⪊ y
Greater-than-and-not-approximately-equal-to comparison operator.
"""
⪊(x, y) = x > y && x ≉ y
⪊
どれも数行で書けてしまっている.
@doc raw"""
`pvalue_bin(n, k, p)` = n回中k回成功したときの「成功確率はpである」という仮説のp値.
```math
\mathrm{pvalue\_bin}(n, k, p) = \sum_{P_{n,p}(j)\le P_{n,p}(k)} P_{n,p}(j),
\quad\text{where}\ P_{n,p}(j) = \binom{n}{j}p^{j}(1-p)^{n-j}.
```
"""
@memoize function pvalue_bin(n, k, p)
bin = Binomial(n, p) # 成功確率pの独立試行をn回行うという設定の二項分布
q = pdf(bin, k) # 二項分布binにおいてk回成功する確率
sum(pdf(bin, j) for j in support(bin) if pdf(bin, j) ⪅ q)
end
"""
`ci_bin(n, k, α)` = n回中k回成功した場合の信頼係数1-αの信頼区間.
信頼区間はP値がα以上になるモデル内での成功確率p全体の集合として定義される.
"""
@memoize function ci_bin(n, k, α)
f(p) = pvalue_bin(n, k, p) - α
find_zeros(f, 0, 1) # p値がαに等しくなるpをすべて求める.
end
"""
`real_significance_level_bin(n, p, α)` = 「成功確率pでn回の独立試行」というモデルの中でp値がα未満になる確率
"""
@memoize function real_significance_level_bin(n, p, α)
null = Binomial(n, p)
sum(pdf(null, j) for j in support(null) if pvalue_bin(n, j, p) < α)
end
real_significance_level_bin
"""
plot_pvals_and_ci_bin(n, k, α;
μ = k/n, σ = √(k*(n-k)/n)/n,
pmin = max(0, μ - 5σ), pmax = min(1, μ + 5σ)
)
はp値と有意水準と信頼区間をプロットしてくれる.
"""
function plot_pvals_and_ci_bin(n, k, α;
μ = k/n, σ = √(k*(n-k)/n)/n,
pmin = max(0, μ - 5σ), pmax = min(1, μ + 5σ)
)
ps = range(pmin, pmax; length=400)
pvals = pvalue_bin.(n, k, ps)
confint = ci_bin(n, k, α)
plot(title="Sample: n = $n, k = $k")
plot!(xtick=0:0.05:1, ytick=0:0.05:1)
plot!(xlim=extrema(ps))
plot!(ps, pvals; label="p-value", xlabel="p")
hline!([α]; label="α = $α", ls=:auto)
plot!([minimum(confint), maximum(confint)], [0, 0];
label="confidence interval", lw=3, color=:red)
end
plot_pvals_and_ci_bin
n, k, p = 16, 11, 0.45
null = Binomial(n, p)
supp = support(null)
A = @. pdf(null, supp) ⪊ pdf(null, k)
B = @. pdf(null, supp) ⪅ pdf(null, k)
@show pvalue_bin(n, k, p)
println()
s = 0.0
for j in supp
if pdf(null, j) ⪅ pdf(null, k)
println("pdf(Binomial($n, $p), $j) = ", pdf(null, j))
s += pdf(null, j)
end
end
println("sum = ", s)
plot(title="How to calculate the p-value for n=$n, k=$k, p=$p")
plot!(xtick=supp)
bar!(supp[A], pdf.(null, supp[A]); color=:white, label="")
hline!([pdf(null, k)]; label="pdf(Binomial($n, $p), $k)", ls=:auto)
bar!(supp[B], pdf.(null, supp[B]); color=:red, label="")
pvalue_bin(n, k, p) = 0.07674943617736227 pdf(Binomial(16, 0.45), 0) = 7.01137235467105e-5 pdf(Binomial(16, 0.45), 1) = 0.0009178523809751176 pdf(Binomial(16, 0.45), 2) = 0.005632275974165504 pdf(Binomial(16, 0.45), 3) = 0.021505053719541024 pdf(Binomial(16, 0.45), 11) = 0.03368478104217074 pdf(Binomial(16, 0.45), 12) = 0.011483448082558188 pdf(Binomial(16, 0.45), 13) = 0.002890937978825841 pdf(Binomial(16, 0.45), 14) = 0.0005068527625214144 pdf(Binomial(16, 0.45), 15) = 5.529302863869957e-5 pdf(Binomial(16, 0.45), 16) = 2.827484419024415e-6 sum = 0.07674943617736227
n, k, p, α = 10, 7, 0.4, 0.05
@show n, k, p, α
@show pval = pvalue_bin(n, k, p)
@show confint = ci_bin(n, k, α)
plot_pvals_and_ci_bin(n, k, α)
(n, k, p, α) = (10, 7, 0.4, 0.05) pval = pvalue_bin(n, k, p) = 0.10111928319999997 confint = ci_bin(n, k, α) = [0.3805893410867135, 0.9127355660858496]
n, k, p, α = 100, 61, 0.5, 0.05
@show n, k, p, α
@show pval = pvalue_bin(n, k, p)
@show confint = ci_bin(n, k, α)
plot_pvals_and_ci_bin(n, k, α)
(n, k, p, α) = (100, 61, 0.5, 0.05) pval = pvalue_bin(n, k, p) = 0.035200200217704834 confint = ci_bin(n, k, α) = [0.5100332391347729, 0.7005721422210581]
n, k, p, α = 20, 2, 0.3, 0.05
@show n, k, p, α
@show pval = pvalue_bin(n, k, p)
@show confint = ci_bin(n, k, α)
plot_pvals_and_ci_bin(n, k, α)
(n, k, p, α) = (20, 2, 0.3, 0.05) pval = pvalue_bin(n, k, p) = 0.052627948729727085 confint = ci_bin(n, k, α) = [0.018065203085418622, 0.3199876494802321]
n, p, α = 100, 0.4, 0.05
@show real_significance_level_bin(n, p, α)
L = 10^4
null_sample = rand(Binomial(n, p), L)
pvals = pvalue_bin.(n, null_sample, p)
@show count(pvals .< 0.05)/L
ps = range(0, 1; length=100)
rsl = real_significance_level_bin.(n, ps, α)
plot(; title = "n = $n, α = $α")
plot!(ps, rsl; label="real significance level")
hline!([α]; label="α = $α", ls=:auto)
plot!(xlabel="p", xtick=0:0.1:1)
plot!(xlim=extrema(ps), ylim=(0, 1.4α))
real_significance_level_bin(n, p, α) = 0.04154450961965445 count(pvals .< 0.05) / L = 0.0362
?⪅
"⪅" can be typed by \lessapprox<tab> search: ⪅
x ⪅ y
Less-than-or-approximately-equal-to comparison operator.
?⪉
"⪉" can be typed by \lnapprox<tab> search: ⪉
x ⪉ y
Less-than-and-not-approximately-equal-to comparison operator.
?⪆
"⪆" can be typed by \gtrapprox<tab> search: ⪆
x ⪆ y
Greater-than-or-approximately-equal-to comparison operator.
?⪊
"⪊" can be typed by \gnapprox<tab> search: ⪊
x ⪊ y
Greater-than-and-not-approximately-equal-to comparison operator.
?pvalue_bin
search: pvalue_bin
pvalue_bin(n, k, p)
= n回中k回成功したときの「成功確率はpである」という仮説のp値.
?ci_bin
search: ci_bin plot_pvals_and_ci_bin
ci_bin(n, k, α)
= n回中k回成功した場合の信頼係数1-αの信頼区間.
信頼区間はP値がα以上になるモデル内での成功確率p全体の集合として定義される.
?real_significance_level_bin
search: real_significance_level_bin
real_significance_level_bin(n, p, α)
= 「成功確率pでn回の独立試行」というモデルの中でp値がα未満になる確率
?plot_pvals_and_ci_bin
search: plot_pvals_and_ci_bin
plot_pvals_and_ci_bin(n, k, α;
μ = k/n, σ = √(k*(n-k)/n)/n,
pmin = max(0, μ - 5σ), pmax = min(1, μ + 5σ)
)
はp値と有意水準と信頼区間をプロットしてくれる.