Berdasarkan isu #140: Uji Kolmogorov-Smirnov
Referensi Isu:
Deskripsi Isu:
Strategi:
scipy.stats.kstest
.try:
import hidrokit
except ModuleNotFoundError:
# saat dibuat menggunakan cabang @dev/dev0.3.7
!pip install git+https://github.com/taruma/hidrokit.git@latest -q
Building wheel for hidrokit (setup.py) ... done
import numpy as np
import pandas as pd
from scipy import stats
from hidrokit.contrib.taruma import hk172, hk124, hk127, hk126
frek_normal, frek_lognormal, frek_gumbel, frek_logpearson3 = hk172, hk124, hk127, hk126
# contoh data diambil dari buku
# limantara hal. 114
_HUJAN = np.array([85, 92, 115, 116, 122, 52, 69, 95, 96, 105])
_TAHUN = np.arange(1998, 2008) # 1998-2007
data = pd.DataFrame(
data=np.stack([_TAHUN, _HUJAN], axis=1),
columns=['tahun', 'hujan']
)
data.tahun = pd.to_datetime(data.tahun, format='%Y')
data.set_index('tahun', inplace=True)
data
hujan | |
---|---|
tahun | |
1998-01-01 | 85 |
1999-01-01 | 92 |
2000-01-01 | 115 |
2001-01-01 | 116 |
2002-01-01 | 122 |
2003-01-01 | 52 |
2004-01-01 | 69 |
2005-01-01 | 95 |
2006-01-01 | 96 |
2007-01-01 | 105 |
Terdapat 2 tabel untuk modul hk140
yaitu:
t_dcr_st
: Tabel nilai kritis (Dcr) Untuk Uji Kolmogorov-Smirnov dari buku Rekayasa Statistika untuk Teknik Pengairan oleh Soetopo.t_dcr_sw
: Tabel nilai kritis Do Untuk Uji Smirnov-Kolmogorov dari buku hidrologi: Aplikasi Metode Statistik untuk Analisa Data oleh Soewarno.Dalam modul hk126
nilai $\Delta_{kritis}$ akan dibangkitkan menggunakan fungsi scipy.stats.ksone.ppf
secara default
. Mohon diperhatikan jika ingin menggunakan nilai $\Delta_{kritis}$ yang berasal dari sumber lain.
# tabel dari soetopo hal. 139
# Tabel Nilai Kritis (Dcr) Untuk Uji Kolmogorov-Smirnov
# KODE: ST
_DATA_ST = [
[0.900, 0.925, 0.950, 0.975, 0.995],
[0.684, 0.726, 0.776, 0.842, 0.929],
[0.565, 0.597, 0.642, 0.708, 0.829],
[0.494, 0.525, 0.564, 0.624, 0.734],
[0.446, 0.474, 0.510, 0.563, 0.669],
[0.410, 0.436, 0.470, 0.521, 0.618],
[0.381, 0.405, 0.438, 0.486, 0.577],
[0.358, 0.381, 0.411, 0.457, 0.543],
[0.339, 0.360, 0.388, 0.432, 0.514],
[0.322, 0.342, 0.368, 0.409, 0.486],
[0.307, 0.326, 0.352, 0.391, 0.468],
[0.295, 0.313, 0.338, 0.375, 0.450],
[0.284, 0.302, 0.325, 0.361, 0.433],
[0.274, 0.292, 0.314, 0.349, 0.418],
[0.266, 0.283, 0.304, 0.338, 0.404],
[0.258, 0.274, 0.295, 0.328, 0.391],
[0.250, 0.266, 0.286, 0.318, 0.380],
[0.244, 0.259, 0.278, 0.309, 0.370],
[0.237, 0.252, 0.272, 0.301, 0.361],
[0.231, 0.246, 0.264, 0.294, 0.352],
]
_INDEX_ST = range(1, 21)
_COL_ST = [0.2, 0.15, 0.1, 0.05, 0.01]
t_dcr_st = pd.DataFrame(
data=_DATA_ST, index=_INDEX_ST, columns=_COL_ST
)
t_dcr_st
0.20 | 0.15 | 0.10 | 0.05 | 0.01 | |
---|---|---|---|---|---|
1 | 0.900 | 0.925 | 0.950 | 0.975 | 0.995 |
2 | 0.684 | 0.726 | 0.776 | 0.842 | 0.929 |
3 | 0.565 | 0.597 | 0.642 | 0.708 | 0.829 |
4 | 0.494 | 0.525 | 0.564 | 0.624 | 0.734 |
5 | 0.446 | 0.474 | 0.510 | 0.563 | 0.669 |
6 | 0.410 | 0.436 | 0.470 | 0.521 | 0.618 |
7 | 0.381 | 0.405 | 0.438 | 0.486 | 0.577 |
8 | 0.358 | 0.381 | 0.411 | 0.457 | 0.543 |
9 | 0.339 | 0.360 | 0.388 | 0.432 | 0.514 |
10 | 0.322 | 0.342 | 0.368 | 0.409 | 0.486 |
11 | 0.307 | 0.326 | 0.352 | 0.391 | 0.468 |
12 | 0.295 | 0.313 | 0.338 | 0.375 | 0.450 |
13 | 0.284 | 0.302 | 0.325 | 0.361 | 0.433 |
14 | 0.274 | 0.292 | 0.314 | 0.349 | 0.418 |
15 | 0.266 | 0.283 | 0.304 | 0.338 | 0.404 |
16 | 0.258 | 0.274 | 0.295 | 0.328 | 0.391 |
17 | 0.250 | 0.266 | 0.286 | 0.318 | 0.380 |
18 | 0.244 | 0.259 | 0.278 | 0.309 | 0.370 |
19 | 0.237 | 0.252 | 0.272 | 0.301 | 0.361 |
20 | 0.231 | 0.246 | 0.264 | 0.294 | 0.352 |
# tabel dari soewarno hal. 139
# Tabel Nilai Kritis (Dcr) Untuk Uji Kolmogorov-Smirnov
# KODE: SW
_DATA_SW = [
[0.45, 0.51, 0.56, 0.67],
[0.32, 0.37, 0.41, 0.49],
[0.27, 0.3 , 0.34, 0.4 ],
[0.23, 0.26, 0.29, 0.35],
[0.21, 0.24, 0.26, 0.32],
[0.19, 0.22, 0.24, 0.29],
[0.18, 0.2 , 0.22, 0.27],
[0.17, 0.19, 0.21, 0.25],
[0.16, 0.18, 0.2 , 0.24],
[0.15, 0.17, 0.19, 0.23]
]
_INDEX_SW = range(5, 51, 5)
_COL_SW = [0.2, 0.1, 0.05, 0.01]
t_dcr_sw = pd.DataFrame(
data=_DATA_SW, index=_INDEX_SW, columns=_COL_SW
)
t_dcr_sw
0.20 | 0.10 | 0.05 | 0.01 | |
---|---|---|---|---|
5 | 0.45 | 0.51 | 0.56 | 0.67 |
10 | 0.32 | 0.37 | 0.41 | 0.49 |
15 | 0.27 | 0.30 | 0.34 | 0.40 |
20 | 0.23 | 0.26 | 0.29 | 0.35 |
25 | 0.21 | 0.24 | 0.26 | 0.32 |
30 | 0.19 | 0.22 | 0.24 | 0.29 |
35 | 0.18 | 0.20 | 0.22 | 0.27 |
40 | 0.17 | 0.19 | 0.21 | 0.25 |
45 | 0.16 | 0.18 | 0.20 | 0.24 |
50 | 0.15 | 0.17 | 0.19 | 0.23 |
# KODE FUNGSI INTERPOLASI DARI TABEL
from scipy import interpolate
def _func_interp_bivariate(df):
"Membuat fungsi dari tabel untuk interpolasi bilinear"
table = df[df.columns.sort_values()].sort_index().copy()
x = table.index
y = table.columns
z = table.to_numpy()
# penggunaan kx=1, ky=1 untuk interpolasi linear antara 2 titik
# tidak menggunakan (cubic) spline interpolation
return interpolate.RectBivariateSpline(x, y, z, kx=1, ky=1)
def _as_value(x, dec=4):
x = np.around(x, dec)
return x.flatten() if x.size > 1 else x.item()
def _calc_k(x):
return (x - x.mean()) / x.std()
table_source = {
'soewarno': t_dcr_sw,
'soetopo': t_dcr_st
}
anfrek = {
'normal': frek_normal,
'lognormal': frek_lognormal,
'gumbel': frek_gumbel,
'logpearson3': frek_logpearson3
}
def calc_dcr(alpha, n, source='scipy'):
alpha = np.array(alpha)
if source.lower() == 'scipy':
# ref: https://stackoverflow.com/questions/53509986/
return stats.ksone.ppf(1-alpha/2, n)
elif source.lower() in table_source.keys():
func_table = _func_interp_bivariate(table_source[source.lower()])
# untuk soewarno 2 angka dibelakang koma, dan soetopo = 3
dec = (source.lower() == 'soetopo') + 2
return _as_value(func_table(n, alpha, grid=False), dec)
def kstest(
df, col=None, dist='normal', source_dist=None,
alpha=0.05, source_dcr='scipy', show_stat=True, report='result'
):
if source_dist is None:
source_dist = (
"scipy"
if dist.lower() in ["normal", "lognormal", "logpearson3"]
else "gumbel"
)
col = df.columns[0] if col is None else col
data = df[[col]].copy()
n = len(data)
data = data.rename({col: 'x'}, axis=1)
data = data.sort_values('x')
data['no'] = np.arange(n) + 1
# w = weibull
data['p_w'] = data.no / (n+1)
if dist.lower() in ['normal', 'gumbel']:
data['k'] = _calc_k(data.x)
if dist.lower() in ['lognormal', 'logpearson3']:
data['log_x'] = np.log10(data.x)
data['k'] = _calc_k(data.log_x)
func = anfrek[dist.lower()]
if dist.lower() in ['normal', 'lognormal']:
parameter = ()
elif dist.lower() == 'gumbel':
parameter = (n,)
elif dist.lower() == 'logpearson3':
parameter = (data.log_x.skew(),)
# d = distribusi
data['p_d'] = func.calc_prob(data.k, source=source_dist, *parameter)
data['d'] = (data.p_w - data.p_d).abs()
dmax = data.d.max()
dcr = calc_dcr(alpha, n, source=source_dcr)
result = int(dmax < dcr)
result_text = ['Distribusi Tidak Diterima', 'Distribusi Diterima']
if show_stat:
print(f'Periksa Kecocokan Distribusi {dist.title()}')
print(f'Delta Kritikal = {dcr:.5f}')
print(f'Delta Max = {dmax:.5f}')
print(f'Result (Dmax < Dcr) = {result_text[result]}')
if report.lower() == 'result':
return data['no x p_w p_d d'.split()]
elif report.lower() == 'full':
return data
calc_dcr(alpha, n, ...)
¶Function: calc_dcr(alpha, n, source='scipy')
Fungsi calc_dcr(...)
digunakan untuk mencari nilai Delta kritis (Dcr / $\Delta_{kritis}$) dari berbagai sumber berdasarkan nilai derajat kepercayaan (level of significance) $\alpha$ dan jumlah banyaknya data $n$.
alpha
: Nilai level of significance $\alpha$. Dalam satuan desimal ($\left(0,1\right) \in \mathbb{R}$).n
: Jumlah banyaknya data.source
: sumber nilai Dcr
. 'scipy'
(default). Sumber yang dapat digunakan antara lain: Soetopo ('soetopo'
), Soewarno ('soewarno'
).Perlu dicatat bahwa batas nilai $\alpha$ dan $n$ untuk masing-masing tabel berbeda-beda.
soetopo
batasan dimulai dari $\alpha = \left[0.2,0.01\right]$ dengan $n = \left[1,20\right]$soewarno
batasan dimulai dari $\alpha = \left[0.2,0.01\right]$ dengan $n = \left[5,50\right]$Untuk $n > 50$ disarankan menggunakan scipy
.
calc_dcr(0.2, 10)
0.32260155962627957
calc_dcr(0.15, 10, source='soetopo')
0.342
# perbandingan antara nilai tabel dan fungsi scipy
source_test = ['soewarno', 'soetopo', 'scipy']
_n = 10
_alpha = [0.2, 0.15, 0.1, 0.07, 0.05, 0.01]
for _source in source_test:
print(f'Dcr {_source:<12}=', calc_dcr(_alpha, _n, source=_source))
Dcr soewarno = [0.32 0.35 0.37 0.39 0.41 0.49] Dcr soetopo = [0.322 0.342 0.368 0.393 0.409 0.486] Dcr scipy = [0.32260156 0.34250845 0.36866333 0.3901533 0.40924614 0.48893166]
kstest(df, ...)
¶Function: kstest(df, col=None, dist='normal', source_dist='scipy', alpha=0.05, source_dcr='scipy', show_stat=True, report='result')
Fungsi kstest(...)
merupakan fungsi untuk melakukan uji kolmogorov-smirnov terhadap distribusi yang dibandingkan. Fungsi ini mengeluarkan objek pandas.DataFrame
.
df
: pandas.DataFrame
.col
: nama kolom, None
(default). Jika tidak diisi menggunakan kolom pertama dalam df
sebagai data masukan.dist
: distribusi yang dibandingkan, 'normal'
(distribusi normal) (default). Distribusi yang dapat digunakan antara lain: Log Normal ('lognormal'
), Gumbel ('gumbel'
), Log Pearson 3 ('logpearson3'
).source_dist
: sumber perhitungan distribusi, 'scipy'
(default). Lihat masing-masing modul analisis frekuensi untuk lebih jelasnya.alpha
: nilai $\alpha$, 0.05
(default).source_dcr
: sumber nilai Dcr, 'scipy'
(default). Sumber yang dapat digunakan antara lain: Soetopo ('soetopo'
), Soewarno ('soewarno'
).show_stat
: menampilkan hasil luaran uji, True
(default).report
: opsi kolom luaran dataframe, 'result'
(default). Untuk melihat kolom perhitungan yang lainnya gunakan 'full'
.kstest(data)
Periksa Kecocokan Distribusi Normal Delta Kritikal = 0.40925 Delta Max = 0.09609 Result (Dmax < Dcr) = Distribusi Diterima
no | x | p_w | p_d | d | |
---|---|---|---|---|---|
tahun | |||||
2003-01-01 | 1 | 52 | 0.090909 | 0.025435 | 0.065474 |
2004-01-01 | 2 | 69 | 0.181818 | 0.119957 | 0.061862 |
1998-01-01 | 3 | 85 | 0.272727 | 0.328681 | 0.055953 |
1999-01-01 | 4 | 92 | 0.363636 | 0.450869 | 0.087233 |
2005-01-01 | 5 | 95 | 0.454545 | 0.505473 | 0.050927 |
2006-01-01 | 6 | 96 | 0.545455 | 0.523702 | 0.021753 |
2007-01-01 | 7 | 105 | 0.636364 | 0.681178 | 0.044815 |
2000-01-01 | 8 | 115 | 0.727273 | 0.823367 | 0.096095 |
2001-01-01 | 9 | 116 | 0.818182 | 0.834972 | 0.016790 |
2002-01-01 | 10 | 122 | 0.909091 | 0.894052 | 0.015039 |
kstest(data, dist='gumbel', source_dist='soetopo');
Periksa Kecocokan Distribusi Gumbel Delta Kritikal = 0.40925 Delta Max = 0.14036 Result (Dmax < Dcr) = Distribusi Diterima
kstest(data, dist='logpearson3', alpha=0.2, source_dcr='soetopo', report='full')
Periksa Kecocokan Distribusi Logpearson3 Delta Kritikal = 0.32200 Delta Max = 0.87290 Result (Dmax < Dcr) = Distribusi Tidak Diterima
x | no | p_w | log_x | k | p_d | d | |
---|---|---|---|---|---|---|---|
tahun | |||||||
2003-01-01 | 52 | 1 | 0.090909 | 1.716003 | -2.179769 | 0.963805 | 0.872896 |
2004-01-01 | 69 | 2 | 0.181818 | 1.838849 | -1.100345 | 0.868202 | 0.686384 |
1998-01-01 | 85 | 3 | 0.272727 | 1.929419 | -0.304525 | 0.689708 | 0.416980 |
1999-01-01 | 92 | 4 | 0.363636 | 1.963788 | -0.002531 | 0.584537 | 0.220900 |
2005-01-01 | 95 | 5 | 0.454545 | 1.977724 | 0.119920 | 0.535544 | 0.080999 |
2006-01-01 | 96 | 6 | 0.545455 | 1.982271 | 0.159879 | 0.518805 | 0.026650 |
2007-01-01 | 105 | 7 | 0.636364 | 2.021189 | 0.501845 | 0.363052 | 0.273312 |
2000-01-01 | 115 | 8 | 0.727273 | 2.060698 | 0.849000 | 0.196202 | 0.531070 |
2001-01-01 | 116 | 9 | 0.818182 | 2.064458 | 0.882039 | 0.181020 | 0.637161 |
2002-01-01 | 122 | 10 | 0.909091 | 2.086360 | 1.074487 | 0.099729 | 0.809362 |
- 20220613 - 1.1.0 / v0.4.1 - perbaikan, ubah source_dist jadi None dengan default scipy kecuali gumbel.
- 20220316 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.