Berdasarkan isu #141: Uji Chi-Square
Referensi Isu:
Deskripsi Isu:
Diskusi Isu:
Strategi:
scipy.stats.chisquare
.try:
import hidrokit
except ModuleNotFoundError:
# saat dibuat menggunakan cabang @dev/dev0.3.7
!pip install git+https://github.com/taruma/hidrokit.git@latest -q
Building wheel for hidrokit (setup.py) ... done
import numpy as np
import pandas as pd
from scipy import stats
from hidrokit.contrib.taruma import hk172, hk124, hk127, hk126
frek_normal, frek_lognormal, frek_gumbel, frek_logpearson3 = hk172, hk124, hk127, hk126
# contoh data diambil dari buku
# limantara hal. 118
_HUJAN = np.array([85, 92, 115, 116, 122, 52, 69, 95, 96, 105])
_TAHUN = np.arange(1998, 2008) # 1998-2007
data = pd.DataFrame(
data=np.stack([_TAHUN, _HUJAN], axis=1),
columns=['tahun', 'hujan']
)
data.tahun = pd.to_datetime(data.tahun, format='%Y')
data.set_index('tahun', inplace=True)
data
hujan | |
---|---|
tahun | |
1998-01-01 | 85 |
1999-01-01 | 92 |
2000-01-01 | 115 |
2001-01-01 | 116 |
2002-01-01 | 122 |
2003-01-01 | 52 |
2004-01-01 | 69 |
2005-01-01 | 95 |
2006-01-01 | 96 |
2007-01-01 | 105 |
Terdapat 1 tabel untuk modul hk141
yaitu:
t_chi_lm
: Tabel nilai kritis untuk Distribusi Chi Square ($X^2$) dari buku Rekayasa Hidrologi oleh Limantara.Dalam modul hk141
nilai kritis $X^2$ akan dibangkitkan menggunakan fungsi scipy.stats.chi2.isf
secara default
. Mohon diperhatikan jika ingin menggunakan nilai $X^2$ yang berasal dari sumber lain.
# tabel dari limantara hal. 117
# Tabel Nilai Kritis untuk Distribusi Chi Square (X^2)
# KODE: LM
_DATA_LM = [
[0.039, 0.016, 0.698, 0.393, 3.841, 5.024, 6.635, 7.879],
[0.100, 0.201, 0.506, 0.103, 5.991, 0.738, 9.210, 10.597],
[0.717, 0.115, 0.216, 0.352, 7.815, 9.348, 11.345, 12.838],
[0.207, 0.297, 0.484, 0.711, 9.488, 11.143, 13.277, 14.860],
[0.412, 0.554, 0.831, 1.145, 11.070, 12.832, 15.086, 16.750],
[0.676, 0.872, 1.237, 1.635, 12.592, 14.449, 16.812, 18.548],
[0.989, 1.239, 1.690, 2.167, 14.067, 16.013, 18.475, 20.278],
[1.344, 1.646, 2.180, 2.733, 15.507, 17.535, 20.090, 21.955],
[1.735, 2.088, 2.700, 3.325, 16.919, 19.023, 21.666, 23.589],
[2.156, 2.558, 3.247, 3.940, 18.307, 20.483, 23.209, 25.188],
[2.603, 3.053, 3.816, 4.575, 19.675, 21.920, 24.725, 26.757],
[3.074, 3.571, 4.404, 5.226, 21.026, 23.337, 26.217, 28.300],
[3.565, 4.107, 5.009, 5.892, 22.362, 24.736, 27.688, 29.819],
[4.075, 4.660, 5.629, 6.571, 23.685, 26.119, 29.141, 31.319],
[4.601, 5.229, 6.262, 7.261, 24.996, 27.488, 30.578, 32.801],
[5.142, 5.812, 6.908, 7.962, 26.296, 28.845, 32.000, 34.267],
[5.697, 6.408, 7.564, 8.672, 27.587, 30.191, 33.409, 35.718],
[6.265, 7.015, 8.231, 9.390, 28.869, 31.526, 34.805, 37.156],
[6.884, 7.633, 8.907, 10.117, 30.144, 32.852, 36.191, 38.582],
[7.434, 8.260, 9.591, 10.851, 31.410, 34.170, 37.566, 39.997],
[8.034, 8.897, 10.283, 11.591, 32.671, 35.479, 38.932, 41.401],
[8.643, 9.542, 10.982, 12.338, 33.924, 36.781, 40.289, 42.796],
[9.260, 10.196, 11.689, 13.091, 36.172, 38.076, 41.638, 44.181],
[9.886, 10.856, 12.401, 13.848, 36.415, 39.364, 42.980, 45.558],
[10.520, 11.524, 13.120, 14.611, 37.652, 40.646, 44.314, 46.928],
[11.160, 12.198, 13.844, 15.379, 38.885, 41.923, 45.642, 48.290],
[11.808, 12.879, 14.573, 16.151, 40.113, 43.194, 46.963, 49.645],
[12.461, 13.565, 15.308, 16.928, 41.337, 44.461, 48.278, 50.993],
[13.121, 14.256, 16.047, 17.708, 42.557, 45.722, 49.588, 52.336],
[13.787, 14.953, 16.791, 18.493, 43.773, 46.979, 50.892, 53.672],
]
_INDEX_LM = range(1, 31)
_COL_LM = [0.995, .99, .975, .95, .05, .025, 0.01, 0.005]
t_chi_lm = pd.DataFrame(
data=_DATA_LM, index=_INDEX_LM, columns=_COL_LM
)
t_chi_lm
0.995 | 0.990 | 0.975 | 0.950 | 0.050 | 0.025 | 0.010 | 0.005 | |
---|---|---|---|---|---|---|---|---|
1 | 0.039 | 0.016 | 0.698 | 0.393 | 3.841 | 5.024 | 6.635 | 7.879 |
2 | 0.100 | 0.201 | 0.506 | 0.103 | 5.991 | 0.738 | 9.210 | 10.597 |
3 | 0.717 | 0.115 | 0.216 | 0.352 | 7.815 | 9.348 | 11.345 | 12.838 |
4 | 0.207 | 0.297 | 0.484 | 0.711 | 9.488 | 11.143 | 13.277 | 14.860 |
5 | 0.412 | 0.554 | 0.831 | 1.145 | 11.070 | 12.832 | 15.086 | 16.750 |
6 | 0.676 | 0.872 | 1.237 | 1.635 | 12.592 | 14.449 | 16.812 | 18.548 |
7 | 0.989 | 1.239 | 1.690 | 2.167 | 14.067 | 16.013 | 18.475 | 20.278 |
8 | 1.344 | 1.646 | 2.180 | 2.733 | 15.507 | 17.535 | 20.090 | 21.955 |
9 | 1.735 | 2.088 | 2.700 | 3.325 | 16.919 | 19.023 | 21.666 | 23.589 |
10 | 2.156 | 2.558 | 3.247 | 3.940 | 18.307 | 20.483 | 23.209 | 25.188 |
11 | 2.603 | 3.053 | 3.816 | 4.575 | 19.675 | 21.920 | 24.725 | 26.757 |
12 | 3.074 | 3.571 | 4.404 | 5.226 | 21.026 | 23.337 | 26.217 | 28.300 |
13 | 3.565 | 4.107 | 5.009 | 5.892 | 22.362 | 24.736 | 27.688 | 29.819 |
14 | 4.075 | 4.660 | 5.629 | 6.571 | 23.685 | 26.119 | 29.141 | 31.319 |
15 | 4.601 | 5.229 | 6.262 | 7.261 | 24.996 | 27.488 | 30.578 | 32.801 |
16 | 5.142 | 5.812 | 6.908 | 7.962 | 26.296 | 28.845 | 32.000 | 34.267 |
17 | 5.697 | 6.408 | 7.564 | 8.672 | 27.587 | 30.191 | 33.409 | 35.718 |
18 | 6.265 | 7.015 | 8.231 | 9.390 | 28.869 | 31.526 | 34.805 | 37.156 |
19 | 6.884 | 7.633 | 8.907 | 10.117 | 30.144 | 32.852 | 36.191 | 38.582 |
20 | 7.434 | 8.260 | 9.591 | 10.851 | 31.410 | 34.170 | 37.566 | 39.997 |
21 | 8.034 | 8.897 | 10.283 | 11.591 | 32.671 | 35.479 | 38.932 | 41.401 |
22 | 8.643 | 9.542 | 10.982 | 12.338 | 33.924 | 36.781 | 40.289 | 42.796 |
23 | 9.260 | 10.196 | 11.689 | 13.091 | 36.172 | 38.076 | 41.638 | 44.181 |
24 | 9.886 | 10.856 | 12.401 | 13.848 | 36.415 | 39.364 | 42.980 | 45.558 |
25 | 10.520 | 11.524 | 13.120 | 14.611 | 37.652 | 40.646 | 44.314 | 46.928 |
26 | 11.160 | 12.198 | 13.844 | 15.379 | 38.885 | 41.923 | 45.642 | 48.290 |
27 | 11.808 | 12.879 | 14.573 | 16.151 | 40.113 | 43.194 | 46.963 | 49.645 |
28 | 12.461 | 13.565 | 15.308 | 16.928 | 41.337 | 44.461 | 48.278 | 50.993 |
29 | 13.121 | 14.256 | 16.047 | 17.708 | 42.557 | 45.722 | 49.588 | 52.336 |
30 | 13.787 | 14.953 | 16.791 | 18.493 | 43.773 | 46.979 | 50.892 | 53.672 |
from scipy import interpolate
def _func_interp_bivariate(df):
"Membuat fungsi dari tabel untuk interpolasi bilinear"
table = df[df.columns.sort_values()].sort_index().copy()
x = table.index
y = table.columns
z = table.to_numpy()
# penggunaan kx=1, ky=1 untuk interpolasi linear antara 2 titik
# tidak menggunakan (cubic) spline interpolation
return interpolate.RectBivariateSpline(x, y, z, kx=1, ky=1)
def _as_value(x, dec=4):
x = np.around(x, dec)
return x.flatten() if x.size > 1 else x.item()
table_source = {
'limantara': t_chi_lm
}
anfrek = {
'normal': frek_normal.calc_x_normal,
'lognormal': frek_lognormal.calc_x_lognormal,
'gumbel': frek_gumbel.calc_x_gumbel,
'logpearson3': frek_logpearson3.calc_x_lp3,
}
def _calc_k(n):
return np.floor(1 + 3.22 * np.log10(n)).astype(int)
def _calc_dk(k, m):
return k - 1 - m
def calc_xcr(alpha, dk, source='scipy'):
alpha = np.array(alpha)
if source.lower() in table_source.keys():
func_table = _func_interp_bivariate(table_source[source.lower()])
return _as_value(func_table(dk, alpha, grid=False), 3)
if source.lower() == 'scipy':
#ref: https://stackoverflow.com/questions/32301698
return stats.chi2.isf(alpha, dk)
def chisquare(
df, col=None, dist='normal', source_dist=None,
alpha=0.05, source_xcr='scipy', show_stat=True,
):
if source_dist is None:
source_dist = (
"scipy"
if dist.lower() in ["normal", "lognormal", "logpearson3"]
else "gumbel"
)
col = df.columns[0] if col is None else col
data = df[[col]].copy()
n = len(data)
data = data.rename({col: 'x'}, axis=1)
if dist.lower() in ['lognormal', 'logpearson3']:
data['log_x'] = np.log10(data.x)
k = _calc_k(n)
prob_class = 1 / k
prob_list = np.linspace(0, 1, k+1)[::-1]
prob_seq = prob_list[1:-1]
func = anfrek[dist.lower()]
T = 1 / prob_seq
val_x = func(data.x, return_period=T, source=source_dist)
# Chi Square Table
calc_df = pd.DataFrame()
min = data.x.min()
max = data.x.max()
seq_x = np.concatenate([[min], val_x, [max]])
calc_df['no'] = range(1, k+1)
class_text = []
for i in range(seq_x.size-1):
if i == 0:
class_text += [f'X <= {seq_x[i+1]:.4f}']
elif i == seq_x.size-2:
class_text += [f'X > {seq_x[i]:.4f}']
else:
class_text += [f'{seq_x[i]:.4f} < X <= {seq_x[i+1]:.4f}']
calc_df['batas_kelas'] = class_text
# calculate fe
fe = []
for i in range(seq_x.size-1):
if i == 0:
fe += [(data.x <= seq_x[i+1]).sum()]
elif i == seq_x.size-2:
fe += [(data.x > seq_x[i]).sum()]
else:
fe += [data.x.between(seq_x[i], seq_x[i+1], inclusive='right').sum()]
calc_df['fe'] = fe
ft = prob_class * n
calc_df['ft'] = [ft]*k
if dist.lower() in ['normal', 'gumbel', 'lognormal']:
dk = _calc_dk(k, 2)
elif dist.lower() in ['logpearson3']:
# di buku soetopo nilai m nya diberi angka 3
dk = _calc_dk(k, 2)
X_calc = np.sum(np.power(2, (calc_df.fe-calc_df.ft))/calc_df.ft)
X_critical = calc_xcr(alpha=alpha, dk=dk, source=source_xcr)
result = int(X_calc < X_critical)
result_text = ['Distribusi Tidak Diterima', 'Distribusi Diterima']
calc_df.set_index('no', inplace=True)
if show_stat:
print(f'Periksa Kecocokan Distribusi {dist.title()}')
print(f'Jumlah Kelas = {k}')
print(f'Dk = {dk}')
print(f'X^2_hitungan = {X_calc:.3f}')
print(f'X^2_kritis = {X_critical:.3f}')
print(f'Result (X2_calc < X2_cr) = {result_text[result]}')
return calc_df
calc_xcr(alpha, dk, ...)
¶Function: calc_xcr(alpha, dk, source='scipy')
Fungsi calc_xcr(...)
digunakan untuk mencari nilai $X^2_{kritis}$ dari berbagai sumber berdasarkan nilai derajat kepercayaan $\alpha$ dan nilai $DK$.
alpha
: Nilai level of significance $\alpha$. Dalam satuan desimal.dk
: Nilai $DK$ hasil perhitungan antara $K$ (jumlah kelas) dan parameter distribusi $m$.source
: sumber nilai $X^2_{kritis}$. 'scipy'
(default). Sumber yang dapat digunakan antara lain: Limantara ('limantara'
).calc_xcr(0.05, 3)
7.814727903251178
calc_xcr([0.05, 0.1, 0.2], 5, source='limantara')
array([11.07 , 10.519, 9.416])
# perbandingan antara nilai tabel dan fungsi scipy
source_test = ['limantara', 'scipy']
_dk = 5
_alpha = [0.2, 0.15, 0.1, 0.07, 0.05, 0.01]
for _source in source_test:
print(f'Xcr {_source:<12}=', calc_xcr(_alpha, _dk, source=_source))
Xcr limantara = [ 9.416 9.967 10.519 10.849 11.07 15.086] Xcr scipy = [ 7.28927613 8.11519941 9.2363569 10.19102791 11.07049769 15.08627247]
chisquare(df, ...)
¶Function: chisquare(df, col=None, dist='normal', source_dist='scipy', alpha=0.05, source_xcr='scipy', show_stat=True)
Fungsi chisquare(...)
merupakan fungsi untuk melakukan uji chi square terhadap distribusi yang dibandingkan. Fungsi ini mengeluarkan objek pandas.DataFrame
.
df
: pandas.DataFrame
.col
: nama kolom, None
(default). Jika tidak diisi menggunakan kolom pertama dalam df
sebagai data masukan.dist
: distribusi yang dibandingkan, 'normal'
(distribusi normal) (default). Distribusi yang dapat digunakan antara lain: Log Normal ('lognormal'
), Gumbel ('gumbel'
), Log Pearson 3 ('logpearson3'
).source_dist
: sumber perhitungan distribusi, 'scipy'
(default). Lihat masing-masing modul analisis frekuensi untuk lebih jelasnya.alpha
: nilai $\alpha$, 0.05
(default).source_xcr
: sumber nilai $X^2_{kritis}$, 'scipy'
(default). Sumber yang dapat digunakan antara lain: Limantara ('limantara'
).show_stat
: menampilkan hasil luaran uji, True
(default).chisquare(data)
Periksa Kecocokan Distribusi Normal Jumlah Kelas = 4 Dk = 1 X^2_hitungan = 1.697 X^2_kritis = 3.841 Result (X2_calc < X2_cr) = Distribusi Diterima
batas_kelas | fe | ft | |
---|---|---|---|
no | |||
1 | X <= 79.9499 | 2 | 2.5 |
2 | 79.9499 < X <= 94.7000 | 2 | 2.5 |
3 | 94.7000 < X <= 109.4501 | 3 | 2.5 |
4 | X > 109.4501 | 3 | 2.5 |
chisquare(data, dist='gumbel', source_dist='soetopo')
Periksa Kecocokan Distribusi Gumbel Jumlah Kelas = 4 Dk = 1 X^2_hitungan = 2.121 X^2_kritis = 3.841 Result (X2_calc < X2_cr) = Distribusi Diterima
batas_kelas | fe | ft | |
---|---|---|---|
no | |||
1 | X <= 75.7758 | 2 | 2.5 |
2 | 75.7758 < X <= 91.7367 | 1 | 2.5 |
3 | 91.7367 < X <= 111.9862 | 4 | 2.5 |
4 | X > 111.9862 | 3 | 2.5 |
chisquare(data, 'hujan', dist='logpearson3', alpha=0.2, source_xcr='limantara')
Periksa Kecocokan Distribusi Logpearson3 Jumlah Kelas = 4 Dk = 1 X^2_hitungan = 2.121 X^2_kritis = 3.266 Result (X2_calc < X2_cr) = Distribusi Diterima
batas_kelas | fe | ft | |
---|---|---|---|
no | |||
1 | X <= 80.4303 | 2 | 2.5 |
2 | 80.4303 < X <= 97.1113 | 4 | 2.5 |
3 | 97.1113 < X <= 111.6241 | 1 | 2.5 |
4 | X > 111.6241 | 3 | 2.5 |
- 20220613 - 1.1.0 / v0.4.1 - perbaikan, ubah source_dist jadi None dengan default scipy kecuali gumbel.
- 20220317 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.