Berdasarkan isu #98: buat ringkasan/rekap data deret waktu
Deskripsi permasalahan:
Strategi Penyelesaian:
Catatan:
import numpy as np
import pandas as pd
try:
import hidrokit
except ModuleNotFoundError:
!pip install git+https://github.com/taruma/hidrokit.git@latest -q
import hidrokit
print(f'hidrokit version: {hidrokit.__version__}')
hidrokit version: 0.4.1
!wget -O sample.xlsx "https://taruma.github.io/assets/hidrokit_dataset/data_daily_sample.xlsx" -q
dataset_path = 'sample.xlsx'
from hidrokit.contrib.taruma import hk88
_data = hk88.read_workbook(dataset_path, ['STA_A', 'STA_B', 'STA_C'])
dataset = _data.infer_objects()
dataset.info()
dataset.head()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 5478 entries, 2001-01-01 to 2015-12-31 Freq: D Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 STA_A 5477 non-null float64 1 STA_B 5470 non-null float64 2 STA_C 5475 non-null float64 dtypes: float64(3) memory usage: 171.2 KB
STA_A | STA_B | STA_C | |
---|---|---|---|
2001-01-01 | 0.0 | 0.0 | 0.00 |
2001-01-02 | 0.0 | 0.0 | 0.65 |
2001-01-03 | 0.0 | 45.0 | 9.16 |
2001-01-04 | 0.0 | 0.0 | 0.00 |
2001-01-05 | 0.0 | 5.0 | 1.03 |
def summary_station(dataset, column, ufunc, ufunc_col, n_days='MS'):
grouped = [dataset.index.year, dataset.index.month]
ufunc = ufunc if isinstance(ufunc, (list, tuple)) else (ufunc,)
ufunc_col = (ufunc_col
if isinstance(ufunc_col, (list, tuple)) else (ufunc_col,))
if len(ufunc) != len(ufunc_col):
raise ValueError('length ufunc and ufunc_col are not matched.')
if n_days.endswith("D") or n_days.endswith("MS") or n_days.endswith("M"):
ix_month = []
val_month = []
for _, x in dataset[column].groupby(by=grouped):
each_month = x.groupby(pd.Grouper(freq=n_days)).agg(ufunc)
val_month.append(each_month.values)
ix_month.append(each_month.index.to_numpy())
return pd.DataFrame(
data=np.vstack(val_month),
index=np.hstack(ix_month),
columns=pd.MultiIndex.from_product([[column], ufunc_col]),
).rename_axis("DATE")
else:
summary = dataset[[column]].resample(n_days).agg(ufunc)
summary.columns = pd.MultiIndex.from_product([[column], ufunc_col])
return summary
def summary_all(dataset, ufunc, ufunc_col, columns=None, n_days='MS', verbose=False):
res = []
columns = columns if columns is not None else list(dataset.columns)
columns = columns if isinstance(columns, (list, tuple)) else [columns]
for column in columns:
if verbose:
print('PROCESSING:', column)
res.append(
summary_station(dataset, column, ufunc, ufunc_col, n_days=n_days)
)
return pd.concat(res, axis=1)
summary_station()
¶Fungsi ini membuat rekap untuk stasiun/kolom tunggal dalam bentuk keluaran pandas.DataFrame
. Argumen yang dibutuhkan antara lain:
dataset
: DataFrame dataset. Isian berupa pandas.DataFrame
.column
: kolom tunggal yang akan diproses. Isian berupa string.ufunc
: fungsi atau list fungsi yang akan digunakan. Isian berupa object
atau list of object
.ufunc_col
: nama atau list nama dari fungsi ufunc
. Isian berupa list of string.n_days='M'
: indikator jumlah hari/bulan yang diproses. Isian merupakan isian valid untuk parameter freq
pada objek pd.Grouper
(referensi). Isian 'M'
berarti setiap bulan (Month), isian '9D'
berarti setiap 9 hari (Days).ufunc
dan ufunc_col
¶Pengguna diberi kebebasan dalam melakukan perhitungan pada fungsi summary_station
. Dalam notebook ini akan diberikan contoh menggunakan fungsi yang tersedia pada python, numpy, dan membuatnya sendiri.
# Fungsi buatan sendiri
def n_rain(x):
"Jumlah hari hujan"
return (x > 0).sum()
myfunc = [np.sum, n_rain, len]
myfunc_col = ['sum', 'n_rain', 'n_days']
Jika tidak diberi argumen n_days
maka fungsi ini akan memproses data setiap bulan.
summary_station(
dataset=dataset, column='STA_B',
ufunc=myfunc, ufunc_col=myfunc_col)
STA_B | |||
---|---|---|---|
sum | n_rain | n_days | |
DATE | |||
2001-01-01 | 454.0 | 18.0 | 31.0 |
2001-02-01 | 298.0 | 12.0 | 28.0 |
2001-03-01 | 475.0 | 18.0 | 31.0 |
2001-04-01 | 272.0 | 12.0 | 30.0 |
2001-05-01 | 86.0 | 4.0 | 31.0 |
... | ... | ... | ... |
2015-08-01 | 0.0 | 0.0 | 31.0 |
2015-09-01 | 0.0 | 0.0 | 30.0 |
2015-10-01 | 14.0 | 1.0 | 31.0 |
2015-11-01 | 165.0 | 3.0 | 30.0 |
2015-12-01 | 216.0 | 11.0 | 31.0 |
180 rows × 3 columns
n_days
¶n_days
bisa diisi dengan jumlah hari yang ingin diproses setiap bulan-nya.
# Setiap 8 Hari
summary_station(
dataset=dataset, column='STA_B',
ufunc=myfunc, ufunc_col=myfunc_col,
n_days='8D')
STA_B | |||
---|---|---|---|
sum | n_rain | n_days | |
DATE | |||
2001-01-01 | 90.0 | 4.0 | 8.0 |
2001-01-09 | 123.0 | 5.0 | 8.0 |
2001-01-17 | 192.0 | 6.0 | 8.0 |
2001-01-25 | 49.0 | 3.0 | 7.0 |
2001-02-01 | 129.0 | 5.0 | 8.0 |
... | ... | ... | ... |
2015-11-25 | 48.0 | 1.0 | 6.0 |
2015-12-01 | 78.0 | 2.0 | 8.0 |
2015-12-09 | 48.0 | 4.0 | 8.0 |
2015-12-17 | 52.0 | 3.0 | 8.0 |
2015-12-25 | 38.0 | 2.0 | 7.0 |
720 rows × 3 columns
# Setiap 15 Hari
summary_station(
dataset=dataset, column='STA_C',
ufunc=myfunc, ufunc_col=myfunc_col,
n_days='15D')
STA_C | |||
---|---|---|---|
sum | n_rain | n_days | |
DATE | |||
2001-01-01 | 158.08 | 13.0 | 15.0 |
2001-01-16 | 146.94 | 14.0 | 15.0 |
2001-01-31 | 22.96 | 1.0 | 1.0 |
2001-02-01 | 157.80 | 12.0 | 15.0 |
2001-02-16 | 77.45 | 11.0 | 13.0 |
... | ... | ... | ... |
2015-11-01 | 152.00 | 7.0 | 15.0 |
2015-11-16 | 76.00 | 4.0 | 15.0 |
2015-12-01 | 23.00 | 1.0 | 15.0 |
2015-12-16 | 46.00 | 8.0 | 15.0 |
2015-12-31 | 0.00 | 0.0 | 1.0 |
465 rows × 3 columns
summary_all()
¶Fungsi ini hanya melakukan proses summary_station()
untuk seluruh kolom atau kolom tertentu yang diatur dengan argumen columns
. Argumen dataset
, ufunc
, ufunc_col
, n_days='M'
sama dengan summary_station()
, yang membedakan adalah argumen columns
.
Baru di versi 0.4.0
(isu #115): Penambahan argumen verbose
untuk menampilkan/tidak menampilkan tulisan PROCESSING: ...
. Nilai default False
(tidak ditampilkan).
# Menggunakan fungsi yang lebih banyak
def n_rain(x):
"Jumlah hari hujan"
return (x > 0).sum()
def n_dry(x):
"Jumlah hari kering"
return np.logical_or(x == 0, x.isna()).sum()
myfunc_all = [len, n_rain, n_dry, np.sum, np.mean, np.std]
myfunc_all_col = ['n_days', 'n_rain', 'n_dry', 'SUM', 'MEAN', 'STD']
summary_all(
dataset=dataset,
ufunc=myfunc_all, ufunc_col=myfunc_all_col,
n_days='7D')
STA_A | STA_B | STA_C | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
n_days | n_rain | n_dry | SUM | MEAN | STD | n_days | n_rain | n_dry | SUM | MEAN | STD | n_days | n_rain | n_dry | SUM | MEAN | STD | |
DATE | ||||||||||||||||||
2001-01-01 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 3.0 | 4.0 | 58.0 | 8.285714 | 16.499639 | 7.0 | 5.0 | 2.0 | 15.43 | 2.204286 | 3.384252 |
2001-01-08 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 4.0 | 3.0 | 68.0 | 9.714286 | 13.300555 | 7.0 | 7.0 | 0.0 | 125.30 | 17.900000 | 32.992996 |
2001-01-15 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 5.0 | 2.0 | 224.0 | 32.000000 | 37.434387 | 7.0 | 7.0 | 0.0 | 93.38 | 13.340000 | 12.806884 |
2001-01-22 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 4.0 | 3.0 | 60.0 | 8.571429 | 10.906529 | 7.0 | 6.0 | 1.0 | 55.88 | 7.982857 | 5.429001 |
2001-01-29 | 3.0 | 0.0 | 3.0 | 0.0 | 0.000000 | 0.000000 | 3.0 | 2.0 | 1.0 | 44.0 | 14.666667 | 13.650397 | 3.0 | 3.0 | 0.0 | 37.99 | 12.663333 | 11.376741 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-12-01 | 7.0 | 3.0 | 4.0 | 86.0 | 12.285714 | 25.408098 | 7.0 | 1.0 | 6.0 | 43.0 | 6.142857 | 16.252472 | 7.0 | 0.0 | 7.0 | 0.00 | 0.000000 | 0.000000 |
2015-12-08 | 7.0 | 5.0 | 2.0 | 55.0 | 7.857143 | 12.266874 | 7.0 | 3.0 | 4.0 | 67.0 | 9.571429 | 13.513662 | 7.0 | 0.0 | 7.0 | 0.00 | 0.000000 | 0.000000 |
2015-12-15 | 7.0 | 7.0 | 0.0 | 105.0 | 15.000000 | 11.503623 | 7.0 | 4.0 | 3.0 | 55.0 | 7.857143 | 9.118271 | 7.0 | 5.0 | 2.0 | 44.00 | 6.285714 | 8.220184 |
2015-12-22 | 7.0 | 7.0 | 0.0 | 136.0 | 19.428571 | 12.053452 | 7.0 | 3.0 | 4.0 | 51.0 | 7.285714 | 11.954278 | 7.0 | 2.0 | 5.0 | 18.00 | 2.571429 | 5.255383 |
2015-12-29 | 3.0 | 3.0 | 0.0 | 15.0 | 5.000000 | 2.000000 | 3.0 | 0.0 | 3.0 | 0.0 | 0.000000 | 0.000000 | 3.0 | 2.0 | 1.0 | 7.00 | 2.333333 | 3.214550 |
888 rows × 18 columns
summary_all(
dataset=dataset, columns=['STA_A', 'STA_C'],
ufunc=myfunc_all, ufunc_col=myfunc_all_col,
n_days='16D')
STA_A | STA_C | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
n_days | n_rain | n_dry | SUM | MEAN | STD | n_days | n_rain | n_dry | SUM | MEAN | STD | |
DATE | ||||||||||||
2001-01-01 | 16.0 | 0.0 | 16.0 | 0.0 | 0.000000 | 0.000000 | 16.0 | 14.0 | 2.0 | 185.80 | 11.612500 | 22.786906 |
2001-01-17 | 15.0 | 0.0 | 15.0 | 0.0 | 0.000000 | 0.000000 | 15.0 | 14.0 | 1.0 | 142.18 | 9.478667 | 9.163421 |
2001-02-01 | 16.0 | 0.0 | 16.0 | 0.0 | 0.000000 | 0.000000 | 16.0 | 13.0 | 3.0 | 157.81 | 9.863125 | 15.319503 |
2001-02-17 | 12.0 | 0.0 | 12.0 | 0.0 | 0.000000 | 0.000000 | 12.0 | 10.0 | 2.0 | 77.44 | 6.453333 | 7.835151 |
2001-03-01 | 16.0 | 0.0 | 16.0 | 0.0 | 0.000000 | 0.000000 | 16.0 | 14.0 | 2.0 | 9.01 | 0.563125 | 0.719228 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-10-17 | 15.0 | 9.0 | 6.0 | 97.0 | 6.466667 | 9.500877 | 15.0 | 1.0 | 14.0 | 6.00 | 0.400000 | 1.549193 |
2015-11-01 | 16.0 | 12.0 | 4.0 | 190.0 | 11.875000 | 14.655488 | 16.0 | 7.0 | 9.0 | 152.00 | 9.500000 | 19.721393 |
2015-11-17 | 14.0 | 13.0 | 1.0 | 384.0 | 27.428571 | 15.360861 | 14.0 | 4.0 | 10.0 | 76.00 | 5.428571 | 13.119066 |
2015-12-01 | 16.0 | 10.0 | 6.0 | 197.0 | 12.312500 | 19.269038 | 16.0 | 1.0 | 15.0 | 23.00 | 1.437500 | 5.750000 |
2015-12-17 | 15.0 | 15.0 | 0.0 | 200.0 | 13.333333 | 10.587504 | 15.0 | 8.0 | 7.0 | 46.00 | 3.066667 | 4.317186 |
360 rows × 12 columns
summary_all(
dataset=dataset,
ufunc=myfunc_all, ufunc_col=myfunc_all_col,
n_days='7D', verbose=True)
PROCESSING: STA_A PROCESSING: STA_B PROCESSING: STA_C
STA_A | STA_B | STA_C | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
n_days | n_rain | n_dry | SUM | MEAN | STD | n_days | n_rain | n_dry | SUM | MEAN | STD | n_days | n_rain | n_dry | SUM | MEAN | STD | |
DATE | ||||||||||||||||||
2001-01-01 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 3.0 | 4.0 | 58.0 | 8.285714 | 16.499639 | 7.0 | 5.0 | 2.0 | 15.43 | 2.204286 | 3.384252 |
2001-01-08 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 4.0 | 3.0 | 68.0 | 9.714286 | 13.300555 | 7.0 | 7.0 | 0.0 | 125.30 | 17.900000 | 32.992996 |
2001-01-15 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 5.0 | 2.0 | 224.0 | 32.000000 | 37.434387 | 7.0 | 7.0 | 0.0 | 93.38 | 13.340000 | 12.806884 |
2001-01-22 | 7.0 | 0.0 | 7.0 | 0.0 | 0.000000 | 0.000000 | 7.0 | 4.0 | 3.0 | 60.0 | 8.571429 | 10.906529 | 7.0 | 6.0 | 1.0 | 55.88 | 7.982857 | 5.429001 |
2001-01-29 | 3.0 | 0.0 | 3.0 | 0.0 | 0.000000 | 0.000000 | 3.0 | 2.0 | 1.0 | 44.0 | 14.666667 | 13.650397 | 3.0 | 3.0 | 0.0 | 37.99 | 12.663333 | 11.376741 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2015-12-01 | 7.0 | 3.0 | 4.0 | 86.0 | 12.285714 | 25.408098 | 7.0 | 1.0 | 6.0 | 43.0 | 6.142857 | 16.252472 | 7.0 | 0.0 | 7.0 | 0.00 | 0.000000 | 0.000000 |
2015-12-08 | 7.0 | 5.0 | 2.0 | 55.0 | 7.857143 | 12.266874 | 7.0 | 3.0 | 4.0 | 67.0 | 9.571429 | 13.513662 | 7.0 | 0.0 | 7.0 | 0.00 | 0.000000 | 0.000000 |
2015-12-15 | 7.0 | 7.0 | 0.0 | 105.0 | 15.000000 | 11.503623 | 7.0 | 4.0 | 3.0 | 55.0 | 7.857143 | 9.118271 | 7.0 | 5.0 | 2.0 | 44.00 | 6.285714 | 8.220184 |
2015-12-22 | 7.0 | 7.0 | 0.0 | 136.0 | 19.428571 | 12.053452 | 7.0 | 3.0 | 4.0 | 51.0 | 7.285714 | 11.954278 | 7.0 | 2.0 | 5.0 | 18.00 | 2.571429 | 5.255383 |
2015-12-29 | 3.0 | 3.0 | 0.0 | 15.0 | 5.000000 | 2.000000 | 3.0 | 0.0 | 3.0 | 0.0 | 0.000000 | 0.000000 | 3.0 | 2.0 | 1.0 | 7.00 | 2.333333 | 3.214550 |
888 rows × 18 columns
- 20220613 - 1.1.0 - Fix error concat index (live at v0.4.1) & refactor based on new output from hk88.
- 20220311 - 1.0.1 - Add verbose (#115)
- 20191217 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.