Berdasarkan isu #88: request: ambil dataset hujan harian
Referensi isu:
hidrokit.contrib.taruma.hk79
#79. (lihat notebook/manual). request: ambil dataset hujan jam-jaman dari excelDeskripsi permasalahan:
pandas.DataFrame
, dengan baris menunjukkan observasi/kejadian dan kolom menunjukkan stasiun.Change Log:
read_workbook(...)
untuk membaca berkas tanpa perlu mengetahui nama sheet dan membaca seluruh sheet kecuali yang sheet berawalan ignore_str='_'
.as_df=False
akan memperoleh pesan error karena luaran sebelum versi 0.4.0 berupa list
, sedangkan untuk versi 0.4.0 luaran berupa dictionary
.try:
import hidrokit
except ModuleNotFoundError:
!pip install hidrokit -q
import hidrokit
print(f'hidrokit version: {hidrokit.__version__}')
hidrokit version: 0.3.6
# Unduh dataset
!wget -O sample.xlsx "https://taruma.github.io/assets/hidrokit_dataset/hidrokit_daily_template.xlsx" -q
FILE = 'sample.xlsx'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from calendar import isleap
import pandas as pd
def _melt_to_array(df, year):
"""Melt dataframe to 1D array one year"""
# ref: hidrokit.contrib.taruma.hk43
_drop = [59, 60, 61, 123, 185, 278, 340]
_drop_leap = [60, 61, 123, 185, 278, 340]
data = df.melt().drop('variable', axis=1)
if isleap(year):
return data['value'].drop(_drop_leap).values
else:
return data['value'].drop(_drop).values
def _index_daily(year):
"""Return DateTimeIndex object for one year"""
year_range = '{}0101 {}0101'.format(year, year + 1).split()
return pd.date_range(*year_range, closed='left')
def _yearly_df(df, year, station_name):
"""Create dataframe for one year"""
return pd.DataFrame(
data=_melt_to_array(df, year),
index=_index_daily(year),
columns=[station_name]
)
def _data_from_sheet(df, station_name, as_df=True):
"""Read dataset from single sheet as dataframe (or list of dataframe)"""
n_years = int(df.iloc[0, 1])
frames = []
for i in range(2, n_years * 33, 33):
year = int(df.iloc[i, 1])
pivot = df.iloc[i:i + 31, 4:16]
data = _yearly_df(pivot, year, station_name)
frames.append(data)
if as_df:
return pd.concat(frames, sort=True)
else:
return frames
def read_workbook(io, stations=None, ignore_str='_', as_df=True):
"""Read dataset from workbook"""
excel = pd.ExcelFile(io)
data = {}
sheet_names = excel.sheet_names
if stations is None:
stations = []
for sheet in sheet_names:
if not sheet.startswith(ignore_str):
stations.append(sheet)
else:
stations = [stations] if isinstance(stations, str) else stations
for station in stations:
df = excel.parse(sheet_name=station, header=None)
data[station] = _data_from_sheet(df, station)
if as_df:
return pd.concat(data.values(), sort=True, axis=1)
else:
return data
from hidrokit.contrib.taruma import hk79
# Ambil informasi excel menggunakan modul .hk79
data_info = hk79._get_info(FILE, config_sheet='_INFO')
print(':: INFORMASI PADA BERKAS')
print(data_info)
:: INFORMASI PADA BERKAS {'key': 'VALUE', 'n_stations': 2, 'stations': 'AURENE, TYBALT', 'source': 'RATA SUM', 'station_1_years': '2002, 2003, 2004', 'station_2_years': '2007, 2008'}
stations = data_info['stations'].replace(' ', '').split(',')
print('nama stasiun dalam berkas:', stations)
nama stasiun dalam berkas: ['AURENE', 'TYBALT']
aurene = read_workbook(FILE, ['AURENE'])
aurene.info()
aurene.head()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1096 entries, 2002-01-01 to 2004-12-31 Freq: D Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AURENE 1096 non-null object dtypes: object(1) memory usage: 17.1+ KB
AURENE | |
---|---|
2002-01-01 | 17.08 |
2002-01-02 | 16.28 |
2002-01-03 | 20.32 |
2002-01-04 | 18.34 |
2002-01-05 | 13.16 |
dataset = read_workbook(FILE, ['TYBALT', 'AURENE'])
dataset.sort_index()
TYBALT | AURENE | |
---|---|---|
2002-01-01 | NaN | 17.08 |
2002-01-02 | NaN | 16.28 |
2002-01-03 | NaN | 20.32 |
2002-01-04 | NaN | 18.34 |
2002-01-05 | NaN | 13.16 |
... | ... | ... |
2008-12-27 | 134.83 | NaN |
2008-12-28 | 81.88 | NaN |
2008-12-29 | 20.14 | NaN |
2008-12-30 | 208.54 | NaN |
2008-12-31 | 208.14 | NaN |
1827 rows × 2 columns
read_workbook(io, ...)
¶Function: read_workbook(io, stations=None, ignore_str='_', as_df=True)
io
: Path untuk berkas excel. Bisa berupa 'string', objek pandas.ExcelFile
, atau objek pathlib.Path
.stations
: nama stasiun/sheet. None
(default). Akan membaca seluruh sheet kecuali yang diawali ignore_str
jika None
. Dapat berupa string
, atau list
/tuple
. *Change in version 0.4.0*ignore_str
: teks yang diawali ignore_str
akan diabaikan saat proses membaca buku. '_'
(default). *New in version 0.4.0*as_df
: luaran dapet berupa pandas.DataFrame
jika True
. True
(default). Jika False
, luaran berupa dictionary
. *Change in version 0.4.0*read_workbook(FILE)
AURENE | TYBALT | |
---|---|---|
2002-01-01 | 17.08 | NaN |
2002-01-02 | 16.28 | NaN |
2002-01-03 | 20.32 | NaN |
2002-01-04 | 18.34 | NaN |
2002-01-05 | 13.16 | NaN |
... | ... | ... |
2008-12-27 | NaN | 134.83 |
2008-12-28 | NaN | 81.88 |
2008-12-29 | NaN | 20.14 |
2008-12-30 | NaN | 208.54 |
2008-12-31 | NaN | 208.14 |
1827 rows × 2 columns
read_workbook(FILE, as_df=False)
{'AURENE': AURENE 2002-01-01 17.08 2002-01-02 16.28 2002-01-03 20.32 2002-01-04 18.34 2002-01-05 13.16 ... ... 2004-12-27 454.75 2004-12-28 69.75 2004-12-29 189.39 2004-12-30 166.39 2004-12-31 133.99 [1096 rows x 1 columns], 'TYBALT': TYBALT 2007-01-01 201.92 2007-01-02 255.38 2007-01-03 130.84 2007-01-04 17.96 2007-01-05 19.12 ... ... 2008-12-27 134.83 2008-12-28 81.88 2008-12-29 20.14 2008-12-30 208.54 2008-12-31 208.14 [731 rows x 1 columns]}
- 20220402 - 1.1.1 - Change 0.3.7 to 0.4.0
- 20220318 - 1.1.0 - Improve read_workbook(...) function Issue #162
- 20191217 - 1.0.1 - Fix read_workbook() issue#95
- 20191213 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.