Berdasarkan isu #79: request: ambil dataset hujan jam-jaman dari excel
Referensi isu:
hidrokit.contrib.taruma.hk43
#43. (view notebook / manual). (Menggunakan fungsi _get_years()
untuk memperoleh list
tahun pada berkas excel.Deskripsi permasalahan:
pandas.DataFrame
Strategi penyelesaian masalah:
hk43._get_years()
)pandas.DataFrame
Catatan:
Untuk prapemrosesan tabel seperti (cek data yang hilang, dlsbnya) akan dikembangkan dengan modul yang terpisah karena beberapa fungsi sudah tersedia di modul hidrokit.contrib.taruma.hk73
(untuk mengolah berkas dari bmkg).
# Download sample excel
!wget -O sample.xlsx "https://taruma.github.io/assets/hidrokit_dataset/hidrokit_hourly_template.xlsx" -q
# Import library
import pandas as pd
import numpy as np
def extract_years_from_excel(file_path):
"""
Get a list of years from an Excel file.
Parameters:
file_path (str): The path to the Excel file.
Returns:
List[int]: A sorted list of years found in the Excel file.
"""
excel = pd.ExcelFile(file_path)
years = []
for sheet in excel.sheet_names:
if sheet.isdigit():
years.append(int(sheet))
return sorted(years)
import warnings
import functools
def deprecated(new_func_name):
"""
Decorator to mark a function as deprecated.
Parameters:
- new_func_name (str): The name of the new function that should be used instead.
Returns:
- wrapper (function): The decorated function.
Example:
@deprecated("new_function")
def old_function():
pass
The above example will generate a warning when `old_function` is called,
suggesting to use `new_function` instead.
"""
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
warnings.warn(
f"{func.__name__} is deprecated, use {new_func_name} instead",
DeprecationWarning,
)
return func(*args, **kwargs)
return wrapper
return decorator
from calendar import monthrange
# ref: https://www.reddit.com/r/learnpython/comments/485h1p/
from collections.abc import Sequence
import pandas as pd
import numpy as np
def _index_hourly(year, freq="60min"):
"""
Create a DatetimeIndex with specific year or [year_start, year_end].
Parameters:
year (int or tuple): The year or range of years to create the index for.
If an int is provided, the index will be created for that specific year.
If a tuple is provided, the index will be created for the range of years
specified by the tuple (inclusive).
freq (str, optional): The frequency of the index. Defaults to "60min".
Returns:
pd.DatetimeIndex: The generated DatetimeIndex.
"""
if isinstance(year, Sequence):
year_start, year_end = year
else:
year_start, year_end = year, year
period = f"{year_start}0101 00:00,{year_end}1231 23:00".split(",")
return pd.date_range(*period, freq=freq)
def _melt_to_array(dataframe):
"""
Convert a DataFrame to a 1-dimensional array by melting it and extracting the 'value' column.
Parameters:
df (pandas.DataFrame): The DataFrame to be converted.
Returns:
numpy.ndarray: A 1-dimensional array containing the values
from the 'value' column of the melted DataFrame.
"""
return dataframe.melt().drop("variable", axis=1)["value"].values
def _get_array_in_month(dataframe, year, month):
"""
Get an array of data for a specific year and month from a dataframe.
Parameters:
dataframe (pandas.DataFrame): The input dataframe.
year (int): The year.
month (int): The month.
Returns:
numpy.ndarray: The array of data for the specified year and month.
"""
n_days = monthrange(year, month)[1]
mask_month = slice(None, n_days)
df_month = dataframe.iloc[mask_month, :].T
return _melt_to_array(df_month)
def _get_array_in_year(df, year):
"""
Get an array of data for a specific year from a DataFrame.
Parameters:
- df (pandas.DataFrame): The DataFrame containing the data.
- year (int): The year for which the data is to be extracted.
Returns:
- numpy.ndarray: The array of data for the specified year.
"""
n_rows, _ = df.shape
# configuration (view the excel)
n_month = 1 # number of row to monthID
n_gap = 2 # number of row between month pivot table
n_lines = 31 + n_gap # number of row each month
data = []
for row in range(1, n_rows, n_lines):
mask_start = row + n_month
mask_end = row + n_lines
month = df.iloc[mask_start, 1]
mask_row = slice(mask_start, mask_end)
df_month = df.iloc[mask_row, 4:]
array_month = _get_array_in_month(df_month, year, month)
data.append(array_month)
return np.hstack(data)
def get_info(file, config_sheet=None):
"""
Retrieves information from an Excel file.
Args:
file (str): The path to the Excel file.
config_sheet (str, optional): The name of the sheet to read from.
If not provided, the first sheet will be used.
Returns:
dict: A dictionary containing the information retrieved from the Excel file.
The keys are the lowercase values from the first column,
and the values are the corresponding values from the second column.
"""
excel = pd.ExcelFile(file)
first_sheet = excel.sheet_names[0]
config_sheet = first_sheet if config_sheet is None else config_sheet
df = pd.read_excel(excel, sheet_name=config_sheet, header=None, usecols="A:B")
info = {}
for index, _ in df.iterrows():
key = df.iloc[index, 0].lower()
value = df.iloc[index, 1]
info[str(key)] = value
return info
@deprecated("get_info")
def _get_info(file, config_sheet=None):
return get_info(file, config_sheet=config_sheet)
def read_excel_hourly(file, station=None):
"""
Read hourly data from an Excel file.
Parameters:
file (str): The path to the Excel file.
station (str, optional): The name of the station. Defaults to None.
Returns:
pandas.DataFrame: A DataFrame containing the hourly data.
"""
excel = pd.ExcelFile(file)
# CONFIG
years = extract_years_from_excel(excel)
station = "NA" if station is None else station
# READ DATA
data = []
for year in years:
sheet = pd.read_excel(
excel, sheet_name=str(year), header=None, nrows=396, usecols="A:AB"
)
array = _get_array_in_year(sheet, year)
df_year = pd.DataFrame(data=array, columns=[station], index=_index_hourly(year))
data.append(df_year)
return pd.concat(data, axis=0)
_index_hourly(year, freq='60min')
¶Tujuan: membuat index menggunakan perintah pd.date_range()
dengan input year
yang berupa bilangan ataupun sequence.
_index_hourly(2000) # jika bilangan harus dalam bentuk integer
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00', '2000-01-01 02:00:00', '2000-01-01 03:00:00', '2000-01-01 04:00:00', '2000-01-01 05:00:00', '2000-01-01 06:00:00', '2000-01-01 07:00:00', '2000-01-01 08:00:00', '2000-01-01 09:00:00', ... '2000-12-31 14:00:00', '2000-12-31 15:00:00', '2000-12-31 16:00:00', '2000-12-31 17:00:00', '2000-12-31 18:00:00', '2000-12-31 19:00:00', '2000-12-31 20:00:00', '2000-12-31 21:00:00', '2000-12-31 22:00:00', '2000-12-31 23:00:00'], dtype='datetime64[ns]', length=8784, freq='60T')
_index_hourly(['2000', 2001]) # jika dalam seq-object bisa berupa integer atau string
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00', '2000-01-01 02:00:00', '2000-01-01 03:00:00', '2000-01-01 04:00:00', '2000-01-01 05:00:00', '2000-01-01 06:00:00', '2000-01-01 07:00:00', '2000-01-01 08:00:00', '2000-01-01 09:00:00', ... '2001-12-31 14:00:00', '2001-12-31 15:00:00', '2001-12-31 16:00:00', '2001-12-31 17:00:00', '2001-12-31 18:00:00', '2001-12-31 19:00:00', '2001-12-31 20:00:00', '2001-12-31 21:00:00', '2001-12-31 22:00:00', '2001-12-31 23:00:00'], dtype='datetime64[ns]', length=17544, freq='60T')
_get_array_in_month(df, year, month)
¶Tujuan: mengambil pivot tabel satu bulan dan mengubahnya (melt) ke bentuk tabel biasa.
_get_array_in_year(df, year)
¶Tujuan: serupa dengan _get_array_in_month()
, fungsi ini mengambil seluruh informasi pada sheet tunggal.
_get_info(file, config_sheet=None)
¶Tujuan: mengambil nilai pada sheet pengaturan (sheet pertama pada file) dan mengubahnya ke dalam bentuk dictionary
.
info_file = _get_info('sample.xlsx', config_sheet='_INFO')
info_file
<ipython-input-11-23e254b88fe0>:27: DeprecationWarning: _get_info is deprecated, use get_info instead warnings.warn(
{'key': 'VALUE', 'station_name': 'Aurene'}
info_file['station_name']
'Aurene'
read_excel_hourly(file, station=None)
¶Tujuan: membaca data jam-jaman yang terdapat pada file lalu mengubahnya ke dalam bentuk pandas.DataFrame
dengan index yang sesuai dengan tahun kejadiannya.
data = read_excel_hourly('sample.xlsx', station=info_file['station_name'])
data
Aurene | |
---|---|
2000-01-01 00:00:00 | - |
2000-01-01 01:00:00 | NaN |
2000-01-01 02:00:00 | NaN |
2000-01-01 03:00:00 | NaN |
2000-01-01 04:00:00 | NaN |
... | ... |
2002-12-31 19:00:00 | - |
2002-12-31 20:00:00 | - |
2002-12-31 21:00:00 | - |
2002-12-31 22:00:00 | - |
2002-12-31 23:00:00 | - |
26304 rows × 1 columns
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 26304 entries, 2000-01-01 00:00:00 to 2002-12-31 23:00:00 Freq: 60T Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Aurene 10918 non-null object dtypes: object(1) memory usage: 411.0+ KB
Catatan: data masih berupa object
, dan belum diubah ke bentuk angka. Prapemrosesan ini serupa pada modul hk73 (untuk membaca berkas bmkg).
data.sample(n=20) # menampilkan sampel 20 baris dalam data secara acak
Aurene | |
---|---|
2001-06-20 13:00:00 | NaN |
2001-10-19 03:00:00 | 2.5 |
2001-04-04 07:00:00 | 0.6 |
2001-10-05 07:00:00 | NaN |
2001-08-01 10:00:00 | NaN |
2002-09-14 07:00:00 | - |
2002-05-16 11:00:00 | - |
2000-02-15 18:00:00 | NaN |
2001-10-30 16:00:00 | NaN |
2002-02-09 19:00:00 | - |
2002-08-14 22:00:00 | - |
2002-07-25 21:00:00 | - |
2001-10-27 01:00:00 | NaN |
2001-06-20 05:00:00 | NaN |
2000-01-08 13:00:00 | 0.6 |
2002-03-08 16:00:00 | - |
2001-05-16 07:00:00 | NaN |
2001-09-27 02:00:00 | NaN |
2001-06-11 03:00:00 | NaN |
2000-03-19 06:00:00 | 2.6 |
data.loc['2001'] # menampilkan data pada tahun tertentu
Aurene | |
---|---|
2001-01-01 00:00:00 | NaN |
2001-01-01 01:00:00 | NaN |
2001-01-01 02:00:00 | NaN |
2001-01-01 03:00:00 | NaN |
2001-01-01 04:00:00 | NaN |
... | ... |
2001-12-31 19:00:00 | NaN |
2001-12-31 20:00:00 | NaN |
2001-12-31 21:00:00 | NaN |
2001-12-31 22:00:00 | NaN |
2001-12-31 23:00:00 | NaN |
8760 rows × 1 columns
data['20020101 07:00': '20020101 16:00'] # menampilkan data diantara jam 7.00 sampai 16.00 pada tanggal 1 januari 2002
Aurene | |
---|---|
2002-01-01 07:00:00 | 0.3 |
2002-01-01 08:00:00 | - |
2002-01-01 09:00:00 | - |
2002-01-01 10:00:00 | - |
2002-01-01 11:00:00 | - |
2002-01-01 12:00:00 | - |
2002-01-01 13:00:00 | 0.5 |
2002-01-01 14:00:00 | - |
2002-01-01 15:00:00 | 1.1 |
2002-01-01 16:00:00 | - |
- 20240414 - 1.1.0 / 0.5.0 - Documentation
- 20191129 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.