Berdasarkan isu #53: request: buat tabel/tensor untuk pemodelan deep learning LSTM
Deskripsi permasalahan:
FILE_PATH = 'dataset_hidrologi_pamarayan_1998_2008.csv'
DRIVE_DROP_PATH = '/content/'
DATASET = DRIVE_DROP_PATH + FILE_PATH
import pandas as pd
import numpy as np
dataset = pd.read_csv(DATASET, index_col=0, parse_dates=True)['19980301':] # 2 bulan pertama tidak ada data di debit
dataset.head()
hujan_bojong_manik | hujan_gunung_tunggal | hujan_pasir_ona | hujan_sampang_peundeuy | hujan_cimarga | hujan_bd_pamarayan | hujan_ciminyak_cilaki | hujan_gardu_tanjak | debit_bd_pamarayan | |
---|---|---|---|---|---|---|---|---|---|
1998-03-01 | 0.0 | 0.0 | 3.0 | 7.0 | 0.0 | 12.0 | 0.0 | 0.0 | 90.12 |
1998-03-02 | 0.0 | 4.0 | 36.0 | 9.0 | 26.0 | 0.0 | 5.0 | 32.0 | 97.90 |
1998-03-03 | 4.5 | 0.0 | 0.0 | 11.0 | 10.0 | 2.0 | 3.0 | 21.0 | 88.90 |
1998-03-04 | 0.0 | 0.0 | 46.0 | 5.0 | 24.0 | 6.0 | 11.0 | 13.0 | 90.30 |
1998-03-05 | 32.0 | 0.0 | 0.0 | 22.0 | 8.0 | 14.0 | 0.0 | 21.0 | 210.06 |
dataset.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3959 entries, 1998-03-01 to 2008-12-31 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hujan_bojong_manik 3959 non-null float64 1 hujan_gunung_tunggal 3959 non-null float64 2 hujan_pasir_ona 3959 non-null float64 3 hujan_sampang_peundeuy 3959 non-null float64 4 hujan_cimarga 3959 non-null float64 5 hujan_bd_pamarayan 3959 non-null float64 6 hujan_ciminyak_cilaki 3959 non-null float64 7 hujan_gardu_tanjak 3959 non-null float64 8 debit_bd_pamarayan 3959 non-null float64 dtypes: float64(9) memory usage: 309.3 KB
import numpy as np
def _columns_index(dataframe, columns):
"""
Get the index of columns in a dataframe.
Args:
dataframe (pandas.DataFrame): The dataframe to search for column indices.
columns (list): A list of column names.
Returns:
list: A list of column indices corresponding to the input column names.
"""
column_names = dataframe.columns
column_indices = []
for column in columns:
column_indices.append(column_names.get_loc(column))
return column_indices
def _get_y(array, timesteps, columns_index):
"""
Get the target variable(s) from the input array.
Parameters:
array (ndarray): The input array.
timesteps (int): The number of timesteps to skip from the beginning of the array.
columns_index (list): The indices of the columns to extract.
Returns:
ndarray: The target variable(s) extracted from the input array.
"""
y = []
for col in columns_index:
y.append(array[timesteps:, col])
if len(columns_index) == 1:
return y[0]
return np.stack(y, axis=1)
def _get_x_tensor(array, timesteps, columns_index):
"""
Generate a tensor of input features for a given array.
Parameters:
array (numpy.ndarray): The input array.
timesteps (int): The number of timesteps to consider for each feature.
columns_index (list): The indices of the columns to include in the tensor.
Returns:
numpy.ndarray: The tensor of input features.
"""
X = [] # pylint: disable=invalid-name
rows, _ = array.shape
for col in columns_index:
array_each_column = []
for row in range(timesteps, rows):
array_each_column.append(array[row - timesteps : row, col])
X.append(array_each_column)
return np.stack(X, axis=2)
def tensor_array(
dataframe,
timesteps,
X_columns=None, # pylint: disable=invalid-name
y_out=False,
y_columns=None,
):
"""
Convert a pandas DataFrame into a tensor array for input to a machine learning model.
Args:
dataframe (pandas.DataFrame): The input DataFrame containing the data.
timesteps (int): The number of timesteps to consider for each sample.
X_columns (list, optional): The list of column names to be used as input features.
If None, all columns will be used. Defaults to None.
y_out (bool, optional): Whether to include the output labels in the tensor array.
Defaults to False.
y_columns (list, optional): The list of column names to be used as output labels.
Only applicable if y_out is True. Defaults to None.
Returns:
numpy.ndarray: The tensor array representing the input data.
If y_out is True, the function also returns:
numpy.ndarray: The tensor array representing the output labels.
"""
_, n_cols = dataframe.shape
array = dataframe.values
# pylint: disable=invalid-name
# X array
if X_columns is None:
X_index = range(n_cols)
else:
X_index = _columns_index(dataframe, X_columns)
X = _get_x_tensor(array, timesteps=timesteps, columns_index=X_index)
# y array
if y_out is True:
if y_columns is None:
y_index = [n_cols - 1]
else:
y_index = _columns_index(dataframe, y_columns)
y = _get_y(array, timesteps=timesteps, columns_index=y_index)
return X, y
return X
Dataset memiliki 3959 baris dengan 8 variabel independen, dan 1 variabel dependen.
8 variabel bebas:
hujan_bojong_manik
,hujan_gunung_tunggal
,hujan_pasir_ona
,hujan_sampang_peundeuy
,hujan_cimarga
,hujan_bd_pamarayan
,hujan_ciminyak_cilaki
,hujan_gardu_tanjak
,1 variabel terikat:
debit_bd_pamarayan
Dengan menggunakan timesteps sebesar 5 hari, maka dimensi tensor input 3d sebesar X=(3954,5,9) dengan output y=(3954,)
yt=f(Xtsi,yts)TIMESTEPS = 5
X, y = tensor_array(
dataset, timesteps=TIMESTEPS,
X_columns=None, y_out=True, y_columns=['debit_bd_pamarayan']
)
X.shape
(3954, 5, 9)
y.shape
(3954,)
print(X)
[[[ 0. 0. 3. ... 0. 0. 90.12] [ 0. 4. 36. ... 5. 32. 97.9 ] [ 4.5 0. 0. ... 3. 21. 88.9 ] [ 0. 0. 46. ... 11. 13. 90.3 ] [ 32. 0. 0. ... 0. 21. 210.06]] [[ 0. 4. 36. ... 5. 32. 97.9 ] [ 4.5 0. 0. ... 3. 21. 88.9 ] [ 0. 0. 46. ... 11. 13. 90.3 ] [ 32. 0. 0. ... 0. 21. 210.06] [ 12. 0. 7. ... 16. 0. 82.9 ]] [[ 4.5 0. 0. ... 3. 21. 88.9 ] [ 0. 0. 46. ... 11. 13. 90.3 ] [ 32. 0. 0. ... 0. 21. 210.06] [ 12. 0. 7. ... 16. 0. 82.9 ] [ 14. 0. 11. ... 25. 22. 274.42]] ... [[ 0. 13. 0. ... 0. 3. 21.27] [ 17. 27. 0. ... 0. 2. 83.27] [ 14. 23.5 4. ... 1. 35. 209.27] [ 12. 15.7 7. ... 0. 14. 134.83] [ 10. 19. 0. ... 0. 0. 81.88]] [[ 17. 27. 0. ... 0. 2. 83.27] [ 14. 23.5 4. ... 1. 35. 209.27] [ 12. 15.7 7. ... 0. 14. 134.83] [ 10. 19. 0. ... 0. 0. 81.88] [ 7. 21.7 11. ... 0. 12. 20.14]] [[ 14. 23.5 4. ... 1. 35. 209.27] [ 12. 15.7 7. ... 0. 14. 134.83] [ 10. 19. 0. ... 0. 0. 81.88] [ 7. 21.7 11. ... 0. 12. 20.14] [ 6.05 17.5 21. ... 0. 10. 208.54]]]
print(y)
[ 82.9 274.42 216.36 ... 20.14 208.54 208.14]
- 20240414 - 1.1.0 / 0.5.0 - Refactor/Documentation
- 20190926 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.