# Upload berkas excel bmkg FILE_NAME = "data_bmkg_jawa_barat_96783_1975_2000_stasiun_geofisika_bandung.xlsx" DATASET_PATH = '/content/' + FILE_NAME """ This module provides functions for reading and analyzing BMKG (Meteorology, Climatology, and Geophysics Agency) data. For more information, refer to the manual: https://gist.github.com/taruma/b00880905f297013f046dad95dc2e284 Functions: - read_bmkg_excel(io): Read BMKG data from an Excel file. - has_nan_values(dataframe): Check if the given dataset contains any NaN values. - get_missing_data_indices(nan_indicator_vector): Get the indices of missing data in the given nan_indicator_vector. - get_nan_indices_by_column(dataframe): Get the indices of missing values (NaN) for each column in a DataFrame. - get_unrecorded_indices(dataframe): Get the indices of unrecorded data in the given dataframe. - get_nan_indices_if_exists(dataframe): Returns the indices of NaN values in the given dataframe if NaN values exist. - get_columns_with_nan_values(dataframe): Get the columns with NaN values in the given dataframe. - group_consecutive_elements(input_list): Groups consecutive elements in the input list. - format_group_indices(group_list, indices=None, format_date="%Y%m%d", date_range_format="{}-{}"): Formats the group indices based on the given parameters. Deprecated Functions: - _read_bmkg(*args, **kwargs): Deprecated version of read_bmkg_excel. - _have_nan(*args, **kwargs): Deprecated version of has_nan_values. - _get_index1D(*args, **kwargs): Deprecated version of get_missing_data_indices. - _get_nan(*args, **kwargs): Deprecated version of get_nan_indices_by_column. - _get_missing(*args, **kwargs): Deprecated version of get_unrecorded_indices. - _check_nan(*args, **kwargs): Deprecated version of get_nan_indices_if_exists. - _get_nan_columns(*args, **kwargs): Deprecated version of get_columns_with_nan_values. - _group_as_list(*args, **kwargs): Deprecated version of group_consecutive_elements. - _group_as_index(group_list, index=None, date_format="%Y%m%d", format_date="{}-{}"): Deprecated version of format_group_indices. """ import warnings import functools def deprecated(new_func_name): """ Decorator to mark a function as deprecated. Parameters: - new_func_name (str): The name of the new function that should be used instead. Returns: - wrapper (function): The decorated function. Example: @deprecated("new_function") def old_function(): pass The above example will generate a warning when `old_function` is called, suggesting to use `new_function` instead. """ def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): warnings.warn( f"{func.__name__} is deprecated, use {new_func_name} instead", DeprecationWarning, ) return func(*args, **kwargs) return wrapper return decorator from itertools import groupby from operator import itemgetter from typing import List import pandas as pd import numpy as np def read_bmkg_excel(io): """ Read BMKG data from an Excel file. Parameters: - io: str or file-like object The file path or file-like object to read the Excel data from. Returns: - pandas.DataFrame The data read from the Excel file. """ return pd.read_excel( io, skiprows=8, skipfooter=16, header=0, index_col=0, parse_dates=True, date_format="%d-%m-%Y", ) def _has_nan_values(dataframe): """ Check if the given dataset contains any NaN values. Parameters: dataframe (pandas.DataFrame): The dataset to check for NaN values. Returns: bool: True if the dataset contains NaN values, False otherwise. """ return bool(dataframe.isna().any().any()) def _get_missing_data_indices(nan_indicator_vector): """ Get the indices of missing data in the given nan_indicator_vector. Parameters: nan_indicator_vector (numpy.ndarray): A boolean array indicating missing data. Returns: numpy.ndarray: An array of indices where missing data is present. """ return np.argwhere(nan_indicator_vector).reshape( -1, ) def _get_nan_indices_by_column(dataframe): """ Get the indices of missing values (NaN) for each column in a DataFrame. Parameters: dataframe (pandas.DataFrame): The input DataFrame. Returns: dict: A dictionary where the keys are the column names and the values are lists of indices where missing values occur in each column. """ nan = {} for col in dataframe.columns: nan[col] = _get_missing_data_indices(dataframe[col].isna().values).tolist() return nan def _get_unrecorded_indices(dataframe): """ Get the indices of unrecorded data in the given dataframe. Parameters: dataframe (pandas.DataFrame): The input dataframe. Returns: dict: A dictionary where the keys are the column names and the values are the indices of unrecorded data in each column. """ unrecorded_indices = {} for col in dataframe.columns: masking = (dataframe[col] == 8888) | (dataframe[col] == 9999) unrecorded_indices[col] = _get_missing_data_indices(masking.values) return unrecorded_indices def _get_nan_indices_if_exists(dataframe): """ Returns the indices of NaN values in the given dataframe if NaN values exist. Parameters: dataframe (pandas.DataFrame): The input dataframe. Returns: dict or None: A dictionary with keys as column names and values as lists of indices where missing values occur, or None if no NaN values exist. """ if _has_nan_values(dataframe): return _get_nan_indices_by_column(dataframe) else: return None def _get_columns_with_nan_values(dataframe): return dataframe.columns[dataframe.isna().any()].tolist() def _group_consecutive_elements(input_list: List) -> List[List]: """ Groups consecutive elements in the input list. Args: input_list (List): The list of elements to be grouped. Returns: List[List]: A list of lists, where each inner list contains consecutive elements from the input list. Example: >>> input_list = [1, 2, 3, 5, 6, 8, 9] >>> _group_consecutive_elements(input_list) [[1, 2, 3], [5, 6], [8, 9]] """ # based on https://stackoverflow.com/a/15276206 group_list = [] for _, g in groupby(enumerate(input_list), lambda x: x[0] - x[1]): single_list = sorted(list(map(itemgetter(1), g))) group_list.append(single_list) return group_list def _format_group_indices( group_list, indices=None, format_date="%Y%m%d", date_range_format="{}-{}" ): """ Formats the group indices based on the given parameters. Args: group_list (list): The list of groups. indices (pd.Index or pd.DatetimeIndex, optional): The indices to format. Defaults to None. format_date (str, optional): The date format string. Defaults to "%Y%m%d". date_range_format (str, optional): The format string for date ranges. Defaults to "{}-{}". Returns: list: The formatted group indices. """ formatted_indices = [] is_date_index = isinstance(indices, pd.DatetimeIndex) for item in group_list: if len(item) == 1: if is_date_index: formatted_indices.append(indices[item[0]].strftime(format_date)) else: formatted_indices.append(indices[item[0]]) else: if is_date_index: formatted_indices.append( date_range_format.format( indices[item[0]].strftime(format_date), indices[item[-1]].strftime(format_date), ) ) else: formatted_indices.append( date_range_format.format(indices[item[0]], indices[item[-1]]) ) return formatted_indices # for backward compatibility @deprecated("read_bmkg_excel") def _read_bmkg(*args, **kwargs): return read_bmkg_excel(*args, **kwargs) @deprecated("_has_nan_values") def _have_nan(*args, **kwargs): return _has_nan_values(*args, **kwargs) @deprecated("_get_missing_data_indices") def _get_index1D(*args, **kwargs): return _get_missing_data_indices(*args, **kwargs) @deprecated("_get_nan_indices_by_column") def _get_nan(*args, **kwargs): return _get_nan_indices_by_column(*args, **kwargs) @deprecated("_get_unrecorded_indices") def _get_missing(*args, **kwargs): return _get_unrecorded_indices(*args, **kwargs) @deprecated("_get_nan_indices_if_exists") def _check_nan(*args, **kwargs): return _get_nan_indices_if_exists(*args, **kwargs) @deprecated("_get_columns_with_nan_values") def _get_nan_columns(*args, **kwargs): return _get_columns_with_nan_values(*args, **kwargs) @deprecated("_group_consecutive_elements") def _group_as_list(*args, **kwargs): return _group_consecutive_elements(*args, **kwargs) @deprecated("_format_group_indices") def _group_as_index( group_list, index=None, date_format="%Y%m%d", format_date="{}-{}" ): return _format_group_indices( group_list, indices=index, format_date=date_format, date_range_format=format_date, ) dataset = _read_bmkg(DATASET_PATH) dataset.head() dataset.tail() _have_nan(dataset) _get_index1D(dataset['RH_avg'].isna().values) _get_nan(dataset).keys() print(_get_nan(dataset)['RH_avg']) _get_nan_columns(dataset) _check_nan(dataset).items() # Jika tidak memiliki nilai nan print(_check_nan(dataset.drop(_get_nan_columns(dataset), axis=1))) missing_dict = _get_nan(dataset) missing_RH_avg = missing_dict['RH_avg'] print(missing_RH_avg) print(_group_as_list(missing_RH_avg)) _group_as_index(_group_as_list(missing_RH_avg), index=dataset.index, date_format='%d %b %Y') _get_missing(dataset) dataset.iloc[_get_missing(dataset)['RR']] _group_as_list(_get_missing(dataset)['RR']) _group_as_index(_, index=dataset.index, date_format='%d %b %Y', format_date='{} sampai {}')