Notebook

Data cleaning and preparation¶

Imports¶

In [1]:

import pandas as pd
import numpy as np
import datetime

Configure data path variables¶

In [2]:

# Path where the input csv-files are stored
fpath_src_data_dir = 'src_data/'

# Path where the cleaned data is stored
fpath_clean_data_dir = 'clean_data/'

# Path where the data ready for the ML analysis is stored
fpath_prepared_data_dir = 'ready_data/'

In [3]:

!ls {fpath_src_data_dir}*.csv

src_data/hm_animal.csv	   src_data/hm_mastitis_bact_tfpcr.csv
src_data/hm_BCS.csv	   src_data/hm_milkrecording.csv
src_data/hm_ebv.csv	   src_data/hm_NSAIET.csv
src_data/hm_health.csv	   src_data/hm_pregnancy.csv
src_data/hm_lactation.csv

In [4]:

!mkdir -p {fpath_clean_data_dir}
!mkdir -p {fpath_prepared_data_dir}

Definitions to read in data and what columns to ignore¶

According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.

Remarks:

We are not taking into consideration the dataset 'hm_mastitis_bact_tfpcr.csv' in this analysis.
The columns 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4' of the dataset 'Hm_milkrecording.csv' are removed in this analysis. However, they are stored to a separate file in a different Jupyter notebook.

In [5]:

data_files = {'hm_BCS.csv': {'delimiter': ';',
                             'columns_to_ignore': []
                            },
            
              'hm_lactation.csv': {'delimiter': ';',
                                   'columns_to_ignore': ['dry_date_prev_lact']
                                  },

              'hm_NSAIET.csv': {'delimiter': ';',
                                'columns_to_ignore': ['ejaculate',
                                                      'INSEM_SEXINGBEHANDLUNG',
                                                      'AI_organization',
                                                      'INSEM_SEXINGGESCHLECHT',
                                                      'idhrd_anon'
                                                     ]
                               },

#               'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',
#                                              'columns_to_ignore': ['Ct_value']
#                                             },

              'hm_animal.csv': {'delimiter': ';',
                                'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']
                               },

              'hm_milkrecording.csv': {'delimiter': ';',
                                       'columns_to_ignore': ['idspectrum_num',
                                                             'AR_PESEE_PESCODEMALADEANALYSE',
                                                             'AR_PESEE_PESCODEMALADEPESEE',
                                                             'bloodBHB',
                                                             'bloodNEFA',
                                                             'milkAcetone',
                                                             'CH4',
                                                             'milk_yield_morning_measured',
                                                             'milk_yield_evening_measured'
                                                            ]
                                      },

              'hm_ebv.csv': {'delimiter': ' ',
                             'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']
                            },

              'hm_pregnancy.csv': {'delimiter': ';',
                                   'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']
                                  },

              'hm_health.csv': {'delimiter': ';',
                                'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']
                               }
             }

fnames = list(data_files.keys())

Useful functions to clean data¶

In [6]:

def read_in_csv(file_path, sep):
    """
    Reads in the specified csv-file.
    
    :param file_path: Path to the csv-file
    :param sep: Separator used in the csv-file
    :return: Pandas dataframe
    """
    return pd.read_csv(file_path, sep=sep)


def save_csv(df, file_path, index=False):
    """
    Save the specified csv-file.
    
    :param df: Pandas dataframe
    :param file_path: Path to the output csv-file
    :param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)
    :return: None
    """
    return df.to_csv(file_path, index=index)


def remove_rows_with_missing_values(df):
    """
    Compute the number of missing values per column and remove all rows with missing values.
    
    :param df: Pandas dataframe
    :return: Pandas dataframe
    """
    
    for _, val in enumerate(df.columns):
        mask = df[val].isnull().values
        if np.sum(mask) > 0:
            print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))
            df = df.drop(np.arange(mask.size)[mask], axis=0)
            df = df.reset_index(drop=True)
    return df


def remove_columns_to_be_ignored(df, columns_to_ignore):
    """
    Remove all columns to be ignored.
    
    :param df: Pandas dataframe
    :param columns_to_ignore: List of the column names of the columns to ignore
    :return: Pandas dataframe
    """
    return df.drop(labels=columns_to_ignore, axis=1)


def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):
    """
    Convert a date-like column to a pandas datetime-column.
    
    :param df: Pandas dataframe
    :param column: Column name of the datetime column
    :param format: Format of the date-entry (default: YYYYMMDD)
    :param convert_to_int: Bool value whether the column needs to be converted to an
                           integer column first (default: False)
    :return: Pandas dataframe
    """
    if convert_to_int:
        df[column] = df[column].values.astype(np.int64)
    df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values
    return df

Read in src-data & clean it (with modifications as suggested by Thomas Denninger)¶

Mögliche Verbesserungen an den Daten: Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde Information der Art der Milchleistungsprüfmethode hineinnehmen milking_time_morning und milking_time_evening ignorieren Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren Werte mit nsaiet_type == Belegung oder == ET entfernen Herausfinden, was bei calving_ease schief lief

In [7]:

for _, fname in enumerate(fnames):
    print('--- Reading in {:} ---'.format(fname))
    df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])

    # Convert date columns to datetime objects & sort dataframe
    if fname == 'hm_BCS.csv':
        df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'BCS_date'])
    if fname == 'hm_lactation.csv':
        df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)
        df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)
        df = df.sort_values(by=['idani_anon', 'calving_date'])
    if fname == 'hm_NSAIET.csv':
        df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'nsaiet_date'])
    if fname == 'hm_animal.csv':
        df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'birth_date'])
    if fname == 'hm_milkrecording.csv':
        df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
        df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
#     if fname == 'hm_ebv.csv':  # Sort afterwards
#         df = df.sort_values(by=['idani_anon'])
    if fname == 'hm_pregnancy.csv':
        df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])
    if fname == 'hm_health.csv':
        df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'healthevent_date'])

    # Modifications to individual datasets, as suggested by Thomas Denninger
    if fname == 'hm_milkrecording.csv':
        # Add new column depending on how the milk_yield was measured
        # "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
        new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
        df['milk_yield_msrmt_type'] = new_col.copy()

        # Add new column of the fat_24h vs. protein_24h ratio
        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
        df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']

        # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
        df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)

    if fname == 'hm_lactation.csv':
#         # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days
#         values = df['dry_date_prev_lact'].values.copy()
#         mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)
#         values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values
#         df['dry_date_prev_lact'] = values.copy()

        # Fill empty 'calving_ease' with constant value 2.5
        df['calving_ease'] = df['calving_ease'].fillna(2.5)

    if fname == 'hm_NSAIET.csv':
        # Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with
        # "nsaiet_type=Belegung": 5; "nsaiet_type=Besamung": 10; "nsaiet_type=ET": 15
        new_col = np.zeros(df['AI_technician'].size, dtype=np.int)
        new_col[df['nsaiet_type'].values == 'Belegung'] = 5
        new_col[df['nsaiet_type'].values == 'Besamung'] = 10
        new_col[df['nsaiet_type'].values == 'ET'] = 15
        df['AI_technician'] = new_col.copy()

    if fname == 'hm_ebv.csv':
        # Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)
        values = df['label'].values.copy()
        mask = df['label'].values == 'XXXX'
        values[mask] = 'U'
        df['label'] = values.copy()
        
        # Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table 
        column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]
        column_names.remove('trait')
        column_names.remove('estimate')
        df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())
        
        # Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest
        considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']
        
        df = df[column_names+considered_traits]
        
        # Sort dataframe
        df = df.sort_values(by=['idani_anon'])
    
    # Remove columns to be ignored and rows with emtpy values
    if fname != 'hm_ebv.csv':
        df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])
    
    # Remove rows with missing values
    df = df.reset_index(drop=True)
    df = remove_rows_with_missing_values(df)
    
    assert(0 == df.isnull().sum().sum())
    print(df)
    print()

    # Save file to clean directory
    save_csv(df=df, file_path=fpath_clean_data_dir+fname)

--- Reading in hm_BCS.csv ---
         BCS_date   BCS       idani_anon
0      2018-10-23  3.00  CHE000000000561
1      2017-02-16  2.50  CHE000000000781
2      2013-12-19  3.00  CHE000000002000
3      2015-02-05  3.00  CHE000000002000
4      2010-11-05  3.25  CHE000000002251
...           ...   ...              ...
171212 2016-10-18  2.75  CHE000099998376
171213 2018-03-09  2.75  CHE000099999361
171214 2019-05-28  2.75  CHE000099999361
171215 2020-03-05  2.50  CHE000099999361
171216 2013-06-04  3.25  CHE000099999926

[171217 rows x 3 columns]

--- Reading in hm_lactation.csv ---
        parity calving_date calving_ease       idani_anon
0            1   2018-09-06            2  CHE000000000561
1            2   2019-09-15            2  CHE000000000561
2            1   2016-09-07            2  CHE000000000781
3            2   2017-08-05            1  CHE000000000781
4            3   2018-10-18          2.5  CHE000000000781
...        ...          ...          ...              ...
637269       1   2013-02-26            2  CHE000099999926
637270       2   2014-03-24            1  CHE000099999926
637271       3   2015-03-05            2  CHE000099999926
637272       4   2016-06-18            3  CHE000099999926
637273       5   2017-11-14            2  CHE000099999926

[637274 rows x 4 columns]

--- Reading in hm_NSAIET.csv ---
         parity nsaiet_date nsaiet_type  AI_technician       idani_anon  \
0             0  2017-11-05    Besamung             10  CHE000000000561   
1             0  2017-11-28    Besamung             10  CHE000000000561   
2             1  2018-11-16    Besamung             10  CHE000000000561   
3             1  2018-12-05    Besamung             10  CHE000000000561   
4             2  2019-12-09    Besamung             10  CHE000000000561   
...         ...         ...         ...            ...              ...   
1366133       1  2013-06-13    Besamung             10  CHE000099999926   
1366134       2  2014-05-19    Besamung             10  CHE000099999926   
1366135       3  2015-09-02    Belegung              5  CHE000099999926   
1366136       4  2016-10-14    Besamung             10  CHE000099999926   
1366137       4  2017-01-24    Besamung             10  CHE000099999926   

        idani_anon_aisire  
0         CHE000020282537  
1         CHE000020282537  
2         CHE000002123235  
3         CHE000002123235  
4         CHE000001110094  
...                   ...  
1366133   CHE000025255973  
1366134   CHE000038915903  
1366135   CHE000099239508  
1366136   CHE000045797183  
1366137   CHE000079801145  

[1366138 rows x 6 columns]

--- Reading in hm_animal.csv ---
       birth_date brd_abbr_icar       idani_anon
0      2016-03-08           HOL  CHE000000000559
1      2016-02-27           HOL  CHE000000000561
2      2011-05-09           HOL  CHE000000000620
3      2014-06-23           HOL  CHE000000000781
4      2015-11-25           HOL  CHE000000001494
...           ...           ...              ...
275766 2014-01-06           HOL  CHE000099998134
275767 2016-11-28           HOL  CHE000099998152
275768 2013-11-12           HOL  CHE000099998376
275769 2015-09-07           HOL  CHE000099999361
275770 2011-01-26           HOL  CHE000099999926

[275771 rows x 3 columns]

--- Reading in hm_milkrecording.csv ---
Column: milking_time_morning; Removed missing values: 0.04%
Column: milking_time_evening; Removed missing values: 0.01%
Column: lab_date; Removed missing values: 0.00%
Column: DIM; Removed missing values: 0.02%
Column: lactose_24h; Removed missing values: 0.00%
Column: scc_24h; Removed missing values: 0.00%
Column: urea_24h; Removed missing values: 0.05%
        mlksmpl_date  milking_time_morning  milking_time_evening   lab_date  \
0         2018-10-10               50000.0              163000.0 2018-10-11   
1         2018-11-13               50000.0              163000.0 2018-11-15   
2         2018-12-18               50000.0              163000.0 2018-12-19   
3         2019-01-21               50000.0              163000.0 2019-01-23   
4         2019-02-23               50000.0              163000.0 2019-02-26   
...              ...                   ...                   ...        ...   
5116724   2018-08-06               55000.0              172000.0 2018-08-08   
5116725   2018-09-07               55000.0              171000.0 2018-09-10   
5116726   2018-10-10               53000.0              171000.0 2018-10-11   
5116727   2018-11-12               53000.0              165000.0 2018-11-14   
5116728   2018-12-14               53500.0              165000.0 2018-12-17   

           DIM pruefmethode melkmethode  milk_yield_24h  fat_24h  protein_24h  \
0         34.0          AT4      Normal            34.2     36.5         24.9   
1         68.0          AT4      Normal            34.1     34.9         29.2   
2        103.0          AT4      Normal            30.9     37.2         26.7   
3        137.0          AT4      Normal            36.3     36.8         31.1   
4        170.0          AT4      Normal            35.3     34.2         29.6   
...        ...          ...         ...             ...      ...          ...   
5116724  265.0          AT4      Normal            27.9     43.5         32.7   
5116725  297.0          AT4      Normal            30.7     58.8         38.4   
5116726  330.0          AT4      Normal            25.3     52.2         40.6   
5116727  363.0          AT4      Normal            19.8     52.8         43.3   
5116728  395.0          AT4      Normal            14.7     53.6         43.6   

         lactose_24h  scc_24h  urea_24h  AR_PESEE_PESCODEALPAGE  \
0               49.8     21.0       2.4                     0.0   
1               50.3     80.0       2.2                     0.0   
2               48.3    132.0       1.6                     0.0   
3               50.0    106.0       2.1                     0.0   
4               48.6    133.0       1.5                     0.0   
...              ...      ...       ...                     ...   
5116724         44.1   1004.0       3.3                     0.0   
5116725         44.6    517.0       3.2                     0.0   
5116726         43.5    235.0       2.9                     0.0   
5116727         43.6    166.0       3.2                     0.0   
5116728         41.7    163.0       2.9                     0.0   

              idani_anon       idhrd_anon  milk_yield_msrmt_type  \
0        CHE000000000561  CHE000000095710                      2   
1        CHE000000000561  CHE000000095710                      3   
2        CHE000000000561  CHE000000095710                      2   
3        CHE000000000561  CHE000000095710                      3   
4        CHE000000000561  CHE000000095710                      2   
...                  ...              ...                    ...   
5116724  CHE000099999926  CHE000000031281                      2   
5116725  CHE000099999926  CHE000000031281                      3   
5116726  CHE000099999926  CHE000000031281                      2   
5116727  CHE000099999926  CHE000000031281                      3   
5116728  CHE000099999926  CHE000000031281                      2   

         fat_protein_24h_ratio  
0                     1.465863  
1                     1.195205  
2                     1.393258  
3                     1.183280  
4                     1.155405  
...                        ...  
5116724               1.330275  
5116725               1.531250  
5116726               1.285714  
5116727               1.219400  
5116728               1.229358  

[5116729 rows x 18 columns]

--- Reading in hm_ebv.csv ---
Column: ekg; Removed missing values: 63.15%
Column: scs; Removed missing values: 0.00%
        base label       idani_anon   ekg   epr   fkg   fpr     mkg    per  \
0       HO20     A  CHE000000000559 -23.0 -0.02 -31.0 -0.07  -636.0   93.0   
1       HO20    CH  CHE000000000561  18.0 -0.23  26.0 -0.24  1173.0   93.0   
2       HO20     A  CHE000000000620   5.0  0.11  -1.0  0.05  -147.0   99.0   
3       HO20    CH  CHE000000000781   0.0 -0.07  -2.0 -0.11   179.0   95.0   
4       HO20    CH  CHE000000001494   3.0  0.11  -4.0  0.04  -179.0   93.0   
...      ...   ...              ...   ...   ...   ...   ...     ...    ...   
271769  HO20    CH  CHE000099998134   6.0  0.11  18.0  0.26   -82.0   89.0   
271770  HO20    CH  CHE000099998152   8.0 -0.03   3.0 -0.11   311.0   89.0   
271771  HO20    CH  CHE000099998376   8.0  0.16  46.0  0.64  -167.0  104.0   
271772  HO20    CH  CHE000099999361  10.0 -0.02  12.0 -0.03   365.0  103.0   
271773  HO20    CH  CHE000099999926  -1.0 -0.01  11.0  0.13    -3.0   94.0   

          scs  
0       102.0  
1        89.0  
2        94.0  
3       106.0  
4       111.0  
...       ...  
271769   90.0  
271770  110.0  
271771   99.0  
271772  111.0  
271773   98.0  

[271774 rows x 10 columns]

--- Reading in hm_pregnancy.csv ---
       pregnancy_detection_date   finding       idani_anon       idhrd_anon
0                    2019-01-08  pregnant  CHE000000000561  CHE000000095710
1                    2020-03-11      open  CHE000000000561  CHE000000095710
2                    2018-01-09      open  CHE000000000781  CHE000000072353
3                    2018-03-13  pregnant  CHE000000000781  CHE000000072353
4                    2018-03-20  pregnant  CHE000000001494  CHE000000052966
...                         ...       ...              ...              ...
143945               2018-11-27  pregnant  CHE000099992995  CHE000000014814
143946               2020-01-17      open  CHE000099993267  CHE000000035579
143947               2019-11-21  pregnant  CHE000099998152  CHE000000064265
143948               2020-01-27  pregnant  CHE000099998152  CHE000000064265
143949               2018-05-01  pregnant  CHE000099999926  CHE000000031281

[143950 rows x 4 columns]

--- Reading in hm_health.csv ---
Column: idhrd_anon; Removed missing values: 0.07%
      hecode_ASR healthevent_date       idani_anon       idhrd_anon
0        10.7.1.       2018-11-19  CHE000000005877  CHE000000079291
1         2.1.1.       2018-12-20  CHE000000005877  CHE000000079291
2          10.4.       2019-09-06  CHE000000005877  CHE000000079291
3           3.5.       2020-03-14  CHE000000005877  CHE000000079291
4           6.1.       2014-06-02  CHE000000006772  CHE000000055108
...          ...              ...              ...              ...
53730     2.1.2.       2019-06-15  CHE000099990201  CHE000000088759
53731       2.5.       2018-01-30  CHE000099990433  CHE000000059284
53732     2.1.1.       2020-03-18  CHE000099992995  CHE000000014814
53733    9.3.99.       2013-11-25  CHE000099998376  CHE000000086408
53734      10.2.       2013-12-19  CHE000099998376  CHE000000086408

[53735 rows x 4 columns]

Make an extra file of the hm_milkrecording.csv blood, methane, and acetone values (for other challenge)¶

In [8]:

fname = 'hm_milkrecording.csv'
columns_to_ignore = ['idspectrum_num',
                     'AR_PESEE_PESCODEMALADEANALYSE',
                     'AR_PESEE_PESCODEMALADEPESEE',
                     'milk_yield_morning_measured',
                     'milk_yield_evening_measured'
                    ]

df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])

# Convert date columns to datetime objects & sort dataframe
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])

# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()

# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)

# Remove columns to be ignored
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)

# Remove rows with emtpy values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)

assert(0 == df.isnull().sum().sum())

# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')

Column: milking_time_morning; Removed missing values: 0.04%
Column: milking_time_evening; Removed missing values: 0.01%
Column: DIM; Removed missing values: 0.02%
Column: lactose_24h; Removed missing values: 0.00%
Column: scc_24h; Removed missing values: 0.00%
Column: urea_24h; Removed missing values: 0.05%
Column: bloodBHB; Removed missing values: 88.89%
Column: CH4; Removed missing values: 5.46%

In [ ]: