import pandas as pd
import numpy as np
import datetime
# Path where the input csv-files are stored
fpath_src_data_dir = 'src_data/'
# Path where the cleaned data is stored
fpath_clean_data_dir = 'clean_data/'
# Path where the data ready for the ML analysis is stored
fpath_prepared_data_dir = 'ready_data/'
!ls {fpath_src_data_dir}*.csv
src_data/hm_animal.csv src_data/hm_mastitis_bact_tfpcr.csv src_data/hm_BCS.csv src_data/hm_milkrecording.csv src_data/hm_ebv.csv src_data/hm_NSAIET.csv src_data/hm_health.csv src_data/hm_pregnancy.csv src_data/hm_lactation.csv
!mkdir -p {fpath_clean_data_dir}
!mkdir -p {fpath_prepared_data_dir}
According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.
Remarks:
data_files = {'hm_BCS.csv': {'delimiter': ';',
'columns_to_ignore': []
},
'hm_lactation.csv': {'delimiter': ';',
'columns_to_ignore': ['dry_date_prev_lact']
},
'hm_NSAIET.csv': {'delimiter': ';',
'columns_to_ignore': ['ejaculate',
'INSEM_SEXINGBEHANDLUNG',
'AI_organization',
'INSEM_SEXINGGESCHLECHT',
'idhrd_anon'
]
},
# 'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',
# 'columns_to_ignore': ['Ct_value']
# },
'hm_animal.csv': {'delimiter': ';',
'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']
},
'hm_milkrecording.csv': {'delimiter': ';',
'columns_to_ignore': ['idspectrum_num',
'AR_PESEE_PESCODEMALADEANALYSE',
'AR_PESEE_PESCODEMALADEPESEE',
'bloodBHB',
'bloodNEFA',
'milkAcetone',
'CH4',
'milk_yield_morning_measured',
'milk_yield_evening_measured'
]
},
'hm_ebv.csv': {'delimiter': ' ',
'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']
},
'hm_pregnancy.csv': {'delimiter': ';',
'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']
},
'hm_health.csv': {'delimiter': ';',
'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']
}
}
fnames = list(data_files.keys())
def read_in_csv(file_path, sep):
"""
Reads in the specified csv-file.
:param file_path: Path to the csv-file
:param sep: Separator used in the csv-file
:return: Pandas dataframe
"""
return pd.read_csv(file_path, sep=sep)
def save_csv(df, file_path, index=False):
"""
Save the specified csv-file.
:param df: Pandas dataframe
:param file_path: Path to the output csv-file
:param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)
:return: None
"""
return df.to_csv(file_path, index=index)
def remove_rows_with_missing_values(df):
"""
Compute the number of missing values per column and remove all rows with missing values.
:param df: Pandas dataframe
:return: Pandas dataframe
"""
for _, val in enumerate(df.columns):
mask = df[val].isnull().values
if np.sum(mask) > 0:
print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))
df = df.drop(np.arange(mask.size)[mask], axis=0)
df = df.reset_index(drop=True)
return df
def remove_columns_to_be_ignored(df, columns_to_ignore):
"""
Remove all columns to be ignored.
:param df: Pandas dataframe
:param columns_to_ignore: List of the column names of the columns to ignore
:return: Pandas dataframe
"""
return df.drop(labels=columns_to_ignore, axis=1)
def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):
"""
Convert a date-like column to a pandas datetime-column.
:param df: Pandas dataframe
:param column: Column name of the datetime column
:param format: Format of the date-entry (default: YYYYMMDD)
:param convert_to_int: Bool value whether the column needs to be converted to an
integer column first (default: False)
:return: Pandas dataframe
"""
if convert_to_int:
df[column] = df[column].values.astype(np.int64)
df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values
return df
Mögliche Verbesserungen an den Daten: Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde Information der Art der Milchleistungsprüfmethode hineinnehmen milking_time_morning und milking_time_evening ignorieren Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren Werte mit nsaiet_type == Belegung oder == ET entfernen Herausfinden, was bei calving_ease schief lief
for _, fname in enumerate(fnames):
print('--- Reading in {:} ---'.format(fname))
df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])
# Convert date columns to datetime objects & sort dataframe
if fname == 'hm_BCS.csv':
df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'BCS_date'])
if fname == 'hm_lactation.csv':
df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)
df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)
df = df.sort_values(by=['idani_anon', 'calving_date'])
if fname == 'hm_NSAIET.csv':
df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'nsaiet_date'])
if fname == 'hm_animal.csv':
df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'birth_date'])
if fname == 'hm_milkrecording.csv':
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
# if fname == 'hm_ebv.csv': # Sort afterwards
# df = df.sort_values(by=['idani_anon'])
if fname == 'hm_pregnancy.csv':
df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])
if fname == 'hm_health.csv':
df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'healthevent_date'])
# Modifications to individual datasets, as suggested by Thomas Denninger
if fname == 'hm_milkrecording.csv':
# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()
# Add new column of the fat_24h vs. protein_24h ratio
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']
# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)
if fname == 'hm_lactation.csv':
# # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days
# values = df['dry_date_prev_lact'].values.copy()
# mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)
# values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values
# df['dry_date_prev_lact'] = values.copy()
# Fill empty 'calving_ease' with constant value 2.5
df['calving_ease'] = df['calving_ease'].fillna(2.5)
if fname == 'hm_NSAIET.csv':
# Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with
# "nsaiet_type=Belegung": 5; "nsaiet_type=Besamung": 10; "nsaiet_type=ET": 15
new_col = np.zeros(df['AI_technician'].size, dtype=np.int)
new_col[df['nsaiet_type'].values == 'Belegung'] = 5
new_col[df['nsaiet_type'].values == 'Besamung'] = 10
new_col[df['nsaiet_type'].values == 'ET'] = 15
df['AI_technician'] = new_col.copy()
if fname == 'hm_ebv.csv':
# Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)
values = df['label'].values.copy()
mask = df['label'].values == 'XXXX'
values[mask] = 'U'
df['label'] = values.copy()
# Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table
column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]
column_names.remove('trait')
column_names.remove('estimate')
df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())
# Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest
considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']
df = df[column_names+considered_traits]
# Sort dataframe
df = df.sort_values(by=['idani_anon'])
# Remove columns to be ignored and rows with emtpy values
if fname != 'hm_ebv.csv':
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])
# Remove rows with missing values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)
assert(0 == df.isnull().sum().sum())
print(df)
print()
# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+fname)
--- Reading in hm_BCS.csv --- BCS_date BCS idani_anon 0 2018-10-23 3.00 CHE000000000561 1 2017-02-16 2.50 CHE000000000781 2 2013-12-19 3.00 CHE000000002000 3 2015-02-05 3.00 CHE000000002000 4 2010-11-05 3.25 CHE000000002251 ... ... ... ... 171212 2016-10-18 2.75 CHE000099998376 171213 2018-03-09 2.75 CHE000099999361 171214 2019-05-28 2.75 CHE000099999361 171215 2020-03-05 2.50 CHE000099999361 171216 2013-06-04 3.25 CHE000099999926 [171217 rows x 3 columns] --- Reading in hm_lactation.csv --- parity calving_date calving_ease idani_anon 0 1 2018-09-06 2 CHE000000000561 1 2 2019-09-15 2 CHE000000000561 2 1 2016-09-07 2 CHE000000000781 3 2 2017-08-05 1 CHE000000000781 4 3 2018-10-18 2.5 CHE000000000781 ... ... ... ... ... 637269 1 2013-02-26 2 CHE000099999926 637270 2 2014-03-24 1 CHE000099999926 637271 3 2015-03-05 2 CHE000099999926 637272 4 2016-06-18 3 CHE000099999926 637273 5 2017-11-14 2 CHE000099999926 [637274 rows x 4 columns] --- Reading in hm_NSAIET.csv --- parity nsaiet_date nsaiet_type AI_technician idani_anon \ 0 0 2017-11-05 Besamung 10 CHE000000000561 1 0 2017-11-28 Besamung 10 CHE000000000561 2 1 2018-11-16 Besamung 10 CHE000000000561 3 1 2018-12-05 Besamung 10 CHE000000000561 4 2 2019-12-09 Besamung 10 CHE000000000561 ... ... ... ... ... ... 1366133 1 2013-06-13 Besamung 10 CHE000099999926 1366134 2 2014-05-19 Besamung 10 CHE000099999926 1366135 3 2015-09-02 Belegung 5 CHE000099999926 1366136 4 2016-10-14 Besamung 10 CHE000099999926 1366137 4 2017-01-24 Besamung 10 CHE000099999926 idani_anon_aisire 0 CHE000020282537 1 CHE000020282537 2 CHE000002123235 3 CHE000002123235 4 CHE000001110094 ... ... 1366133 CHE000025255973 1366134 CHE000038915903 1366135 CHE000099239508 1366136 CHE000045797183 1366137 CHE000079801145 [1366138 rows x 6 columns] --- Reading in hm_animal.csv --- birth_date brd_abbr_icar idani_anon 0 2016-03-08 HOL CHE000000000559 1 2016-02-27 HOL CHE000000000561 2 2011-05-09 HOL CHE000000000620 3 2014-06-23 HOL CHE000000000781 4 2015-11-25 HOL CHE000000001494 ... ... ... ... 275766 2014-01-06 HOL CHE000099998134 275767 2016-11-28 HOL CHE000099998152 275768 2013-11-12 HOL CHE000099998376 275769 2015-09-07 HOL CHE000099999361 275770 2011-01-26 HOL CHE000099999926 [275771 rows x 3 columns] --- Reading in hm_milkrecording.csv --- Column: milking_time_morning; Removed missing values: 0.04% Column: milking_time_evening; Removed missing values: 0.01% Column: lab_date; Removed missing values: 0.00% Column: DIM; Removed missing values: 0.02% Column: lactose_24h; Removed missing values: 0.00% Column: scc_24h; Removed missing values: 0.00% Column: urea_24h; Removed missing values: 0.05% mlksmpl_date milking_time_morning milking_time_evening lab_date \ 0 2018-10-10 50000.0 163000.0 2018-10-11 1 2018-11-13 50000.0 163000.0 2018-11-15 2 2018-12-18 50000.0 163000.0 2018-12-19 3 2019-01-21 50000.0 163000.0 2019-01-23 4 2019-02-23 50000.0 163000.0 2019-02-26 ... ... ... ... ... 5116724 2018-08-06 55000.0 172000.0 2018-08-08 5116725 2018-09-07 55000.0 171000.0 2018-09-10 5116726 2018-10-10 53000.0 171000.0 2018-10-11 5116727 2018-11-12 53000.0 165000.0 2018-11-14 5116728 2018-12-14 53500.0 165000.0 2018-12-17 DIM pruefmethode melkmethode milk_yield_24h fat_24h protein_24h \ 0 34.0 AT4 Normal 34.2 36.5 24.9 1 68.0 AT4 Normal 34.1 34.9 29.2 2 103.0 AT4 Normal 30.9 37.2 26.7 3 137.0 AT4 Normal 36.3 36.8 31.1 4 170.0 AT4 Normal 35.3 34.2 29.6 ... ... ... ... ... ... ... 5116724 265.0 AT4 Normal 27.9 43.5 32.7 5116725 297.0 AT4 Normal 30.7 58.8 38.4 5116726 330.0 AT4 Normal 25.3 52.2 40.6 5116727 363.0 AT4 Normal 19.8 52.8 43.3 5116728 395.0 AT4 Normal 14.7 53.6 43.6 lactose_24h scc_24h urea_24h AR_PESEE_PESCODEALPAGE \ 0 49.8 21.0 2.4 0.0 1 50.3 80.0 2.2 0.0 2 48.3 132.0 1.6 0.0 3 50.0 106.0 2.1 0.0 4 48.6 133.0 1.5 0.0 ... ... ... ... ... 5116724 44.1 1004.0 3.3 0.0 5116725 44.6 517.0 3.2 0.0 5116726 43.5 235.0 2.9 0.0 5116727 43.6 166.0 3.2 0.0 5116728 41.7 163.0 2.9 0.0 idani_anon idhrd_anon milk_yield_msrmt_type \ 0 CHE000000000561 CHE000000095710 2 1 CHE000000000561 CHE000000095710 3 2 CHE000000000561 CHE000000095710 2 3 CHE000000000561 CHE000000095710 3 4 CHE000000000561 CHE000000095710 2 ... ... ... ... 5116724 CHE000099999926 CHE000000031281 2 5116725 CHE000099999926 CHE000000031281 3 5116726 CHE000099999926 CHE000000031281 2 5116727 CHE000099999926 CHE000000031281 3 5116728 CHE000099999926 CHE000000031281 2 fat_protein_24h_ratio 0 1.465863 1 1.195205 2 1.393258 3 1.183280 4 1.155405 ... ... 5116724 1.330275 5116725 1.531250 5116726 1.285714 5116727 1.219400 5116728 1.229358 [5116729 rows x 18 columns] --- Reading in hm_ebv.csv --- Column: ekg; Removed missing values: 63.15% Column: scs; Removed missing values: 0.00% base label idani_anon ekg epr fkg fpr mkg per \ 0 HO20 A CHE000000000559 -23.0 -0.02 -31.0 -0.07 -636.0 93.0 1 HO20 CH CHE000000000561 18.0 -0.23 26.0 -0.24 1173.0 93.0 2 HO20 A CHE000000000620 5.0 0.11 -1.0 0.05 -147.0 99.0 3 HO20 CH CHE000000000781 0.0 -0.07 -2.0 -0.11 179.0 95.0 4 HO20 CH CHE000000001494 3.0 0.11 -4.0 0.04 -179.0 93.0 ... ... ... ... ... ... ... ... ... ... 271769 HO20 CH CHE000099998134 6.0 0.11 18.0 0.26 -82.0 89.0 271770 HO20 CH CHE000099998152 8.0 -0.03 3.0 -0.11 311.0 89.0 271771 HO20 CH CHE000099998376 8.0 0.16 46.0 0.64 -167.0 104.0 271772 HO20 CH CHE000099999361 10.0 -0.02 12.0 -0.03 365.0 103.0 271773 HO20 CH CHE000099999926 -1.0 -0.01 11.0 0.13 -3.0 94.0 scs 0 102.0 1 89.0 2 94.0 3 106.0 4 111.0 ... ... 271769 90.0 271770 110.0 271771 99.0 271772 111.0 271773 98.0 [271774 rows x 10 columns] --- Reading in hm_pregnancy.csv --- pregnancy_detection_date finding idani_anon idhrd_anon 0 2019-01-08 pregnant CHE000000000561 CHE000000095710 1 2020-03-11 open CHE000000000561 CHE000000095710 2 2018-01-09 open CHE000000000781 CHE000000072353 3 2018-03-13 pregnant CHE000000000781 CHE000000072353 4 2018-03-20 pregnant CHE000000001494 CHE000000052966 ... ... ... ... ... 143945 2018-11-27 pregnant CHE000099992995 CHE000000014814 143946 2020-01-17 open CHE000099993267 CHE000000035579 143947 2019-11-21 pregnant CHE000099998152 CHE000000064265 143948 2020-01-27 pregnant CHE000099998152 CHE000000064265 143949 2018-05-01 pregnant CHE000099999926 CHE000000031281 [143950 rows x 4 columns] --- Reading in hm_health.csv --- Column: idhrd_anon; Removed missing values: 0.07% hecode_ASR healthevent_date idani_anon idhrd_anon 0 10.7.1. 2018-11-19 CHE000000005877 CHE000000079291 1 2.1.1. 2018-12-20 CHE000000005877 CHE000000079291 2 10.4. 2019-09-06 CHE000000005877 CHE000000079291 3 3.5. 2020-03-14 CHE000000005877 CHE000000079291 4 6.1. 2014-06-02 CHE000000006772 CHE000000055108 ... ... ... ... ... 53730 2.1.2. 2019-06-15 CHE000099990201 CHE000000088759 53731 2.5. 2018-01-30 CHE000099990433 CHE000000059284 53732 2.1.1. 2020-03-18 CHE000099992995 CHE000000014814 53733 9.3.99. 2013-11-25 CHE000099998376 CHE000000086408 53734 10.2. 2013-12-19 CHE000099998376 CHE000000086408 [53735 rows x 4 columns]
fname = 'hm_milkrecording.csv'
columns_to_ignore = ['idspectrum_num',
'AR_PESEE_PESCODEMALADEANALYSE',
'AR_PESEE_PESCODEMALADEPESEE',
'milk_yield_morning_measured',
'milk_yield_evening_measured'
]
df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])
# Convert date columns to datetime objects & sort dataframe
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()
# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)
# Remove columns to be ignored
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)
# Remove rows with emtpy values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)
assert(0 == df.isnull().sum().sum())
# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')
Column: milking_time_morning; Removed missing values: 0.04% Column: milking_time_evening; Removed missing values: 0.01% Column: DIM; Removed missing values: 0.02% Column: lactose_24h; Removed missing values: 0.00% Column: scc_24h; Removed missing values: 0.00% Column: urea_24h; Removed missing values: 0.05% Column: bloodBHB; Removed missing values: 88.89% Column: CH4; Removed missing values: 5.46%