import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython.core.display import display, HTML
import os
%matplotlib inline
# Notebook Styling
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.float_format',lambda x: '%.5f' % x)
CSV_PATH = os.path.join('data', 'hr', 'Absenteeism_at_work.csv')
absenteeism = pd.read_csv(CSV_PATH, encoding='latin1', sep=';')
absenteeism.head()
ID | Reason for absence | Month of absence | Day of the week | Seasons | Transportation expense | Distance from Residence to Work | Service time | Age | Work load Average/day | Hit target | Disciplinary failure | Education | Son | Social drinker | Social smoker | Pet | Weight | Height | Body mass index | Absenteeism time in hours | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11 | 26 | 7 | 3 | 1 | 289 | 36 | 13 | 33 | 239.55400 | 97 | 0 | 1 | 2 | 1 | 0 | 1 | 90 | 172 | 30 | 4 |
1 | 36 | 0 | 7 | 3 | 1 | 118 | 13 | 18 | 50 | 239.55400 | 97 | 1 | 1 | 1 | 1 | 0 | 0 | 98 | 178 | 31 | 0 |
2 | 3 | 23 | 7 | 4 | 1 | 179 | 51 | 18 | 38 | 239.55400 | 97 | 0 | 1 | 0 | 1 | 0 | 0 | 89 | 170 | 31 | 2 |
3 | 7 | 7 | 7 | 5 | 1 | 279 | 5 | 14 | 39 | 239.55400 | 97 | 0 | 1 | 2 | 1 | 1 | 0 | 68 | 168 | 24 | 4 |
4 | 11 | 23 | 7 | 5 | 1 | 289 | 36 | 13 | 33 | 239.55400 | 97 | 0 | 1 | 2 | 1 | 0 | 1 | 90 | 172 | 30 | 2 |
absenteeism.shape
(740, 21)
absenteeism.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 740 entries, 0 to 739 Data columns (total 21 columns): ID 740 non-null int64 Reason for absence 740 non-null int64 Month of absence 740 non-null int64 Day of the week 740 non-null int64 Seasons 740 non-null int64 Transportation expense 740 non-null int64 Distance from Residence to Work 740 non-null int64 Service time 740 non-null int64 Age 740 non-null int64 Work load Average/day 740 non-null float64 Hit target 740 non-null int64 Disciplinary failure 740 non-null int64 Education 740 non-null int64 Son 740 non-null int64 Social drinker 740 non-null int64 Social smoker 740 non-null int64 Pet 740 non-null int64 Weight 740 non-null int64 Height 740 non-null int64 Body mass index 740 non-null int64 Absenteeism time in hours 740 non-null int64 dtypes: float64(1), int64(20) memory usage: 121.5 KB
The Reason for Absence feature consists of 28 possible answers:
absenteeism['Reason for absence'] = absenteeism['Reason for absence'].astype('category')