import pandas as pd
import numpy as np
import missingno as msno
import datetime as dt
from stringcase import snakecase
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.style as style
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#with np.printoptions(threshold=np.inf):
# create a new class which makes it possible to format text in colour or bold etc.
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
In this guided project, we'll work with exit surveys from employees of the Department of Education, Training and Employment (DETE) and the Technical and Further Education (TAFE) institute in Queensland, Australia. You can find the DETE exit survey data here. The original TAFE exit survey data is no longer available. We've made some slight modifications to the original datasets to make them easier to work with, including changing the encoding to UTF-8 (the original ones are encoded using cp1252.)
In this project, we'll play the role of data analyst and pretend our stakeholders want to know the following:
First, we'll read in the datasets and do some initial exporation
#read in both surveys to seperate datasets.
dete_survey = pd.read_csv('dete_survey.csv', na_values="Not Stated")
tafe_survey = pd.read_csv('tafe_survey.csv')
Lets starts xploring the DETE dataset first.
col_index_number = [*range(dete_survey.shape[1])]
with pd.option_context('display.max_columns', None,'display.max_rows', None):
display(pd.DataFrame({'Column #': col_index_number,
'Data ype': dete_survey.dtypes,
'NotNull': dete_survey.notnull().sum()}
))
display(dete_survey.head(5))
Column # | Data ype | NotNull | |
---|---|---|---|
ID | 0 | int64 | 822 |
SeparationType | 1 | object | 822 |
Cease Date | 2 | object | 788 |
DETE Start Date | 3 | float64 | 749 |
Role Start Date | 4 | float64 | 724 |
Position | 5 | object | 817 |
Classification | 6 | object | 455 |
Region | 7 | object | 717 |
Business Unit | 8 | object | 126 |
Employment Status | 9 | object | 817 |
Career move to public sector | 10 | bool | 822 |
Career move to private sector | 11 | bool | 822 |
Interpersonal conflicts | 12 | bool | 822 |
Job dissatisfaction | 13 | bool | 822 |
Dissatisfaction with the department | 14 | bool | 822 |
Physical work environment | 15 | bool | 822 |
Lack of recognition | 16 | bool | 822 |
Lack of job security | 17 | bool | 822 |
Work location | 18 | bool | 822 |
Employment conditions | 19 | bool | 822 |
Maternity/family | 20 | bool | 822 |
Relocation | 21 | bool | 822 |
Study/Travel | 22 | bool | 822 |
Ill Health | 23 | bool | 822 |
Traumatic incident | 24 | bool | 822 |
Work life balance | 25 | bool | 822 |
Workload | 26 | bool | 822 |
None of the above | 27 | bool | 822 |
Professional Development | 28 | object | 808 |
Opportunities for promotion | 29 | object | 735 |
Staff morale | 30 | object | 816 |
Workplace issue | 31 | object | 788 |
Physical environment | 32 | object | 817 |
Worklife balance | 33 | object | 815 |
Stress and pressure support | 34 | object | 810 |
Performance of supervisor | 35 | object | 813 |
Peer support | 36 | object | 812 |
Initiative | 37 | object | 813 |
Skills | 38 | object | 811 |
Coach | 39 | object | 767 |
Career Aspirations | 40 | object | 746 |
Feedback | 41 | object | 792 |
Further PD | 42 | object | 768 |
Communication | 43 | object | 814 |
My say | 44 | object | 812 |
Information | 45 | object | 816 |
Kept informed | 46 | object | 813 |
Wellness programs | 47 | object | 766 |
Health & Safety | 48 | object | 793 |
Gender | 49 | object | 798 |
Age | 50 | object | 811 |
Aboriginal | 51 | object | 16 |
Torres Strait | 52 | object | 3 |
South Sea | 53 | object | 7 |
Disability | 54 | object | 23 |
NESB | 55 | object | 32 |
ID | SeparationType | Cease Date | DETE Start Date | Role Start Date | Position | Classification | Region | Business Unit | Employment Status | Career move to public sector | Career move to private sector | Interpersonal conflicts | Job dissatisfaction | Dissatisfaction with the department | Physical work environment | Lack of recognition | Lack of job security | Work location | Employment conditions | Maternity/family | Relocation | Study/Travel | Ill Health | Traumatic incident | Work life balance | Workload | None of the above | Professional Development | Opportunities for promotion | Staff morale | Workplace issue | Physical environment | Worklife balance | Stress and pressure support | Performance of supervisor | Peer support | Initiative | Skills | Coach | Career Aspirations | Feedback | Further PD | Communication | My say | Information | Kept informed | Wellness programs | Health & Safety | Gender | Age | Aboriginal | Torres Strait | South Sea | Disability | NESB | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Ill Health Retirement | 08/2012 | 1984.0 | 2004.0 | Public Servant | A01-A04 | Central Office | Corporate Strategy and Peformance | Permanent Full-time | True | False | False | True | False | False | True | False | False | False | False | False | False | False | False | False | False | True | A | A | N | N | N | A | A | A | A | N | N | N | A | A | A | N | A | A | N | N | N | Male | 56-60 | NaN | NaN | NaN | NaN | Yes |
1 | 2 | Voluntary Early Retirement (VER) | 08/2012 | NaN | NaN | Public Servant | AO5-AO7 | Central Office | Corporate Strategy and Peformance | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | A | A | N | N | N | N | A | A | A | N | N | N | A | A | A | N | A | A | N | N | N | Male | 56-60 | NaN | NaN | NaN | NaN | NaN |
2 | 3 | Voluntary Early Retirement (VER) | 05/2012 | 2011.0 | 2011.0 | Schools Officer | NaN | Central Office | Education Queensland | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | N | N | N | N | N | N | N | N | N | N | N | N | N | N | N | A | A | N | N | N | N | Male | 61 or older | NaN | NaN | NaN | NaN | NaN |
3 | 4 | Resignation-Other reasons | 05/2012 | 2005.0 | 2006.0 | Teacher | Primary | Central Queensland | NaN | Permanent Full-time | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | A | N | N | N | A | A | N | N | A | A | A | A | A | A | A | A | A | A | A | N | A | Female | 36-40 | NaN | NaN | NaN | NaN | NaN |
4 | 5 | Age Retirement | 05/2012 | 1970.0 | 1989.0 | Head of Curriculum/Head of Special Education | NaN | South East | NaN | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | A | A | N | N | D | D | N | A | A | A | A | A | A | SA | SA | D | D | A | N | A | M | Female | 61 or older | NaN | NaN | NaN | NaN | NaN |
The dataset contains a lot of columns (56). Most of which are not relevent for answering the two research questions. In the next step the dataset will be sliced in order to cut out these irrelevant columns.
#drop columns that are not relevant to the analysis
print(color.BOLD + "Columns to remove are:" + color.END)
print(dete_survey.iloc[:,28:49].columns)
print('\n')
dete_survey_updated = dete_survey.drop(dete_survey.iloc[:,28:49].columns,axis=1)
with pd.option_context('display.max_columns', None):
print(color.BOLD + "Slice of remaining dataset:" + color.END)
display(dete_survey_updated.head(5))
Columns to remove are: Index(['Professional Development', 'Opportunities for promotion', 'Staff morale', 'Workplace issue', 'Physical environment', 'Worklife balance', 'Stress and pressure support', 'Performance of supervisor', 'Peer support', 'Initiative', 'Skills', 'Coach', 'Career Aspirations', 'Feedback', 'Further PD', 'Communication', 'My say', 'Information', 'Kept informed', 'Wellness programs', 'Health & Safety'], dtype='object') Slice of remaining dataset:
ID | SeparationType | Cease Date | DETE Start Date | Role Start Date | Position | Classification | Region | Business Unit | Employment Status | Career move to public sector | Career move to private sector | Interpersonal conflicts | Job dissatisfaction | Dissatisfaction with the department | Physical work environment | Lack of recognition | Lack of job security | Work location | Employment conditions | Maternity/family | Relocation | Study/Travel | Ill Health | Traumatic incident | Work life balance | Workload | None of the above | Gender | Age | Aboriginal | Torres Strait | South Sea | Disability | NESB | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Ill Health Retirement | 08/2012 | 1984.0 | 2004.0 | Public Servant | A01-A04 | Central Office | Corporate Strategy and Peformance | Permanent Full-time | True | False | False | True | False | False | True | False | False | False | False | False | False | False | False | False | False | True | Male | 56-60 | NaN | NaN | NaN | NaN | Yes |
1 | 2 | Voluntary Early Retirement (VER) | 08/2012 | NaN | NaN | Public Servant | AO5-AO7 | Central Office | Corporate Strategy and Peformance | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | Male | 56-60 | NaN | NaN | NaN | NaN | NaN |
2 | 3 | Voluntary Early Retirement (VER) | 05/2012 | 2011.0 | 2011.0 | Schools Officer | NaN | Central Office | Education Queensland | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | Male | 61 or older | NaN | NaN | NaN | NaN | NaN |
3 | 4 | Resignation-Other reasons | 05/2012 | 2005.0 | 2006.0 | Teacher | Primary | Central Queensland | NaN | Permanent Full-time | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | Female | 36-40 | NaN | NaN | NaN | NaN | NaN |
4 | 5 | Age Retirement | 05/2012 | 1970.0 | 1989.0 | Head of Curriculum/Head of Special Education | NaN | South East | NaN | Permanent Full-time | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | Female | 61 or older | NaN | NaN | NaN | NaN | NaN |
Lets check for missing data in the remaining columns. In order to do that I've build a function that sums all NaN values and states the % of missing values for each column. The outcome will be in a dataframe and only columns with a missing percentage higher than 50 will be showed.
def missing_col(data):
#Check missing values for all columns and sort them descending in a dataframe
#perc missing per column
perc_missing = round(data.isnull().sum()
/ len(data) * 100, 1)
#check how many values are not null in each column
total_notnull = data.notnull().sum()
#create a df of both series, filter to show only columns missing more than 50% and sort values
df = pd.DataFrame({'Total NotNull':total_notnull,'% Missing':perc_missing})
return display(df[df['% Missing']>50].sort_values(by='% Missing', ascending=False))
missing_col(dete_survey_updated)
Total NotNull | % Missing | |
---|---|---|
Torres Strait | 3 | 99.6 |
South Sea | 7 | 99.1 |
Aboriginal | 16 | 98.1 |
Disability | 23 | 97.2 |
NESB | 32 | 96.1 |
Business Unit | 126 | 84.7 |
As can be seen there are six columns with a very high percentage of missing values. At the same time the importance of these columns is unclear for the analysis. Lets remove these columns in the next step.
#drop all columns with more than 80% missing, thus all columns that have 126 values or less.
dete_survey_updated = dete_survey_updated.dropna(thresh=127,axis=1).copy()
Now that the first exploration of the DETE dataset has finished I'll start the exploration of the TAFE dataset.
col_index_number = [*range(tafe_survey.shape[1])]
with pd.option_context('display.max_columns', None,'display.max_rows', None):
display(pd.DataFrame({'Column #': col_index_number,
'Data Type': tafe_survey.dtypes,
'NotNull': tafe_survey.notnull().sum()}))
display(tafe_survey.head(5))
Column # | Data Type | NotNull | |
---|---|---|---|
Record ID | 0 | float64 | 702 |
Institute | 1 | object | 702 |
WorkArea | 2 | object | 702 |
CESSATION YEAR | 3 | float64 | 695 |
Reason for ceasing employment | 4 | object | 701 |
Contributing Factors. Career Move - Public Sector | 5 | object | 437 |
Contributing Factors. Career Move - Private Sector | 6 | object | 437 |
Contributing Factors. Career Move - Self-employment | 7 | object | 437 |
Contributing Factors. Ill Health | 8 | object | 437 |
Contributing Factors. Maternity/Family | 9 | object | 437 |
Contributing Factors. Dissatisfaction | 10 | object | 437 |
Contributing Factors. Job Dissatisfaction | 11 | object | 437 |
Contributing Factors. Interpersonal Conflict | 12 | object | 437 |
Contributing Factors. Study | 13 | object | 437 |
Contributing Factors. Travel | 14 | object | 437 |
Contributing Factors. Other | 15 | object | 437 |
Contributing Factors. NONE | 16 | object | 437 |
Main Factor. Which of these was the main factor for leaving? | 17 | object | 113 |
InstituteViews. Topic:1. I feel the senior leadership had a clear vision and direction | 18 | object | 608 |
InstituteViews. Topic:2. I was given access to skills training to help me do my job better | 19 | object | 613 |
InstituteViews. Topic:3. I was given adequate opportunities for personal development | 20 | object | 610 |
InstituteViews. Topic:4. I was given adequate opportunities for promotion within %Institute]Q25LBL% | 21 | object | 608 |
InstituteViews. Topic:5. I felt the salary for the job was right for the responsibilities I had | 22 | object | 615 |
InstituteViews. Topic:6. The organisation recognised when staff did good work | 23 | object | 607 |
InstituteViews. Topic:7. Management was generally supportive of me | 24 | object | 614 |
InstituteViews. Topic:8. Management was generally supportive of my team | 25 | object | 608 |
InstituteViews. Topic:9. I was kept informed of the changes in the organisation which would affect me | 26 | object | 610 |
InstituteViews. Topic:10. Staff morale was positive within the Institute | 27 | object | 602 |
InstituteViews. Topic:11. If I had a workplace issue it was dealt with quickly | 28 | object | 601 |
InstituteViews. Topic:12. If I had a workplace issue it was dealt with efficiently | 29 | object | 597 |
InstituteViews. Topic:13. If I had a workplace issue it was dealt with discreetly | 30 | object | 601 |
WorkUnitViews. Topic:14. I was satisfied with the quality of the management and supervision within my work unit | 31 | object | 609 |
WorkUnitViews. Topic:15. I worked well with my colleagues | 32 | object | 605 |
WorkUnitViews. Topic:16. My job was challenging and interesting | 33 | object | 607 |
WorkUnitViews. Topic:17. I was encouraged to use my initiative in the course of my work | 34 | object | 610 |
WorkUnitViews. Topic:18. I had sufficient contact with other people in my job | 35 | object | 613 |
WorkUnitViews. Topic:19. I was given adequate support and co-operation by my peers to enable me to do my job | 36 | object | 609 |
WorkUnitViews. Topic:20. I was able to use the full range of my skills in my job | 37 | object | 609 |
WorkUnitViews. Topic:21. I was able to use the full range of my abilities in my job. ; Category:Level of Agreement; Question:YOUR VIEWS ABOUT YOUR WORK UNIT] | 38 | object | 608 |
WorkUnitViews. Topic:22. I was able to use the full range of my knowledge in my job | 39 | object | 608 |
WorkUnitViews. Topic:23. My job provided sufficient variety | 40 | object | 611 |
WorkUnitViews. Topic:24. I was able to cope with the level of stress and pressure in my job | 41 | object | 610 |
WorkUnitViews. Topic:25. My job allowed me to balance the demands of work and family to my satisfaction | 42 | object | 611 |
WorkUnitViews. Topic:26. My supervisor gave me adequate personal recognition and feedback on my performance | 43 | object | 606 |
WorkUnitViews. Topic:27. My working environment was satisfactory e.g. sufficient space, good lighting, suitable seating and working area | 44 | object | 610 |
WorkUnitViews. Topic:28. I was given the opportunity to mentor and coach others in order for me to pass on my skills and knowledge prior to my cessation date | 45 | object | 609 |
WorkUnitViews. Topic:29. There was adequate communication between staff in my unit | 46 | object | 603 |
WorkUnitViews. Topic:30. Staff morale was positive within my work unit | 47 | object | 606 |
Induction. Did you undertake Workplace Induction? | 48 | object | 619 |
InductionInfo. Topic:Did you undertake a Corporate Induction? | 49 | object | 432 |
InductionInfo. Topic:Did you undertake a Institute Induction? | 50 | object | 483 |
InductionInfo. Topic: Did you undertake Team Induction? | 51 | object | 440 |
InductionInfo. Face to Face Topic:Did you undertake a Corporate Induction; Category:How it was conducted? | 52 | object | 555 |
InductionInfo. On-line Topic:Did you undertake a Corporate Induction; Category:How it was conducted? | 53 | object | 555 |
InductionInfo. Induction Manual Topic:Did you undertake a Corporate Induction? | 54 | object | 555 |
InductionInfo. Face to Face Topic:Did you undertake a Institute Induction? | 55 | object | 530 |
InductionInfo. On-line Topic:Did you undertake a Institute Induction? | 56 | object | 555 |
InductionInfo. Induction Manual Topic:Did you undertake a Institute Induction? | 57 | object | 553 |
InductionInfo. Face to Face Topic: Did you undertake Team Induction; Category? | 58 | object | 555 |
InductionInfo. On-line Topic: Did you undertake Team Induction?process you undertook and how it was conducted.] | 59 | object | 555 |
InductionInfo. Induction Manual Topic: Did you undertake Team Induction? | 60 | object | 555 |
Workplace. Topic:Did you and your Manager develop a Performance and Professional Development Plan (PPDP)? | 61 | object | 608 |
Workplace. Topic:Does your workplace promote a work culture free from all forms of unlawful discrimination? | 62 | object | 594 |
Workplace. Topic:Does your workplace promote and practice the principles of employment equity? | 63 | object | 587 |
Workplace. Topic:Does your workplace value the diversity of its employees? | 64 | object | 586 |
Workplace. Topic:Would you recommend the Institute as an employer to others? | 65 | object | 581 |
Gender. What is your Gender? | 66 | object | 596 |
CurrentAge. Current Age | 67 | object | 596 |
Employment Type. Employment Type | 68 | object | 596 |
Classification. Classification | 69 | object | 596 |
LengthofServiceOverall. Overall Length of Service at Institute (in years) | 70 | object | 596 |
LengthofServiceCurrent. Length of Service at current workplace (in years) | 71 | object | 596 |
Record ID | Institute | WorkArea | CESSATION YEAR | Reason for ceasing employment | Contributing Factors. Career Move - Public Sector | Contributing Factors. Career Move - Private Sector | Contributing Factors. Career Move - Self-employment | Contributing Factors. Ill Health | Contributing Factors. Maternity/Family | Contributing Factors. Dissatisfaction | Contributing Factors. Job Dissatisfaction | Contributing Factors. Interpersonal Conflict | Contributing Factors. Study | Contributing Factors. Travel | Contributing Factors. Other | Contributing Factors. NONE | Main Factor. Which of these was the main factor for leaving? | InstituteViews. Topic:1. I feel the senior leadership had a clear vision and direction | InstituteViews. Topic:2. I was given access to skills training to help me do my job better | InstituteViews. Topic:3. I was given adequate opportunities for personal development | InstituteViews. Topic:4. I was given adequate opportunities for promotion within %Institute]Q25LBL% | InstituteViews. Topic:5. I felt the salary for the job was right for the responsibilities I had | InstituteViews. Topic:6. The organisation recognised when staff did good work | InstituteViews. Topic:7. Management was generally supportive of me | InstituteViews. Topic:8. Management was generally supportive of my team | InstituteViews. Topic:9. I was kept informed of the changes in the organisation which would affect me | InstituteViews. Topic:10. Staff morale was positive within the Institute | InstituteViews. Topic:11. If I had a workplace issue it was dealt with quickly | InstituteViews. Topic:12. If I had a workplace issue it was dealt with efficiently | InstituteViews. Topic:13. If I had a workplace issue it was dealt with discreetly | WorkUnitViews. Topic:14. I was satisfied with the quality of the management and supervision within my work unit | WorkUnitViews. Topic:15. I worked well with my colleagues | WorkUnitViews. Topic:16. My job was challenging and interesting | WorkUnitViews. Topic:17. I was encouraged to use my initiative in the course of my work | WorkUnitViews. Topic:18. I had sufficient contact with other people in my job | WorkUnitViews. Topic:19. I was given adequate support and co-operation by my peers to enable me to do my job | WorkUnitViews. Topic:20. I was able to use the full range of my skills in my job | WorkUnitViews. Topic:21. I was able to use the full range of my abilities in my job. ; Category:Level of Agreement; Question:YOUR VIEWS ABOUT YOUR WORK UNIT] | WorkUnitViews. Topic:22. I was able to use the full range of my knowledge in my job | WorkUnitViews. Topic:23. My job provided sufficient variety | WorkUnitViews. Topic:24. I was able to cope with the level of stress and pressure in my job | WorkUnitViews. Topic:25. My job allowed me to balance the demands of work and family to my satisfaction | WorkUnitViews. Topic:26. My supervisor gave me adequate personal recognition and feedback on my performance | WorkUnitViews. Topic:27. My working environment was satisfactory e.g. sufficient space, good lighting, suitable seating and working area | WorkUnitViews. Topic:28. I was given the opportunity to mentor and coach others in order for me to pass on my skills and knowledge prior to my cessation date | WorkUnitViews. Topic:29. There was adequate communication between staff in my unit | WorkUnitViews. Topic:30. Staff morale was positive within my work unit | Induction. Did you undertake Workplace Induction? | InductionInfo. Topic:Did you undertake a Corporate Induction? | InductionInfo. Topic:Did you undertake a Institute Induction? | InductionInfo. Topic: Did you undertake Team Induction? | InductionInfo. Face to Face Topic:Did you undertake a Corporate Induction; Category:How it was conducted? | InductionInfo. On-line Topic:Did you undertake a Corporate Induction; Category:How it was conducted? | InductionInfo. Induction Manual Topic:Did you undertake a Corporate Induction? | InductionInfo. Face to Face Topic:Did you undertake a Institute Induction? | InductionInfo. On-line Topic:Did you undertake a Institute Induction? | InductionInfo. Induction Manual Topic:Did you undertake a Institute Induction? | InductionInfo. Face to Face Topic: Did you undertake Team Induction; Category? | InductionInfo. On-line Topic: Did you undertake Team Induction?process you undertook and how it was conducted.] | InductionInfo. Induction Manual Topic: Did you undertake Team Induction? | Workplace. Topic:Did you and your Manager develop a Performance and Professional Development Plan (PPDP)? | Workplace. Topic:Does your workplace promote a work culture free from all forms of unlawful discrimination? | Workplace. Topic:Does your workplace promote and practice the principles of employment equity? | Workplace. Topic:Does your workplace value the diversity of its employees? | Workplace. Topic:Would you recommend the Institute as an employer to others? | Gender. What is your Gender? | CurrentAge. Current Age | Employment Type. Employment Type | Classification. Classification | LengthofServiceOverall. Overall Length of Service at Institute (in years) | LengthofServiceCurrent. Length of Service at current workplace (in years) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6.341330e+17 | Southern Queensland Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Contract Expired | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Agree | Agree | Agree | Neutral | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Strongly Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Neutral | Agree | Agree | Yes | Yes | Yes | Yes | Face to Face | - | - | Face to Face | - | - | Face to Face | - | - | Yes | Yes | Yes | Yes | Yes | Female | 26 30 | Temporary Full-time | Administration (AO) | 1-2 | 1-2 |
1 | 6.341337e+17 | Mount Isa Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Retirement | - | - | - | - | - | - | - | - | - | Travel | - | - | NaN | Agree | Agree | Agree | Agree | Agree | Strongly Agree | Strongly Agree | Agree | Strongly Agree | Agree | Agree | Agree | Disagree | Strongly Agree | Strongly Agree | Strongly Agree | Agree | Agree | Agree | Strongly Agree | Agree | Agree | Agree | Strongly Agree | Agree | Strongly Agree | Strongly Agree | Agree | Agree | Strongly Agree | No | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Yes | Yes | Yes | Yes | Yes | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 6.341388e+17 | Mount Isa Institute of TAFE | Delivery (teaching) | 2010.0 | Retirement | - | - | - | - | - | - | - | - | - | - | - | NONE | NaN | Agree | Agree | Agree | Agree | Agree | Agree | Strongly Agree | Agree | Agree | Agree | Agree | Neutral | Neutral | Strongly Agree | Strongly Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | No | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Yes | Yes | Yes | Yes | Yes | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 6.341399e+17 | Mount Isa Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Resignation | - | - | - | - | - | - | - | - | - | Travel | - | - | NaN | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Yes | No | Yes | Yes | - | - | - | NaN | - | - | - | - | - | Yes | Yes | Yes | Yes | Yes | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 6.341466e+17 | Southern Queensland Institute of TAFE | Delivery (teaching) | 2010.0 | Resignation | - | Career Move - Private Sector | - | - | - | - | - | - | - | - | - | - | NaN | Agree | Agree | Strongly Agree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Agree | Strongly Agree | Strongly Agree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Yes | Yes | Yes | Yes | - | - | Induction Manual | Face to Face | - | - | Face to Face | - | - | Yes | Yes | Yes | Yes | Yes | Male | 41 45 | Permanent Full-time | Teacher (including LVT) | 3-4 | 3-4 |
The dataset contains a lot of columns (72). Most of which are not relevent for answering the two research questions. In the next step the dataset will be sliced in order to cut out these irrelevant columns.
#drop columns that are not relevant to the analysis
tafe_survey_updated = tafe_survey.drop(tafe_survey.iloc[:,17:66].columns, axis=1)
display(tafe_survey_updated.head(5))
Record ID | Institute | WorkArea | CESSATION YEAR | Reason for ceasing employment | Contributing Factors. Career Move - Public Sector | Contributing Factors. Career Move - Private Sector | Contributing Factors. Career Move - Self-employment | Contributing Factors. Ill Health | Contributing Factors. Maternity/Family | ... | Contributing Factors. Study | Contributing Factors. Travel | Contributing Factors. Other | Contributing Factors. NONE | Gender. What is your Gender? | CurrentAge. Current Age | Employment Type. Employment Type | Classification. Classification | LengthofServiceOverall. Overall Length of Service at Institute (in years) | LengthofServiceCurrent. Length of Service at current workplace (in years) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6.341330e+17 | Southern Queensland Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Contract Expired | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | Female | 26 30 | Temporary Full-time | Administration (AO) | 1-2 | 1-2 |
1 | 6.341337e+17 | Mount Isa Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Retirement | - | - | - | - | - | ... | - | Travel | - | - | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 6.341388e+17 | Mount Isa Institute of TAFE | Delivery (teaching) | 2010.0 | Retirement | - | - | - | - | - | ... | - | - | - | NONE | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 6.341399e+17 | Mount Isa Institute of TAFE | Non-Delivery (corporate) | 2010.0 | Resignation | - | - | - | - | - | ... | - | Travel | - | - | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 6.341466e+17 | Southern Queensland Institute of TAFE | Delivery (teaching) | 2010.0 | Resignation | - | Career Move - Private Sector | - | - | - | ... | - | - | - | - | Male | 41 45 | Permanent Full-time | Teacher (including LVT) | 3-4 | 3-4 |
5 rows × 23 columns
In order to check for missing values the earlier created function will be used.
missing_col(tafe_survey_updated)
Total NotNull | % Missing |
---|
There are no columns that have more than 50% missing values in the remaining TAFE dataset. This concludes the first exploration of the TAFE dataset.
After exploring the data it is now time to do some cleaning. As both datasets need to be combined the column names will be adjusted. This makes it possible to merg te dataset. Furthermore, the amount of years in service for the DETE dataset needs to be calculated, as this is not yet present in the dataset. In order to do this the year in which the resignee left has to be known. This information can be retrieved from the cease_date column. Afterwards the datasets will be sliced so that only the data consists only of people who resigned (so no contract expirations / retirements etc). Lastly columns with missing values will be handled.
I'll once more start with the DETE dataset.
dete_survey_updated.columns
Index(['ID', 'SeparationType', 'Cease Date', 'DETE Start Date', 'Role Start Date', 'Position', 'Classification', 'Region', 'Employment Status', 'Career move to public sector', 'Career move to private sector', 'Interpersonal conflicts', 'Job dissatisfaction', 'Dissatisfaction with the department', 'Physical work environment', 'Lack of recognition', 'Lack of job security', 'Work location', 'Employment conditions', 'Maternity/family', 'Relocation', 'Study/Travel', 'Ill Health', 'Traumatic incident', 'Work life balance', 'Workload', 'None of the above', 'Gender', 'Age'], dtype='object')
#what are the columns names currently?
before_col = dete_survey_updated.columns
#adjusting the column names so there is more synergy between them style-wise.
dete_survey_updated.columns = (dete_survey_updated.columns
.str.lower()
.str.replace(' ','_')
.str.replace('\s+','',regex=True)
)
dete_survey_updated.rename(columns={'dete_start_date':'start_year'}, inplace=True)
#what are the column names after adjustment
after_col = dete_survey_updated.columns
print('\n')
print(color.BOLD + "DETE column names:")
display(pd.DataFrame({'Before':before_col,'After':after_col}))
DETE column names:
Before | After | |
---|---|---|
0 | ID | id |
1 | SeparationType | separationtype |
2 | Cease Date | cease_date |
3 | DETE Start Date | start_year |
4 | Role Start Date | role_start_date |
5 | Position | position |
6 | Classification | classification |
7 | Region | region |
8 | Employment Status | employment_status |
9 | Career move to public sector | career_move_to_public_sector |
10 | Career move to private sector | career_move_to_private_sector |
11 | Interpersonal conflicts | interpersonal_conflicts |
12 | Job dissatisfaction | job_dissatisfaction |
13 | Dissatisfaction with the department | dissatisfaction_with_the_department |
14 | Physical work environment | physical_work_environment |
15 | Lack of recognition | lack_of_recognition |
16 | Lack of job security | lack_of_job_security |
17 | Work location | work_location |
18 | Employment conditions | employment_conditions |
19 | Maternity/family | maternity/family |
20 | Relocation | relocation |
21 | Study/Travel | study/travel |
22 | Ill Health | ill_health |
23 | Traumatic incident | traumatic_incident |
24 | Work life balance | work_life_balance |
25 | Workload | workload |
26 | None of the above | none_of_the_above |
27 | Gender | gender |
28 | Age | age |
#retrieving the year from cease_date and storing it in a new column
pattern = r"(\d*)$" #take the last 4 digits from every cell
dete_survey_updated['cease_year'] = (dete_survey_updated['cease_date']
.str.extract(pattern)
.astype(float)
)
def explore(data,col):
vc_cease_year = data[col].value_counts(dropna=False).sort_index()
num_nan = data[col].isnull().sum()
print("number of NaN is: {}".format(num_nan))
print("earliest {} is: {}".format(col, vc_cease_year.index.min()))
print("latest {} is {}".format(col, vc_cease_year.index.max()))
explore(dete_survey_updated,'cease_year')
number of NaN is: 34 earliest cease_year is: 2006.0 latest cease_year is 2014.0
Both the earliest and last cease_year show no signs of errors, as 2006 and 2014 are both plausible.
explore(dete_survey_updated,'start_year')
number of NaN is: 73 earliest start_year is: 1963.0 latest start_year is 2013.0
Both the earliest and last start_date show no signs of errors as 1963 and 2013 are both plausible. Now that cease_year is known that years in service can be calculated.
dete_survey_updated['years_in_service'] = dete_survey_updated['cease_year'] - dete_survey_updated['start_year']
Now that DETE columns are renamed and years in service is calculated its time to do the same for the TAFE dataset.
#what are the current columns names
before_col = tafe_survey_updated.columns
#adjustment of column names
col_dict = {'Record ID': 'id',
'CESSATION YEAR': 'cease_year',
'CurrentAge. Current Age': 'age',
'Gender. What is your Gender?': 'gender',
'LengthofServiceOverall. Overall Length of Service at Institute (in years)':'years_in_service',
'LengthofServiceCurrent. Length of Service at current workplace (in years)':'years_in_role',
'Reason for ceasing employment':'separationtype',
'Classification. Classification':'position',
'Contributing Factors. Dissatisfaction':'dissatisfaction',
'Contributing Factors. Job Dissatisfaction':'job_dissatisfaction'
}
tafe_survey_updated.rename(columns = col_dict,inplace=True)
#what are the column names after adjustment
after_col = tafe_survey_updated.columns
print('\n')
print(color.BOLD + "TAFE column names:")
display(pd.DataFrame({'Before':before_col,'After':after_col}))
TAFE column names:
Before | After | |
---|---|---|
0 | Record ID | id |
1 | Institute | Institute |
2 | WorkArea | WorkArea |
3 | CESSATION YEAR | cease_year |
4 | Reason for ceasing employment | separationtype |
5 | Contributing Factors. Career Move - Public Sec... | Contributing Factors. Career Move - Public Sec... |
6 | Contributing Factors. Career Move - Private Se... | Contributing Factors. Career Move - Private Se... |
7 | Contributing Factors. Career Move - Self-emplo... | Contributing Factors. Career Move - Self-emplo... |
8 | Contributing Factors. Ill Health | Contributing Factors. Ill Health |
9 | Contributing Factors. Maternity/Family | Contributing Factors. Maternity/Family |
10 | Contributing Factors. Dissatisfaction | dissatisfaction |
11 | Contributing Factors. Job Dissatisfaction | job_dissatisfaction |
12 | Contributing Factors. Interpersonal Conflict | Contributing Factors. Interpersonal Conflict |
13 | Contributing Factors. Study | Contributing Factors. Study |
14 | Contributing Factors. Travel | Contributing Factors. Travel |
15 | Contributing Factors. Other | Contributing Factors. Other |
16 | Contributing Factors. NONE | Contributing Factors. NONE |
17 | Gender. What is your Gender? | gender |
18 | CurrentAge. Current Age | age |
19 | Employment Type. Employment Type | Employment Type. Employment Type |
20 | Classification. Classification | position |
21 | LengthofServiceOverall. Overall Length of Serv... | years_in_service |
22 | LengthofServiceCurrent. Length of Service at c... | years_in_role |
explore(tafe_survey_updated,'cease_year')
number of NaN is: 7 earliest cease_year is: 2009.0 latest cease_year is 2013.0
In the TAFE dataset the cease_year column has no strange values. Both 2009 and 2013 are plausible. The cease years are distributed a little more narrowly than those of the DETE dataset, but that has no impact on the analysis I'm performing.
Before merging the datasets the datasets are sliced once more. The research question is about resignees, so the column separationtype will be filtered in both datasets on all types of resignations.
#slice df further on separation type, as we're only interested in 'resignations'
pattern = r"Resignation-\w*"
template = "Shape of " + color.BOLD + "{}" + color.END + " dataframe is " + color.BOLD + "{}" + color.END
dete_resignations = (dete_survey_updated.loc[dete_survey_updated['separationtype'].str.contains(pattern),].copy())
dete_resignations.reset_index(drop=True,inplace=True)
print(template.format("DETE", dete_resignations.shape))
tafe_resignations = tafe_survey_updated.loc[tafe_survey_updated['separationtype']=="Resignation",].copy()
tafe_resignations.reset_index(drop=True,inplace=True)
print(template.format("TAFE", tafe_resignations.shape))
Shape of DETE dataframe is (311, 31) Shape of TAFE dataframe is (340, 23)
Both research questions are about the variable 'dissatisfaction'. Currently there are multiple columns in the dataset DETE that describe some kind of dissatisfaction. In order to determine whether a resignee was indeed dissatisfied a new column will be created. This column 'dissatisfied' will be true if the respondent has answered true in one of the following columns:
This column will now be created for the DETE dataset.
dete_resignations['dissatisfied'] = dete_resignations[
['job_dissatisfaction',
'dissatisfaction_with_the_department',
'physical_work_environment',
'lack_of_recognition',
'lack_of_job_security',
'work_location',
'employment_conditions',
'work_life_balance',
'workload']
].any(axis=1, skipna=False).copy()
dete_resignations['dissatisfied'].value_counts(dropna=False)
False 162 True 149 Name: dissatisfied, dtype: int64
Out of 311 respondents that resigned, 149 have indicated that they were dissatisfied about something.
In the TAFE dataset there are two columns indicating some kind of dissatisfaction:
print(tafe_resignations['dissatisfaction'].value_counts(dropna=False))
print('\n')
print(tafe_resignations['job_dissatisfaction'].value_counts(dropna=False))
- 277 Contributing Factors. Dissatisfaction 55 NaN 8 Name: dissatisfaction, dtype: int64 - 270 Job Dissatisfaction 62 NaN 8 Name: job_dissatisfaction, dtype: int64
As can be seen above the contents of both columns are not booleans yet. In order to determine whether the respondent was dissatisfied the columns need to be cleaned. The rules will be as follows:
def boolean_cleanup(value):
if pd.isnull(value):
value == np.nan
elif value == '-':
value = False
else:
value = True
return value
columns = ['dissatisfaction','job_dissatisfaction']
tafe_resignations[columns] = tafe_resignations[columns].applymap(boolean_cleanup).copy()
print(tafe_resignations[columns].apply(lambda x: x.value_counts(dropna=False)))
dissatisfaction job_dissatisfaction False 277 270 True 55 62 NaN 8 8
tafe_resignations['dissatisfied'] = tafe_resignations[columns].any(axis=1, skipna=False).copy()
tafe_resignations['dissatisfied']= tafe_resignations['dissatisfied'].fillna(False)
tafe_resignations['dissatisfied'].value_counts(dropna=False)
False 249 True 91 Name: dissatisfied, dtype: int64
Out of 340 respondents that resigned, 91 have indicated that they were dissatisfied about something. Eight values were missing and are replaced with False as this is the value that occurs the most frequently.
The datasets can almost be combined. Before combining a new column will be made in both datasets so it remains clear to which odataset the data originally belongs after the merge.
dete_resignations['institute'] = 'DETE'
tafe_resignations['institute'] = 'TAFE'
combined = pd.concat([dete_resignations,tafe_resignations],ignore_index=True)
The data has now been merged. Lets check how many values are missing for each column.
print('\n')
print("Shape of combined is {}".format(combined.shape))
isnull = (combined.isnull().sum().sort_values())
notnull = (combined.notnull().sum().sort_values(ascending=False))
perc_missing = isnull/(isnull+notnull)
display(pd.DataFrame({
"NotNull":notnull,
'IsNull':isnull,
"% missing":perc_missing
}
).sort_values(by='% missing', ascending=False))
Shape of combined is (651, 48)
NotNull | IsNull | % missing | |
---|---|---|---|
classification | 161 | 490 | 0.752688 |
region | 265 | 386 | 0.592934 |
role_start_date | 271 | 380 | 0.583717 |
start_year | 283 | 368 | 0.565284 |
Employment Type. Employment Type | 290 | 361 | 0.554531 |
years_in_role | 290 | 361 | 0.554531 |
cease_date | 300 | 351 | 0.539171 |
employment_status | 307 | 344 | 0.528418 |
relocation | 311 | 340 | 0.522273 |
career_move_to_public_sector | 311 | 340 | 0.522273 |
employment_conditions | 311 | 340 | 0.522273 |
work_location | 311 | 340 | 0.522273 |
dissatisfaction_with_the_department | 311 | 340 | 0.522273 |
workload | 311 | 340 | 0.522273 |
traumatic_incident | 311 | 340 | 0.522273 |
career_move_to_private_sector | 311 | 340 | 0.522273 |
work_life_balance | 311 | 340 | 0.522273 |
interpersonal_conflicts | 311 | 340 | 0.522273 |
study/travel | 311 | 340 | 0.522273 |
lack_of_job_security | 311 | 340 | 0.522273 |
lack_of_recognition | 311 | 340 | 0.522273 |
maternity/family | 311 | 340 | 0.522273 |
none_of_the_above | 311 | 340 | 0.522273 |
physical_work_environment | 311 | 340 | 0.522273 |
ill_health | 311 | 340 | 0.522273 |
Contributing Factors. Career Move - Private Sector | 332 | 319 | 0.490015 |
Contributing Factors. Career Move - Public Sector | 332 | 319 | 0.490015 |
Contributing Factors. Travel | 332 | 319 | 0.490015 |
Contributing Factors. Study | 332 | 319 | 0.490015 |
Contributing Factors. Other | 332 | 319 | 0.490015 |
Contributing Factors. NONE | 332 | 319 | 0.490015 |
Contributing Factors. Maternity/Family | 332 | 319 | 0.490015 |
Contributing Factors. Interpersonal Conflict | 332 | 319 | 0.490015 |
Contributing Factors. Ill Health | 332 | 319 | 0.490015 |
Contributing Factors. Career Move - Self-employment | 332 | 319 | 0.490015 |
dissatisfaction | 332 | 319 | 0.490015 |
WorkArea | 340 | 311 | 0.477727 |
Institute | 340 | 311 | 0.477727 |
years_in_service | 563 | 88 | 0.135177 |
gender | 592 | 59 | 0.090630 |
age | 596 | 55 | 0.084485 |
position | 598 | 53 | 0.081413 |
cease_year | 635 | 16 | 0.024578 |
job_dissatisfaction | 643 | 8 | 0.012289 |
separationtype | 651 | 0 | 0.000000 |
institute | 651 | 0 | 0.000000 |
id | 651 | 0 | 0.000000 |
dissatisfied | 651 | 0 | 0.000000 |
There are a lot of columns that consist of a high percentage of missing values. For the analysis I'll continue with the columns that have at least 500 values.
combined_up = combined.dropna(thresh=500,axis=1).copy()
The analysis is now in the stage that the first research question can be answered:
Let see how many unique values of years_in_service there are.
combined_up['years_in_service'].value_counts(dropna=False)
NaN 88 Less than 1 year 73 1-2 64 3-4 63 5-6 33 11-20 26 5.0 23 1.0 22 7-10 21 0.0 20 3.0 20 6.0 17 4.0 16 9.0 14 2.0 14 7.0 13 More than 20 years 10 13.0 8 8.0 8 15.0 7 20.0 7 22.0 6 17.0 6 12.0 6 10.0 6 14.0 6 18.0 5 16.0 5 24.0 4 11.0 4 23.0 4 39.0 3 19.0 3 32.0 3 21.0 3 36.0 2 26.0 2 28.0 2 30.0 2 25.0 2 38.0 1 29.0 1 41.0 1 42.0 1 27.0 1 49.0 1 35.0 1 34.0 1 33.0 1 31.0 1 Name: years_in_service, dtype: int64
Above can be seen that there are a lot of unique categories, both string and numeric. Let's clean that up. I'll make 6 categories that will then be used to analyse whether years of service has an effect on the amount of resignations due to dissatisfaction.
def year_cat(element):
cat1 = "<1"
cat2 = '1-2'
cat3 = '3-4'
cat4 = '5-6'
cat5 = '7-10'
cat6 = '>10'
if pd.isna(element):
return np.nan
elif isinstance(element, str):
if element == 'Less than 1 year': return cat1
elif element == '1-2': return cat2
elif element == '3-4': return cat3
elif element == '5-6': return cat4
elif element == '7-10': return cat5
elif element in ['11-20','More than 20 years']: return cat6
elif isinstance(element,float):
if element == 0: return cat1
if element in [1,2]: return cat2
if element in [3,4]: return cat3
if element in [5,6]: return cat4
if element in [7,8,9,10]: return cat5
if element >10: return cat6
combined_up['years_in_service'] = combined_up['years_in_service'].map(year_cat)
combined_up['years_in_service'].value_counts(dropna=False)[['<1', '1-2', '3-4', '5-6', '7-10', '>10', np.nan]]
<1 93 1-2 100 3-4 99 5-6 73 7-10 62 >10 136 NaN 88 Name: years_in_service, dtype: int64
Now that the years in service are neatly categorized it is time to see whether years in service has an influence on likelyhood of an resignee to be dissatisfied. Below the amount of dissatisfied resignees will be summed for each category.
combined_up_dis = combined_up.loc[combined_up['dissatisfied']==True,]
df = (combined_up['years_in_service']
.value_counts(dropna=False)[['<1', '1-2', '3-4', '5-6', '7-10', '>10']]
.to_frame(name='Total')
)
df['Dissatisfied'] = (combined_up_dis['years_in_service']
.value_counts(dropna=False)[['<1', '1-2', '3-4', '5-6', '7-10', '>10']]
)
df['Other reasons'] = df['Total'] - df['Dissatisfied']
df['Dissatisfied %'] = round(df['Dissatisfied'] / df['Total'],2)
df['Other reasons %'] = round(df['Other reasons'] / df['Total'],2)
df.index.name = 'Years in service'
df
Total | Dissatisfied | Other reasons | Dissatisfied % | Other reasons % | |
---|---|---|---|---|---|
Years in service | |||||
<1 | 93 | 30 | 63 | 0.32 | 0.68 |
1-2 | 100 | 27 | 73 | 0.27 | 0.73 |
3-4 | 99 | 28 | 71 | 0.28 | 0.72 |
5-6 | 73 | 31 | 42 | 0.42 | 0.58 |
7-10 | 62 | 32 | 30 | 0.52 | 0.48 |
>10 | 136 | 66 | 70 | 0.49 | 0.51 |
Lets plot those numbers in two horizontal bar graphs.
plt.figure(figsize=(14,8)) #width then height
grid = plt.GridSpec(1,2, wspace = .87, hspace = .25) #rows then columns, space between graphs in width and height
ax1 = plt.subplot(grid[0,0])
ax2 = plt.subplot(grid[0,1])
ax1.barh(df.index, df['Total'],
color='#F97306', alpha=0.3, label='Total resignees'
)
ax1.barh(df.index, df['Dissatisfied'],
color='#F97306', alpha=0.8, label='Dissatisfied resignees'
)
ax1.set_title('Absolute', y=0.955, weight='bold')
ax1.legend(bbox_to_anchor=(1.03,0.97),fontsize=12)
#---------------------------------------------------------------------------------------------------------
df_rel = df[['Dissatisfied %', 'Other reasons %']]
bottom_list = []
for i in df_rel:
b = list(df_rel.loc[:, df_rel.columns[0]: i].sum(axis=1))
bottom_list.append(b)
x_ax = df_rel.index
for i, v in enumerate(df_rel.columns):
if i == 0:
# First column is at the bottom of the chart so bottom is 0 and doesn't need to be specified.
ax2.barh(x_ax, df_rel[v], color='#F97306', alpha=0.8)
else:
ax2.barh(x_ax, df_rel[v], color='#F97306', alpha=0.2, left=bottom_list[i - 1])
ax2.set_title('Relative', y=0.955, weight='bold')
ax2.set_xticks([0,0.2,0.4,0.6,0.8,1])
ax2.set_xticklabels([0,'20%','40%','60%','80%','100%'])
ax2.axvline(x=0.5, color='black', linestyle='dashed', ymin=0.046, ymax=0.953)
for ax in [ax1,ax2]:
for loc in ['bottom','top','left','right']:
ax.spines[loc].set_visible(False)
ax1.text(x=0,y=6.2, s="Resignees with 7+ service years are the most likely to be dissatisfied", size=20, weight='bold')
ax1.text(x=0,y=5.9, s="Resignee dissatisfaction per years of service category in both absolute as relative numbers", size=16)
plt.show()
As can be seen, people with higher years of service are relatively more dissatisfied. However, only one of the categories (7-10) shows more than 50% of dissatisfied people. This shows that more often there are probably other reasons for a person to resign.
That brings the analysis to the second research question to be answered:
Lets begin with checking how many different values there are in the age column.
combined_up['age'].value_counts().sort_index()
20 or younger 10 21 25 33 21-25 29 26 30 32 26-30 35 31 35 32 31-35 29 36 40 32 36-40 41 41 45 45 41-45 48 46 50 39 46-50 42 51-55 71 56 or older 29 56-60 26 61 or older 23 Name: age, dtype: int64
Once again a lot of unique categories, both string and numeric. Let's clean that up. I'll make 9 categories that will then be used to analyse whether age has an effect on the amount of resignations due to dissatisfaction.
combined_up['age'] = combined_up['age'].str.replace(" ","-")
def age_cleanup(element):
if element == "61 or older": return "56 or older"
elif element == "56-60": return "56 or older"
else: return element
combined_up['age'] = combined_up['age'].map(age_cleanup)
combined_up['age'].value_counts(dropna=False).sort_index()
20 or younger 10 21-25 62 26-30 67 31-35 61 36-40 73 41-45 93 46-50 81 51-55 71 56 or older 78 NaN 55 Name: age, dtype: int64
Now that the ages are neatly categorized it is time to see whether age has an influence on likelyhood of an resignee to be dissatisfied. Below the amount of dissatisfied resignees will be summed for each category.
combined_up_dis = combined_up.loc[combined_up['dissatisfied']==True,]
df = combined_up['age'].value_counts().sort_index().to_frame(name='Total')
df['Dissatisfied'] = combined_up_dis['age'].value_counts().sort_index()
df['Other reasons'] = df['Total'] - df['Dissatisfied']
df['Dissatisfied %'] = round(df['Dissatisfied'] / df['Total'],2)
df['Other reasons %'] = round(df['Other reasons'] / df['Total'],2)
df.index.name = 'Age'
display(df)
Total | Dissatisfied | Other reasons | Dissatisfied % | Other reasons % | |
---|---|---|---|---|---|
Age | |||||
20 or younger | 10 | 2 | 8 | 0.20 | 0.80 |
21-25 | 62 | 19 | 43 | 0.31 | 0.69 |
26-30 | 67 | 28 | 39 | 0.42 | 0.58 |
31-35 | 61 | 23 | 38 | 0.38 | 0.62 |
36-40 | 73 | 25 | 48 | 0.34 | 0.66 |
41-45 | 93 | 35 | 58 | 0.38 | 0.62 |
46-50 | 81 | 31 | 50 | 0.38 | 0.62 |
51-55 | 71 | 30 | 41 | 0.42 | 0.58 |
56 or older | 78 | 33 | 45 | 0.42 | 0.58 |
Lets plot those numbers in two horizontal bar graphs.
plt.figure(figsize=(14,8)) #width then height
grid = plt.GridSpec(1,2, wspace = 1, hspace = .25) #rows then columns, space between graphs in width and height
ax1 = plt.subplot(grid[0,0])
ax2 = plt.subplot(grid[0,1])
ax1.barh(df.index, df['Total'], color='#15B01A', alpha=0.3, label='Total resignees')
ax1.barh(df.index, df['Dissatisfied'], color= '#15B01A', alpha=0.8, label='Dissatisfied resignees')
ax1.set_title('Absolute', y=0.955, weight='bold')
ax1.legend(bbox_to_anchor=(1,0.97),fontsize=12)
df_rel = df[['Dissatisfied %','Other reasons %']]
left_list = []
for i in df_rel:
l = list(df_rel.loc[:, df_rel.columns[0]: i].sum(axis=1))
left_list.append(l)
left_list
x_ax = df_rel.index
for i, v in enumerate(df_rel.columns):
if i == 0:
# First column is at the left of the chart so left is 0 and doesn't need to be specified.
ax2.barh(x_ax, df_rel[v], color='#15B01A', alpha=0.8)
else:
ax2.barh(x_ax, df_rel[v], left=left_list[i - 1],color='#15B01A',alpha=0.3)
ax2.set_title('Relative', y=0.955, weight='bold')
ax2.set_xticks([0,0.2,0.4,0.6,0.8,1])
ax2.set_xticklabels([0,'20%','40%','60%','80%','100%'])
ax2.axvline(x=0.5, color='black', linestyle='dashed', ymin=0.046, ymax=0.953)
for ax in [ax1,ax2]:
for loc in ['bottom','top','left','right']:
ax.spines[loc].set_visible(False)
ax1.text(x=0,y=9.45, s="Resignees that are 51+ are the most likely to be dissatisfied", size=20, weight='bold')
ax1.text(x=0,y=9, s="Resignee dissatisfaction per age category in both absolute as relative numbers", size=16)
plt.show()
It was shown that the higher the number of service years the higher the degree of respondents that were dissatisfied. For the variable age however, the relative amount of dissatisfied resignees was more uniformly distributed.