#!/usr/bin/env python # coding: utf-8 # ### Analysis Of Employee Exit Surveys # In this project, we are going to concentrate on the two Employee Exit Survey Datasets,to understand the resignation of employees from the company over the period of the time.Datasets are contributed by Department of Education, Training and Employment (DETE) and the Technical and Further Education (TAFE) institute in Queensland, Australia. # # Our goal is aligned with: # # 1)Are employees who only worked for the institutes for a short period of time resigning due to some kind of dissatisfaction? What about employees who have been there longer? # # 2)Are younger employees resigning due to some kind of dissatisfaction? What about older employees? # # TO get viable answers to the questions,we assume our clients(stakeholders) want combine the results for both surveys. # In[64]: import numpy as np import pandas as pd dete_survey=pd.read_csv("dete_survey.csv") tafe_survey=pd.read_csv("tafe_survey.csv") # In[65]: dete_survey.head(4) # In[66]: dete_survey.info() # In[67]: tafe_survey.head() # In[68]: tafe_survey.info() # 1)THere are many columns in both dasets which are not needed for analysis.we need to delete them. # # 2)we need to convert not stated values to nan.Not stated values are missing values. # # 3)THere are multiple columns in both datasets which has same values or same meaning with different column names. # In[69]: #Identify Missing Values and Drop Unneccessary Columns dete_survey=pd.read_csv("dete_survey.csv",na_values="Not Stated") # Following columns are deleted which are found unnecessary to put in our analysis.When we inspect these columns, the values are not substanial benefit to our analysis. # In[70]: dete_survey.columns[28:49] # In[71]: tafe_survey.columns[17:66] # In[72]: dete_survey_updated=dete_survey.drop(dete_survey.columns[28:49],axis=1) tafe_survey_updated=tafe_survey.drop(tafe_survey.columns[17:66],axis=1) # Each dataframes contains columns which have same meaning but different column names.We are looking at the column names similar to both datasets: # # # # These are column names we are looking for our analysis.So we combine them.Before combining, we rename the columns in both datasets in standaradised form. # # In[73]: dete_survey_updated.columns=dete_survey_updated.columns.str.strip().str.replace(" ","_").str.upper() dete_survey_updated.columns # In[74]: ##Updating column names in tafe survey to match with dete_survey_updated name_change= {'Record ID': 'id', 'CESSATION YEAR': 'cease_date', 'Reason for ceasing employment': 'separationtype', 'Gender. What is your Gender?': 'gender', 'CurrentAge. Current Age': 'age', 'Employment Type. Employment Type': 'employment_status', 'Classification. Classification': 'position', 'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service', 'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'} # In[75]: ## check the columns tafe_survey_updated=tafe_survey_updated.rename(name_change,axis=1) tafe_survey_updated.columns # ###### Filtering the data # In[76]: #There are various reasons listed for seperationtype in both datasets. #for our analysis we are exclusively focus on sepration type-Resignation.Filtering out the Resignation alone from both datasets resignation_pattern="Resignation" dete_resignations=dete_survey_updated[dete_survey_updated['SEPARATIONTYPE'].str.contains(resignation_pattern)] #dete_resignations["SEPARATIONTYPE"]= "Resignation".copy() dete_resignations.loc[:,"SEPARATIONTYPE"] = "Resignation" dete_resignations.head() # In[77]: resignation_pattern="Resignation" tafe_resignations=tafe_survey_updated[tafe_survey_updated['separationtype']=="Resignation"].copy() tafe_resignations.head(2) # ###### Verify the data # # The two important columns in both dataset are cease date and start date,which shows the employee duration in the company.But both columns in the dataset are not clean.Both columns have to be filtered such that they contain only the years.Given that most people in this field start working in their 20s, it's also unlikely that the dete_start_date was before the year 1940. # In[78]: dete_resignations["CEASE_DATE"].value_counts() # In[79]: dete_resignations["CEASE_DATE"]=dete_resignations["CEASE_DATE"].str.split("/").str[-1] dete_resignations["CEASE_DATE"]=dete_resignations["CEASE_DATE"].astype(float).copy() # In[80]: dete_resignations["DETE_START_DATE"].value_counts() # In[81]: tafe_resignations["cease_date"].value_counts().sort_index(ascending=True) # In[82]: dete_resignations["CEASE_DATE"].value_counts().sort_index(ascending=True) # In[83]: import matplotlib.pyplot as plt fig = plt.figure(figsize=(12, 12)) fig.add_subplot(2,1,1) dete_resignations.boxplot(column=["CEASE_DATE"]) plt.title("dete_survey_employee_exit") fig.add_subplot(2,1,2) tafe_resignations.boxplot(column=["cease_date"]) plt.title("tafe_survey_employee_exit") plt.show() # Below are our Findings: # # IN both surveys,there is a siginificant difference in number of employees in exited in a given period of time.While tafe survey suggest that maximum number of employees resignation happend in 2011,the dete survey emphasis the resignation happened in 2013. # Since we aren't concerned with analyzing the results by year, we'll leave them as is. # ##### Creating a new columnn # # Our goal is to find how long the employees remain in the company before resigning his position due to dissatifaction. # # The tafe_resignations dataframe already contains a "service" column, which we renamed to institute_service. # # Below, we calculate the years of service in the dete_survey_updated dataframe by subtracting the dete_start_date from the cease_date and create a new column named institute_service. # In[84]: dete_resignations['INSTITUTE_SERVICE'] = dete_resignations['CEASE_DATE'] - dete_resignations['DETE_START_DATE'] dete_resignations['INSTITUTE_SERVICE'].head() # Our goal is identify whether the long term or short trem employees exited from organisation due to dissatisfaction.We have many column which states resaoning of employees resignation.We focus on the columns which categorize employees as "dissatisfied" with the company from each dataframe. # # tafe_survey_updated: # # 1)Contributing Factors. Dissatisfaction # # 2)Contributing Factors. Job Dissatisfaction # # detesurveyupdated: # # 1 job_dissatisfaction # # 2 dissatisfaction_with_the_department # # 3 physical_work_environment # # 4 lack_of_recognition # # 5 lack_of_job_security # # 6 work_location # # 7 employment_conditions # # 8 work_life_balance # # 9 workload # # The idea is when the employee in both survey mentioned any of the above reasons,we mark true in the new column named "Dissatisfied".New column in both dataframes have following values: # # True: indicates a person resigned because they were dissatisfied in some way # # False: indicates a person resigned because of a reason other than dissatisfaction with the job # # NaN: indicates the value is missing # In[85]: tafe_resignations["Contributing Factors. Dissatisfaction"].value_counts() # In[86]: tafe_resignations["Contributing Factors. Job Dissatisfaction"].value_counts() # In[87]: ##Clearly from the above value counts, we can say some of people resigned due to dissatisfaction.Now,in new column we #have to assign the value as true if any of above reasons mentioned or assign the values as False if above reasons listed ##as"-" are mentioned in both columns or nan def update_vals(val): if pd.isnull(val): return np.nan elif val=="-": return False else: return True # In[88]: tafe_resignations["Dissatisfied"]=tafe_resignations[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(axis=1, skipna=False) # In[89]: tafe_resignations["Dissatisfied"].value_counts(dropna=False) # In[90]: ##Let us follw the same steps for dete_resignations dataframe to create a new colum dissatisfaction # In[91]: k=["JOB_DISSATISFACTION","DISSATISFACTION_WITH_THE_DEPARTMENT","PHYSICAL_WORK_ENVIRONMENT","LACK_OF_RECOGNITION" ,"LACK_OF_JOB_SECURITY","WORK_LOCATION","EMPLOYMENT_CONDITIONS","WORK_LIFE_BALANCE","WORKLOAD"] # In[92]: for i in k: print(i,"Value count") print(dete_resignations[i].value_counts(dropna=False)) print() # In[93]: dete_resignations["Dissatisfied"]=dete_resignations[["JOB_DISSATISFACTION","DISSATISFACTION_WITH_THE_DEPARTMENT","PHYSICAL_WORK_ENVIRONMENT","LACK_OF_RECOGNITION" ,"LACK_OF_JOB_SECURITY","WORK_LOCATION","EMPLOYMENT_CONDITIONS","WORK_LIFE_BALANCE","WORKLOAD"]].any(axis=1,skipna=False) # In[94]: dete_resignations["Dissatisfied"].value_counts() # In[95]: tafe_resignations_up = tafe_resignations.copy() dete_resignations_up=dete_resignations.copy() # In[96]: ##converting all the columns to lower case tafe_resignations_up.columns=tafe_resignations_up.columns.str.lower() dete_resignations_up.columns=dete_resignations_up.columns.str.lower() # ### Combing the dataframes # Noe we have cleaned,filtered and verified the data in both surveys dataset.To achieve our goal we must combine the datasets.Before COmbing the datasets, we need to deicated column in each dataframe to indentify which easily distinguish between the two surveys # In[97]: ##column named institute to dete_resignations_up.##column named institute to tafe_resignations_up tafe_resignations_up["institute"]="DETE" dete_resignations_up["institute"]="TAFE" # In[98]: combined = pd.concat([dete_resignations_up, tafe_resignations_up],ignore_index=True) # In[99]: ##Checking how many values are present in each column combined.notnull().sum().sort_values() # In[100]: # Drop columns with less than 500 non null values combined_updated = combined.dropna(thresh = 500, axis =1).copy() # In[101]: combined_updated # ##### Tranforming data in institute service # # Our journey to reaching the goal has reached final stage - analysis of data.So far what we did was cleaning and filtering the datasets according to our needs.One category we need to transform is,institute serivice which has values like this: # # NaN 88 # # Less than 1 year 73 # # 1-2 64 # # # # To analyze the data, we'll convert these numbers into categories. We'll base our anlaysis on this article(https://www.businesswire.com/news/home/20171108006002/en/Age-Number-Engage-Employees-Career-Stage), which makes the argument that understanding employee's needs according to career stage instead of age is more effective. # # we use the following modified definitions : # # New: Less than 3 years at a company # # Experienced: 3-6 years at a company # # Established: 7-10 years at a company # # Veteran: 11 or more years at a company # In[102]: #Unique values in institute_service combined_updated['institute_service'].value_counts(dropna=False) # In[103]: combined_updated['institute_service_new']=combined_updated['institute_service'].astype(str).str.extract(r'(\d+)') #Refer: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html #Refer: https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html combined_updated['institute_service_new'] = combined_updated['institute_service_new'].astype('float') # Check the years extracted are correct combined_updated['institute_service_new'].value_counts().head(5) # In[104]: def upval(value): if value < 3: return "New: Less than 3 years at a company" elif 3 <= value < 7: return "Experienced: 3-6 years at a company" elif 7 <= value < 11: return "Established: 7-10 years at a company" elif value >= 11: return "Veteran: 11 or more years at a company" else: #pd.isnull(value): return np.nan # In[105]: combined_updated['service_cat']=combined_updated['institute_service_new'].apply(upval) # In[106]: combined_updated['service_cat'].value_counts(dropna=False) # In[107]: combined_updated # ###### Analysis of data # Now we have finalised a categories of the employee resignation based number of years they worked in organisation.We can do analysis to find out how many employees exited due to dissatifaction.Whether the dissatisfaction is found among new employees. # # To deal with the missing values in disatisfied columm, we'll replace the missing values in the dissatisfied column with the most frequent value, False.Based on the updated values in disatisfied column we will plot the results. # # we still have additional missing values left to deal with. This is meant to be an initial introduction to the analysis, not the final analysis. # In[108]: combined_updated["dissatisfied"].value_counts(dropna = False) # In[109]: #replace the missing values in the dissatisfied column with the most frequent value, False combined_updated['dissatisfied']= combined_updated["dissatisfied"].fillna(False) # In[110]: #Percentage break of employees who are dissatified based on the duraion of the service service_cat_dis_pivot=pd.pivot_table(combined_updated,values='dissatisfied',index='service_cat') service_cat_dis_pivot # In[111]: service_cat_dis_pivot.plot(kind='barh', legend=False, figsize=(10,5), fontsize=12,color=(0.8, 0.3, 0.7, 0.5),edgecolor='blue') # From above bar plot,from our intial analysis we can conclude that veteran and establised employees are leaving the organisation due to dissatisfaction with the company.Employees who have service more than 7 years in the organisation are likely to resign due to dissatisfaction. # ##### Additional Goals # 1)We have included missing value in dissatisfied column.we will explore and fill the missing values in other columns and try to aggregate the data according to the service_cat column again. How many people in each career stage resigned due to some kind of dissatisfaction? # # 2)How many people in each age group resgined due to some kind of dissatisfaction? # # 3)Perform analysis on individual suveys(DETE and TAFE) # # # In[112]: combined_updated.isnull().sum() # In[113]: #institute_service,institute_service_new,service_cat are all same columns which are mention the lengthn of service #in different ways combined_updated=combined_updated.drop(["institute_service"],axis=1) # In[114]: #How many people in each career stage resigned due to some kind of dissatisfaction? pivot_tab=pd.pivot_table(combined_updated,values='dissatisfied',index='service_cat',aggfunc='sum') pivot_tab # Above values show that Established profeesional leaved the company due to dissatisfaction.Even though the many new recruiters left the organisation,percentage of new employees left from total recruitment in last three years is 30%. # In[115]: ###### Cleaning up and transforming the age column combined_updated["age"].value_counts(dropna=False) # From noticed above,cleaning up age column is trickier because we cannot apply same logic of "institute_service" column to age. # We do analysis on the present age values and do not fill any missing age values. # In[116]: #df[df.Last_Name.notnull()] # get the index of rows with nan values on column "institute_service" combined_updated_age = combined_updated[combined_updated.age.notnull()] #Other way of doing it #index_rows = combined_updated[combined_updated["age"].isnull()].index #combined_age= combined_updated.copy().drop(labels = index_rows) # In[117]: combined_updated_age["age"]=combined_updated_age["age"].str.replace(" ","-").copy() # In[118]: combined_updated_age["age"]=combined_updated_age["age"].astype(str).str.strip() combined_updated_age["age"].value_counts() # In[119]: #Now we will change the the column in age suitable to our analysis with the help of function and apply method. def age (val): if val[0]=="2": return "20 -29" elif val[0]=="3": return "30 -40" elif val[0]=="4": return "40 -50" elif val[0]=="5": return "50 -60" else: return "60 and above" # In[120]: combined_updated_age["age"]=combined_updated_age["age"].apply(age) # In[121]: #Let us check for the values combined_updated_age["age"].value_counts() # In[122]: pivot_tab=pd.pivot_table(combined_updated_age,values=['dissatisfied'],index='age') # In[123]: pivot_tab.plot(kind='barh', legend=False, figsize=(10,5), fontsize=12,color=(0.8, 0.3, 0.7, 0.5),edgecolor='blue') # People who are above age 50 and 60 resigned their job mainly due to job disatisfaction among other reasons # ##### Analyze each survey separately to get to know which age group resigned due to what dis-satisfaction reason. # In[138]: dete_resignations.columns # In[139]: dete_resignations.AGE.value_counts() #JOB_DISSATISFACTION,DISSATISFACTION_WITH_THE_DEPARTMENT,LACK_OF_JOB_SECURITY,WORK_LOCATION,EMPLOYMENT_CONDITIONS,RELOCATION,WORK_LIFE_BALANCE,WORKLOAD # In[140]: dete_resignations.notnull().sum() # In[141]: dete_resignations["Dissatisfied"].value_counts(dropna=False) # In[142]: #Removing the columns which have not null values less than 275 dete_resignations_up = dete_resignations.dropna(thresh = 275, axis =1).copy() # In[143]: dete_resignations_up.columns # Our goal is to find which of mentioned disatisfaction is the main reason for driving the employee out of the organisation.For our analysis, we further customize the age suitable to our needs. # # In[144]: def age (val): if val[0]=="2": return "20-29" elif val[0]=="3": return "30-40" elif val[0]=="4": return "40-50" elif val[0]=="5": return "50-60" else: return "60 and above" # In[145]: dete_resignations_up["AGE"]=dete_resignations_up.AGE.astype(str) # In[146]: dete_resignations_up["AGE"]=dete_resignations_up["AGE"].apply(age) dete_resignations_up["AGE"].value_counts() # In[149]: selected_cols=['JOB_DISSATISFACTION', 'DISSATISFACTION_WITH_THE_DEPARTMENT','PHYSICAL_WORK_ENVIRONMENT', 'LACK_OF_RECOGNITION','LACK_OF_JOB_SECURITY', 'WORK_LOCATION', 'EMPLOYMENT_CONDITIONS','RELOCATION','WORK_LIFE_BALANCE','WORKLOAD'] # In[156]: #Check if all the selected columns are in boolean type dete_resignations_up[selected_cols] = dete_resignations_up[selected_cols].astype(bool) # In[161]: #Percentage break of employees who are dissatified based on the duraion of the service pivot_dete=pd.pivot_table(dete_resignations_up,values=selected_cols,index= "AGE") pivot_dete # In[253]: selected_cols=list(pivot_dete.columns) len(selected_cols) # In[ ]: # In[390]: pivot_dete.plot(kind='bar', legend=True, figsize=(24,12), fontsize=15,edgecolor='blue',ylim=(0,0.35),colormap="Paired",title="Age breakup of people who are leaving job due to dissatisfaction") # ##### From Dete Survey we can confirm,that work relocation has been the significant factor among all age groups especially are in late 20s,30s and 60s for resignation from the job. # ##### The work life balance also had an impact of employee leaving the organisation. # ###### Tafe survey analysis # In[336]: tafe_resignations.columns # IN tafe survey we will focus on the dissatifaction columns -'Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction'.THere is no specific dissatifaction reason is mentioned in columns of tafe survey.So we look at dissatisfied column # In[361]: tafe_resignations["Dissatisfied"].value_counts() # In[362]: def age (val): if val[0]=="2": return "20-29" elif val[0]=="3": return "30-40" elif val[0]=="4": return "40-50" elif val[0]=="5": return "50-60" else: return "60 and above" # In[364]: tafe_resignations_up["AGE"]=dete_resignations_up["AGE"].apply(age) tafe_resignations_up["AGE"].value_counts(dropna=False) # In[388]: tafe_resignations_up["Dissatisfied"]=tafe_resignations_up["Dissatisfied"].astype(bool) pivot_tafe=pd.pivot_table(tafe_resignations_up,values="Dissatisfied",index= "AGE") pivot_tafe # In[387]: pivot_tafe.plot(kind='barh', legend=True, fontsize=15,edgecolor='blue',colormap="Paired") # From tafe survey we can conclude that senior age group employees(45%) form a significant portion in leaving the organisation due to dissatifaction. # In[419]: pivot_combined_gender=pd.pivot_table(combined_updated,values="dissatisfied",index= ["gender","service_cat"]) pivot_combined_gender.plot(kind='barh', legend=True, fontsize=15,edgecolor='blue',color="green",figsize=(24,12)) # ## Conclusion # # In this project we analysed the exit surveys from employees of the Department of Education, Training and Employment (DETE) and the Technical and Further Education (TAFE) institute in Queensland, Australia. # O # ur goal is aligned with: # # 1)Are employees who only worked for the institutes for a short period of time resigning due to some kind of dissatisfaction? What about employees who have been there longer? # Employees who have been longer left the organisation due to dissatisfaction.Eventhogh significant percentage of short term employees are leaving the organisation,they are much lower compared to senior employees. # # 2)Are younger employees resigning due to some kind of dissatisfaction? What about older employees? # # Around 35% and 45 % of the employees belonging to the age group 20-29 and 30-39 are leaving the organisation due to dissatisfaction.Old Employees are leaving the organisation even more around 55% and 50%. # # We also found relocation is one of the main reasons for leaving the company.This can be seen across all age group of the people particularly among late 20s ,30s and 60s. # # #