#!/usr/bin/env python
# coding: utf-8

# # Job Dissatisfaction and Exit Surveys: 
# ## Analyzing age and length of job retention and their  association with certain factors 
# 
# ![Image](https://www.aihr.com/wp-content/uploads/Job-Dissatisfaction-cover-1000x523-1.png)
# 
# 
# This project aims to first clean exit survey data from the Department of Education, Training, and Employement (DETE) as well as the Technical and Further Education (TAFE) institute. From there, the data will be combined and factors affecting job dissatisfaction will be assessed in order to draw conclusions on what the most important factors were according to the exit surveys. 

# In[1]:


import pandas as pd
import numpy as np


# In[2]:


dete_survey = pd.read_csv('dete_survey.csv', na_values = 'Not Stated') ## this reads 'Not Stated'
                                                                       ## values in as NaN


# In[3]:


tafe_survey = pd.read_csv('tafe_survey.csv')


# In[4]:


pd.options.display.max_columns = 150
dete_survey.info()


# In[5]:


dete_survey.head()


# In[6]:


print(dete_survey['SeparationType'].value_counts())


## ↓↓↓↓ Other columns of interest:
# print(dete_survey['DETE Start Date'].value_counts())
# print('\n')
# print(dete_survey['Role Start Date'].value_counts())
# print('\n')
# print(dete_survey['Age'].value_counts())
# print('\n')
# print(dete_survey['Cease Date'].value_counts())
# print('\n')


# In[7]:


print(tafe_survey.info())

## ↓↓↓↓ Other columns of interest:
# print(tafe_survey['CurrentAge. Current Age'].value_counts())
# print('\n')
# print(tafe_survey['CESSATION YEAR'].value_counts())
# print('\n')
# print(tafe_survey['LengthofServiceCurrent. Length of Service at current workplace (in years)'].value_counts())
# print('\n')
# print(tafe_survey['LengthofServiceOverall. Overall Length of Service at Institute (in years)'].value_counts())
# print('\n')
# print(tafe_survey['Reason for ceasing employment'].value_counts())
# print('\n')

# tafe_survey.isnull()


# In[8]:


tafe_survey.head()


# ## Basic information gathered:
# 
# * Information from both exit surveys include and create an idea for:
#     * The age groups of the individuals 
#     * Amount of time employed
#     * When employees started
#     * Reasons for leaving
# 
# ### Some important takeaways from above: 
# 
# #### Within the DETE Survey:
#  * In reason for leaving, some interesting categories to evaluate further are: 
#     * Resignation-Other reasons
#     * Resignation-Other employer 
#     * Resignation-Move overseas/interstate
#  * More individuals left in 2012 than 2013
#  * Individuals older than the age of 50 comprise well over half of the individuals leaving
#  * Length of time is not calculated explicitly
# 
# #### Within the TAFE Survey:
#  * In reason for leaving, some interesting categories to evaluate further are:
#     * The categories for TAFE give a more general term 'Resignation' in 'Reason for ceasing employment' 
#     * The number of Resignations in TAFE relative to all other reasons in Resignations as proportion is much higher when looking at this same metric in DETE. 
#  * Length of time is explicitly calculated with its own column
#  * Like with the DETE survey, Individuals older than the age of 50 comprise well over half of the individuals leaving.

# In[9]:


dete_drop = dete_survey.iloc[:,28:49]
dete_survey_updated = dete_survey.drop(dete_drop,axis=1)
dete_survey_updated.info()


# In[10]:


tafe_drop = tafe_survey.iloc[:,17:66]
tafe_survey_updated = tafe_survey.drop(tafe_drop,axis=1)
tafe_drop.info()


# ### ↑↑↑↑ The above code drops unnecessary columns for both TAFE and DETE exit surveys. Only columns with commonalities across both surveys will be further analyzed. 

# In[11]:


dete_survey_updated.columns


# In[12]:


tafe_survey_updated.head()


# In[13]:


dete_survey_updated.columns = dete_survey_updated.columns.str.replace('/','_').str.replace(' ','_').str.lower()
dete_survey_updated.head()


# In[14]:


mapping_info = {'Record ID': 'id',
               'CESSATION YEAR': 'cease_date',
               'Reason for ceasing employment': 'separation_type',
               'Gender. What is your Gender?': 'gender',
               'CurrentAge. Current Age': 'age',
               'Employment Type. Employment Type': 'employment_status',
               'Classification. Classification': 'position',
               'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service',
               'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'}

tafe_survey_updated = tafe_survey_updated.rename(mapping_info,axis=1)


# In[15]:


tafe_survey_updated.head()


# In[16]:


dete_survey_updated['separationtype'].value_counts()


# In[17]:


tafe_survey_updated['separation_type'].value_counts()


# In[18]:


dete_survey_updated['separationtype'] = dete_survey_updated['separationtype'].str.split('-').str[0]
dete_survey_updated['separationtype'].value_counts()


# In[19]:


dete_resignation = dete_survey_updated[dete_survey_updated['separationtype'] == 'Resignation'].copy()

tafe_resignation = tafe_survey_updated[tafe_survey_updated['separation_type']=='Resignation'].copy


# In[20]:


tafe_survey_updated = tafe_survey_updated.copy()[tafe_survey_updated['separation_type'] == 'Resignation']
tafe_resignations = tafe_survey_updated


# ##  Creating New Dataframes for TAFE and DETE: 
# For the separation type category (or column) in both surveys, only Resignation can be analyzed as a direct reason for dissatisfaction. Therefore, new data frames are created using only Resignation as the separation type. This should provide a clearer view of job dissatisfaction moving forward. 

# In[21]:


dete_resignation['cease_date'].value_counts()


# In[22]:


dete_resignation['cease_date'] = dete_resignation['cease_date'].str.split('/').str[-1]
dete_resignation['cease_date'].value_counts()


# In[23]:


value_counts_dete = dete_resignation["cease_date"].astype(float).value_counts()
dete_resignation['cease_date'] = dete_resignation['cease_date'].astype(float)
print(value_counts_dete)


# In[24]:


value_counts_tafe = tafe_resignations["cease_date"].value_counts()
print(value_counts_tafe)


# In[25]:


value_counts_dete.sort_index(ascending = False)


# In[26]:


value_counts_tafe.sort_index(ascending = False)


# In[27]:


dete_resignation.info()


# In[28]:


get_ipython().run_line_magic('matplotlib', 'inline')
dete_resignation.boxplot(column = ['dete_start_date','role_start_date','cease_date']) 


# #### The boxplot is unusually small because of the outlier at around 250. Clearly no one started in the first millenium so this data point needs to be removed. This type of visualization also helps understand where there may be obvious outliers—instead of a function like value counts where several indices may make it more difficult to pick out an index which is out of place. Below, this single outlier is removed and a more accurate range provides a more interpretable graph. 

# In[29]:


dete_resignation = dete_resignation[dete_resignation['role_start_date'] != 200.0]

dete_resignation['role_start_date'].value_counts()

dete_resignation.boxplot(column = ['dete_start_date','role_start_date','cease_date']) 
                                                       

# In[30]:


dete_resignation = dete_resignation[(dete_resignation['cease_date'] != 2006.0) & (dete_resignation['cease_date'] != 2010.0)]

tafe_resignations = tafe_resignations[tafe_resignations['cease_date'] != 2009.0]


# In[31]:


dete_resignation['cease_date'].value_counts()


# In[32]:


tafe_resignations['cease_date'].value_counts()


# In[33]:


dete_resignation['institute_service'] = dete_resignation['cease_date'] - dete_resignation['dete_start_date'] 


# #### In the boxplots above, the interquartile range is in the late 90's to 2010 for DETE start date and mid 2000's to just after 2010 for the role start date. The interquartile range for the role start date occurs on a slightly more delayed time frame. Cease date also has a very tight interquartile range comparatively and so 2006 as well 2009 years were removed. 

# In[34]:


tafe_resignations['Contributing Factors. Dissatisfaction'].value_counts()


# In[35]:


tafe_resignations['Contributing Factors. Job Dissatisfaction'].value_counts()


# In[36]:


tafe_resignations.head()


# In[37]:


dete_resignation.head()


# In[40]:


def update_vals(val):
    if pd.isnull(val):
        return np.nan
    elif val == '-':
        return False
    else:
        return True

tafe_resignations.copy()['dissatisfied'] = tafe_resignations[['Contributing Factors. Dissatisfaction','Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(axis=1, skipna=False)    
tafe_resignations_up = tafe_resignations

tafe_resignations['dissatisfied'].value_counts(dropna = False)


# 

# In[41]:


dete_resignation['dissatisfied'] = dete_resignation[['job_dissatisfaction', 'dissatisfaction_with_the_department','lack_of_recognition','lack_of_job_security','work_location','employment_conditions','work_life_balance','workload']].any(axis=1,skipna=False)

dete_resignation['dissatisfied'] = dete_resignation[['job_dissatisfaction', 'dissatisfaction_with_the_department','lack_of_recognition','lack_of_job_security','work_location','employment_conditions','work_life_balance','workload']]
dete_resignation['dissatisfied'].isnull().sum() ###applymap is not necessary in this circumstance


# In[42]:


dete_resignations_up = dete_resignation.copy()

dete_resignations_up['dissatisfied'].value_counts(dropna=False)


# ### Using relevant columns to understand dissatisfaction:
# 
# * created 'dissatisfied' column which determined whether or not a client left because some of the original data set's determined columns.
# * Those columns in DETE are: `job_dissatisfaction`,`dissatisfaction_with_the_department`,
# `lack_of_recognition`,`lack_of_job_security`,`work_location`,
# `employment_conditions`,`work_life_balance`, and `workload`. For the TAFE survey, those columns are: `Contributing Factors. Dissatisfaction` and `Contributing Factors. Job Dissatisfaction`
# * defined a function to transform elements in columns to either False, True or NaN
# * dete_resignation df did not have NaN values so code looks different!
# * These columns are chosen to focus and understand on reasons for job dissatisfaction in terms of True and False. 

# In[43]:


dete_resignations_up.copy()['institute'] = 'DETE'
tafe_resignations_up.copy()['institute'] = 'TAFE'


# In[44]:


combined = pd.concat([tafe_resignations_up,dete_resignations_up], ignore_index=True)


# In[45]:


combined_updated = combined.dropna(axis=1,thresh=500)


# In[46]:


combined_updated.head()


# ### Dropping null values:
# 
# * Both dataframes are combined keeping all columns from each data frame. 
# * For non-null values in a column which are less than 500, this columns is dropped

# In[47]:


combined_updated['institute_service'].value_counts(dropna=False)


# In[50]:


combined_updated.copy()['institute_service_up'] = combined_updated['institute_service'].astype('str').str.extract(r"(\d+)")
combined_updated.copy()['institute_service_up'] = combined_updated['institute_service_up'].astype('float')                                                                                     

combined_updated.copy()['institute_service_up'].value_counts()


# ## Extracting singular year values:  
# After looking at the frequency for the amount of years using  value_counts, the numbers needed to be extracted—either from strings or from ranges. Once those values were extracted, value_counts was used again to evaluate frequencies. While the ranges aren't averaged and the phrases before extraction don't give exact values, it doesn't necessarily mean the data will be misrepresented. For instance, "5-6" years changes to 5 years, however since 5-6 years fits into the 3-6 years category which will be created below, it makes no difference if the code written counts more values towards 5 years than 6 years. Likewise, with an expression like 'More than 20 years', if the threshold for "Veteran" is anyone who's served at the company longer than 11 years, than the code extracting "More than 20 years" as just 20 years, will ultimately be categorized correctly as more than 11 years.  

# In[53]:


def exp_range(val):
    ## if pd.isnull(val):
        ## return 'NaN'
    
    if val < 3:
        return "New: Less than 3 years"
    elif 3 < val < 6: 
        return "Experienced: 3-6 years"
    elif 7 < val < 10: 
        return "Established: 7-10 years"
    else: 
        return "Veteran 11 or more years"
        
combined_updated.copy()['service_cat'] = combined_updated['institute_service_up'].apply(exp_range)
combined_updated.dropna(axis=0,subset=['service_cat'])


# In[54]:


combined_updated.copy()['dissatisfied'].value_counts(dropna = False)


# In[55]:


combined_updated.copy()['dissatisfied'] = combined_updated['dissatisfied'].fillna(False)


# In[60]:


combined_updated.copy()['dissatisfied'].value_counts()
combined_updated.copy()['dissatisfied'] = combined_updated['dissatisfied'].astype(float)


# In[61]:


service_cat_pv = combined_updated.pivot_table(values = 'dissatisfied',index = 'service_cat',margins=False)
print(service_cat_pv)


# In[62]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
service_cat_pv.plot(kind = 'bar', title = 'Average Dissatisfaction Based on time with company', xlim=(0,1), legend = False)

plt.ylabel('percentage dissatisfied')


# # Final Observations:
# 
# The following bar plot indicates all experience levels are dissatisfied within the range of about 18-23%. The most dissatisfied group—based on service time is the Established group (about 23%). The Veteran group (followed very closely by the New group) are second most dissatisfied (~21%). The least dissatisfied was the Experienced group (about 18%). This is perhaps because those who are established are looking to expand their skillset and didn't think they could do that at their current company. 

# #  <span style="font-family:Papyrus; font-size:2em;">End of Project!</span>
# 
# [curious about the format?](https://support.squarespace.com/hc/en-us/articles/206543587-Markdown-cheat-sheet) 
#