#!/usr/bin/env python
# coding: utf-8

# In this project, we'll clean and analyze exit surveys from employees of the Department of Education, Training and Employment (DETE)}) and the Technical and Further Education (TAFE) body of the Queensland government in Australia. The TAFE exit survey can be found here and the survey for the DETE can be found here.
# 
# We'll pretend our stakeholders want us to combine the results for both surveys to answer the following question:
# 
# Are employees who only worked for the institutes for a short period of time resigning due to some kind of dissatisfaction? What about employees who have been there longer?

# In[35]:


import pandas as pd
import numpy as np


# In[36]:


dete_survey=pd.read_csv("dete_survey.csv")
dete_survey.info()


# In[37]:


tafe_survey=pd.read_csv("tafe_survey.csv")
tafe_survey.info()
tafe_survey.head()


# 
# We can make the following observations based on the work above:
# 
# The dete_survey dataframe contains 'Not Stated' values that indicate values are missing, but they aren't represented as NaN.
# Both the dete_survey and tafe_survey contain many columns that we don't need to complete our analysis.
# Each dataframe contains many of the same columns, but the column names are different.
# There are multiple columns/answers that indicate an employee resigned because they were dissatisfied.

# In[38]:


dete_survey=pd.read_csv("dete_survey.csv",na_values="Not Stated")


# In[39]:


dete_survey_updated=dete_survey.drop(dete_survey.columns[28:49],axis=1)
tafe_survey_updated=tafe_survey.drop(tafe_survey.columns[17:66],axis=1)


# In[40]:


print(dete_survey_updated.columns)
print(tafe_survey_updated.columns)


# In[41]:


dete_survey_updated.columns=dete_survey_updated.columns.str.replace(" ","_").str.lower().str.strip()


# In[42]:


dete_survey_updated.columns


# In[43]:


tafe_survey_updated=tafe_survey_updated.rename({'Record ID': 'id','CESSATION YEAR': 'cease_date'
,'Reason for ceasing employment': 'separationtype'
,'Gender. What is your Gender?': 'gender'
,'CurrentAge. Current Age': 'age'
,'Employment Type. Employment Type': 'employment_status'
,'Classification. Classification': 'position'
,'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service'
,'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'},axis=1)

tafe_survey_updated.columns


# In[44]:


dete_survey_updated.head()


# In[45]:


tafe_survey_updated.head()


# In[46]:


dete_survey_updated["separationtype"].value_counts()


# In[47]:


tafe_survey_updated["separationtype"].value_counts()


# In[48]:


tafe_resignations=tafe_survey_updated[tafe_survey_updated["separationtype"] == "Resignation"].copy()
tafe_resignations.head()


# In[49]:


dete_resignations = dete_survey_updated[dete_survey_updated['separationtype'] == "Resignation-Other reasons"].copy()


# In[50]:


dete_resignations["cease_date"].value_counts()


# In[51]:


dete_resignations['cease_date'] = dete_resignations['cease_date'].str.split('/').str[-1]

dete_resignations['cease_date'] = dete_resignations['cease_date'].astype("float")


# In[52]:


dete_resignations["cease_date"].value_counts()


# In[53]:


dete_resignations["dete_start_date"].value_counts().sort_index(ascending=True)


# From the work we did in the last screen, we can verify:
# 
# There aren't any major issues with the years.
# The years in each dataframe don't span quite the same number of years. We'll leave it up to your discretion to drop any years you don't think are needed for the analysis.

# In[54]:


dete_resignations.head()


# In[55]:


dete_resignations["institute_service"]= dete_resignations["cease_date"] - dete_resignations["dete_start_date"]
dete_resignations["institute_service"].head()


# In the last screen, we created a new institute_service column that we'll use to analyze survey respondents according to their length of employment.Next, we'll identify any employees who resigned because they were dissatisfied.

# In[56]:


tafe_resignations['Contributing Factors. Dissatisfaction'].value_counts()


# In[57]:


tafe_resignations['Contributing Factors. Job Dissatisfaction'].value_counts()


# In[58]:


def update_vals(x):
    if x == '-':
        return False
    elif pd.isnull(x):
        return np.nan
    else:
        return True


# In[59]:


tafe_resignations['dissatisfied'] = tafe_resignations[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(1, skipna=False)


# In[60]:


tafe_resignations_up=tafe_resignations['dissatisfied'].copy()


# In[61]:


dete_resignations['dissatisfied'] =dete_resignations.any(1, skipna=False)


# In[62]:


dete_resignations_up=dete_resignations['dissatisfied'].copy()


# In[63]:


dete_resignations_up['institute'] = 'DETE'
tafe_resignations_up['institute'] = 'TAFE'


# In[66]:


combined = pd.concat([dete_resignations_up, tafe_resignations_up], ignore_index=True)


# In[65]:


combined_updated = combined.dropna(thresh = 500, axis =1).copy()


# In[ ]:


combined_updated['institute_service_up'] = combined_updated['institute_service'].astype('str').str.extract(r'(\d+)')


# In[ ]:


combined_updated['institute_service_up'] = combined_updated['institute_service_up'].astype('float')


# In[ ]:


combined_updated['institute_service_up'].value_counts()


# In[ ]:


def transform_service(val):
    if val >= 11:
        return "Veteran"
    elif 7 <= val < 11:
        return "Established"
    elif 3 <= val < 7:
        return "Experienced"
    elif pd.isnull(val):
        return np.nan
    else:
        return "New"
combined_updated['service_cat'] = combined_updated['institute_service_up'].apply(transform_service)


# In[ ]:


combined_updated['service_cat'].value_counts()


# In[ ]: