#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns import random sns.set() from jupyterthemes import jtplot # jtplot.style() jtplot.style('grade3', context='paper', fscale=1.5, ticks=True, grid=False) # jtplot.figsize(x=15., y=9.,aspect=1.2) get_ipython().run_line_magic('matplotlib', 'inline') # In[ ]: # In[ ]: #
#
# # Explore the data #
#
# The first thing we should do at the very beginning of any data analysis is to get a feel for the data. # Questions on the following lines should be asked, # + Do the columns make sense? # + Do the values make sense? # + Are there any missing values? # + Are there any outliers? If yes, what's their story? # + Which variables are strongly correlated? # In[ ]: # In[2]: df = pd.read_csv("HR_comma_sep.csv") # In[3]: df.head() # In[4]: df.describe() # In[5]: df.info() # In[6]: # let's clean up the data df.rename(columns={'sales':'department'},inplace=True) df_n = pd.get_dummies(df,columns=['department','salary']) # In[7]: df_n.head() #
#
# So, no missing values in this dataset. Let's move on to understanding the correlation between various features. #
#
# ** Correlation ** #
#
# In[8]: corr_mat = df_n.corr() # In[9]: # Set up the matplotlib figure f, ax = plt.subplots(figsize=(15, 10)); # Draw the heatmap using seaborn sns.heatmap(corr_mat, square=True, ax=ax); #
#
# A couple of interesting observations. # + The taget variable "left" is negatively correlated with "satisfaction level" which makes sense because employees are more likely # to leave if they are not satisfied with their work. # + Management department is positively correlated with high salary and promotions in last 5 years # In[ ]: # In[10]: df.head() # In[ ]: #
#
# **How does satisfaction level vary among employees who left?** #
#
# From the plot, it seems that employees who left are less satisfied on an average. # #
#
# In[11]: plt.figure(figsize=(10,6)) sns.boxplot(x='left',y='satisfaction_level',data=df); # In[ ]: #
#
# **Does having more number of projects have any impact on employee churn?** #
#
# # From the plots, it seems that people who churn (1) generally tend to have more projects than people who stay (0). From the Boxplot/CDF we can see that, ~50% of churners have 4 or more projects. #
#
# In[12]: fig,axarr = plt.subplots(ncols=2,figsize=(12,6)) # plt.figure(figsize=(10,6)) sns.boxplot(x='left',y='number_project',data=df,ax=axarr[0]); sns.kdeplot(df[df.left==0].number_project,label=0,cumulative=True,ax=axarr[1]); sns.kdeplot(df[df.left==1].number_project,label=1,cumulative=True,ax=axarr[1]); axarr[0].set_ylabel("Number of projects") axarr[1].set_ylabel("CDF") axarr[1].set_xlabel("Number of projects"); #
#
# ** Does staying in the company for a long time make the employees more vulnerable to churn? ** #
#
# In[69]: left_over_years = pd.crosstab(df.time_spend_company,df.left,margins=True,normalize='index') left_over_years = left_over_years[left_over_years.index!='All'] left_over_years = left_over_years.round(2) left_over_years.head() # In[68]: fig, ax = plt.subplots(figsize=(10,6)) _ = left_over_years.plot(kind='bar',ax=ax) _ = plt.xlabel("number of years at the company") _ = plt.ylabel("% of employees") _ = plt.title("Distribution of employees after n-years") #
#
# # It's seen that employees start churning after completing 3 years at the company. There's an upward trend from year-3 onwards till year-5. Almost 57% of people who completed 5 years at the company churn. Then we see a downward trend after year 6. People who has spent 7 years or more are unlikely to churn. # #
# # Why do people churn the most after 5 years? Is it because of promotion or some other factors are at play? # #
#
# In[85]: employee_spent_5_years = df[df.time_spend_company==5] emp_5_yrs_df = pd.crosstab(employee_spent_5_years.promotion_last_5years, employee_spent_5_years.left,margins=True,normalize='index') emp_5_yrs_df = emp_5_yrs_df[emp_5_yrs_df.index!='All'] emp_5_yrs_df = emp_5_yrs_df.round(2) emp_5_yrs_df # In[86]: emp_5_yrs_df.plot(kind='bar') #
#
# # This is interesting. 57% of the employees who haven't been given any promotion in last 5 years surely churn. # While, 94% of employees who have been given promotion in last 5 years surely stay! # #
#
# In[ ]: