#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import random
sns.set()
from jupyterthemes import jtplot
# jtplot.style()
jtplot.style('grade3',
context='paper',
fscale=1.5,
ticks=True,
grid=False)
# jtplot.figsize(x=15., y=9.,aspect=1.2)
get_ipython().run_line_magic('matplotlib', 'inline')
# In[ ]:
# In[ ]:
#
#
# # Explore the data
#
#
# The first thing we should do at the very beginning of any data analysis is to get a feel for the data.
# Questions on the following lines should be asked,
# + Do the columns make sense?
# + Do the values make sense?
# + Are there any missing values?
# + Are there any outliers? If yes, what's their story?
# + Which variables are strongly correlated?
# In[ ]:
# In[2]:
df = pd.read_csv("HR_comma_sep.csv")
# In[3]:
df.head()
# In[4]:
df.describe()
# In[5]:
df.info()
# In[6]:
# let's clean up the data
df.rename(columns={'sales':'department'},inplace=True)
df_n = pd.get_dummies(df,columns=['department','salary'])
# In[7]:
df_n.head()
#
#
# So, no missing values in this dataset. Let's move on to understanding the correlation between various features.
#
#
# ** Correlation **
#
#
# In[8]:
corr_mat = df_n.corr()
# In[9]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10));
# Draw the heatmap using seaborn
sns.heatmap(corr_mat, square=True, ax=ax);
#
#
# A couple of interesting observations.
# + The taget variable "left" is negatively correlated with "satisfaction level" which makes sense because employees are more likely
# to leave if they are not satisfied with their work.
# + Management department is positively correlated with high salary and promotions in last 5 years
# In[ ]:
# In[10]:
df.head()
# In[ ]:
#
#
# **How does satisfaction level vary among employees who left?**
#
#
# From the plot, it seems that employees who left are less satisfied on an average.
#
#
#
# In[11]:
plt.figure(figsize=(10,6))
sns.boxplot(x='left',y='satisfaction_level',data=df);
# In[ ]:
#
#
# **Does having more number of projects have any impact on employee churn?**
#
#
#
# From the plots, it seems that people who churn (1) generally tend to have more projects than people who stay (0). From the Boxplot/CDF we can see that, ~50% of churners have 4 or more projects.
#
#
# In[12]:
fig,axarr = plt.subplots(ncols=2,figsize=(12,6))
# plt.figure(figsize=(10,6))
sns.boxplot(x='left',y='number_project',data=df,ax=axarr[0]);
sns.kdeplot(df[df.left==0].number_project,label=0,cumulative=True,ax=axarr[1]);
sns.kdeplot(df[df.left==1].number_project,label=1,cumulative=True,ax=axarr[1]);
axarr[0].set_ylabel("Number of projects")
axarr[1].set_ylabel("CDF")
axarr[1].set_xlabel("Number of projects");
#
#
# ** Does staying in the company for a long time make the employees more vulnerable to churn? **
#
#
# In[69]:
left_over_years = pd.crosstab(df.time_spend_company,df.left,margins=True,normalize='index')
left_over_years = left_over_years[left_over_years.index!='All']
left_over_years = left_over_years.round(2)
left_over_years.head()
# In[68]:
fig, ax = plt.subplots(figsize=(10,6))
_ = left_over_years.plot(kind='bar',ax=ax)
_ = plt.xlabel("number of years at the company")
_ = plt.ylabel("% of employees")
_ = plt.title("Distribution of employees after n-years")
#
#
#
# It's seen that employees start churning after completing 3 years at the company. There's an upward trend from year-3 onwards till year-5. Almost 57% of people who completed 5 years at the company churn. Then we see a downward trend after year 6. People who has spent 7 years or more are unlikely to churn.
#
#
#
# Why do people churn the most after 5 years? Is it because of promotion or some other factors are at play?
#
#
#
# In[85]:
employee_spent_5_years = df[df.time_spend_company==5]
emp_5_yrs_df = pd.crosstab(employee_spent_5_years.promotion_last_5years, employee_spent_5_years.left,margins=True,normalize='index')
emp_5_yrs_df = emp_5_yrs_df[emp_5_yrs_df.index!='All']
emp_5_yrs_df = emp_5_yrs_df.round(2)
emp_5_yrs_df
# In[86]:
emp_5_yrs_df.plot(kind='bar')
#
#
#
# This is interesting. 57% of the employees who haven't been given any promotion in last 5 years surely churn.
# While, 94% of employees who have been given promotion in last 5 years surely stay!
#
#
#
# In[ ]: