In [83]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Import data

In [84]:
data = pd.read_csv('../data/raw_data.csv')
data.head(10)
Out[84]:
name satisfaction_level last_evaluation number_projects average_monthly_hours time_spent_company work_accident left promotion_last_5_years department salary salary_level
0 SMITH 0.38 0.53 2 157 3 0 1 0 sales low 1
1 JOHNSON 0.80 0.86 5 262 6 0 1 0 sales medium 2
2 WILLIAMS 0.11 0.88 7 272 4 0 1 0 sales medium 2
3 BROWN 0.72 0.87 5 223 5 0 1 0 sales low 1
4 JONES 0.37 0.52 2 159 3 0 1 0 sales low 1
5 MILLER 0.41 0.50 2 153 3 0 1 0 sales low 1
6 DAVIS 0.10 0.77 6 247 4 0 1 0 sales low 1
7 GARCIA 0.92 0.85 5 259 5 0 1 0 sales low 1
8 RODRIGUEZ 0.89 1.00 5 224 5 0 1 0 sales low 1
9 WILSON 0.42 0.53 2 142 3 0 1 0 sales low 1

Analyze correlations

In [85]:
sns.set(style="white")

# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca55c9290>

Analyze features

In [86]:
sns.set(style="white")
f, ax = plt.subplots(figsize=(5, 4))
sns.barplot(x=data.satisfaction_level,y=data.left,orient="h", ax=ax)
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca4f47650>
In [87]:
sns.set(style="darkgrid")
g = sns.FacetGrid(data, row="department", col="left", margin_titles=True)
bins = np.linspace(0, 1, 13)
g.map(plt.hist, "satisfaction_level", color="steelblue", bins=bins, lw=0)
Out[87]:
<seaborn.axisgrid.FacetGrid at 0x7f3ca5915b90>

Leavers analysis

In [119]:
sns.set(style="white", palette="muted", color_codes=True)

# Set up the matplotlib figure
f, axes = plt.subplots(3, 3, figsize=(9,7))
sns.despine(left=True)

#people that left
leavers = data.loc[data['left'] == 1]

# Plot a simple histogram with binsize determined automatically
sns.distplot(leavers['satisfaction_level'], kde=False, color="b", ax=axes[0,0])
sns.distplot(leavers['salary_level'], bins=3, kde=False, color="b", ax=axes[0, 1])
sns.distplot(leavers['average_monthly_hours'], kde=False, color="b", ax=axes[0, 2])
sns.distplot(leavers['number_projects'], kde=False, color="b", ax=axes[1,0])
sns.distplot(leavers['last_evaluation'], kde=False, color="b", ax=axes[1, 1])
sns.distplot(leavers['time_spent_company'], kde=False, bins=5, color="b", ax=axes[1, 2])
sns.distplot(leavers['promotion_last_5_years'],bins=10, kde=False, color="b", ax=axes[2,0])
sns.distplot(leavers['work_accident'], bins=10,kde=False, color="b", ax=axes[2, 1])


plt.tight_layout()

Count key employees

In [134]:
#all key employees
key_employees = data.loc[data['last_evaluation'] > 0.7].loc[data['time_spent_company'] >= 3]
key_employees.describe()
Out[134]:
satisfaction_level last_evaluation number_projects average_monthly_hours time_spent_company work_accident left promotion_last_5_years salary_level
count 6123.000000 6123.000000 6123.000000 6123.000000 6123.000000 6123.000000 6123.000000 6123.000000 6123.000000
mean 0.603059 0.864467 4.301813 219.332027 4.127225 0.138984 0.304426 0.022865 1.583374
std 0.287024 0.083265 1.215323 48.552356 1.383378 0.345958 0.460201 0.149484 0.629788
min 0.090000 0.710000 2.000000 96.000000 3.000000 0.000000 0.000000 0.000000 1.000000
25% 0.430000 0.800000 3.000000 180.000000 3.000000 0.000000 0.000000 0.000000 1.000000
50% 0.690000 0.870000 4.000000 229.000000 4.000000 0.000000 0.000000 0.000000 2.000000
75% 0.830000 0.930000 5.000000 258.000000 5.000000 0.000000 1.000000 0.000000 2.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000 3.000000
In [135]:
#lost key employees
lost_key_employees = key_employees.loc[data['left']==1]
lost_key_employees.describe()
Out[135]:
satisfaction_level last_evaluation number_projects average_monthly_hours time_spent_company work_accident left promotion_last_5_years salary_level
count 1864.000000 1864.000000 1864.000000 1864.000000 1864.000000 1864.000000 1864.0 1864.000000 1864.000000
mean 0.462328 0.896357 5.325107 257.935622 4.622318 0.047747 1.0 0.002146 1.409871
std 0.354372 0.067570 1.061447 30.686214 0.695091 0.213287 0.0 0.046287 0.521599
min 0.090000 0.710000 2.000000 130.000000 3.000000 0.000000 1.0 0.000000 1.000000
25% 0.100000 0.840000 5.000000 243.000000 4.000000 0.000000 1.0 0.000000 1.000000
50% 0.505000 0.900000 5.000000 258.000000 5.000000 0.000000 1.0 0.000000 1.000000
75% 0.820000 0.950000 6.000000 278.000000 5.000000 0.000000 1.0 0.000000 2.000000
max 0.920000 1.000000 7.000000 310.000000 6.000000 1.000000 1.0 1.000000 3.000000
In [151]:
print "Number of key employees: ", len(key_employees)
print "Number of lost key employees: ", len(lost_key_employees)
print "Percentage of lost key employees: ", round((float(len(lost_key_employees))/float(len(key_employees))*100),2),"%"
Number of key employees:  6123
Number of lost key employees:  1864
Percentage of lost key employees:  30.44 %
In [152]:
#save key employees data as csv
key_employees.to_csv('../data/key_employees.csv')

Why do performing emploees leave ?

In [90]:
#filter out people with a good last evaluation
leaving_performers = leavers.loc[leavers['last_evaluation'] > 0.7]
In [91]:
sns.set(style="white")

# Compute the correlation matrix
corr = leaving_performers.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca65bcc90>

Why do satisifed employees leave ?

In [97]:
#filter out people with a good last evaluation
satisfied_employees = data.loc[data['satisfaction_level'] > 0.7]
In [99]:
sns.set(style="white")

# Compute the correlation matrix
corr = satisfied_employees.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca6795b90>