In [83]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Import data¶

In [84]:

data = pd.read_csv('../data/raw_data.csv')
data.head(10)

Out[84]:

	name	satisfaction_level	last_evaluation	number_projects	average_monthly_hours	time_spent_company	left	department	salary	salary_level
0	SMITH	0.38	0.53	2	157	3	1	sales	low	1
1	JOHNSON	0.80	0.86	5	262	6	1	sales	medium	2
2	WILLIAMS	0.11	0.88	7	272	4	1	sales	medium	2
3	BROWN	0.72	0.87	5	223	5	1	sales	low	1
4	JONES	0.37	0.52	2	159	3	1	sales	low	1
5	MILLER	0.41	0.50	2	153	3	1	sales	low	1
6	DAVIS	0.10	0.77	6	247	4	1	sales	low	1
7	GARCIA	0.92	0.85	5	259	5	1	sales	low	1
8	RODRIGUEZ	0.89	1.00	5	224	5	1	sales	low	1
9	WILSON	0.42	0.53	2	142	3	1	sales	low	1

Analyze correlations¶

In [85]:

sns.set(style="white")

# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

Out[85]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca55c9290>

Analyze features¶

In [86]:

sns.set(style="white")
f, ax = plt.subplots(figsize=(5, 4))
sns.barplot(x=data.satisfaction_level,y=data.left,orient="h", ax=ax)

Out[86]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca4f47650>

In [87]:

sns.set(style="darkgrid")
g = sns.FacetGrid(data, row="department", col="left", margin_titles=True)
bins = np.linspace(0, 1, 13)
g.map(plt.hist, "satisfaction_level", color="steelblue", bins=bins, lw=0)

Out[87]:

<seaborn.axisgrid.FacetGrid at 0x7f3ca5915b90>

Leavers analysis¶

In [119]:

sns.set(style="white", palette="muted", color_codes=True)

# Set up the matplotlib figure
f, axes = plt.subplots(3, 3, figsize=(9,7))
sns.despine(left=True)

#people that left
leavers = data.loc[data['left'] == 1]

# Plot a simple histogram with binsize determined automatically
sns.distplot(leavers['satisfaction_level'], kde=False, color="b", ax=axes[0,0])
sns.distplot(leavers['salary_level'], bins=3, kde=False, color="b", ax=axes[0, 1])
sns.distplot(leavers['average_monthly_hours'], kde=False, color="b", ax=axes[0, 2])
sns.distplot(leavers['number_projects'], kde=False, color="b", ax=axes[1,0])
sns.distplot(leavers['last_evaluation'], kde=False, color="b", ax=axes[1, 1])
sns.distplot(leavers['time_spent_company'], kde=False, bins=5, color="b", ax=axes[1, 2])
sns.distplot(leavers['promotion_last_5_years'],bins=10, kde=False, color="b", ax=axes[2,0])
sns.distplot(leavers['work_accident'], bins=10,kde=False, color="b", ax=axes[2, 1])


plt.tight_layout()

Count key employees¶

In [134]:

#all key employees
key_employees = data.loc[data['last_evaluation'] > 0.7].loc[data['time_spent_company'] >= 3]
key_employees.describe()

Out[134]:

	satisfaction_level	last_evaluation	number_projects	average_monthly_hours	time_spent_company	work_accident	left	promotion_last_5_years	salary_level
count	6123.000000	6123.000000	6123.000000	6123.000000	6123.000000	6123.000000	6123.000000	6123.000000	6123.000000
mean	0.603059	0.864467	4.301813	219.332027	4.127225	0.138984	0.304426	0.022865	1.583374
std	0.287024	0.083265	1.215323	48.552356	1.383378	0.345958	0.460201	0.149484	0.629788
min	0.090000	0.710000	2.000000	96.000000	3.000000	0.000000	0.000000	0.000000	1.000000
25%	0.430000	0.800000	3.000000	180.000000	3.000000	0.000000	0.000000	0.000000	1.000000
50%	0.690000	0.870000	4.000000	229.000000	4.000000	0.000000	0.000000	0.000000	2.000000
75%	0.830000	0.930000	5.000000	258.000000	5.000000	0.000000	1.000000	0.000000	2.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000	3.000000

In [135]:

#lost key employees
lost_key_employees = key_employees.loc[data['left']==1]
lost_key_employees.describe()

Out[135]:

	satisfaction_level	last_evaluation	number_projects	average_monthly_hours	time_spent_company	work_accident	left	promotion_last_5_years	salary_level
count	1864.000000	1864.000000	1864.000000	1864.000000	1864.000000	1864.000000	1864.0	1864.000000	1864.000000
mean	0.462328	0.896357	5.325107	257.935622	4.622318	0.047747	1.0	0.002146	1.409871
std	0.354372	0.067570	1.061447	30.686214	0.695091	0.213287	0.0	0.046287	0.521599
min	0.090000	0.710000	2.000000	130.000000	3.000000	0.000000	1.0	0.000000	1.000000
25%	0.100000	0.840000	5.000000	243.000000	4.000000	0.000000	1.0	0.000000	1.000000
50%	0.505000	0.900000	5.000000	258.000000	5.000000	0.000000	1.0	0.000000	1.000000
75%	0.820000	0.950000	6.000000	278.000000	5.000000	0.000000	1.0	0.000000	2.000000
max	0.920000	1.000000	7.000000	310.000000	6.000000	1.000000	1.0	1.000000	3.000000

In [151]:

print "Number of key employees: ", len(key_employees)
print "Number of lost key employees: ", len(lost_key_employees)
print "Percentage of lost key employees: ", round((float(len(lost_key_employees))/float(len(key_employees))*100),2),"%"

Number of key employees:  6123
Number of lost key employees:  1864
Percentage of lost key employees:  30.44 %

In [152]:

#save key employees data as csv
key_employees.to_csv('../data/key_employees.csv')

Why do performing emploees leave ?¶

In [90]:

#filter out people with a good last evaluation
leaving_performers = leavers.loc[leavers['last_evaluation'] > 0.7]

In [91]:

sns.set(style="white")

# Compute the correlation matrix
corr = leaving_performers.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

Out[91]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca65bcc90>

Why do satisifed employees leave ?¶

In [97]:

#filter out people with a good last evaluation
satisfied_employees = data.loc[data['satisfaction_level'] > 0.7]

In [99]:

sns.set(style="white")

# Compute the correlation matrix
corr = satisfied_employees.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

Out[99]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca6795b90>