import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('../data/raw_data.csv')
data.head(10)
name | satisfaction_level | last_evaluation | number_projects | average_monthly_hours | time_spent_company | work_accident | left | promotion_last_5_years | department | salary | salary_level | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | SMITH | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low | 1 |
1 | JOHNSON | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium | 2 |
2 | WILLIAMS | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium | 2 |
3 | BROWN | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low | 1 |
4 | JONES | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low | 1 |
5 | MILLER | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 1 | 0 | sales | low | 1 |
6 | DAVIS | 0.10 | 0.77 | 6 | 247 | 4 | 0 | 1 | 0 | sales | low | 1 |
7 | GARCIA | 0.92 | 0.85 | 5 | 259 | 5 | 0 | 1 | 0 | sales | low | 1 |
8 | RODRIGUEZ | 0.89 | 1.00 | 5 | 224 | 5 | 0 | 1 | 0 | sales | low | 1 |
9 | WILSON | 0.42 | 0.53 | 2 | 142 | 3 | 0 | 1 | 0 | sales | low | 1 |
sns.set(style="white")
# Compute the correlation matrix
corr = data.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
square=True, xticklabels=True, yticklabels=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca55c9290>
sns.set(style="white")
f, ax = plt.subplots(figsize=(5, 4))
sns.barplot(x=data.satisfaction_level,y=data.left,orient="h", ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca4f47650>
sns.set(style="darkgrid")
g = sns.FacetGrid(data, row="department", col="left", margin_titles=True)
bins = np.linspace(0, 1, 13)
g.map(plt.hist, "satisfaction_level", color="steelblue", bins=bins, lw=0)
<seaborn.axisgrid.FacetGrid at 0x7f3ca5915b90>
sns.set(style="white", palette="muted", color_codes=True)
# Set up the matplotlib figure
f, axes = plt.subplots(3, 3, figsize=(9,7))
sns.despine(left=True)
#people that left
leavers = data.loc[data['left'] == 1]
# Plot a simple histogram with binsize determined automatically
sns.distplot(leavers['satisfaction_level'], kde=False, color="b", ax=axes[0,0])
sns.distplot(leavers['salary_level'], bins=3, kde=False, color="b", ax=axes[0, 1])
sns.distplot(leavers['average_monthly_hours'], kde=False, color="b", ax=axes[0, 2])
sns.distplot(leavers['number_projects'], kde=False, color="b", ax=axes[1,0])
sns.distplot(leavers['last_evaluation'], kde=False, color="b", ax=axes[1, 1])
sns.distplot(leavers['time_spent_company'], kde=False, bins=5, color="b", ax=axes[1, 2])
sns.distplot(leavers['promotion_last_5_years'],bins=10, kde=False, color="b", ax=axes[2,0])
sns.distplot(leavers['work_accident'], bins=10,kde=False, color="b", ax=axes[2, 1])
plt.tight_layout()
#all key employees
key_employees = data.loc[data['last_evaluation'] > 0.7].loc[data['time_spent_company'] >= 3]
key_employees.describe()
satisfaction_level | last_evaluation | number_projects | average_monthly_hours | time_spent_company | work_accident | left | promotion_last_5_years | salary_level | |
---|---|---|---|---|---|---|---|---|---|
count | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 | 6123.000000 |
mean | 0.603059 | 0.864467 | 4.301813 | 219.332027 | 4.127225 | 0.138984 | 0.304426 | 0.022865 | 1.583374 |
std | 0.287024 | 0.083265 | 1.215323 | 48.552356 | 1.383378 | 0.345958 | 0.460201 | 0.149484 | 0.629788 |
min | 0.090000 | 0.710000 | 2.000000 | 96.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 0.430000 | 0.800000 | 3.000000 | 180.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
50% | 0.690000 | 0.870000 | 4.000000 | 229.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
75% | 0.830000 | 0.930000 | 5.000000 | 258.000000 | 5.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 |
max | 1.000000 | 1.000000 | 7.000000 | 310.000000 | 10.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 |
#lost key employees
lost_key_employees = key_employees.loc[data['left']==1]
lost_key_employees.describe()
satisfaction_level | last_evaluation | number_projects | average_monthly_hours | time_spent_company | work_accident | left | promotion_last_5_years | salary_level | |
---|---|---|---|---|---|---|---|---|---|
count | 1864.000000 | 1864.000000 | 1864.000000 | 1864.000000 | 1864.000000 | 1864.000000 | 1864.0 | 1864.000000 | 1864.000000 |
mean | 0.462328 | 0.896357 | 5.325107 | 257.935622 | 4.622318 | 0.047747 | 1.0 | 0.002146 | 1.409871 |
std | 0.354372 | 0.067570 | 1.061447 | 30.686214 | 0.695091 | 0.213287 | 0.0 | 0.046287 | 0.521599 |
min | 0.090000 | 0.710000 | 2.000000 | 130.000000 | 3.000000 | 0.000000 | 1.0 | 0.000000 | 1.000000 |
25% | 0.100000 | 0.840000 | 5.000000 | 243.000000 | 4.000000 | 0.000000 | 1.0 | 0.000000 | 1.000000 |
50% | 0.505000 | 0.900000 | 5.000000 | 258.000000 | 5.000000 | 0.000000 | 1.0 | 0.000000 | 1.000000 |
75% | 0.820000 | 0.950000 | 6.000000 | 278.000000 | 5.000000 | 0.000000 | 1.0 | 0.000000 | 2.000000 |
max | 0.920000 | 1.000000 | 7.000000 | 310.000000 | 6.000000 | 1.000000 | 1.0 | 1.000000 | 3.000000 |
print "Number of key employees: ", len(key_employees)
print "Number of lost key employees: ", len(lost_key_employees)
print "Percentage of lost key employees: ", round((float(len(lost_key_employees))/float(len(key_employees))*100),2),"%"
Number of key employees: 6123 Number of lost key employees: 1864 Percentage of lost key employees: 30.44 %
#save key employees data as csv
key_employees.to_csv('../data/key_employees.csv')
#filter out people with a good last evaluation
leaving_performers = leavers.loc[leavers['last_evaluation'] > 0.7]
sns.set(style="white")
# Compute the correlation matrix
corr = leaving_performers.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
square=True, xticklabels=True, yticklabels=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca65bcc90>
#filter out people with a good last evaluation
satisfied_employees = data.loc[data['satisfaction_level'] > 0.7]
sns.set(style="white")
# Compute the correlation matrix
corr = satisfied_employees.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 4))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5,
square=True, xticklabels=True, yticklabels=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7f3ca6795b90>