Employee Retention



EMPLOYEE_RETENTION

LEFT OR EMPLOYEED

IMPORT

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split # Scikit-Learn 0.18+
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

EXAMINE DATA

df = pd.read_csv('../Employee_retention/employee_data.csv')
df.head()
avg_monthly_hrs department filed_complaint last_evaluation n_projects recently_promoted salary satisfaction status tenure
0 221 engineering NaN 0.932868 4 NaN low 0.829896 Left 5.0
1 232 support NaN NaN 3 NaN low 0.834544 Employed 2.0
2 184 sales NaN 0.788830 3 NaN medium 0.834988 Employed 3.0
3 206 sales NaN 0.575688 4 NaN low 0.424764 Employed 2.0
4 249 sales NaN 0.845217 3 NaN low 0.779043 Employed 3.0
df.tail()
avg_monthly_hrs department filed_complaint last_evaluation n_projects recently_promoted salary satisfaction status tenure
14244 178 IT NaN 0.735865 5 NaN low 0.263282 Employed 5.0
14245 257 sales NaN 0.638604 3 NaN low 0.868209 Employed 2.0
14246 232 finance 1.0 0.847623 5 NaN medium 0.898917 Left 5.0
14247 130 IT NaN 0.757184 4 NaN medium 0.641304 Employed 3.0
14248 159 NaN NaN 0.578742 3 NaN medium 0.808850 Employed 3.0
df.shape
(14249, 10)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14249 entries, 0 to 14248
Data columns (total 10 columns):
avg_monthly_hrs      14249 non-null int64
department           13540 non-null object
filed_complaint      2058 non-null float64
last_evaluation      12717 non-null float64
n_projects           14249 non-null int64
recently_promoted    300 non-null float64
salary               14249 non-null object
satisfaction         14068 non-null float64
status               14249 non-null object
tenure               14068 non-null float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.1+ MB
df.describe()
avg_monthly_hrs filed_complaint last_evaluation n_projects recently_promoted satisfaction tenure
count 14249.000000 2058.0 12717.000000 14249.000000 300.0 14068.000000 14068.000000
mean 199.795775 1.0 0.718477 3.773809 1.0 0.621295 3.497228
std 50.998714 0.0 0.173062 1.253126 0.0 0.250469 1.460917
min 49.000000 1.0 0.316175 1.000000 1.0 0.040058 2.000000
25% 155.000000 1.0 0.563866 3.000000 1.0 0.450390 3.000000
50% 199.000000 1.0 0.724939 4.000000 1.0 0.652527 3.000000
75% 245.000000 1.0 0.871358 5.000000 1.0 0.824951 4.000000
max 310.000000 1.0 1.000000 7.000000 1.0 1.000000 10.000000
df.isnull().sum()
avg_monthly_hrs          0
department             709
filed_complaint      12191
last_evaluation       1532
n_projects               0
recently_promoted    13949
salary                   0
satisfaction           181
status                   0
tenure                 181
dtype: int64
nullvalues = df.isnull().sum()
nullvalues.plot.barh()
<matplotlib.axes._subplots.AxesSubplot at 0x10f227fd0>

png

# Plotting histogram for numeric distributions 
df.hist(figsize=(10,10), xrot = -45)
plt.show()

png

# # Plotting bar graphs for categorical distributions 
for feature in df.dtypes[df.dtypes == 'object'].index:
    sns.countplot(y=feature, data=df)
    plt.show()

png

png

png

# Just getting comfortable with matplotlib
df.hist('last_evaluation',bins=50);

png

# Just getting comfortable with matplotlib
df.plot(x='last_evaluation', y='avg_monthly_hrs',
        title='status', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x1a11014898>

png

# jointplot showing the kde distributions of satisfication vs. tenure
sns.jointplot(x='tenure',y='satisfaction',data=df,color='red',kind='kde');

png

# Just getting comfortable with Seaborn
sns.countplot(y='n_projects', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x1a111e9b70>

png

CORRELATIONS

correlations = df.corr()
plt.subplots(figsize=(7,5))
sns.heatmap(correlations)
<matplotlib.axes._subplots.AxesSubplot at 0x1a114eec88>

png

mask = np.zeros_like(correlations)
mask[np.triu_indices_from(mask)] = True
plt.subplots(figsize=(10,5))
sns.axes_style("white")
sns.heatmap(correlations * 100, annot= True, mask=mask,
                 vmax=.3, square=True, cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1a1711fe80>

png

Segmentation

Cutting data to observe the relationship between categorical and numeric features

# Segment satisfaction by status
sns.violinplot(y = 'status', x = 'satisfaction', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x1a11523358>

png

# Segment last_evaluation by status
sns.violinplot(y = 'status', x = 'last_evaluation', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x1a1a5265c0>

png

# Segment by status and display the means within each class
df.groupby('status').mean()
avg_monthly_hrs filed_complaint last_evaluation n_projects recently_promoted satisfaction tenure
status
Employed 197.700286 1.0 0.714479 3.755273 1.0 0.675979 3.380245
Left 206.502948 1.0 0.730706 3.833137 1.0 0.447500 3.869023
# Since target is status (categorical) will do extra segmentation
# Scatterplot of satisfaction vs. last_evaluation
sns.lmplot(x='satisfaction', y='last_evaluation', hue='status',
           data=df, fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x1a172350f0>

png

# # Scatterplot of satisfaction vs. last_evaluation, only those who have left
sns.lmplot(x='satisfaction', y='last_evaluation',
           data=df[df.status == 'Left'], fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x1a1a6b8470>

png

DATA CLEANING

# Dropping duplicates
# Were only a few
df = df.drop_duplicates()
df.shape
(14221, 10)
# Looking at classes of department
list(df.department.unique())
['engineering',
 'support',
 'sales',
 'IT',
 'product',
 'marketing',
 'temp',
 'procurement',
 'finance',
 nan,
 'management',
 'information_technology',
 'admin']
# I will drop temporary workers 
df = df[df.department != 'temp']
df.shape
(14068, 10)
# unique values for filed_complaint
df.filed_complaint.unique()
array([nan,  1.])
df['filed_complaint'] = df.filed_complaint.fillna(0)
# check results 
df.filed_complaint.unique()
array([0., 1.])
# unique values for recently promoted
df.recently_promoted.unique()
array([nan,  1.])
df['recently_promoted'] = df.recently_promoted.fillna(0)
# check results 
df.recently_promoted.unique()
array([0., 1.])
# I will replace information technology with IT
df.department.replace('information_technology', 'IT',
                      inplace =True)
# plot results
sns.countplot(y = 'department', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x1a111ff5c0>

png

df.isnull().sum()
avg_monthly_hrs         0
department            709
filed_complaint         0
last_evaluation      1351
n_projects              0
recently_promoted       0
salary                  0
satisfaction            0
status                  0
tenure                  0
dtype: int64
# I will just replace nan values with missing for department
df['department'].fillna('Missing', inplace = True)
# create new  variable for missing last_evaluation
# astype converts 
df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
# Fill missing values with 0 for last evaluation
df.last_evaluation.fillna(0 , inplace = True)
#checking work
df.isnull().sum()
avg_monthly_hrs            0
department                 0
filed_complaint            0
last_evaluation            0
n_projects                 0
recently_promoted          0
salary                     0
satisfaction               0
status                     0
tenure                     0
last_evaluation_missing    0
dtype: int64
df.head()
avg_monthly_hrs department filed_complaint last_evaluation n_projects recently_promoted salary satisfaction status tenure last_evaluation_missing
0 221 engineering 0.0 0.932868 4 0.0 low 0.829896 Left 5.0 0
1 232 support 0.0 0.000000 3 0.0 low 0.834544 Employed 2.0 1
2 184 sales 0.0 0.788830 3 0.0 medium 0.834988 Employed 3.0 0
3 206 sales 0.0 0.575688 4 0.0 low 0.424764 Employed 2.0 0
4 249 sales 0.0 0.845217 3 0.0 low 0.779043 Employed 3.0 0

FEATURE ENGINEERING

# Looking Scatterplot of satisfaction vs. last_evaluation again
# for only those who have left
# To try and engineer
sns.lmplot(x='satisfaction', y='last_evaluation',
           data=df[df.status == 'Left'], fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x1a111abc50>

png

# I can engineer these findings 
df['underperformer'] = ((df.last_evaluation < 0.6) & 
                        (df.last_evaluation_missing == 0)).astype(int)
df['unhappy'] = (df.satisfaction < 0.2).astype(int)
df['overachiever'] = ((df.last_evaluation > 0.8) 
                     & (df.satisfaction > 0.7)).astype(int)
# The proportion of observations belonging to each group
df[['underperformer', 'unhappy', 'overachiever']].mean()
underperformer    0.285257
unhappy           0.092195
overachiever      0.177069
dtype: float64
# Converting status into an indicator variable
# Left = 1
# Right = 0
df['status']= pd.get_dummies(df.status).Left
df['status'].unique()
array([1, 0], dtype=uint64)
df.status.head()
0    1
1    0
2    0
3    0
4    0
Name: status, dtype: uint8
# Checking the proportion for who 'Left
df.status.mean()
0.23933750355416547
# Create new dataframe with dummy features
df = pd.get_dummies(df, columns=['department', 'salary'])

# Display first 10 rows
df.head(5)
avg_monthly_hrs filed_complaint last_evaluation n_projects recently_promoted satisfaction status tenure last_evaluation_missing underperformer ... department_finance department_management department_marketing department_procurement department_product department_sales department_support salary_high salary_low salary_medium
0 221 0.0 0.932868 4 0.0 0.829896 1 5.0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 232 0.0 0.000000 3 0.0 0.834544 0 2.0 1 0 ... 0 0 0 0 0 0 1 0 1 0
2 184 0.0 0.788830 3 0.0 0.834988 0 3.0 0 0 ... 0 0 0 0 0 1 0 0 0 1
3 206 0.0 0.575688 4 0.0 0.424764 0 2.0 0 1 ... 0 0 0 0 0 1 0 0 1 0
4 249 0.0 0.845217 3 0.0 0.779043 0 3.0 0 0 ... 0 0 0 0 0 1 0 0 1 0

5 rows × 26 columns

list(df.columns)
['avg_monthly_hrs',
 'filed_complaint',
 'last_evaluation',
 'n_projects',
 'recently_promoted',
 'satisfaction',
 'status',
 'tenure',
 'last_evaluation_missing',
 'underperformer',
 'unhappy',
 'overachiever',
 'department_IT',
 'department_Missing',
 'department_admin',
 'department_engineering',
 'department_finance',
 'department_management',
 'department_marketing',
 'department_procurement',
 'department_product',
 'department_sales',
 'department_support',
 'salary_high',
 'salary_low',
 'salary_medium']

SAVE PROGRESS

df.to_csv('engineered_cleaned', index = None)
df = pd.read_csv('../Employee_retention/engineered_cleaned')
unseen_data = pd.read_csv('../Employee_retention/unseen_raw_data.csv')
df.head()
avg_monthly_hrs filed_complaint last_evaluation n_projects recently_promoted satisfaction status tenure last_evaluation_missing underperformer ... department_finance department_management department_marketing department_procurement department_product department_sales department_support salary_high salary_low salary_medium
0 221 0.0 0.932868 4 0.0 0.829896 1 5.0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 232 0.0 0.000000 3 0.0 0.834544 0 2.0 1 0 ... 0 0 0 0 0 0 1 0 1 0
2 184 0.0 0.788830 3 0.0 0.834988 0 3.0 0 0 ... 0 0 0 0 0 1 0 0 0 1
3 206 0.0 0.575688 4 0.0 0.424764 0 2.0 0 1 ... 0 0 0 0 0 1 0 0 1 0
4 249 0.0 0.845217 3 0.0 0.779043 0 3.0 0 0 ... 0 0 0 0 0 1 0 0 1 0

5 rows × 26 columns

unseen_data.head()
avg_monthly_hrs department filed_complaint last_evaluation n_projects recently_promoted salary satisfaction tenure
0 228 management NaN 0.735618 2 NaN high 0.805661 3.0
1 229 product NaN 1.000000 4 NaN low 0.719961 4.0
2 196 sales 1.0 0.557426 4 NaN low 0.749835 2.0
3 207 IT NaN 0.715171 3 NaN high 0.987447 3.0
4 129 management NaN 0.484818 2 NaN low 0.441219 3.0

TRAIN TEST DATA

y = df.status
X = df.drop('status', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size =0.2,
                                                    random_state = 1234 )

len(X_train), len(X_test), len(y_train), len(y_test)
(11254, 2814, 11254, 2814)

SCALE DATA

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# X_pred = ss.transform(X_pred)

LOGISTIC REGRESSION

lr = LogisticRegression()
lr.fit(X_train, y_train)

print(lr.score(X_test, y_test))
0.8464818763326226
# Set up the parameters. Looking at C regularization strengths on a log scale.
lr_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), lr_params, cv=5, verbose=1)
%%time
lr_gridsearch = lr_gridsearch.fit(X_train, y_train)
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
CPU times: user 44.4 s, sys: 1.1 s, total: 45.5 s
Wall time: 45.7 s


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   45.5s finished
# best score on the training data:
lr_gridsearch.best_score_
0.8516083170428292
# best parameters on the training data:
# Lasso was chosen: this indicates that maybe unimportant (noise) variables
# is more of an issue in our data than multicollinearity.
lr_gridsearch.best_params_
{'C': 0.7924828983539169, 'penalty': 'l1', 'solver': 'liblinear'}
# assign the best estimator to a variable:
best_lr = lr_gridsearch.best_estimator_
# Score it on the testing data:
best_lr.score(X_test, y_test)
0.8468372423596304
# slightly better than the default.
coef_df = pd.DataFrame({
        'coef':best_lr.coef_[0],
        'feature':X.columns
    })
coef_df['abs_coef'] = np.abs(coef_df.coef)
# sort by absolute value of coefficient (magnitude)
coef_df.sort_values('abs_coef', ascending=False, inplace=True)
# Show non-zero coefs and predictors
coef_df[coef_df.coef != 0]
coef feature abs_coef
5 -1.879827 satisfaction 1.879827
10 1.410796 overachiever 1.410796
7 1.373559 last_evaluation_missing 1.373559
8 1.323589 underperformer 1.323589
2 1.046259 last_evaluation 1.046259
1 -0.539858 filed_complaint 0.539858
3 -0.489306 n_projects 0.489306
22 -0.380817 salary_high 0.380817
6 0.373059 tenure 0.373059
0 0.298843 avg_monthly_hrs 0.298843
23 0.245474 salary_low 0.245474
4 -0.227202 recently_promoted 0.227202
9 0.139250 unhappy 0.139250
18 -0.128141 department_procurement 0.128141
16 -0.069641 department_management 0.069641
14 0.057416 department_engineering 0.057416
11 -0.035294 department_IT 0.035294
12 0.033931 department_Missing 0.033931
13 -0.031768 department_admin 0.031768
15 0.019949 department_finance 0.019949
20 0.013519 department_sales 0.013519
19 -0.007938 department_product 0.007938
17 -0.005222 department_marketing 0.005222

LOGISTIC REGRESSION SCORE

predictions = lr.predict(X_test)
accuracy_score(y_true = y_test, y_pred = predictions)
0.8464818763326226
accuracy_score(y_true = y_test, y_pred = predictions, normalize=False)
2382
print(classification_report(y_test,predictions))
             precision    recall  f1-score   support

          0       0.89      0.91      0.90      2145
          1       0.69      0.64      0.67       669

avg / total       0.84      0.85      0.84      2814

RANDOM FOREST

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print(lr.score(X_test, y_test))
0.8464818763326226
RandomForestClassifier().get_params()
{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
rf_params = {
    'min_samples_split':[5],
    'max_depth':[3, 5, 7]
}

rf_gridsearch = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, verbose=1)
%%time
rf_gridsearch = rf_gridsearch.fit(X_train, y_train)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: user 1.01 s, sys: 19.6 ms, total: 1.03 s
Wall time: 1.03 s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.0s finished
rf_gridsearch.best_score_
0.9690776612759907
rf_gridsearch.best_params_
{'max_depth': 7, 'min_samples_split': 5}
best_rf = rf_gridsearch.best_estimator_
best_rf.score(X_test, y_test)
0.9669509594882729
list(best_rf.feature_importances_)
[0.18520717815866733,
 0.0017834239265606083,
 0.07324057836113385,
 0.2609816806103563,
 0.0009973877944141293,
 0.18507539209847942,
 0.18776074277567353,
 0.0016795377487483693,
 0.015534653021916997,
 0.04200076195183799,
 0.031615545546549315,
 0.00030820603243023536,
 0.00037398192565454147,
 0.00020238531570987854,
 0.0008124618240543731,
 0.0003715306187323974,
 0.000299604779572024,
 9.117414532271443e-06,
 0.000869561700950858,
 8.683718767008375e-05,
 0.0008954154836238806,
 0.0004514619508305953,
 0.0028828410344619495,
 0.005094881687359401,
 0.001464831050079755]
# The feature importances (the higher, the more important the feature).
feat_import = list(best_rf.feature_importances_)
plt.plot(feat_import)
plt.ylabel('Feat_Importances')
plt.show()

png

#The number of features when fit is performed.
best_rf.n_features_
25
predictions = best_rf.predict(X_test)
print(classification_report(y_test,predictions))
             precision    recall  f1-score   support

          0       0.97      0.99      0.98      2145
          1       0.96      0.90      0.93       669

avg / total       0.97      0.97      0.97      2814
confusion_matrix(y_true= y_test, y_pred = predictions)
array([[2121,   24],
       [  69,  600]])
best_rf.predict_proba(X_test)
array([[0.26765861, 0.73234139],
       [0.98382641, 0.01617359],
       [0.94003303, 0.05996697],
       ...,
       [0.98239891, 0.01760109],
       [0.97468786, 0.02531214],
       [0.83351674, 0.16648326]])
y_hat = best_rf.predict(X_test)

cm = confusion_matrix(y_test, y_hat)
cm = pd.DataFrame(cm)

sns.heatmap(cm, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1a1a72dc88>

png

# Much higher score with random forest 

GRADIENT BOOSTING

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

print(gb.score(X_test, y_test))
0.9673063255152807
GradientBoostingClassifier().get_params()
{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}
gb_params = {
    'learning_rate':[0.05, 0.1,0.2],
    'n_estimators':[20,100,200],
    'max_depth':[1,3,5]
}

gb_gridsearch = GridSearchCV(RandomForestClassifier(), gb_params, cv=5, verbose=1)
%%time
gb_gridsearch = rf_gridsearch.fit(X_train, y_train)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: user 1.1 s, sys: 12.6 ms, total: 1.12 s
Wall time: 1.12 s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    1.0s finished
# only a little better 
gb_gridsearch.best_score_
0.9685445175048871
gb_gridsearch.best_params_
{'max_depth': 7, 'min_samples_split': 5}
predictions = gb.predict(X_test)
accuracy_score(y_true = y_test, y_pred = predictions)
0.9673063255152807
best_gb = lr_gridsearch.best_estimator_
best_gb.predict_proba(X_test)
array([[0.6020669 , 0.3979331 ],
       [0.97631714, 0.02368286],
       [0.82280054, 0.17719946],
       ...,
       [0.99830735, 0.00169265],
       [0.70519885, 0.29480115],
       [0.93748659, 0.06251341]])
confusion_matrix(y_true= y_test, y_pred = predictions)
array([[2111,   34],
       [  58,  611]])
print(classification_report(y_test,predictions))
             precision    recall  f1-score   support

          0       0.97      0.98      0.98      2145
          1       0.95      0.91      0.93       669

avg / total       0.97      0.97      0.97      2814
y_hat = best_gb.predict(X_test)

cm = confusion_matrix(y_test, y_hat)
cm = pd.DataFrame(cm)

sns.heatmap(cm, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10f2252e8>

png

Written on September 30, 2018