What are common characteristics of employees lost in attrition compared to those who stay in IBM’s fictional dataset?

Estimated reading time: 30 minutes

What are common characteristics of employees lost in attrition compared to those who stay in IBM’s fictional dataset?

We will be using point plots, box plots, kernel density diagrams, means, standard deviations, and z-tests to explore this question.


Set Up Dataset

from pandas import read_csv
data = read_csv("data/attrition.csv")
target = "Attrition"
feature_by_dtype = {}
for c in data.columns:
    
    if c == target: continue
    
    data_type = str(data[c].dtype)
    
    if data_type not in feature_by_dtype.keys():
         feature_by_dtype[data_type] = [c]
    else:
        feature_by_dtype[data_type].append(c)

feature_by_dtype
feature_by_dtype.keys()
dict_keys(['int64', 'object'])
objects = feature_by_dtype["object"]
remove = ["Over18"]
import pandas as pd

pd.options.display.max_columns = None
data.head()
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
categorical_features = [f for f in objects if f not in remove]
int64s = feature_by_dtype["int64"]
## handeling feature types in dictionary
remove.append("StandardHours")
remove.append("EmployeeCount")
count_features = []
for i in [i for i in int64s if len(data[i].unique()) < 20 and i not in remove]:
    count_features.append(i)
count_features = count_features #+ ["TotalWorkingYears", "YearsAtCompany", "HourlyRate"]
remove.append("EmployeeNumber")
numerical_features = [i for i in int64s if i not in remove]

Numerical Features

data[numerical_features].head()
Age DailyRate DistanceFromHome Education EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1102 1 2 2 94 3 2 4 5993 19479 8 11 3 1 0 8 0 1 6 4 0 5
1 49 279 8 1 3 61 2 2 2 5130 24907 1 23 4 4 1 10 3 3 10 7 1 7
2 37 1373 2 2 4 92 2 1 3 2090 2396 6 15 3 2 0 7 3 3 0 0 0 0
3 33 1392 3 4 4 56 3 1 3 2909 23159 1 11 3 3 0 8 3 3 8 7 3 0
4 27 591 2 1 1 40 3 1 2 3468 16632 9 12 3 4 1 6 3 3 2 2 2 2

Python Source Code

def display_ttest(data, category, numeric):
    output = {}
    s1 = data[data[category] == data[category].unique()[0]][numeric]
    s2 = data[data[category] == data[category].unique()[1]][numeric]
    from scipy.stats import ttest_ind
    t, p = ttest_ind(s1,s2)
    from IPython.display import display
    from pandas import DataFrame
    display(DataFrame(data=[{"t-test statistic" : t, "p-value" : p}], columns=["t-test statistic", "p-value"], index=[category]).round(2))

def display_ztest(data, category, numeric):
    output = {}
    s1 = data[data[category] == data[category].unique()[0]][numeric]
    s2 = data[data[category] == data[category].unique()[1]][numeric]
    from statsmodels.stats.weightstats import ztest
    z, p = ztest(s1,s2)
    from IPython.display import display
    from pandas import DataFrame
    display(DataFrame(data=[{"z-test statistic" : z, "p-value" : p}], columns=["z-test statistic", "p-value"], index=[category]).round(2))
    
def display_cxn_analysis(data, category, numeric, target):
    
    from seaborn import boxplot, kdeplot, set_style, distplot, countplot
    from matplotlib.pyplot import show, figure, subplots, ylabel, xlabel, subplot, suptitle
    
    not_target = [a for a in data[category].unique() if a != target][0]
    
    pal = {target : "yellow",
          not_target : "darkgrey"}
    

    set_style("whitegrid")
    figure(figsize=(12,5))
    suptitle(numeric + " by " + category)

    # ==============================================
    
    p1 = subplot(2,2,2)
    boxplot(y=category, x=numeric, data=data, orient="h", palette = pal)
    p1.get_xaxis().set_visible(False)

    # ==============================================
    
    if(numeric in count_features):
        p2 = subplot(2,2,4)
        
        s2 = data[data[category] == not_target][numeric]
        s2 = s2.rename(not_target) 
        countplot(s2, color = pal[not_target])
        
        s1 = data[data[category] == target][numeric]
        s1 = s1.rename(target)
        ax = countplot(s1, color = pal[target])
        
        ax.set_yticklabels([ "{:.0f}%".format((tick/len(data)) * 100) for tick in ax.get_yticks()])
        
        ax.set_ylabel("Percentage")
        ax.set_xlabel(numeric)
        
    else:
        p2 = subplot(2,2,4, sharex=p1)
        s1 = data[data[category] == target][numeric]
        s1 = s1.rename(target)
        kdeplot(s1, shade=True, color = pal[target])
        #distplot(s1,kde=False,color = pal[target])

        s2 = data[data[category] == not_target][numeric]
        s2 = s2.rename(not_target)  
        kdeplot(s2, shade=True, color = pal[not_target])
        #distplot(s2,kde=False,color = pal[not_target])

        #ylabel("Density Function")
        ylabel("Distribution Plot")
        xlabel(numeric)
    
    # ==============================================
    
    p3 = subplot(1,2,1)
    from seaborn import pointplot
    from matplotlib.pyplot import rc_context

    with rc_context({'lines.linewidth': 0.8}):
        pp = pointplot(x=category, y=numeric, data=data, capsize=.1, color="black", marker="s")
        
    
    # ==============================================
    
    show()
    
    #display p value
    
    if(data[category].value_counts()[0] > 30 and data[category].value_counts()[1] > 30):
        display_ztest(data,category,numeric)
    else:
        display_ttest(data,category,numeric)
    
    #Means, Standard Deviation, Absolute Distance
    table = data[[category,numeric]]
    
    means = table.groupby(category).mean()
    stds = table.groupby(category).std()
    
    s1_mean = means.loc[data[category].unique()[0]]
    s1_std = stds.loc[data[category].unique()[0]]
    
    s2_mean = means.loc[data[category].unique()[1]]
    s2_std = means.loc[data[category].unique()[1]]
    
    print("%s Mean: %.2f (+/- %.2f)" % (category + " == " + str(data[category].unique()[0]),s1_mean, s1_std))
    print("%s Mean : %.2f (+/- %.2f)" % (category + " == " + str(data[category].unique()[1]), s2_mean, s2_std))
    print("Absolute Mean Diferrence Distance: %.2f" % abs(s1_mean - s2_mean))
def get_p_value(s1,s2):
    
    from statsmodels.stats.weightstats import ztest
    from scipy.stats import ttest_ind
    
    if(len(s1) > 30 & len(s2) > 30):
        z, p = ztest(s1,s2)
        return p
    else:
        t, p = ttest_ind(s1,s2)
        return p
    
def get_p_values(data, category, numerics):
    
    output = {}
    
    for numeric in numerics:
        s1 = data[data[category] == data[category].unique()[0]][numeric]
        s2 = data[data[category] == data[category].unique()[1]][numeric]
        row = {"p-value" : get_p_value(s1,s2)}
        output[numeric] = row
    
    from pandas import DataFrame
    
    return DataFrame(data=output).T

def get_statistically_significant_numerics(data, category, numerics):
    df = get_p_values(data, category, numerics)
    return list(df[df["p-value"] < 0.05].index)

def get_statistically_non_significant_numerics(data, category, numerics):
    df = get_p_values(data, category, numerics)
    return list(df[df["p-value"] >= 0.05].index)
    
def display_p_values(data, category, numerics):
    from IPython.display import display
    display(get_p_values(data, category, numerics).round(2).sort_values("p-value", ascending=False))
### TESTING

#Well this simply sees if there is a statistical difference between numeric feature's..
# .. distribution between the two attrition classes (Yes and No)
output = {}

for numeric in numerical_features:
    s1 = data[data[target] == data[target].unique()[0]][numeric]
    s2 = data[data[target] == data[target].unique()[1]][numeric]
    
    from statsmodels.stats.weightstats import ztest
    from scipy.stats import ttest_ind
    
    if(len(s1) > 30 & len(s2) > 30):
        # for this task this is always going to be true 
        # test for mean based on normal distribution, the samples are assumed
        # to be independent.
        z, p = ztest(s1,s2)
    else:
        t, p = ttest_ind(s1,s2)
    
    row = {"p-value" : p}
    output[numeric] = row

df = pd.DataFrame(data=output).T
df_sig = df[df["p-value"] < 0.05]

### TEST PASSED
significant = get_statistically_significant_numerics(data,target,numerical_features) 
ns = get_statistically_non_significant_numerics(data,target,numerical_features)

Statistically Significant Numerical Features

i = iter(significant)

The fictional company on average loses staff that are 3 - 4 years younger than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.18 0.0
Attrition == Yes Mean: 33.61 (+/- 9.69)
Attrition == No Mean : 37.56 (+/- 37.56)
Absolute Mean Diferrence Distance: 3.95

Employees lost in attrition tend to have lower daily rates than those who stay.

  • Each of the group are 180 degrees flipped from each other in their kernel density diagram
display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -2.17 0.03
Attrition == Yes Mean: 750.36 (+/- 401.90)
Attrition == No Mean : 812.50 (+/- 812.50)
Absolute Mean Diferrence Distance: 62.14

Employees lost in attrition tend to have longer commute distances than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition 2.99 0.0
Attrition == Yes Mean: 10.63 (+/- 8.45)
Attrition == No Mean : 8.92 (+/- 8.92)
Absolute Mean Diferrence Distance: 1.72

Employees lost in attrition are less satisfied with their work environment on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -3.98 0.0
Attrition == Yes Mean: 2.46 (+/- 1.17)
Attrition == No Mean : 2.77 (+/- 2.77)
Absolute Mean Diferrence Distance: 0.31

Employees lost in attrition are less involved with their jobs on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -5.02 0.0
Attrition == Yes Mean: 2.52 (+/- 0.77)
Attrition == No Mean : 2.77 (+/- 2.77)
Absolute Mean Diferrence Distance: 0.25

Employees lost in attrition tend to be lower in job level than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.57 0.0
Attrition == Yes Mean: 1.64 (+/- 0.94)
Attrition == No Mean : 2.15 (+/- 2.15)
Absolute Mean Diferrence Distance: 0.51

Employees who stay have more job satisfication than employees lost in attrition

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -3.99 0.0
Attrition == Yes Mean: 2.47 (+/- 1.12)
Attrition == No Mean : 2.78 (+/- 2.78)
Absolute Mean Diferrence Distance: 0.31

Employees lost in attrition tend to have lower monthly average income on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.2 0.0
Attrition == Yes Mean: 4787.09 (+/- 3640.21)
Attrition == No Mean : 6832.74 (+/- 6832.74)
Absolute Mean Diferrence Distance: 2045.65

Employees who stay tend to have more stock options than those lost in attrition.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -5.3 0.0
Attrition == Yes Mean: 0.53 (+/- 0.86)
Attrition == No Mean : 0.85 (+/- 0.85)
Absolute Mean Diferrence Distance: 0.32

Employees lost in attrition had less total working years than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.65 0.0
Attrition == Yes Mean: 8.24 (+/- 7.17)
Attrition == No Mean : 11.86 (+/- 11.86)
Absolute Mean Diferrence Distance: 3.62

Employees lost in attrition had less training opportunities than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -2.28 0.02
Attrition == Yes Mean: 2.62 (+/- 1.25)
Attrition == No Mean : 2.83 (+/- 2.83)
Absolute Mean Diferrence Distance: 0.21

Employees lost in attrition had poorer work-life balance on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -2.45 0.01
Attrition == Yes Mean: 2.66 (+/- 0.82)
Attrition == No Mean : 2.78 (+/- 2.78)
Absolute Mean Diferrence Distance: 0.12

Employees who stay had longer organization tenure than those lost in attrition by 2 years on average.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -5.2 0.0
Attrition == Yes Mean: 5.13 (+/- 5.95)
Attrition == No Mean : 7.37 (+/- 7.37)
Absolute Mean Diferrence Distance: 2.24

Employees who stayed had 1 - 2 more years in their current role than those lost in attrition.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.23 0.0
Attrition == Yes Mean: 2.90 (+/- 3.17)
Attrition == No Mean : 4.48 (+/- 4.48)
Absolute Mean Diferrence Distance: 1.58

Employees lost in attrition had less time with their current manager by 1 - 2 years on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

z-test statistic p-value
Attrition -6.06 0.0
Attrition == Yes Mean: 2.85 (+/- 3.14)
Attrition == No Mean : 4.37 (+/- 4.37)
Absolute Mean Diferrence Distance: 1.52

Employees who stay are more satisfied with their work environment on average than those who leave.


Non-Significant Features

ns
['Education',
 'HourlyRate',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'YearsSinceLastPromotion']
### Some Additional Visualisations
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

dataset = data

# Define a set of graphs, 3 by 5, usin the matplotlib library
f, axes = plt.subplots(5, 3, figsize=(24, 36), sharex=False, sharey=False)

# Define a few seaborn graphs, which for the most part only need the "dataset", the "x and "y" axis and the position. 
# You can also show a third value and expand your analysis by setting the "hue" property.
sns.swarmplot(x="EducationField", y="MonthlyIncome", data=dataset, hue="Gender", ax=axes[0,0])
axes[0,0].set( title = 'Monthly income against Educational Field')

sns.pointplot(x="PerformanceRating", y="JobSatisfaction", data=dataset, hue="Gender", ax=axes[0,1])
axes[0,1].set( title = 'Job satisfaction against Performance Rating')

sns.barplot(x="NumCompaniesWorked", y="PerformanceRating", data=dataset, ax=axes[0,2])
axes[0,2].set( title = 'Number of companies worked against Performance rating')

sns.barplot(x="JobSatisfaction", y="EducationField", data=dataset, ax=axes[1,0])
axes[1,0].set( title = 'Educational Field against Job Satisfaction')

sns.barplot(x="YearsWithCurrManager", y="JobSatisfaction", data=dataset, ax=axes[1,1])
axes[1,1].set( title = 'Years with current Manager against Job Satisfaction')

sns.pointplot(x="JobSatisfaction", y="MonthlyRate", data=dataset, ax=axes[1,2])
axes[1,2].set( title = 'Job Satisfaction against Monthly rate')

sns.barplot(x="WorkLifeBalance", y="DistanceFromHome", data=dataset, ax=axes[2,0])
axes[2,0].set( title = 'Distance from home against Work life balance')

sns.pointplot(x="OverTime", y="WorkLifeBalance", hue="Gender", data=dataset, jitter=True, ax=axes[2,1])
axes[2,1].set( title = 'Work life balance against Overtime')

sns.pointplot(x="OverTime", y="RelationshipSatisfaction", hue="Gender", data=dataset, ax=axes[2,2])
axes[2,2].set( title = 'Overtime against Relationship satisfaction')

sns.pointplot(x="MaritalStatus", y="YearsInCurrentRole", hue="Gender", data=dataset, ax=axes[3,0])
axes[3,0].set( title = 'Marital Status against Years in current role')

sns.pointplot(x="Age", y="YearsSinceLastPromotion", hue="Gender", data=dataset, ax=axes[3,1])
axes[3,1].set( title = 'Age against Years since last promotion')

sns.pointplot(x="OverTime", y="PerformanceRating", hue="Gender", data=dataset, ax=axes[3,2])
axes[3,2].set( title = 'Performance Rating against Overtime')

sns.barplot(x="Gender", y="PerformanceRating", data=dataset, ax=axes[4,0])
axes[4,0].set( title = 'Performance Rating against Gender')

sns.barplot(x="Gender", y="JobSatisfaction", data=dataset, ax=axes[4,1])
axes[4,1].set( title = 'Job satisfaction against Gender')

sns.countplot(x="Attrition", data=dataset, ax=axes[4,2])
axes[4,2].set( title = 'Attrition distribution')
[<matplotlib.text.Text at 0x10b8ab9e8>]

png