import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

# To scale the data using z-score 
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

# Algorithms to use
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

# Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

# For tuning the model
from sklearn.model_selection import GridSearchCV

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")


# Reading the dataset
df = pd.read_excel('HR_Employee_Attrition_Dataset.xlsx')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   EmployeeNumber            2940 non-null   int64 
 1   Attrition                 2940 non-null   object
 2   Age                       2940 non-null   int64 
 3   BusinessTravel            2940 non-null   object
 4   DailyRate                 2940 non-null   int64 
 5   Department                2940 non-null   object
 6   DistanceFromHome          2940 non-null   int64 
 7   Education                 2940 non-null   int64 
 8   EducationField            2940 non-null   object
 9   EnvironmentSatisfaction   2940 non-null   int64 
 10  Gender                    2940 non-null   object
 11  HourlyRate                2940 non-null   int64 
 12  JobInvolvement            2940 non-null   int64 
 13  JobLevel                  2940 non-null   int64 
 14  JobRole                   2940 non-null   object
 15  JobSatisfaction           2940 non-null   int64 
 16  MaritalStatus             2940 non-null   object
 17  MonthlyIncome             2940 non-null   int64 
 18  MonthlyRate               2940 non-null   int64 
 19  NumCompaniesWorked        2940 non-null   int64 
 20  Over18                    2940 non-null   object
 21  OverTime                  2940 non-null   object
 22  PercentSalaryHike         2940 non-null   int64 
 23  PerformanceRating         2940 non-null   int64 
 24  RelationshipSatisfaction  2940 non-null   int64 
 25  StandardHours             2940 non-null   int64 
 26  StockOptionLevel          2940 non-null   int64 
 27  TotalWorkingYears         2940 non-null   int64 
 28  TrainingTimesLastYear     2940 non-null   int64 
 29  WorkLifeBalance           2940 non-null   int64 
 30  YearsAtCompany            2940 non-null   int64 
 31  YearsInCurrentRole        2940 non-null   int64 
 32  YearsSinceLastPromotion   2940 non-null   int64 
 33  YearsWithCurrManager      2940 non-null   int64 
dtypes: int64(25), object(9)
memory usage: 781.1+ KB


# Checking the number of unique values in each column
df.nunique()

EmployeeNumber              2940
Attrition                      2
Age                           43
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithCurrManager          18
dtype: int64


# Dropping the columns 
df = df.drop(['EmployeeNumber', 'Over18', 'StandardHours'], axis = 1)


# Creating numerical columns
num_cols = ['DailyRate', 'Age', 'DistanceFromHome', 'MonthlyIncome', 'MonthlyRate', 'PercentSalaryHike', 'TotalWorkingYears',
          'YearsAtCompany', 'NumCompaniesWorked', 'HourlyRate',
          'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'TrainingTimesLastYear']

# Creating categorical variables 
cat_cols = ['Attrition', 'OverTime', 'BusinessTravel', 'Department', 'Education', 'EducationField', 'JobSatisfaction', 
            'EnvironmentSatisfaction', 'WorkLifeBalance', 'StockOptionLevel', 'Gender', 'PerformanceRating', 'JobInvolvement',
            'JobLevel', 'JobRole', 'MaritalStatus', 'RelationshipSatisfaction']


# Checking summary statistics
df[num_cols].describe().T


# Creating histograms
df[num_cols].hist(figsize = (14, 14))

plt.show()


for i in cat_cols:
    print(df[i].value_counts(normalize = True))
    
    print('*' * 40)

No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64
****************************************
No     0.717007
Yes    0.282993
Name: OverTime, dtype: float64
****************************************
Travel_Rarely        0.709524
Travel_Frequently    0.188435
Non-Travel           0.102041
Name: BusinessTravel, dtype: float64
****************************************
Research & Development    0.653741
Sales                     0.303401
Human Resources           0.042857
Name: Department, dtype: float64
****************************************
3    0.389116
4    0.270748
2    0.191837
1    0.115646
5    0.032653
Name: Education, dtype: float64
****************************************
Life Sciences       0.412245
Medical             0.315646
Marketing           0.108163
Technical Degree    0.089796
Other               0.055782
Human Resources     0.018367
Name: EducationField, dtype: float64
****************************************
4    0.312245
3    0.300680
1    0.196599
2    0.190476
Name: JobSatisfaction, dtype: float64
****************************************
3    0.308163
4    0.303401
2    0.195238
1    0.193197
Name: EnvironmentSatisfaction, dtype: float64
****************************************
3    0.607483
2    0.234014
4    0.104082
1    0.054422
Name: WorkLifeBalance, dtype: float64
****************************************
0    0.429252
1    0.405442
2    0.107483
3    0.057823
Name: StockOptionLevel, dtype: float64
****************************************
Male      0.6
Female    0.4
Name: Gender, dtype: float64
****************************************
3    0.846259
4    0.153741
Name: PerformanceRating, dtype: float64
****************************************
3    0.590476
2    0.255102
4    0.097959
1    0.056463
Name: JobInvolvement, dtype: float64
****************************************
1    0.369388
2    0.363265
3    0.148299
4    0.072109
5    0.046939
Name: JobLevel, dtype: float64
****************************************
Sales Executive              0.221769
Research Scientist           0.198639
Laboratory Technician        0.176190
Manufacturing Director       0.098639
Healthcare Representative    0.089116
Manager                      0.069388
Sales Representative         0.056463
Research Director            0.054422
Human Resources              0.035374
Name: JobRole, dtype: float64
****************************************
Married     0.457823
Single      0.319728
Divorced    0.222449
Name: MaritalStatus, dtype: float64
****************************************
3    0.312245
4    0.293878
2    0.206122
1    0.187755
Name: RelationshipSatisfaction, dtype: float64
****************************************


for i in cat_cols:
    if i != 'Attrition':
        (pd.crosstab(df[i], df['Attrition'], normalize = 'index')*100).plot(kind = 'bar', figsize = (8, 4), stacked = True)
        plt.ylabel('Percentage Attrition %')


# The mean of numerical variables grouped by attrition
df.groupby(['Attrition'])[num_cols].mean()


# Plotting the correlation between numerical variables
plt.figure(figsize = (15, 8))

sns.heatmap(df[num_cols].corr(), annot = True, fmt = '0.2f', cmap = 'YlGnBu')

<AxesSubplot:>


# Creating the list of columns for which we need to create the dummy variables
to_get_dummies_for = ['BusinessTravel', 'Department', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'MaritalStatus']

# Creating dummy variables
df = pd.get_dummies(data = df, columns = to_get_dummies_for, drop_first = True)      

# Mapping overtime and attrition
dict_OverTime = {'Yes': 1, 'No': 0}
dict_attrition = {'Yes': 1, 'No': 0}

df['OverTime'] = df.OverTime.map(dict_OverTime)
df['Attrition'] = df.Attrition.map(dict_attrition)


# Separating the target variable and other variables
Y = df.Attrition
X = df.drop(columns = ['Attrition'])


# Scaling the data
sc = StandardScaler()

X_scaled = sc.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns = X.columns)


# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.3, random_state = 1, stratify = Y)


def metrics_score(actual, predicted):

    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)

    plt.figure(figsize = (8, 5))

    sns.heatmap(cm, annot = True, fmt = '.2f', xticklabels = ['Not Attrite', 'Attrite'], yticklabels = ['Not Attrite', 'Attrite'])

    plt.ylabel('Actual')

    plt.xlabel('Predicted')
    
    plt.show()


# Fitting the logistic regression model
lg = LogisticRegression()

lg.fit(x_train,y_train)

LogisticRegression()


# Checking the performance on the training data
y_pred_train = lg.predict(x_train)

metrics_score(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1726
           1       0.81      0.50      0.62       332

    accuracy                           0.90      2058
   macro avg       0.86      0.74      0.78      2058
weighted avg       0.89      0.90      0.89      2058


# Checking the performance on the test dataset
y_pred_test = lg.predict(x_test)
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94       740
           1       0.76      0.46      0.58       142

    accuracy                           0.89       882
   macro avg       0.83      0.72      0.76       882
weighted avg       0.88      0.89      0.88       882


# Printing the coefficients of logistic regression
cols = X.columns

coef_lg = lg.coef_

pd.DataFrame(coef_lg,columns = cols).T.sort_values(by = 0, ascending = False)


odds = np.exp(lg.coef_[0]) # Finding the odds

# Adding the odds to a DataFrame and sorting the values
pd.DataFrame(odds, x_train.columns, columns = ['odds']).sort_values(by = 'odds', ascending = False)


y_scores_lg = lg.predict_proba(x_train) # predict_proba gives the probability of each observation belonging to each class


precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train, y_scores_lg[:, 1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize = (10, 7))

plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label = 'precision')

plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')

plt.xlabel('Threshold')

plt.legend(loc = 'upper right')

plt.ylim([0, 1])

plt.show()


optimal_threshold1 = .35

y_pred_train = lg.predict_proba(x_train)

metrics_score(y_train, y_pred_train[:, 1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1726
           1       0.65      0.64      0.64       332

    accuracy                           0.89      2058
   macro avg       0.79      0.79      0.79      2058
weighted avg       0.89      0.89      0.89      2058


optimal_threshold1 = .35

y_pred_test = lg.predict_proba(x_test)

metrics_score(y_test, y_pred_test[:, 1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       740
           1       0.62      0.63      0.63       142

    accuracy                           0.88       882
   macro avg       0.78      0.78      0.78       882
weighted avg       0.88      0.88      0.88       882


knn = KNeighborsClassifier()

# We select the optimal value of K for which the error rate is the least in the validation data
# Let us loop over a few values of K to determine the optimal value of K

train_error = []

test_error = []

knn_many_split = {}

error_df_knn = pd.DataFrame()

features = X.columns

for k in range(1, 15):
    train_error = []
    
    test_error = []
    
    lista = []
    
    knn = KNeighborsClassifier(n_neighbors = k)
    
    for i in range(30):
        x_train_new, x_val, y_train_new, y_val = train_test_split(x_train, y_train, test_size = 0.20)
    
        # Fitting K-NN on the training data
        knn.fit(x_train_new, y_train_new)
        
        # Calculating error on the training data and the validation data
        train_error.append(1 - knn.score(x_train_new, y_train_new)) 
        
        test_error.append(1 - knn.score(x_val, y_val))
    
    lista.append(sum(train_error)/len(train_error))
    
    lista.append(sum(test_error)/len(test_error))
    
    knn_many_split[k] = lista

knn_many_split

{1: [0.0, 0.08616504854368928],
 2: [0.05445524503847712, 0.1444174757281554],
 3: [0.06729445119481572, 0.16464401294498388],
 4: [0.11692993114621306, 0.14522653721682852],
 5: [0.11257594167679222, 0.14862459546925566],
 6: [0.1289185905224787, 0.1524271844660194],
 7: [0.12345078979343858, 0.15088996763754048],
 8: [0.13620899149453222, 0.15024271844660203],
 9: [0.12987039287160795, 0.15129449838187708],
 10: [0.14127176994734708, 0.1490291262135922],
 11: [0.13507492912110167, 0.1482200647249191],
 12: [0.14212231672742, 0.1492718446601942],
 13: [0.137910085054678, 0.15048543689320393],
 14: [0.1454232482786553, 0.15161812297734636]}


kltest = []

vltest = []

for k, v in knn_many_split.items():
    kltest.append(k)
    
    vltest.append(knn_many_split[k][1])

kltrain = []

vltrain = []

for k, v in knn_many_split.items():
    kltrain.append(k)
    
    vltrain.append(knn_many_split[k][0])

# Plotting K vs Error
plt.figure(figsize = (10, 6))

plt.plot(kltest, vltest, label = 'test' )

plt.plot(kltrain, vltrain, label = 'train')

plt.legend()

plt.show()


# Define K-NN model

knn = KNeighborsClassifier(n_neighbors = 5)


# Fitting data to the K-NN model

knn.fit(x_train,y_train)

KNeighborsClassifier()


# Checking the performance of K-NN model on the training data
y_pred_train_knn = knn.predict(x_train)

metrics_score(y_train, y_pred_train_knn)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      1726
           1       0.80      0.43      0.56       332

    accuracy                           0.89      2058
   macro avg       0.85      0.70      0.75      2058
weighted avg       0.88      0.89      0.88      2058


# Checking the performance of K-NN model on the testing data
y_pred_test_knn = knn.predict(x_test)

metrics_score(y_test, y_pred_test_knn)

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       740
           1       0.67      0.32      0.43       142

    accuracy                           0.87       882
   macro avg       0.78      0.64      0.68       882
weighted avg       0.85      0.87      0.84       882


params_knn = {'n_neighbors': np.arange(3, 15), 'weights': ['uniform', 'distance'], 'p': [1, 2]}

grid_knn = GridSearchCV(estimator = knn, param_grid = params_knn, scoring = 'recall', cv = 10)

model_knn = grid_knn.fit(x_train,y_train)

knn_estimator = model_knn.best_estimator_

print(knn_estimator)

KNeighborsClassifier(n_neighbors=4, p=1, weights='distance')


# Fit the best estimator on the training data
knn_estimator.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=4, p=1, weights='distance')


y_pred_train_knn_estimator = knn_estimator.predict(x_train)

metrics_score(y_train, y_pred_train_knn_estimator)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1726
           1       1.00      1.00      1.00       332

    accuracy                           1.00      2058
   macro avg       1.00      1.00      1.00      2058
weighted avg       1.00      1.00      1.00      2058


y_pred_test_knn_estimator = knn_estimator.predict(x_test)

metrics_score(y_test, y_pred_test_knn_estimator)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       740
           1       0.86      0.83      0.84       142

    accuracy                           0.95       882
   macro avg       0.91      0.90      0.91       882
weighted avg       0.95      0.95      0.95       882


# Fitting the LDA model
lda = LinearDiscriminantAnalysis()

lda.fit(x_train, y_train)

LinearDiscriminantAnalysis()


# Checking model performance of LDA
y_pred_train_lda = lda.predict(x_train)

metrics_score(y_train, y_pred_train_lda)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1726
           1       0.80      0.52      0.63       332

    accuracy                           0.90      2058
   macro avg       0.86      0.75      0.78      2058
weighted avg       0.89      0.90      0.89      2058


# Creating the list of column names
cols = X.columns

# Saving coefficients of LDA model
coef_lda = lda.coef_

# Printing the cofficients of LDA
pd.DataFrame(coef_lda, columns = cols).T.sort_values(by = 0, ascending = False)


y_scores_lda = lda.predict_proba(x_train) # predict_proba gives the probability of each observation belonging to each class

precisions_lda, recalls_lda, thresholds_lda = precision_recall_curve(y_train, y_scores_lda[:, 1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize = (10, 7))

plt.plot(thresholds_lda, precisions_lda[:-1], 'b--', label = 'precision')

plt.plot(thresholds_lda, recalls_lda[:-1], 'g--', label = 'recall')

plt.xlabel('Threshold')

plt.legend(loc = 'upper right')

plt.ylim([0, 1])

plt.show()


optimal_threshold1 = .35

y_pred_train_lda = lda.predict_proba(x_train)

metrics_score(y_train, y_pred_train_lda[:,1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      1726
           1       0.63      0.60      0.62       332

    accuracy                           0.88      2058
   macro avg       0.78      0.77      0.77      2058
weighted avg       0.88      0.88      0.88      2058


# Checking performance on the test data
optimal_threshold1 = .35

y_pred_test_lda = lda.predict_proba(x_test)

metrics_score(y_test, y_pred_test_lda[:,1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       740
           1       0.63      0.61      0.62       142

    accuracy                           0.88       882
   macro avg       0.78      0.77      0.78       882
weighted avg       0.88      0.88      0.88       882


# Fitting QDA model
qda = QuadraticDiscriminantAnalysis()

qda.fit(x_train, y_train)

QuadraticDiscriminantAnalysis()


# Checking model performance on the training data
y_pred_train_qda = qda.predict(x_train)

metrics_score(y_train, y_pred_train_qda)

              precision    recall  f1-score   support

           0       1.00      0.15      0.26      1726
           1       0.18      1.00      0.31       332

    accuracy                           0.29      2058
   macro avg       0.59      0.57      0.28      2058
weighted avg       0.87      0.29      0.27      2058


# Checking performance of the model on the test data
y_pred_test_qda = qda.predict(x_test)

metrics_score(y_test, y_pred_test_qda)

              precision    recall  f1-score   support

           0       0.97      0.15      0.26       740
           1       0.18      0.97      0.30       142

    accuracy                           0.28       882
   macro avg       0.57      0.56      0.28       882
weighted avg       0.84      0.28      0.27       882

	count	mean	std	min	25%	50%	75%	max
DailyRate	2940.0	802.485714	403.440447	102.0	465.0	802.0	1157.0	1499.0
Age	2940.0	36.923810	9.133819	18.0	30.0	36.0	43.0	60.0
DistanceFromHome	2940.0	9.192517	8.105485	1.0	2.0	7.0	14.0	29.0
MonthlyIncome	2940.0	6502.931293	4707.155770	1009.0	2911.0	4919.0	8380.0	19999.0
MonthlyRate	2940.0	14313.103401	7116.575021	2094.0	8045.0	14235.5	20462.0	26999.0
PercentSalaryHike	2940.0	15.209524	3.659315	11.0	12.0	14.0	18.0	25.0
TotalWorkingYears	2940.0	11.279592	7.779458	0.0	6.0	10.0	15.0	40.0
YearsAtCompany	2940.0	7.008163	6.125483	0.0	3.0	5.0	9.0	40.0
NumCompaniesWorked	2940.0	2.693197	2.497584	0.0	1.0	2.0	4.0	9.0
HourlyRate	2940.0	65.891156	20.325969	30.0	48.0	66.0	84.0	100.0
YearsInCurrentRole	2940.0	4.229252	3.622521	0.0	2.0	3.0	7.0	18.0
YearsSinceLastPromotion	2940.0	2.187755	3.221882	0.0	0.0	1.0	3.0	15.0
YearsWithCurrManager	2940.0	4.123129	3.567529	0.0	2.0	3.0	7.0	17.0
TrainingTimesLastYear	2940.0	2.799320	1.289051	0.0	2.0	3.0	3.0	6.0

	DailyRate	Age	DistanceFromHome	MonthlyIncome	MonthlyRate	PercentSalaryHike	TotalWorkingYears	YearsAtCompany	NumCompaniesWorked	HourlyRate	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager	TrainingTimesLastYear
Attrition
No	812.504461	37.561233	8.915653	6832.739659	14265.779400	15.231144	11.862936	7.369019	2.645580	65.952149	4.484185	2.234388	4.367397	2.832928
Yes	750.362869	33.607595	10.632911	4787.092827	14559.308017	15.097046	8.244726	5.130802	2.940928	65.573840	2.902954	1.945148	2.852321	2.624473

	0
OverTime	0.958034
BusinessTravel_Travel_Frequently	0.716046
MaritalStatus_Single	0.618145
YearsSinceLastPromotion	0.552935
YearsAtCompany	0.523238
NumCompaniesWorked	0.501137
Department_Sales	0.483346
Department_Research & Development	0.482820
BusinessTravel_Travel_Rarely	0.441384
DistanceFromHome	0.384346
JobRole_Sales Executive	0.383153
MaritalStatus_Married	0.288340
JobRole_Human Resources	0.282114
JobLevel_5	0.269649
JobRole_Laboratory Technician	0.177910
JobRole_Sales Representative	0.173305
Gender_Male	0.165803
Education_3	0.158748
Education_2	0.131061
Education_4	0.113254
JobRole_Manufacturing Director	0.112275
Education_5	0.092054
EducationField_Technical Degree	0.083282
MonthlyRate	0.059920
HourlyRate	0.048010
JobLevel_3	0.007039
EducationField_Marketing	-0.013725
JobRole_Manager	-0.032051
PerformanceRating	-0.032545
PercentSalaryHike	-0.074595
DailyRate	-0.095750
StockOptionLevel	-0.107451
EducationField_Other	-0.138263
JobLevel_4	-0.161223
WorkLifeBalance	-0.212611
JobRole_Research Scientist	-0.233311
TrainingTimesLastYear	-0.240552
Age	-0.275176
RelationshipSatisfaction	-0.312201
EducationField_Life Sciences	-0.319031
EducationField_Medical	-0.354189
JobRole_Research Director	-0.359223
JobSatisfaction	-0.373627
YearsWithCurrManager	-0.382881
YearsInCurrentRole	-0.438966
EnvironmentSatisfaction_2	-0.444969
JobInvolvement_2	-0.485360
TotalWorkingYears	-0.497228
EnvironmentSatisfaction_3	-0.501917
MonthlyIncome	-0.599868
JobInvolvement_4	-0.641244
EnvironmentSatisfaction_4	-0.651612
JobLevel_2	-0.714008
JobInvolvement_3	-0.750455

	odds
OverTime	2.606567
BusinessTravel_Travel_Frequently	2.046326
MaritalStatus_Single	1.855483
YearsSinceLastPromotion	1.738348
YearsAtCompany	1.687483
NumCompaniesWorked	1.650597
Department_Sales	1.621491
Department_Research & Development	1.620638
BusinessTravel_Travel_Rarely	1.554858
DistanceFromHome	1.468653
JobRole_Sales Executive	1.466903
MaritalStatus_Married	1.334210
JobRole_Human Resources	1.325929
JobLevel_5	1.309504
JobRole_Laboratory Technician	1.194718
JobRole_Sales Representative	1.189229
Gender_Male	1.180341
Education_3	1.172042
Education_2	1.140037
Education_4	1.119916
JobRole_Manufacturing Director	1.118820
Education_5	1.096424
EducationField_Technical Degree	1.086849
MonthlyRate	1.061752
HourlyRate	1.049181
JobLevel_3	1.007064
EducationField_Marketing	0.986369
JobRole_Manager	0.968457
PerformanceRating	0.967979
PercentSalaryHike	0.928119
DailyRate	0.908691
StockOptionLevel	0.898120
EducationField_Other	0.870870
JobLevel_4	0.851102
WorkLifeBalance	0.808470
JobRole_Research Scientist	0.791907
TrainingTimesLastYear	0.786194
Age	0.759438
RelationshipSatisfaction	0.731835
EducationField_Life Sciences	0.726853
EducationField_Medical	0.701742
JobRole_Research Director	0.698219
JobSatisfaction	0.688234
YearsWithCurrManager	0.681894
YearsInCurrentRole	0.644703
EnvironmentSatisfaction_2	0.640844
JobInvolvement_2	0.615475
TotalWorkingYears	0.608214
EnvironmentSatisfaction_3	0.605369
MonthlyIncome	0.548884
JobInvolvement_4	0.526637
EnvironmentSatisfaction_4	0.521205
JobLevel_2	0.489678
JobInvolvement_3	0.472151

	0
OverTime	1.041698
Department_Research & Development	0.795546
Department_Sales	0.649234
BusinessTravel_Travel_Frequently	0.583047
MaritalStatus_Single	0.565419
NumCompaniesWorked	0.456036
JobRole_Sales Executive	0.406011
YearsSinceLastPromotion	0.353791
YearsAtCompany	0.351596
JobRole_Human Resources	0.342187
JobRole_Sales Representative	0.335370
DistanceFromHome	0.331483
BusinessTravel_Travel_Rarely	0.283220
JobLevel_5	0.274886
Education_3	0.230501
Education_2	0.198261
MaritalStatus_Married	0.182050
Education_4	0.173300
JobRole_Laboratory Technician	0.148185
JobRole_Manager	0.144605
Gender_Male	0.128920
JobLevel_4	0.127368
Education_5	0.084847
JobRole_Manufacturing Director	0.059837
MonthlyRate	0.030803
JobLevel_3	0.013900
JobRole_Research Director	0.010330
PerformanceRating	-0.025318
HourlyRate	-0.045710
PercentSalaryHike	-0.059350
EducationField_Technical Degree	-0.061963
DailyRate	-0.063601
StockOptionLevel	-0.102784
WorkLifeBalance	-0.175842
TrainingTimesLastYear	-0.202162
EducationField_Marketing	-0.220459
JobRole_Research Scientist	-0.252660
YearsWithCurrManager	-0.270493
RelationshipSatisfaction	-0.274757
YearsInCurrentRole	-0.290026
EducationField_Other	-0.323048
TotalWorkingYears	-0.332565
Age	-0.345372
JobSatisfaction	-0.358574
EnvironmentSatisfaction_2	-0.508781
EnvironmentSatisfaction_3	-0.562208
JobLevel_2	-0.604475
EducationField_Life Sciences	-0.614948
EnvironmentSatisfaction_4	-0.622288
EducationField_Medical	-0.648146
JobInvolvement_2	-0.732200
MonthlyIncome	-0.745046
JobInvolvement_4	-0.779225
JobInvolvement_3	-1.055171

HR Employee Attrition Prediction¶

Context¶

Objective¶

Dataset Description¶

Importing the required libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data cleaning steps and observations from EDA.

Loading the dataset¶

Checking the info of the data¶

Exploratory Data Analysis¶

Univariate analysis of numerical columns¶

Univariate analysis for categorical variables¶

Bivariate and Multivariate analysis¶

Let's check the relationship between different numerical variables¶

Summary of EDA ¶

Model Building - Approach¶

Preparing data for modeling¶

Scaling the data¶

Splitting the data into 70% train and 30% test sets¶

Model evaluation criterion¶

Building the model¶

Logistic Regression¶

K-Nearest Neighbors (K-NN)¶

Using GridSearchCV for Hyperparameter tuning of the model¶

Conclusion and Recommendations¶

Conclusions:¶

Recommendations:¶

Optional Content¶

Building the model¶

Linear Discriminant Analysis¶

Precision-Recall Curve for LDA¶

Quadratic Discriminant Analysis¶

	EmployeeNumber	Attrition	Age	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EnvironmentSatisfaction	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	1	Yes	41	Travel_Rarely	1102	Sales	1	2	Life Sciences	2	...	1	80	0	8	0	1	6	4	0	5
1	2	No	49	Travel_Frequently	279	Research & Development	8	1	Life Sciences	3	...	4	80	1	10	3	3	10	7	1	7
2	3	Yes	37	Travel_Rarely	1373	Research & Development	2	2	Other	4	...	2	80	0	7	3	3	0	0	0	0
3	4	No	33	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	4	...	3	80	0	8	3	3	8	7	3	0
4	5	No	27	Travel_Rarely	591	Research & Development	2	1	Medical	1	...	4	80	1	6	3	3	2	2	2	2

HR Employee Attrition Prediction¶

Context¶

Objective¶

Dataset Description¶

Importing the required libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data cleaning steps and observations from EDA.

Loading the dataset¶

Checking the info of the data¶

Exploratory Data Analysis¶

Univariate analysis of numerical columns¶

Univariate analysis for categorical variables¶

Bivariate and Multivariate analysis¶

Let's check the relationship between different numerical variables¶

Summary of EDA¶

Model Building - Approach¶

Preparing data for modeling¶

Scaling the data¶

Splitting the data into 70% train and 30% test sets¶

Model evaluation criterion¶

Building the model¶

Logistic Regression¶

K-Nearest Neighbors (K-NN)¶

Using GridSearchCV for Hyperparameter tuning of the model¶

Conclusion and Recommendations¶

Conclusions:¶

Recommendations:¶

Optional Content¶

Building the model¶

Linear Discriminant Analysis¶

Precision-Recall Curve for LDA¶

Quadratic Discriminant Analysis¶

Summary of EDA ¶