In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from pathlib import Path
In [2]:
# Display all of the columns
pd.set_option('display.max_columns', None)
In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('employee_churn_data.csv')
)
df.head()
Out[3]:
department promoted review projects salary tenure satisfaction bonus avg_hrs_month left
0 operations 0 0.577569 3 low 5.0 0.626759 0 180.866070 no
1 operations 0 0.751900 3 medium 6.0 0.443679 0 182.708149 no
2 support 0 0.722548 3 medium 6.0 0.446823 0 184.416084 no
3 logistics 0 0.675158 4 high 8.0 0.440139 0 188.707545 no
4 sales 0 0.676203 3 high 5.0 0.577607 1 179.821083 no
In [4]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['salary'] = label_encoder.fit_transform(df['salary'])
df['department'] = label_encoder.fit_transform(df['department'])
df['left'] = label_encoder.fit_transform(df['left'])
In [5]:
# Split features and target variable
X = df.drop('left', axis=1)
y = df['left']
y
Out[5]:
0       0
1       0
2       0
3       0
4       0
       ..
9535    1
9536    1
9537    1
9538    1
9539    1
Name: left, Length: 9540, dtype: int32
In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [7]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [8]:
# Train SVM classifier
svm_classifier = SVC(kernel='rbf', random_state=42)  
svm_classifier.fit(X_train_scaled, y_train)
Out[8]:
SVC(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(random_state=42)
In [9]:
# Predict on testing data
y_train_pred = svm_classifier.predict(X_train_scaled)
y_train_pred
Out[9]:
array([1, 0, 0, ..., 0, 0, 0])
In [10]:
# Evaluate the model
print("Performance on Training Data:")
accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
Performance on Training Data:
Accuracy: 0.8487945492662474

Classification Report:
              precision    recall  f1-score   support

 not leaving       0.85      0.95      0.90      5426
     leaving       0.84      0.59      0.69      2206

    accuracy                           0.85      7632
   macro avg       0.85      0.77      0.80      7632
weighted avg       0.85      0.85      0.84      7632


Confusion Matrix:
[[5177  249]
 [ 905 1301]]
In [11]:
y_test_pred = svm_classifier.predict(X_test_scaled)
In [12]:
print("Performance on Testing Data:")
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("")
print("The precision is strong for both classes(leaving and not leaving, but there was a big difference in the recall ratio. For employees not leaving, the recall is 0.94, indicating that the model correctly identified 94% of the employees who didn't leave. However, the model only correctly identified 57% of the employees who actually left.")
Performance on Testing Data:
Accuracy: 0.8307127882599581

Classification Report:
              precision    recall  f1-score   support

 not leaving       0.83      0.94      0.89      1330
     leaving       0.82      0.57      0.67       578

    accuracy                           0.83      1908
   macro avg       0.83      0.76      0.78      1908
weighted avg       0.83      0.83      0.82      1908


Confusion Matrix:
[[1256   74]
 [ 249  329]]

The precision is strong for both classes(leaving and not leaving, but there was a big difference in the recall ratio. For employees not leaving, the recall is 0.94, indicating that the model correctly identified 94% of the employees who didn't leave. However, the model only correctly identified 57% of the employees who actually left.
In [13]:
# Introduce class weights
In [14]:
# Train weighted SVM classifier with class weight
svm_classifier_weighted = SVC(kernel='rbf', class_weight='balanced', random_state=42)  
svm_classifier_weighted.fit(X_train_scaled, y_train)
Out[14]:
SVC(class_weight='balanced', random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(class_weight='balanced', random_state=42)
In [15]:
# Predict on testing data
y_train_pred_weighted = svm_classifier_weighted.predict(X_train_scaled)
y_train_pred_weighted
Out[15]:
array([1, 0, 0, ..., 0, 0, 0])
In [16]:
# Evaluate the model
print("Performance on weighted Training Data:")
accuracy_weighted = accuracy_score(y_train, y_train_pred_weighted)
print("Accuracy:", accuracy_weighted)
print("")
print("Classification Report with Adjusted Class Weights:")
print(classification_report(y_train, y_train_pred_weighted, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix with Adjusted Class Weights:")
print(confusion_matrix(y_train, y_train_pred_weighted))
Performance on weighted Training Data:
Accuracy: 0.8026729559748428

Classification Report with Adjusted Class Weights:
              precision    recall  f1-score   support

 not leaving       0.93      0.78      0.85      5426
     leaving       0.61      0.86      0.72      2206

    accuracy                           0.80      7632
   macro avg       0.77      0.82      0.78      7632
weighted avg       0.84      0.80      0.81      7632


Confusion Matrix with Adjusted Class Weights:
[[4221 1205]
 [ 301 1905]]
In [17]:
y_test_pred_weighted = svm_classifier_weighted.predict(X_test_scaled)
y_test_pred_weighted
Out[17]:
array([1, 1, 1, ..., 0, 0, 1])
In [18]:
# Evaluate the model
print("Performance on weighted Testing Data:")
accuracy_weighted = accuracy_score(y_test, y_test_pred_weighted)
print("Accuracy:", accuracy_weighted)
print("")
print("Classification Report with Adjusted Class Weights:")
print(classification_report(y_test, y_test_pred_weighted, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix with Adjusted Class Weights:")
print(confusion_matrix(y_test, y_test_pred_weighted))
print("")
print("Here we introduced class weights since there were many more employees who did not leave vs those who did leave. Doing this assigns higher weights to the minority class during training which will make the model pay more attention to correctly classify employees who are leaving. Adjusting class weights improved the recall ratio from 57% to 83% for the leaving class, which shows that the model has improved in being able to identify employees who are likely to leave. However, precision dropped from 82% to 59% for this same leaving class.")
Performance on weighted Testing Data:
Accuracy: 0.7730607966457023

Classification Report with Adjusted Class Weights:
              precision    recall  f1-score   support

 not leaving       0.91      0.75      0.82      1330
     leaving       0.59      0.83      0.69       578

    accuracy                           0.77      1908
   macro avg       0.75      0.79      0.76      1908
weighted avg       0.81      0.77      0.78      1908


Confusion Matrix with Adjusted Class Weights:
[[994 336]
 [ 97 481]]

Here we introduced class weights since there were many more employees who did not leave vs those who did leave. Doing this assigns higher weights to the minority class during training which will make the model pay more attention to correctly classify employees who are leaving. Adjusting class weights improved the recall ratio from 57% to 83% for the leaving class, which shows that the model has improved in being able to identify employees who are likely to leave. However, precision dropped from 82% to 59% for this same leaving class.
In [ ]: