In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from pathlib import Path
In [2]:
# Display all of the columns
pd.set_option('display.max_columns', None)
In [3]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
Path('employee_churn_data.csv')
)
df.head()
Out[3]:
department | promoted | review | projects | salary | tenure | satisfaction | bonus | avg_hrs_month | left | |
---|---|---|---|---|---|---|---|---|---|---|
0 | operations | 0 | 0.577569 | 3 | low | 5.0 | 0.626759 | 0 | 180.866070 | no |
1 | operations | 0 | 0.751900 | 3 | medium | 6.0 | 0.443679 | 0 | 182.708149 | no |
2 | support | 0 | 0.722548 | 3 | medium | 6.0 | 0.446823 | 0 | 184.416084 | no |
3 | logistics | 0 | 0.675158 | 4 | high | 8.0 | 0.440139 | 0 | 188.707545 | no |
4 | sales | 0 | 0.676203 | 3 | high | 5.0 | 0.577607 | 1 | 179.821083 | no |
In [4]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['salary'] = label_encoder.fit_transform(df['salary'])
df['department'] = label_encoder.fit_transform(df['department'])
df['left'] = label_encoder.fit_transform(df['left'])
In [5]:
# Split features and target variable
X = df.drop('left', axis=1)
y = df['left']
y
Out[5]:
0 0 1 0 2 0 3 0 4 0 .. 9535 1 9536 1 9537 1 9538 1 9539 1 Name: left, Length: 9540, dtype: int32
In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [7]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [8]:
# Train SVM classifier
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_train_scaled, y_train)
Out[8]:
SVC(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(random_state=42)
In [9]:
# Predict on testing data
y_train_pred = svm_classifier.predict(X_train_scaled)
y_train_pred
Out[9]:
array([1, 0, 0, ..., 0, 0, 0])
In [10]:
# Evaluate the model
print("Performance on Training Data:")
accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names = ['not leaving', 'leaving']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
Performance on Training Data: Accuracy: 0.8487945492662474 Classification Report: precision recall f1-score support not leaving 0.85 0.95 0.90 5426 leaving 0.84 0.59 0.69 2206 accuracy 0.85 7632 macro avg 0.85 0.77 0.80 7632 weighted avg 0.85 0.85 0.84 7632 Confusion Matrix: [[5177 249] [ 905 1301]]
In [11]:
y_test_pred = svm_classifier.predict(X_test_scaled)
In [12]:
print("Performance on Testing Data:")
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names = ['not leaving', 'leaving']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("")
print("The precision is strong for both classes(leaving and not leaving, but there was a big difference in the recall ratio. For employees not leaving, the recall is 0.94, indicating that the model correctly identified 94% of the employees who didn't leave. However, the model only correctly identified 57% of the employees who actually left.")
Performance on Testing Data: Accuracy: 0.8307127882599581 Classification Report: precision recall f1-score support not leaving 0.83 0.94 0.89 1330 leaving 0.82 0.57 0.67 578 accuracy 0.83 1908 macro avg 0.83 0.76 0.78 1908 weighted avg 0.83 0.83 0.82 1908 Confusion Matrix: [[1256 74] [ 249 329]] The precision is strong for both classes(leaving and not leaving, but there was a big difference in the recall ratio. For employees not leaving, the recall is 0.94, indicating that the model correctly identified 94% of the employees who didn't leave. However, the model only correctly identified 57% of the employees who actually left.
In [13]:
# Introduce class weights
In [14]:
# Train weighted SVM classifier with class weight
svm_classifier_weighted = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm_classifier_weighted.fit(X_train_scaled, y_train)
Out[14]:
SVC(class_weight='balanced', random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(class_weight='balanced', random_state=42)
In [15]:
# Predict on testing data
y_train_pred_weighted = svm_classifier_weighted.predict(X_train_scaled)
y_train_pred_weighted
Out[15]:
array([1, 0, 0, ..., 0, 0, 0])
In [16]:
# Evaluate the model
print("Performance on weighted Training Data:")
accuracy_weighted = accuracy_score(y_train, y_train_pred_weighted)
print("Accuracy:", accuracy_weighted)
print("")
print("Classification Report with Adjusted Class Weights:")
print(classification_report(y_train, y_train_pred_weighted, target_names = ['not leaving', 'leaving']))
print("\nConfusion Matrix with Adjusted Class Weights:")
print(confusion_matrix(y_train, y_train_pred_weighted))
Performance on weighted Training Data: Accuracy: 0.8026729559748428 Classification Report with Adjusted Class Weights: precision recall f1-score support not leaving 0.93 0.78 0.85 5426 leaving 0.61 0.86 0.72 2206 accuracy 0.80 7632 macro avg 0.77 0.82 0.78 7632 weighted avg 0.84 0.80 0.81 7632 Confusion Matrix with Adjusted Class Weights: [[4221 1205] [ 301 1905]]
In [17]:
y_test_pred_weighted = svm_classifier_weighted.predict(X_test_scaled)
y_test_pred_weighted
Out[17]:
array([1, 1, 1, ..., 0, 0, 1])
In [18]:
# Evaluate the model
print("Performance on weighted Testing Data:")
accuracy_weighted = accuracy_score(y_test, y_test_pred_weighted)
print("Accuracy:", accuracy_weighted)
print("")
print("Classification Report with Adjusted Class Weights:")
print(classification_report(y_test, y_test_pred_weighted, target_names = ['not leaving', 'leaving']))
print("\nConfusion Matrix with Adjusted Class Weights:")
print(confusion_matrix(y_test, y_test_pred_weighted))
print("")
print("Here we introduced class weights since there were many more employees who did not leave vs those who did leave. Doing this assigns higher weights to the minority class during training which will make the model pay more attention to correctly classify employees who are leaving. Adjusting class weights improved the recall ratio from 57% to 83% for the leaving class, which shows that the model has improved in being able to identify employees who are likely to leave. However, precision dropped from 82% to 59% for this same leaving class.")
Performance on weighted Testing Data: Accuracy: 0.7730607966457023 Classification Report with Adjusted Class Weights: precision recall f1-score support not leaving 0.91 0.75 0.82 1330 leaving 0.59 0.83 0.69 578 accuracy 0.77 1908 macro avg 0.75 0.79 0.76 1908 weighted avg 0.81 0.77 0.78 1908 Confusion Matrix with Adjusted Class Weights: [[994 336] [ 97 481]] Here we introduced class weights since there were many more employees who did not leave vs those who did leave. Doing this assigns higher weights to the minority class during training which will make the model pay more attention to correctly classify employees who are leaving. Adjusting class weights improved the recall ratio from 57% to 83% for the leaving class, which shows that the model has improved in being able to identify employees who are likely to leave. However, precision dropped from 82% to 59% for this same leaving class.
In [ ]: