In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
csv_path = 'employee_churn_data.csv'
turnover_df = pd.read_csv(csv_path)
turnover_df.head()
Out[2]:
department promoted review projects salary tenure satisfaction bonus avg_hrs_month left
0 operations 0 0.577569 3 low 5.0 0.626759 0 180.866070 no
1 operations 0 0.751900 3 medium 6.0 0.443679 0 182.708149 no
2 support 0 0.722548 3 medium 6.0 0.446823 0 184.416084 no
3 logistics 0 0.675158 4 high 8.0 0.440139 0 188.707545 no
4 sales 0 0.676203 3 high 5.0 0.577607 1 179.821083 no
In [3]:
turnover_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9540 entries, 0 to 9539
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   department     9540 non-null   object 
 1   promoted       9540 non-null   int64  
 2   review         9540 non-null   float64
 3   projects       9540 non-null   int64  
 4   salary         9540 non-null   object 
 5   tenure         9540 non-null   float64
 6   satisfaction   9540 non-null   float64
 7   bonus          9540 non-null   int64  
 8   avg_hrs_month  9540 non-null   float64
 9   left           9540 non-null   object 
dtypes: float64(4), int64(3), object(3)
memory usage: 745.4+ KB
In [4]:
#categorical columns turned into numerical
label_encoder = LabelEncoder()
turnover_df['salary'] = label_encoder.fit_transform(turnover_df['salary'])
turnover_df['department'] = label_encoder.fit_transform(turnover_df['department'])
turnover_df['left'] = label_encoder.fit_transform(turnover_df['left'])
In [5]:
# Split features and target variable
X = turnover_df.drop('left', axis=1)
y = turnover_df['left']
y
Out[5]:
0       0
1       0
2       0
3       0
4       0
       ..
9535    1
9536    1
9537    1
9538    1
9539    1
Name: left, Length: 9540, dtype: int32
In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [7]:
#scaling our data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
In [8]:
#creating model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)
classifier
Out[8]:
LogisticRegression(max_iter=1000, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=1)
In [11]:
#fitting model on our unscaled data 
classifier.fit(X_train, y_train)
Out[11]:
LogisticRegression(max_iter=1000, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=1)
In [12]:
classifier.fit(X_train_scaled, y_train)
Out[12]:
LogisticRegression(max_iter=1000, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=1)
In [13]:
#getting predictions for our scaled test data
predictions = classifier.predict(X_test_scaled)
In [15]:
#test data set
print(f"Testing Accuracy Score: {accuracy_score(y_test, predictions)}")
print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
Testing Accuracy Score: 0.7311320754716981

Classification Report:
              precision    recall  f1-score   support

 not leaving       0.74      0.95      0.83      1330
     leaving       0.67      0.22      0.34       578

    accuracy                           0.73      1908
   macro avg       0.70      0.59      0.58      1908
weighted avg       0.72      0.73      0.68      1908


Confusion Matrix:
[[1265   65]
 [ 448  130]]
In [16]:
train_predictions = classifier.predict(X_train_scaled)
In [17]:
#training data reports
print(f' Training Accuracy Score: ', accuracy_score(y_train, train_predictions))
print("\nClassification Report:")
print(classification_report(y_train, train_predictions, target_names = ['not leaving', 'leaving']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_train, train_predictions))
 Training Accuracy Score:  0.7333595387840671

Classification Report:
              precision    recall  f1-score   support

 not leaving       0.75      0.94      0.83      5426
     leaving       0.61      0.22      0.32      2206

    accuracy                           0.73      7632
   macro avg       0.68      0.58      0.58      7632
weighted avg       0.71      0.73      0.69      7632


Confusion Matrix:
[[5108  318]
 [1717  489]]
In [18]:
#train importance df
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Get the feature importance (coefficients)
feature_importance = model.coef_[0]

# Create a DataFrame for easy viewing
feature_names = X.columns  # Assuming X is a DataFrame
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)
         Feature  Importance
2         review    0.933413
6   satisfaction    0.385084
8  avg_hrs_month    0.367450
4         salary    0.032709
0     department   -0.013433
3       projects   -0.014418
7          bonus   -0.024395
1       promoted   -0.097614
5         tenure   -0.108770
In [19]:
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
No description has been provided for this image
In [22]:
#test importance df
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

model = LogisticRegression()
model.fit(X_test_scaled, y_test)

# Get the feature importance (coefficients)
feature_importance = model.coef_[0]

# Create a DataFrame for easy viewing
feature_names = X.columns  # Assuming X is a DataFrame
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)
         Feature  Importance
2         review    1.023313
5         tenure    0.406741
6   satisfaction    0.404325
0     department    0.055922
4         salary    0.004134
7          bonus   -0.020784
1       promoted   -0.087659
3       projects   -0.144251
8  avg_hrs_month   -0.153577
In [23]:
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
No description has been provided for this image