Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
9 |
+
from sklearn.impute import SimpleImputer
|
10 |
+
from sklearn.ensemble import (
|
11 |
+
RandomForestClassifier, AdaBoostClassifier,
|
12 |
+
StackingClassifier, VotingClassifier
|
13 |
+
)
|
14 |
+
from sklearn.tree import DecisionTreeClassifier
|
15 |
+
from sklearn.linear_model import LogisticRegression
|
16 |
+
from sklearn.svm import SVC
|
17 |
+
from xgboost import XGBClassifier
|
18 |
+
|
19 |
+
from sklearn.metrics import (
|
20 |
+
accuracy_score, roc_auc_score,
|
21 |
+
confusion_matrix, classification_report
|
22 |
+
)
|
23 |
+
from imblearn.over_sampling import SMOTE
|
24 |
+
|
25 |
+
class HRTurnoverPredictor:
|
26 |
+
def __init__(self, data):
|
27 |
+
self.df = data
|
28 |
+
self.X, self.y = self.preprocess_data()
|
29 |
+
|
30 |
+
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
31 |
+
self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
|
32 |
+
)
|
33 |
+
|
34 |
+
self.scaler = StandardScaler()
|
35 |
+
self.X_train_scaled = self.scaler.fit_transform(self.X_train)
|
36 |
+
self.X_test_scaled = self.scaler.transform(self.X_test)
|
37 |
+
|
38 |
+
self.smote = SMOTE(random_state=42)
|
39 |
+
self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample(
|
40 |
+
self.X_train_scaled, self.y_train
|
41 |
+
)
|
42 |
+
|
43 |
+
self.models = self.initialize_models()
|
44 |
+
self.results = None
|
45 |
+
|
46 |
+
def preprocess_data(self):
|
47 |
+
df = self.df.copy()
|
48 |
+
categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource']
|
49 |
+
|
50 |
+
for col in categorical_cols:
|
51 |
+
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
|
52 |
+
|
53 |
+
features = [
|
54 |
+
'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction',
|
55 |
+
'SpecialProjectsCount', 'DaysLateLast30', 'Absences',
|
56 |
+
'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'
|
57 |
+
]
|
58 |
+
|
59 |
+
X = df[features]
|
60 |
+
y = df['Termd'].astype(int)
|
61 |
+
|
62 |
+
imputer = SimpleImputer(strategy='median')
|
63 |
+
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
|
64 |
+
|
65 |
+
return X, y
|
66 |
+
|
67 |
+
def initialize_models(self):
|
68 |
+
base_models = {
|
69 |
+
'Logistic Regression': LogisticRegression(random_state=42),
|
70 |
+
'Decision Tree': DecisionTreeClassifier(random_state=42),
|
71 |
+
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
|
72 |
+
'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
|
73 |
+
'AdaBoost': AdaBoostClassifier(random_state=42),
|
74 |
+
'SVM': SVC(probability=True, random_state=42)
|
75 |
+
}
|
76 |
+
|
77 |
+
base_estimators = [
|
78 |
+
('lr', LogisticRegression(random_state=42)),
|
79 |
+
('rf', RandomForestClassifier(random_state=42)),
|
80 |
+
('xgb', XGBClassifier(random_state=42))
|
81 |
+
]
|
82 |
+
stacking_classifier = StackingClassifier(
|
83 |
+
estimators=base_estimators,
|
84 |
+
final_estimator=LogisticRegression(),
|
85 |
+
cv=5
|
86 |
+
)
|
87 |
+
base_models['Stacking'] = stacking_classifier
|
88 |
+
|
89 |
+
voting_classifier = VotingClassifier(
|
90 |
+
estimators=list(base_models.items())[:-1], # Exclude Stacking
|
91 |
+
voting='soft'
|
92 |
+
)
|
93 |
+
base_models['Voting'] = voting_classifier
|
94 |
+
|
95 |
+
return base_models
|
96 |
+
|
97 |
+
def train_models(self):
|
98 |
+
results = {}
|
99 |
+
|
100 |
+
for name, model in self.models.items():
|
101 |
+
model.fit(self.X_train_resampled, self.y_train_resampled)
|
102 |
+
y_pred = model.predict(self.X_test_scaled)
|
103 |
+
y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1]
|
104 |
+
|
105 |
+
acc = accuracy_score(self.y_test, y_pred)
|
106 |
+
roc_auc = roc_auc_score(self.y_test, y_pred_proba)
|
107 |
+
|
108 |
+
results[name] = {
|
109 |
+
'Accuracy': acc,
|
110 |
+
'ROC AUC': roc_auc,
|
111 |
+
'Confusion Matrix': confusion_matrix(self.y_test, y_pred),
|
112 |
+
'Classification Report': classification_report(self.y_test, y_pred, output_dict=True),
|
113 |
+
'Predicted Probabilities': y_pred_proba
|
114 |
+
}
|
115 |
+
|
116 |
+
self.results = results
|
117 |
+
return results
|
118 |
+
|
119 |
+
def get_feature_importance(self):
|
120 |
+
rf_model = self.models['Random Forest']
|
121 |
+
if hasattr(rf_model, "feature_importances_"):
|
122 |
+
importances = rf_model.feature_importances_
|
123 |
+
feature_importance = pd.DataFrame({
|
124 |
+
"Feature": self.X.columns,
|
125 |
+
"Importance": importances
|
126 |
+
}).sort_values(by="Importance", ascending=False)
|
127 |
+
return feature_importance
|
128 |
+
return None
|
129 |
+
|
130 |
+
def main():
|
131 |
+
st.set_page_config(page_title="HR Turnover Prediction", layout="wide")
|
132 |
+
|
133 |
+
st.title("🏢 Employee Turnover Prediction Dashboard")
|
134 |
+
|
135 |
+
# Sidebar for file upload
|
136 |
+
st.sidebar.header("Upload HR Dataset")
|
137 |
+
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
|
138 |
+
|
139 |
+
if uploaded_file is not None:
|
140 |
+
# Read the CSV file
|
141 |
+
df = pd.read_csv(uploaded_file)
|
142 |
+
st.sidebar.success("File successfully uploaded!")
|
143 |
+
|
144 |
+
# Create predictor
|
145 |
+
predictor = HRTurnoverPredictor(df)
|
146 |
+
|
147 |
+
# Tabs for different analyses
|
148 |
+
tab1, tab2, tab3, tab4 = st.tabs([
|
149 |
+
"Model Performance",
|
150 |
+
"Confusion Matrices",
|
151 |
+
"Feature Importance",
|
152 |
+
"Model Insights"
|
153 |
+
])
|
154 |
+
|
155 |
+
with tab1:
|
156 |
+
st.header("Model Performance Comparison")
|
157 |
+
|
158 |
+
# Train models
|
159 |
+
results = predictor.train_models()
|
160 |
+
|
161 |
+
# Create performance DataFrame
|
162 |
+
perf_df = pd.DataFrame([
|
163 |
+
{
|
164 |
+
'Model': model_name,
|
165 |
+
'Accuracy': metrics['Accuracy'],
|
166 |
+
'ROC AUC': metrics['ROC AUC']
|
167 |
+
}
|
168 |
+
for model_name, metrics in results.items()
|
169 |
+
]).sort_values('ROC AUC', ascending=False)
|
170 |
+
|
171 |
+
# Display performance table
|
172 |
+
st.dataframe(perf_df)
|
173 |
+
|
174 |
+
# Bar plot of model performance
|
175 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
176 |
+
perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax)
|
177 |
+
plt.title("Model Performance Comparison")
|
178 |
+
plt.xlabel("Model")
|
179 |
+
plt.ylabel("Score")
|
180 |
+
plt.tight_layout()
|
181 |
+
st.pyplot(fig)
|
182 |
+
|
183 |
+
with tab2:
|
184 |
+
st.header("Confusion Matrices")
|
185 |
+
|
186 |
+
# Create a grid of confusion matrices
|
187 |
+
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
|
188 |
+
axes = axes.ravel()
|
189 |
+
|
190 |
+
for i, (model_name, metrics) in enumerate(results.items()):
|
191 |
+
cm = metrics['Confusion Matrix']
|
192 |
+
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i])
|
193 |
+
axes[i].set_title(f"{model_name} Confusion Matrix")
|
194 |
+
axes[i].set_xlabel("Predicted Label")
|
195 |
+
axes[i].set_ylabel("True Label")
|
196 |
+
|
197 |
+
plt.tight_layout()
|
198 |
+
st.pyplot(fig)
|
199 |
+
|
200 |
+
with tab3:
|
201 |
+
st.header("Feature Importance")
|
202 |
+
|
203 |
+
# Get and display feature importance
|
204 |
+
feature_importance = predictor.get_feature_importance()
|
205 |
+
if feature_importance is not None:
|
206 |
+
st.dataframe(feature_importance)
|
207 |
+
|
208 |
+
# Feature importance plot
|
209 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
210 |
+
feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax)
|
211 |
+
plt.title("Random Forest Feature Importance")
|
212 |
+
plt.xlabel("Features")
|
213 |
+
plt.ylabel("Importance")
|
214 |
+
plt.tight_layout()
|
215 |
+
st.pyplot(fig)
|
216 |
+
|
217 |
+
with tab4:
|
218 |
+
st.header("Model Insights")
|
219 |
+
|
220 |
+
# Display detailed classification reports
|
221 |
+
for model_name, metrics in results.items():
|
222 |
+
st.subheader(f"{model_name} Classification Report")
|
223 |
+
report_df = pd.DataFrame(metrics['Classification Report']).transpose()
|
224 |
+
st.dataframe(report_df)
|
225 |
+
|
226 |
+
if __name__ == '__main__':
|
227 |
+
main()
|