Rooobert commited on
Commit
d3a3afe
·
verified ·
1 Parent(s): d005659

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
9
+ from sklearn.impute import SimpleImputer
10
+ from sklearn.ensemble import (
11
+ RandomForestClassifier, AdaBoostClassifier,
12
+ StackingClassifier, VotingClassifier
13
+ )
14
+ from sklearn.tree import DecisionTreeClassifier
15
+ from sklearn.linear_model import LogisticRegression
16
+ from sklearn.svm import SVC
17
+ from xgboost import XGBClassifier
18
+
19
+ from sklearn.metrics import (
20
+ accuracy_score, roc_auc_score,
21
+ confusion_matrix, classification_report
22
+ )
23
+ from imblearn.over_sampling import SMOTE
24
+
25
+ class HRTurnoverPredictor:
26
+ def __init__(self, data):
27
+ self.df = data
28
+ self.X, self.y = self.preprocess_data()
29
+
30
+ self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
31
+ self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
32
+ )
33
+
34
+ self.scaler = StandardScaler()
35
+ self.X_train_scaled = self.scaler.fit_transform(self.X_train)
36
+ self.X_test_scaled = self.scaler.transform(self.X_test)
37
+
38
+ self.smote = SMOTE(random_state=42)
39
+ self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample(
40
+ self.X_train_scaled, self.y_train
41
+ )
42
+
43
+ self.models = self.initialize_models()
44
+ self.results = None
45
+
46
+ def preprocess_data(self):
47
+ df = self.df.copy()
48
+ categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource']
49
+
50
+ for col in categorical_cols:
51
+ df[col] = LabelEncoder().fit_transform(df[col].astype(str))
52
+
53
+ features = [
54
+ 'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction',
55
+ 'SpecialProjectsCount', 'DaysLateLast30', 'Absences',
56
+ 'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'
57
+ ]
58
+
59
+ X = df[features]
60
+ y = df['Termd'].astype(int)
61
+
62
+ imputer = SimpleImputer(strategy='median')
63
+ X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
64
+
65
+ return X, y
66
+
67
+ def initialize_models(self):
68
+ base_models = {
69
+ 'Logistic Regression': LogisticRegression(random_state=42),
70
+ 'Decision Tree': DecisionTreeClassifier(random_state=42),
71
+ 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
72
+ 'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
73
+ 'AdaBoost': AdaBoostClassifier(random_state=42),
74
+ 'SVM': SVC(probability=True, random_state=42)
75
+ }
76
+
77
+ base_estimators = [
78
+ ('lr', LogisticRegression(random_state=42)),
79
+ ('rf', RandomForestClassifier(random_state=42)),
80
+ ('xgb', XGBClassifier(random_state=42))
81
+ ]
82
+ stacking_classifier = StackingClassifier(
83
+ estimators=base_estimators,
84
+ final_estimator=LogisticRegression(),
85
+ cv=5
86
+ )
87
+ base_models['Stacking'] = stacking_classifier
88
+
89
+ voting_classifier = VotingClassifier(
90
+ estimators=list(base_models.items())[:-1], # Exclude Stacking
91
+ voting='soft'
92
+ )
93
+ base_models['Voting'] = voting_classifier
94
+
95
+ return base_models
96
+
97
+ def train_models(self):
98
+ results = {}
99
+
100
+ for name, model in self.models.items():
101
+ model.fit(self.X_train_resampled, self.y_train_resampled)
102
+ y_pred = model.predict(self.X_test_scaled)
103
+ y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1]
104
+
105
+ acc = accuracy_score(self.y_test, y_pred)
106
+ roc_auc = roc_auc_score(self.y_test, y_pred_proba)
107
+
108
+ results[name] = {
109
+ 'Accuracy': acc,
110
+ 'ROC AUC': roc_auc,
111
+ 'Confusion Matrix': confusion_matrix(self.y_test, y_pred),
112
+ 'Classification Report': classification_report(self.y_test, y_pred, output_dict=True),
113
+ 'Predicted Probabilities': y_pred_proba
114
+ }
115
+
116
+ self.results = results
117
+ return results
118
+
119
+ def get_feature_importance(self):
120
+ rf_model = self.models['Random Forest']
121
+ if hasattr(rf_model, "feature_importances_"):
122
+ importances = rf_model.feature_importances_
123
+ feature_importance = pd.DataFrame({
124
+ "Feature": self.X.columns,
125
+ "Importance": importances
126
+ }).sort_values(by="Importance", ascending=False)
127
+ return feature_importance
128
+ return None
129
+
130
+ def main():
131
+ st.set_page_config(page_title="HR Turnover Prediction", layout="wide")
132
+
133
+ st.title("🏢 Employee Turnover Prediction Dashboard")
134
+
135
+ # Sidebar for file upload
136
+ st.sidebar.header("Upload HR Dataset")
137
+ uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
138
+
139
+ if uploaded_file is not None:
140
+ # Read the CSV file
141
+ df = pd.read_csv(uploaded_file)
142
+ st.sidebar.success("File successfully uploaded!")
143
+
144
+ # Create predictor
145
+ predictor = HRTurnoverPredictor(df)
146
+
147
+ # Tabs for different analyses
148
+ tab1, tab2, tab3, tab4 = st.tabs([
149
+ "Model Performance",
150
+ "Confusion Matrices",
151
+ "Feature Importance",
152
+ "Model Insights"
153
+ ])
154
+
155
+ with tab1:
156
+ st.header("Model Performance Comparison")
157
+
158
+ # Train models
159
+ results = predictor.train_models()
160
+
161
+ # Create performance DataFrame
162
+ perf_df = pd.DataFrame([
163
+ {
164
+ 'Model': model_name,
165
+ 'Accuracy': metrics['Accuracy'],
166
+ 'ROC AUC': metrics['ROC AUC']
167
+ }
168
+ for model_name, metrics in results.items()
169
+ ]).sort_values('ROC AUC', ascending=False)
170
+
171
+ # Display performance table
172
+ st.dataframe(perf_df)
173
+
174
+ # Bar plot of model performance
175
+ fig, ax = plt.subplots(figsize=(10, 6))
176
+ perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax)
177
+ plt.title("Model Performance Comparison")
178
+ plt.xlabel("Model")
179
+ plt.ylabel("Score")
180
+ plt.tight_layout()
181
+ st.pyplot(fig)
182
+
183
+ with tab2:
184
+ st.header("Confusion Matrices")
185
+
186
+ # Create a grid of confusion matrices
187
+ fig, axes = plt.subplots(3, 3, figsize=(15, 15))
188
+ axes = axes.ravel()
189
+
190
+ for i, (model_name, metrics) in enumerate(results.items()):
191
+ cm = metrics['Confusion Matrix']
192
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i])
193
+ axes[i].set_title(f"{model_name} Confusion Matrix")
194
+ axes[i].set_xlabel("Predicted Label")
195
+ axes[i].set_ylabel("True Label")
196
+
197
+ plt.tight_layout()
198
+ st.pyplot(fig)
199
+
200
+ with tab3:
201
+ st.header("Feature Importance")
202
+
203
+ # Get and display feature importance
204
+ feature_importance = predictor.get_feature_importance()
205
+ if feature_importance is not None:
206
+ st.dataframe(feature_importance)
207
+
208
+ # Feature importance plot
209
+ fig, ax = plt.subplots(figsize=(10, 6))
210
+ feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax)
211
+ plt.title("Random Forest Feature Importance")
212
+ plt.xlabel("Features")
213
+ plt.ylabel("Importance")
214
+ plt.tight_layout()
215
+ st.pyplot(fig)
216
+
217
+ with tab4:
218
+ st.header("Model Insights")
219
+
220
+ # Display detailed classification reports
221
+ for model_name, metrics in results.items():
222
+ st.subheader(f"{model_name} Classification Report")
223
+ report_df = pd.DataFrame(metrics['Classification Report']).transpose()
224
+ st.dataframe(report_df)
225
+
226
+ if __name__ == '__main__':
227
+ main()