Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.impute import SimpleImputer | |
from sklearn.ensemble import ( | |
RandomForestClassifier, AdaBoostClassifier, | |
StackingClassifier, VotingClassifier | |
) | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from xgboost import XGBClassifier | |
from sklearn.metrics import ( | |
accuracy_score, roc_auc_score, | |
confusion_matrix, classification_report | |
) | |
from imblearn.over_sampling import SMOTE | |
class HRTurnoverPredictor: | |
def __init__(self, data): | |
self.df = data | |
self.X, self.y = self.preprocess_data() | |
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( | |
self.X, self.y, test_size=0.2, random_state=42, stratify=self.y | |
) | |
self.scaler = StandardScaler() | |
self.X_train_scaled = self.scaler.fit_transform(self.X_train) | |
self.X_test_scaled = self.scaler.transform(self.X_test) | |
self.smote = SMOTE(random_state=42) | |
self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample( | |
self.X_train_scaled, self.y_train | |
) | |
self.models = self.initialize_models() | |
self.results = None | |
def preprocess_data(self): | |
df = self.df.copy() | |
categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'] | |
for col in categorical_cols: | |
df[col] = LabelEncoder().fit_transform(df[col].astype(str)) | |
features = [ | |
'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction', | |
'SpecialProjectsCount', 'DaysLateLast30', 'Absences', | |
'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource' | |
] | |
X = df[features] | |
y = df['Termd'].astype(int) | |
imputer = SimpleImputer(strategy='median') | |
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns) | |
return X, y | |
def initialize_models(self): | |
base_models = { | |
'Logistic Regression': LogisticRegression(random_state=42), | |
'Decision Tree': DecisionTreeClassifier(random_state=42), | |
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), | |
'XGBoost': XGBClassifier(n_estimators=100, random_state=42), | |
'AdaBoost': AdaBoostClassifier(random_state=42), | |
'SVM': SVC(probability=True, random_state=42) | |
} | |
base_estimators = [ | |
('lr', LogisticRegression(random_state=42)), | |
('rf', RandomForestClassifier(random_state=42)), | |
('xgb', XGBClassifier(random_state=42)) | |
] | |
stacking_classifier = StackingClassifier( | |
estimators=base_estimators, | |
final_estimator=LogisticRegression(), | |
cv=5 | |
) | |
base_models['Stacking'] = stacking_classifier | |
voting_classifier = VotingClassifier( | |
estimators=list(base_models.items())[:-1], # Exclude Stacking | |
voting='soft' | |
) | |
base_models['Voting'] = voting_classifier | |
return base_models | |
def train_models(self): | |
results = {} | |
for name, model in self.models.items(): | |
model.fit(self.X_train_resampled, self.y_train_resampled) | |
y_pred = model.predict(self.X_test_scaled) | |
y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1] | |
acc = accuracy_score(self.y_test, y_pred) | |
roc_auc = roc_auc_score(self.y_test, y_pred_proba) | |
results[name] = { | |
'Accuracy': acc, | |
'ROC AUC': roc_auc, | |
'Confusion Matrix': confusion_matrix(self.y_test, y_pred), | |
'Classification Report': classification_report(self.y_test, y_pred, output_dict=True), | |
'Predicted Probabilities': y_pred_proba | |
} | |
self.results = results | |
return results | |
def get_feature_importance(self): | |
rf_model = self.models['Random Forest'] | |
if hasattr(rf_model, "feature_importances_"): | |
importances = rf_model.feature_importances_ | |
feature_importance = pd.DataFrame({ | |
"Feature": self.X.columns, | |
"Importance": importances | |
}).sort_values(by="Importance", ascending=False) | |
return feature_importance | |
return None | |
def main(): | |
st.set_page_config(page_title="HR Turnover Prediction", layout="wide") | |
st.title("🏢 Employee Turnover Prediction Dashboard") | |
# Sidebar for file upload | |
st.sidebar.header("Upload HR Dataset") | |
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
# Read the CSV file | |
df = pd.read_csv(uploaded_file) | |
st.sidebar.success("File successfully uploaded!") | |
# Create predictor | |
predictor = HRTurnoverPredictor(df) | |
# Tabs for different analyses | |
tab1, tab2, tab3, tab4 = st.tabs([ | |
"Model Performance", | |
"Confusion Matrices", | |
"Feature Importance", | |
"Model Insights" | |
]) | |
with tab1: | |
st.header("Model Performance Comparison") | |
# Train models | |
results = predictor.train_models() | |
# Create performance DataFrame | |
perf_df = pd.DataFrame([ | |
{ | |
'Model': model_name, | |
'Accuracy': metrics['Accuracy'], | |
'ROC AUC': metrics['ROC AUC'] | |
} | |
for model_name, metrics in results.items() | |
]).sort_values('ROC AUC', ascending=False) | |
# Display performance table | |
st.dataframe(perf_df) | |
# Bar plot of model performance | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax) | |
plt.title("Model Performance Comparison") | |
plt.xlabel("Model") | |
plt.ylabel("Score") | |
plt.tight_layout() | |
st.pyplot(fig) | |
with tab2: | |
st.header("Confusion Matrices") | |
# Create a grid of confusion matrices | |
fig, axes = plt.subplots(3, 3, figsize=(15, 15)) | |
axes = axes.ravel() | |
for i, (model_name, metrics) in enumerate(results.items()): | |
cm = metrics['Confusion Matrix'] | |
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i]) | |
axes[i].set_title(f"{model_name} Confusion Matrix") | |
axes[i].set_xlabel("Predicted Label") | |
axes[i].set_ylabel("True Label") | |
plt.tight_layout() | |
st.pyplot(fig) | |
with tab3: | |
st.header("Feature Importance") | |
# Get and display feature importance | |
feature_importance = predictor.get_feature_importance() | |
if feature_importance is not None: | |
st.dataframe(feature_importance) | |
# Feature importance plot | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax) | |
plt.title("Random Forest Feature Importance") | |
plt.xlabel("Features") | |
plt.ylabel("Importance") | |
plt.tight_layout() | |
st.pyplot(fig) | |
with tab4: | |
st.header("Model Insights") | |
# Display detailed classification reports | |
for model_name, metrics in results.items(): | |
st.subheader(f"{model_name} Classification Report") | |
report_df = pd.DataFrame(metrics['Classification Report']).transpose() | |
st.dataframe(report_df) | |
if __name__ == '__main__': | |
main()# |