import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.impute import SimpleImputer from sklearn.ensemble import ( RandomForestClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier ) from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.metrics import ( accuracy_score, roc_auc_score, confusion_matrix, classification_report ) from imblearn.over_sampling import SMOTE class HRTurnoverPredictor: def __init__(self, data): self.df = data self.X, self.y = self.preprocess_data() self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.2, random_state=42, stratify=self.y ) self.scaler = StandardScaler() self.X_train_scaled = self.scaler.fit_transform(self.X_train) self.X_test_scaled = self.scaler.transform(self.X_test) self.smote = SMOTE(random_state=42) self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample( self.X_train_scaled, self.y_train ) self.models = self.initialize_models() self.results = None def preprocess_data(self): df = self.df.copy() categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'] for col in categorical_cols: df[col] = LabelEncoder().fit_transform(df[col].astype(str)) features = [ 'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount', 'DaysLateLast30', 'Absences', 'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource' ] X = df[features] y = df['Termd'].astype(int) imputer = SimpleImputer(strategy='median') X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns) return X, y def initialize_models(self): base_models = { 'Logistic Regression': LogisticRegression(random_state=42), 'Decision Tree': DecisionTreeClassifier(random_state=42), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), 'XGBoost': XGBClassifier(n_estimators=100, random_state=42), 'AdaBoost': AdaBoostClassifier(random_state=42), 'SVM': SVC(probability=True, random_state=42) } base_estimators = [ ('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(random_state=42)), ('xgb', XGBClassifier(random_state=42)) ] stacking_classifier = StackingClassifier( estimators=base_estimators, final_estimator=LogisticRegression(), cv=5 ) base_models['Stacking'] = stacking_classifier voting_classifier = VotingClassifier( estimators=list(base_models.items())[:-1], # Exclude Stacking voting='soft' ) base_models['Voting'] = voting_classifier return base_models def train_models(self): results = {} for name, model in self.models.items(): model.fit(self.X_train_resampled, self.y_train_resampled) y_pred = model.predict(self.X_test_scaled) y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1] acc = accuracy_score(self.y_test, y_pred) roc_auc = roc_auc_score(self.y_test, y_pred_proba) results[name] = { 'Accuracy': acc, 'ROC AUC': roc_auc, 'Confusion Matrix': confusion_matrix(self.y_test, y_pred), 'Classification Report': classification_report(self.y_test, y_pred, output_dict=True), 'Predicted Probabilities': y_pred_proba } self.results = results return results def get_feature_importance(self): rf_model = self.models['Random Forest'] if hasattr(rf_model, "feature_importances_"): importances = rf_model.feature_importances_ feature_importance = pd.DataFrame({ "Feature": self.X.columns, "Importance": importances }).sort_values(by="Importance", ascending=False) return feature_importance return None def main(): st.set_page_config(page_title="HR Turnover Prediction", layout="wide") st.title("🏢 Employee Turnover Prediction Dashboard") # Sidebar for file upload st.sidebar.header("Upload HR Dataset") uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: # Read the CSV file df = pd.read_csv(uploaded_file) st.sidebar.success("File successfully uploaded!") # Create predictor predictor = HRTurnoverPredictor(df) # Tabs for different analyses tab1, tab2, tab3, tab4 = st.tabs([ "Model Performance", "Confusion Matrices", "Feature Importance", "Model Insights" ]) with tab1: st.header("Model Performance Comparison") # Train models results = predictor.train_models() # Create performance DataFrame perf_df = pd.DataFrame([ { 'Model': model_name, 'Accuracy': metrics['Accuracy'], 'ROC AUC': metrics['ROC AUC'] } for model_name, metrics in results.items() ]).sort_values('ROC AUC', ascending=False) # Display performance table st.dataframe(perf_df) # Bar plot of model performance fig, ax = plt.subplots(figsize=(10, 6)) perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax) plt.title("Model Performance Comparison") plt.xlabel("Model") plt.ylabel("Score") plt.tight_layout() st.pyplot(fig) with tab2: st.header("Confusion Matrices") # Create a grid of confusion matrices fig, axes = plt.subplots(3, 3, figsize=(15, 15)) axes = axes.ravel() for i, (model_name, metrics) in enumerate(results.items()): cm = metrics['Confusion Matrix'] sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i]) axes[i].set_title(f"{model_name} Confusion Matrix") axes[i].set_xlabel("Predicted Label") axes[i].set_ylabel("True Label") plt.tight_layout() st.pyplot(fig) with tab3: st.header("Feature Importance") # Get and display feature importance feature_importance = predictor.get_feature_importance() if feature_importance is not None: st.dataframe(feature_importance) # Feature importance plot fig, ax = plt.subplots(figsize=(10, 6)) feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax) plt.title("Random Forest Feature Importance") plt.xlabel("Features") plt.ylabel("Importance") plt.tight_layout() st.pyplot(fig) with tab4: st.header("Model Insights") # Display detailed classification reports for model_name, metrics in results.items(): st.subheader(f"{model_name} Classification Report") report_df = pd.DataFrame(metrics['Classification Report']).transpose() st.dataframe(report_df) if __name__ == '__main__': main()#