HR_Supervised / app.py
Rooobert's picture
Update app.py
4805603 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (
RandomForestClassifier, AdaBoostClassifier,
StackingClassifier, VotingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (
accuracy_score, roc_auc_score,
confusion_matrix, classification_report
)
from imblearn.over_sampling import SMOTE
class HRTurnoverPredictor:
def __init__(self, data):
self.df = data
self.X, self.y = self.preprocess_data()
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
)
self.scaler = StandardScaler()
self.X_train_scaled = self.scaler.fit_transform(self.X_train)
self.X_test_scaled = self.scaler.transform(self.X_test)
self.smote = SMOTE(random_state=42)
self.X_train_resampled, self.y_train_resampled = self.smote.fit_resample(
self.X_train_scaled, self.y_train
)
self.models = self.initialize_models()
self.results = None
def preprocess_data(self):
df = self.df.copy()
categorical_cols = ['MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource']
for col in categorical_cols:
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
features = [
'Salary', 'PerfScoreID', 'EngagementSurvey', 'EmpSatisfaction',
'SpecialProjectsCount', 'DaysLateLast30', 'Absences',
'MaritalDesc', 'CitizenDesc', 'RaceDesc', 'Department', 'Position', 'RecruitmentSource'
]
X = df[features]
y = df['Termd'].astype(int)
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
return X, y
def initialize_models(self):
base_models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
'AdaBoost': AdaBoostClassifier(random_state=42),
'SVM': SVC(probability=True, random_state=42)
}
base_estimators = [
('lr', LogisticRegression(random_state=42)),
('rf', RandomForestClassifier(random_state=42)),
('xgb', XGBClassifier(random_state=42))
]
stacking_classifier = StackingClassifier(
estimators=base_estimators,
final_estimator=LogisticRegression(),
cv=5
)
base_models['Stacking'] = stacking_classifier
voting_classifier = VotingClassifier(
estimators=list(base_models.items())[:-1], # Exclude Stacking
voting='soft'
)
base_models['Voting'] = voting_classifier
return base_models
def train_models(self):
results = {}
for name, model in self.models.items():
model.fit(self.X_train_resampled, self.y_train_resampled)
y_pred = model.predict(self.X_test_scaled)
y_pred_proba = model.predict_proba(self.X_test_scaled)[:, 1]
acc = accuracy_score(self.y_test, y_pred)
roc_auc = roc_auc_score(self.y_test, y_pred_proba)
results[name] = {
'Accuracy': acc,
'ROC AUC': roc_auc,
'Confusion Matrix': confusion_matrix(self.y_test, y_pred),
'Classification Report': classification_report(self.y_test, y_pred, output_dict=True),
'Predicted Probabilities': y_pred_proba
}
self.results = results
return results
def get_feature_importance(self):
rf_model = self.models['Random Forest']
if hasattr(rf_model, "feature_importances_"):
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({
"Feature": self.X.columns,
"Importance": importances
}).sort_values(by="Importance", ascending=False)
return feature_importance
return None
def main():
st.set_page_config(page_title="HR Turnover Prediction", layout="wide")
st.title("🏢 Employee Turnover Prediction Dashboard")
# Sidebar for file upload
st.sidebar.header("Upload HR Dataset")
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
# Read the CSV file
df = pd.read_csv(uploaded_file)
st.sidebar.success("File successfully uploaded!")
# Create predictor
predictor = HRTurnoverPredictor(df)
# Tabs for different analyses
tab1, tab2, tab3, tab4 = st.tabs([
"Model Performance",
"Confusion Matrices",
"Feature Importance",
"Model Insights"
])
with tab1:
st.header("Model Performance Comparison")
# Train models
results = predictor.train_models()
# Create performance DataFrame
perf_df = pd.DataFrame([
{
'Model': model_name,
'Accuracy': metrics['Accuracy'],
'ROC AUC': metrics['ROC AUC']
}
for model_name, metrics in results.items()
]).sort_values('ROC AUC', ascending=False)
# Display performance table
st.dataframe(perf_df)
# Bar plot of model performance
fig, ax = plt.subplots(figsize=(10, 6))
perf_df.plot(x='Model', y=['Accuracy', 'ROC AUC'], kind='bar', ax=ax)
plt.title("Model Performance Comparison")
plt.xlabel("Model")
plt.ylabel("Score")
plt.tight_layout()
st.pyplot(fig)
with tab2:
st.header("Confusion Matrices")
# Create a grid of confusion matrices
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()
for i, (model_name, metrics) in enumerate(results.items()):
cm = metrics['Confusion Matrix']
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[i])
axes[i].set_title(f"{model_name} Confusion Matrix")
axes[i].set_xlabel("Predicted Label")
axes[i].set_ylabel("True Label")
plt.tight_layout()
st.pyplot(fig)
with tab3:
st.header("Feature Importance")
# Get and display feature importance
feature_importance = predictor.get_feature_importance()
if feature_importance is not None:
st.dataframe(feature_importance)
# Feature importance plot
fig, ax = plt.subplots(figsize=(10, 6))
feature_importance.plot(x='Feature', y='Importance', kind='bar', ax=ax)
plt.title("Random Forest Feature Importance")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
st.pyplot(fig)
with tab4:
st.header("Model Insights")
# Display detailed classification reports
for model_name, metrics in results.items():
st.subheader(f"{model_name} Classification Report")
report_df = pd.DataFrame(metrics['Classification Report']).transpose()
st.dataframe(report_df)
if __name__ == '__main__':
main()#