Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.svm import SVC | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
from ucimlrepo import fetch_ucirepo | |
# Page configuration | |
st.set_page_config( | |
page_title="Car Evaluation Analysis", | |
page_icon="π", | |
layout="wide" | |
) | |
# Title and introduction | |
st.title("π Car Evaluation Analysis Dashboard") | |
st.markdown(""" | |
This dashboard analyzes car evaluation data using different machine learning models. | |
The dataset includes various car attributes and their evaluation classifications. | |
""") | |
# Load and prepare data | |
def load_data(): | |
car_evaluation = fetch_ucirepo(id=19) | |
X, y = car_evaluation.data.features, car_evaluation.data.targets | |
df = pd.concat([X, y], axis=1) | |
return df, X, y | |
df, X, y = load_data() | |
# Sidebar | |
st.sidebar.header("Navigation") | |
page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Analysis", "Model Training", "Model Comparison"]) | |
# Data Overview Page | |
if page == "Data Overview": | |
st.header("Dataset Overview") | |
# Display metrics in cards | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
st.metric( | |
label="Total Records", | |
value=f"{len(df):,}" | |
) | |
with col2: | |
st.metric( | |
label="Features", | |
value=len(df.columns) - 1 | |
) | |
with col3: | |
st.metric( | |
label="Target Classes", | |
value=len(df['class'].unique()) | |
) | |
with col4: | |
st.metric( | |
label="Missing Values", | |
value=df.isnull().sum().sum() | |
) | |
st.write("") | |
# Sample Data | |
st.subheader("Sample Data") | |
st.dataframe( | |
df.head(), | |
use_container_width=True, | |
height=230 | |
) | |
# Target Class Distribution | |
st.subheader("Target Class Distribution") | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.countplot(data=df, x='class', palette='viridis') | |
plt.title('Distribution of Car Evaluations') | |
st.pyplot(fig) | |
with col2: | |
st.write("") | |
st.write("") | |
class_distribution = df['class'].value_counts() | |
for class_name, count in class_distribution.items(): | |
st.metric( | |
label=class_name, | |
value=count | |
) | |
# Exploratory Analysis Page | |
elif page == "Exploratory Analysis": | |
st.header("Exploratory Data Analysis") | |
# Feature Distribution | |
st.subheader("Feature Distributions") | |
feature_to_plot = st.selectbox("Select Feature", df.columns[:-1]) | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.countplot(data=df, x=feature_to_plot, palette='coolwarm') | |
plt.title(f'Distribution of {feature_to_plot}') | |
plt.xticks(rotation=45) | |
st.pyplot(fig) | |
# Feature vs Target | |
st.subheader("Feature vs Target Class") | |
fig, ax = plt.subplots(figsize=(12, 6)) | |
sns.countplot(data=df, x=feature_to_plot, hue='class', palette='Set2') | |
plt.title(f'{feature_to_plot} Distribution by Class') | |
plt.xticks(rotation=45) | |
st.pyplot(fig) | |
# Correlation Heatmap | |
st.subheader("Correlation Heatmap") | |
encoded_df = pd.get_dummies(df, drop_first=True) | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
sns.heatmap(encoded_df.corr(), annot=True, fmt='.2f', cmap='coolwarm') | |
plt.title('Correlation Heatmap of Encoded Features') | |
st.pyplot(fig) | |
# Model Training Page | |
elif page == "Model Training": | |
st.header("Model Training and Evaluation") | |
# Data preprocessing | |
encoder = OneHotEncoder(sparse_output=False) | |
X_encoded = encoder.fit_transform(X) | |
y_encoded = y.values.ravel() | |
# Train-test split | |
test_size = st.slider("Select Test Size", 0.1, 0.4, 0.2, 0.05) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X_encoded, y_encoded, test_size=test_size, random_state=42 | |
) | |
# Model selection | |
model_choice = st.selectbox( | |
"Select Model", | |
["Support Vector Machine", "Random Forest", "Logistic Regression"] | |
) | |
if st.button("Train Model"): | |
with st.spinner("Training model..."): | |
if model_choice == "Support Vector Machine": | |
model = SVC(kernel='linear', random_state=42) | |
elif model_choice == "Random Forest": | |
model = RandomForestClassifier(n_estimators=100, random_state=42) | |
else: | |
model = LogisticRegression(max_iter=500, random_state=42) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
# Display results | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Model Performance") | |
accuracy = accuracy_score(y_test, y_pred) | |
st.metric(label="Accuracy", value=f"{accuracy:.4f}") | |
st.text("Classification Report:") | |
st.text(classification_report(y_test, y_pred)) | |
with col2: | |
st.subheader("Confusion Matrix") | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.heatmap( | |
confusion_matrix(y_test, y_pred), | |
annot=True, | |
fmt='d', | |
cmap='Blues', | |
xticklabels=np.unique(y_test), | |
yticklabels=np.unique(y_test) | |
) | |
plt.title(f'{model_choice} Confusion Matrix') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
st.pyplot(fig) | |
# Feature importance for Random Forest | |
if model_choice == "Random Forest": | |
st.subheader("Feature Importance") | |
feature_importance = pd.DataFrame({ | |
'feature': encoder.get_feature_names_out(), | |
'importance': model.feature_importances_ | |
}) | |
feature_importance = feature_importance.sort_values( | |
'importance', ascending=False | |
).head(10) | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.barplot( | |
data=feature_importance, | |
x='importance', | |
y='feature' | |
) | |
plt.title('Top 10 Most Important Features') | |
st.pyplot(fig) | |
# Model Comparison Page | |
else: | |
st.header("Model Comparison") | |
if st.button("Compare All Models"): | |
with st.spinner("Training all models..."): | |
# Data preprocessing | |
encoder = OneHotEncoder(sparse_output=False) | |
X_encoded = encoder.fit_transform(X) | |
y_encoded = y.values.ravel() | |
# Train-test split | |
X_train, X_test, y_train, y_test = train_test_split( | |
X_encoded, y_encoded, test_size=0.2, random_state=42 | |
) | |
# Train all models | |
models = { | |
"SVM": SVC(kernel='linear', random_state=42), | |
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), | |
"Logistic Regression": LogisticRegression(max_iter=500, random_state=42) | |
} | |
results = {} | |
for name, model in models.items(): | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
results[name] = { | |
'accuracy': accuracy_score(y_test, y_pred), | |
'predictions': y_pred | |
} | |
# Display comparison results | |
st.subheader("Accuracy Comparison") | |
accuracy_df = pd.DataFrame({ | |
'Model': list(results.keys()), | |
'Accuracy': [results[model]['accuracy'] for model in results.keys()] | |
}) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.dataframe(accuracy_df) | |
with col2: | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.barplot( | |
data=accuracy_df, | |
x='Model', | |
y='Accuracy', | |
palette='viridis' | |
) | |
plt.title('Model Accuracy Comparison') | |
plt.ylim(0, 1) | |
st.pyplot(fig) | |
# Detailed model comparison | |
st.subheader("Detailed Model Performance") | |
for name in results.keys(): | |
st.write(f"\n{name}:") | |
st.text(classification_report(y_test, results[name]['predictions'])) | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.heatmap( | |
confusion_matrix(y_test, results[name]['predictions']), | |
annot=True, | |
fmt='d', | |
cmap='Blues', | |
xticklabels=np.unique(y_test), | |
yticklabels=np.unique(y_test) | |
) | |
plt.title(f'{name} Confusion Matrix') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
st.pyplot(fig) | |
# Footer | |
st.markdown(""" | |
--- | |
Created with β€οΈ using Streamlit | |
""") |