|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.neighbors import KNeighborsClassifier |
|
from sklearn.ensemble import RandomForestClassifier |
|
from lightgbm import LGBMClassifier |
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score |
|
from sklearn.datasets import load_wine |
|
|
|
|
|
st.set_page_config( |
|
page_title="Wine Quality Analysis", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
st.title("π Wine Quality Analysis Dashboard") |
|
st.markdown(""" |
|
This dashboard analyzes wine quality data using different machine learning models. |
|
The dataset includes various wine attributes and their classifications. |
|
""") |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
wine_data = load_wine() |
|
df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names) |
|
df['class'] = wine_data.target |
|
return df, wine_data |
|
|
|
df, wine_data = load_data() |
|
|
|
|
|
st.sidebar.header("Navigation") |
|
page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Analysis", "Model Training", "Model Comparison"]) |
|
|
|
|
|
if page == "Data Overview": |
|
st.header("Dataset Overview") |
|
|
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
with col1: |
|
st.metric( |
|
label="Total Records", |
|
value=f"{len(df):,}" |
|
) |
|
|
|
with col2: |
|
st.metric( |
|
label="Features", |
|
value=len(df.columns) - 1 |
|
) |
|
|
|
with col3: |
|
st.metric( |
|
label="Target Classes", |
|
value=len(df['class'].unique()) |
|
) |
|
|
|
with col4: |
|
st.metric( |
|
label="Missing Values", |
|
value=df.isnull().sum().sum() |
|
) |
|
|
|
st.write("") |
|
|
|
|
|
st.subheader("Sample Data") |
|
st.dataframe( |
|
df.head(), |
|
use_container_width=True, |
|
height=230 |
|
) |
|
|
|
|
|
st.subheader("Target Class Distribution") |
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.countplot(data=df, x='class', palette='rocket') |
|
plt.title('Distribution of Wine Classes') |
|
st.pyplot(fig) |
|
|
|
with col2: |
|
st.write("") |
|
st.write("") |
|
class_distribution = df['class'].value_counts() |
|
for class_name, count in class_distribution.items(): |
|
st.metric( |
|
label=f"Class {class_name}", |
|
value=count |
|
) |
|
|
|
|
|
elif page == "Exploratory Analysis": |
|
st.header("Exploratory Data Analysis") |
|
|
|
|
|
st.subheader("Feature Distributions") |
|
feature_to_plot = st.selectbox("Select Feature", df.columns[:-1]) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.histplot(data=df, x=feature_to_plot, kde=True, color='purple') |
|
plt.title(f'Distribution of {feature_to_plot}') |
|
plt.xticks(rotation=45) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Correlation Heatmap") |
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm') |
|
plt.title('Feature Correlation Heatmap') |
|
st.pyplot(fig) |
|
|
|
|
|
elif page == "Model Training": |
|
st.header("Model Training and Evaluation") |
|
|
|
|
|
X = df.drop('class', axis=1) |
|
y = df['class'] |
|
|
|
|
|
test_size = st.slider("Select Test Size", 0.1, 0.4, 0.2, 0.05) |
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, y, test_size=test_size, random_state=42, stratify=y |
|
) |
|
|
|
|
|
scaler = StandardScaler() |
|
X_train_scaled = scaler.fit_transform(X_train) |
|
X_test_scaled = scaler.transform(X_test) |
|
|
|
|
|
model_choice = st.selectbox( |
|
"Select Model", |
|
["KNN", "Random Forest", "LightGBM"] |
|
) |
|
|
|
if st.button("Train Model"): |
|
with st.spinner("Training model..."): |
|
if model_choice == "KNN": |
|
model = KNeighborsClassifier(n_neighbors=5) |
|
elif model_choice == "Random Forest": |
|
model = RandomForestClassifier(n_estimators=100, random_state=42) |
|
else: |
|
model = LGBMClassifier(n_estimators=100, random_state=42) |
|
|
|
model.fit(X_train_scaled, y_train) |
|
y_pred = model.predict(X_test_scaled) |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Model Performance") |
|
accuracy = accuracy_score(y_test, y_pred) |
|
st.metric(label="Accuracy", value=f"{accuracy:.4f}") |
|
st.text("Classification Report:") |
|
st.text(classification_report(y_test, y_pred)) |
|
|
|
with col2: |
|
st.subheader("Confusion Matrix") |
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap( |
|
confusion_matrix(y_test, y_pred), |
|
annot=True, |
|
fmt='d', |
|
cmap='Blues', |
|
xticklabels=wine_data.target_names, |
|
yticklabels=wine_data.target_names |
|
) |
|
plt.title(f'{model_choice} Confusion Matrix') |
|
plt.xlabel('Predicted') |
|
plt.ylabel('Actual') |
|
st.pyplot(fig) |
|
|
|
|
|
if model_choice in ["Random Forest", "LightGBM"]: |
|
st.subheader("Feature Importance") |
|
feature_importance = pd.Series( |
|
model.feature_importances_, index=wine_data.feature_names |
|
).sort_values(ascending=False) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot( |
|
data=feature_importance.reset_index(), |
|
x=0, |
|
y='index', |
|
palette='viridis' |
|
) |
|
plt.title('Top Features by Importance') |
|
plt.xlabel('Importance') |
|
plt.ylabel('Feature') |
|
st.pyplot(fig) |
|
|
|
|
|
else: |
|
st.header("Model Comparison") |
|
|
|
if st.button("Compare All Models"): |
|
with st.spinner("Training all models..."): |
|
|
|
X = df.drop('class', axis=1) |
|
y = df['class'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
) |
|
|
|
|
|
scaler = StandardScaler() |
|
X_train_scaled = scaler.fit_transform(X_train) |
|
X_test_scaled = scaler.transform(X_test) |
|
|
|
|
|
models = { |
|
"KNN": KNeighborsClassifier(n_neighbors=5), |
|
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), |
|
"LightGBM": LGBMClassifier(n_estimators=100, random_state=42) |
|
} |
|
|
|
results = {} |
|
for name, model in models.items(): |
|
model.fit(X_train_scaled, y_train) |
|
y_pred = model.predict(X_test_scaled) |
|
results[name] = { |
|
'accuracy': accuracy_score(y_test, y_pred), |
|
'predictions': y_pred |
|
} |
|
|
|
|
|
st.subheader("Accuracy Comparison") |
|
accuracy_df = pd.DataFrame({ |
|
'Model': list(results.keys()), |
|
'Accuracy': [results[model]['accuracy'] for model in results.keys()] |
|
}) |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.dataframe(accuracy_df) |
|
|
|
with col2: |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot( |
|
data=accuracy_df, |
|
x='Model', |
|
y='Accuracy', |
|
palette='rocket' |
|
) |
|
plt.title('Model Accuracy Comparison') |
|
plt.ylim(0, 1) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Detailed Model Performance") |
|
for name in results.keys(): |
|
st.write(f"\n{name}:") |
|
st.text(classification_report(y_test, results[name]['predictions'])) |
|
|
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap( |
|
confusion_matrix(y_test, results[name]['predictions']), |
|
annot=True, |
|
fmt='d', |
|
cmap='Blues', |
|
xticklabels=wine_data.target_names, |
|
yticklabels=wine_data.target_names |
|
) |
|
plt.title(f'{name} Confusion Matrix') |
|
plt.xlabel('Predicted') |
|
plt.ylabel('Actual') |
|
st.pyplot(fig) |
|
|
|
|
|
if name in ["Random Forest", "LightGBM"]: |
|
st.subheader(f"{name} Feature Importance") |
|
feature_importance = pd.Series( |
|
models[name].feature_importances_, index=wine_data.feature_names |
|
).sort_values(ascending=False) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot( |
|
data=feature_importance.reset_index(), |
|
x=0, |
|
y='index', |
|
palette='viridis' |
|
) |
|
plt.title(f'{name} Feature Importance') |
|
plt.xlabel('Importance') |
|
plt.ylabel('Feature') |
|
st.pyplot(fig) |
|
|
|
st.markdown(""" |
|
--- |
|
Created with β€οΈ using Streamlit |
|
""") |
|
|