Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split, cross_val_score | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.tree import DecisionTreeClassifier, plot_tree | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | |
from sklearn.preprocessing import StandardScaler | |
# Page configuration | |
st.set_page_config( | |
page_title="Seattle Weather Analysis", | |
page_icon="🌦️", | |
layout="wide" | |
) | |
# Title and introduction | |
st.title("🌦️ Seattle Weather Machine Learning") | |
st.markdown(""" | |
This dashboard analyzes Seattle weather data using different machine learning models. | |
The dataset includes weather attributes and their classification. | |
""") | |
def get_dataset_overview(df): | |
""" | |
Generate a comprehensive overview of the dataset | |
""" | |
return { | |
"Total Records": len(df), | |
"Features": len(df.columns) - 1, # Excluding target column | |
"Target Classes": len(df['weather'].unique()), | |
"Missing Values": df.isnull().sum().sum() | |
} | |
def load_data(): | |
"""Load and preprocess the Seattle weather dataset""" | |
df = pd.read_csv('seattle-weather.csv') | |
df_cleaned = df.drop(columns=['date']) | |
weather_mapping = {'drizzle': 0, 'rain': 1, 'sun': 2, 'snow': 3, 'fog': 4} | |
df_cleaned['weather_encoded'] = df_cleaned['weather'].map(weather_mapping) | |
# Split features and target | |
X = df_cleaned.drop(columns=['weather', 'weather_encoded']) | |
y = df_cleaned['weather_encoded'] | |
# Scale features | |
scaler = StandardScaler() | |
X_scaled = scaler.fit_transform(X) | |
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) | |
# Train-test split | |
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) | |
return df, df_cleaned, X, y, X_train, X_test, y_train, y_test, weather_mapping | |
def plot_weather_distribution(df): | |
"""Plot distribution of weather types""" | |
fig, ax = plt.subplots() | |
sns.countplot(x='weather', data=df, palette='viridis', ax=ax) | |
ax.set_title("Distribution of Weather Types") | |
st.pyplot(fig) | |
def plot_temp_relationship(df): | |
"""Plot relationship between max and min temperatures""" | |
fig, ax = plt.subplots() | |
sns.scatterplot(x='temp_max', y='temp_min', hue='weather', data=df, ax=ax) | |
ax.set_title("Relationship Between Temp_max and Temp_min") | |
st.pyplot(fig) | |
def train_models(X_train, X_test, y_train, y_test): | |
"""Train Naive Bayes, Decision Tree, and Random Forest models""" | |
models = { | |
'Naive Bayes': GaussianNB(), | |
'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5), | |
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) | |
} | |
results = {} | |
for name, model in models.items(): | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
cv_scores = cross_val_score(model, X_train, y_train, cv=5) | |
results[name] = { | |
'model': model, | |
'accuracy': accuracy, | |
'cv_mean': cv_scores.mean(), | |
'cv_std': cv_scores.std(), | |
'pred': y_pred | |
} | |
return results | |
def plot_confusion_matrix(y_test, y_pred, model_name, weather_mapping): | |
"""Plot confusion matrix for a given model""" | |
fig, ax = plt.subplots() | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', | |
xticklabels=list(weather_mapping.keys()), | |
yticklabels=list(weather_mapping.keys()), ax=ax) | |
ax.set_title(f"Confusion Matrix - {model_name}") | |
ax.set_xlabel("Predicted") | |
ax.set_ylabel("Actual") | |
st.pyplot(fig) | |
def plot_feature_importance(model, X, model_name): | |
"""Plot feature importance for a given model""" | |
fig, ax = plt.subplots() | |
feature_importance = pd.DataFrame({ | |
'Feature': X.columns, | |
'Importance': model.feature_importances_ | |
}).sort_values('Importance', ascending=False) | |
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis', ax=ax) | |
ax.set_title(f"{model_name} Feature Importance") | |
st.pyplot(fig) | |
def main(): | |
# Load data | |
df, df_cleaned, X, y, X_train, X_test, y_train, y_test, weather_mapping = load_data() | |
# Sidebar menu | |
menu = st.sidebar.selectbox("Choose Analysis", [ | |
"Data Overview", | |
"Data Visualization", | |
"Model Training", | |
"Model Comparison" | |
]) | |
if menu == "Data Overview": | |
st.header("Dataset Overview") | |
# Get dataset overview | |
overview = get_dataset_overview(df) | |
# Create columns for side-by-side display | |
col1, col2, col3, col4 = st.columns(4) | |
# Display overview metrics | |
with col1: | |
st.metric(label="Total Records", value=overview["Total Records"]) | |
with col2: | |
st.metric(label="Features", value=overview["Features"]) | |
with col3: | |
st.metric(label="Target Classes", value=overview["Target Classes"]) | |
with col4: | |
st.metric(label="Missing Values", value=overview["Missing Values"]) | |
# Display first few rows | |
st.subheader("First Few Rows") | |
st.dataframe(df.head()) | |
# Weather Type Distribution | |
st.subheader("Weather Type Distribution") | |
weather_dist = df['weather'].value_counts() | |
col1, col2 = st.columns(2) | |
with col1: | |
st.dataframe(weather_dist) | |
with col2: | |
fig, ax = plt.subplots() | |
weather_dist.plot(kind='pie', autopct='%1.1f%%', ax=ax) | |
ax.set_title("Weather Type Percentage") | |
st.pyplot(fig) | |
# Descriptive Statistics | |
st.subheader("Descriptive Statistics") | |
st.dataframe(df.describe()) | |
elif menu == "Data Visualization": | |
st.header("Weather Data Visualizations") | |
viz_option = st.selectbox("Choose Visualization", [ | |
"Weather Type Distribution", | |
"Temperature Relationship", | |
"Correlation Heatmap" | |
]) | |
if viz_option == "Weather Type Distribution": | |
plot_weather_distribution(df) | |
elif viz_option == "Temperature Relationship": | |
plot_temp_relationship(df) | |
elif viz_option == "Correlation Heatmap": | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
corr_matrix = pd.concat([X, y], axis=1).corr() | |
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1, ax=ax) | |
ax.set_title("Correlation Heatmap") | |
st.pyplot(fig) | |
elif menu == "Model Training": | |
st.header("Machine Learning Models") | |
# Train models | |
results = train_models(X_train, X_test, y_train, y_test) | |
model_select = st.selectbox("Choose Model", list(results.keys())) | |
model_result = results[model_select] | |
st.write(f"{model_select} Results:") | |
st.write(f"Test Accuracy: {model_result['accuracy']:.4f}") | |
st.write(f"Cross-Validation Mean Accuracy: {model_result['cv_mean']:.4f}") | |
st.write(f"Cross-Validation Std: {model_result['cv_std']:.4f}") | |
# Confusion Matrix | |
plot_confusion_matrix(y_test, model_result['pred'], model_select, weather_mapping) | |
# Feature Importance (for Decision Tree and Random Forest) | |
if model_select != 'Naive Bayes': | |
plot_feature_importance(model_result['model'], X, model_select) | |
elif menu == "Model Comparison": | |
st.header("Model Performance Comparison") | |
# Train models if not already trained | |
results = train_models(X_train, X_test, y_train, y_test) | |
# Create comparison DataFrame | |
comparison_df = pd.DataFrame({ | |
'Model': list(results.keys()), | |
'Test Accuracy': [results[model]['accuracy'] for model in results], | |
'CV Mean Accuracy': [results[model]['cv_mean'] for model in results], | |
'CV Std': [results[model]['cv_std'] for model in results] | |
}) | |
st.write("Model Performance Comparison:") | |
st.dataframe(comparison_df) | |
# Bar plots for comparison | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) | |
# Test Accuracy Comparison | |
sns.barplot(x='Model', y='Test Accuracy', data=comparison_df, ax=ax1) | |
ax1.set_title('Test Accuracy Comparison') | |
ax1.tick_params(axis='x', rotation=45) | |
# Cross-validation Comparison | |
sns.barplot(x='Model', y='CV Mean Accuracy', data=comparison_df, ax=ax2) | |
ax2.errorbar(x=range(len(comparison_df)), | |
y=comparison_df['CV Mean Accuracy'], | |
yerr=comparison_df['CV Std'] * 2, | |
fmt='none', color='black', capsize=5) | |
ax2.set_title('Cross-validation Accuracy') | |
ax2.tick_params(axis='x', rotation=45) | |
plt.tight_layout() | |
st.pyplot(fig) | |
if __name__ == "__main__": | |
main() |