cisemh commited on
Commit
0921e01
·
1 Parent(s): 546a475
Files changed (1) hide show
  1. app.py +307 -2
app.py CHANGED
@@ -1,4 +1,309 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler
8
+ from sklearn.neighbors import KNeighborsClassifier
9
+ from sklearn.ensemble import RandomForestClassifier
10
+ from lightgbm import LGBMClassifier
11
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
12
+ from sklearn.datasets import load_wine
13
 
14
+ # Page configuration
15
+ st.set_page_config(
16
+ page_title="Wine Quality Analysis",
17
+ page_icon="🍇",
18
+ layout="wide"
19
+ )
20
+
21
+ # Title and introduction
22
+ st.title("🍇 Wine Quality Analysis Dashboard")
23
+ st.markdown("""
24
+ This dashboard analyzes wine quality data using different machine learning models.
25
+ The dataset includes various wine attributes and their classifications.
26
+ """)
27
+
28
+ # Load and prepare data
29
+ @st.cache_data
30
+ def load_data():
31
+ wine_data = load_wine()
32
+ df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
33
+ df['class'] = wine_data.target
34
+ return df, wine_data
35
+
36
+ df, wine_data = load_data()
37
+
38
+ # Sidebar
39
+ st.sidebar.header("Navigation")
40
+ page = st.sidebar.radio("Go to", ["Data Overview", "Exploratory Analysis", "Model Training", "Model Comparison"])
41
+
42
+ # Data Overview Page
43
+ if page == "Data Overview":
44
+ st.header("Dataset Overview")
45
+
46
+ # Display metrics in cards
47
+ col1, col2, col3, col4 = st.columns(4)
48
+
49
+ with col1:
50
+ st.metric(
51
+ label="Total Records",
52
+ value=f"{len(df):,}"
53
+ )
54
+
55
+ with col2:
56
+ st.metric(
57
+ label="Features",
58
+ value=len(df.columns) - 1
59
+ )
60
+
61
+ with col3:
62
+ st.metric(
63
+ label="Target Classes",
64
+ value=len(df['class'].unique())
65
+ )
66
+
67
+ with col4:
68
+ st.metric(
69
+ label="Missing Values",
70
+ value=df.isnull().sum().sum()
71
+ )
72
+
73
+ st.write("")
74
+
75
+ # Sample Data
76
+ st.subheader("Sample Data")
77
+ st.dataframe(
78
+ df.head(),
79
+ use_container_width=True,
80
+ height=230
81
+ )
82
+
83
+ # Target Class Distribution
84
+ st.subheader("Target Class Distribution")
85
+
86
+ col1, col2 = st.columns([2, 1])
87
+
88
+ with col1:
89
+ fig, ax = plt.subplots(figsize=(10, 6))
90
+ sns.countplot(data=df, x='class', palette='rocket')
91
+ plt.title('Distribution of Wine Classes')
92
+ st.pyplot(fig)
93
+
94
+ with col2:
95
+ st.write("")
96
+ st.write("")
97
+ class_distribution = df['class'].value_counts()
98
+ for class_name, count in class_distribution.items():
99
+ st.metric(
100
+ label=f"Class {class_name}",
101
+ value=count
102
+ )
103
+
104
+ # Exploratory Analysis Page
105
+ elif page == "Exploratory Analysis":
106
+ st.header("Exploratory Data Analysis")
107
+
108
+ # Feature Distribution
109
+ st.subheader("Feature Distributions")
110
+ feature_to_plot = st.selectbox("Select Feature", df.columns[:-1])
111
+
112
+ fig, ax = plt.subplots(figsize=(10, 6))
113
+ sns.histplot(data=df, x=feature_to_plot, kde=True, color='purple')
114
+ plt.title(f'Distribution of {feature_to_plot}')
115
+ plt.xticks(rotation=45)
116
+ st.pyplot(fig)
117
+
118
+ # Correlation Heatmap
119
+ st.subheader("Correlation Heatmap")
120
+ fig, ax = plt.subplots(figsize=(12, 8))
121
+ sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
122
+ plt.title('Feature Correlation Heatmap')
123
+ st.pyplot(fig)
124
+
125
+ # Model Training Page
126
+ elif page == "Model Training":
127
+ st.header("Model Training and Evaluation")
128
+
129
+ # Data preprocessing
130
+ X = df.drop('class', axis=1)
131
+ y = df['class']
132
+
133
+ # Train-test split
134
+ test_size = st.slider("Select Test Size", 0.1, 0.4, 0.2, 0.05)
135
+ X_train, X_test, y_train, y_test = train_test_split(
136
+ X, y, test_size=test_size, random_state=42, stratify=y
137
+ )
138
+
139
+ # Scaling
140
+ scaler = StandardScaler()
141
+ X_train_scaled = scaler.fit_transform(X_train)
142
+ X_test_scaled = scaler.transform(X_test)
143
+
144
+ # Model selection
145
+ model_choice = st.selectbox(
146
+ "Select Model",
147
+ ["KNN", "Random Forest", "LightGBM"]
148
+ )
149
+
150
+ if st.button("Train Model"):
151
+ with st.spinner("Training model..."):
152
+ if model_choice == "KNN":
153
+ model = KNeighborsClassifier(n_neighbors=5)
154
+ elif model_choice == "Random Forest":
155
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
156
+ else:
157
+ model = LGBMClassifier(n_estimators=100, random_state=42)
158
+
159
+ model.fit(X_train_scaled, y_train)
160
+ y_pred = model.predict(X_test_scaled)
161
+
162
+ # Display results
163
+ col1, col2 = st.columns(2)
164
+
165
+ with col1:
166
+ st.subheader("Model Performance")
167
+ accuracy = accuracy_score(y_test, y_pred)
168
+ st.metric(label="Accuracy", value=f"{accuracy:.4f}")
169
+ st.text("Classification Report:")
170
+ st.text(classification_report(y_test, y_pred))
171
+
172
+ with col2:
173
+ st.subheader("Confusion Matrix")
174
+ fig, ax = plt.subplots(figsize=(8, 6))
175
+ sns.heatmap(
176
+ confusion_matrix(y_test, y_pred),
177
+ annot=True,
178
+ fmt='d',
179
+ cmap='Blues',
180
+ xticklabels=wine_data.target_names,
181
+ yticklabels=wine_data.target_names
182
+ )
183
+ plt.title(f'{model_choice} Confusion Matrix')
184
+ plt.xlabel('Predicted')
185
+ plt.ylabel('Actual')
186
+ st.pyplot(fig)
187
+
188
+ # Feature importance for applicable models
189
+ if model_choice in ["Random Forest", "LightGBM"]:
190
+ st.subheader("Feature Importance")
191
+ feature_importance = pd.Series(
192
+ model.feature_importances_, index=wine_data.feature_names
193
+ ).sort_values(ascending=False)
194
+
195
+ fig, ax = plt.subplots(figsize=(10, 6))
196
+ sns.barplot(
197
+ data=feature_importance.reset_index(),
198
+ x=0,
199
+ y='index',
200
+ palette='viridis'
201
+ )
202
+ plt.title('Top Features by Importance')
203
+ plt.xlabel('Importance')
204
+ plt.ylabel('Feature')
205
+ st.pyplot(fig)
206
+
207
+ # Model Comparison Page
208
+ else:
209
+ st.header("Model Comparison")
210
+
211
+ if st.button("Compare All Models"):
212
+ with st.spinner("Training all models..."):
213
+ # Data preprocessing
214
+ X = df.drop('class', axis=1)
215
+ y = df['class']
216
+
217
+ # Train-test split
218
+ X_train, X_test, y_train, y_test = train_test_split(
219
+ X, y, test_size=0.2, random_state=42, stratify=y
220
+ )
221
+
222
+ # Scaling
223
+ scaler = StandardScaler()
224
+ X_train_scaled = scaler.fit_transform(X_train)
225
+ X_test_scaled = scaler.transform(X_test)
226
+
227
+ # Train all models
228
+ models = {
229
+ "KNN": KNeighborsClassifier(n_neighbors=5),
230
+ "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
231
+ "LightGBM": LGBMClassifier(n_estimators=100, random_state=42)
232
+ }
233
+
234
+ results = {}
235
+ for name, model in models.items():
236
+ model.fit(X_train_scaled, y_train)
237
+ y_pred = model.predict(X_test_scaled)
238
+ results[name] = {
239
+ 'accuracy': accuracy_score(y_test, y_pred),
240
+ 'predictions': y_pred
241
+ }
242
+
243
+ # Display comparison results
244
+ st.subheader("Accuracy Comparison")
245
+ accuracy_df = pd.DataFrame({
246
+ 'Model': list(results.keys()),
247
+ 'Accuracy': [results[model]['accuracy'] for model in results.keys()]
248
+ })
249
+
250
+ col1, col2 = st.columns(2)
251
+
252
+ with col1:
253
+ st.dataframe(accuracy_df)
254
+
255
+ with col2:
256
+ fig, ax = plt.subplots(figsize=(10, 6))
257
+ sns.barplot(
258
+ data=accuracy_df,
259
+ x='Model',
260
+ y='Accuracy',
261
+ palette='rocket'
262
+ )
263
+ plt.title('Model Accuracy Comparison')
264
+ plt.ylim(0, 1)
265
+ st.pyplot(fig)
266
+
267
+ # Detailed model comparison
268
+ st.subheader("Detailed Model Performance")
269
+ for name in results.keys():
270
+ st.write(f"\n{name}:")
271
+ st.text(classification_report(y_test, results[name]['predictions']))
272
+
273
+ fig, ax = plt.subplots(figsize=(8, 6))
274
+ sns.heatmap(
275
+ confusion_matrix(y_test, results[name]['predictions']),
276
+ annot=True,
277
+ fmt='d',
278
+ cmap='Blues',
279
+ xticklabels=wine_data.target_names,
280
+ yticklabels=wine_data.target_names
281
+ )
282
+ plt.title(f'{name} Confusion Matrix')
283
+ plt.xlabel('Predicted')
284
+ plt.ylabel('Actual')
285
+ st.pyplot(fig)
286
+
287
+ # Feature importance for applicable models
288
+ if name in ["Random Forest", "LightGBM"]:
289
+ st.subheader(f"{name} Feature Importance")
290
+ feature_importance = pd.Series(
291
+ models[name].feature_importances_, index=wine_data.feature_names
292
+ ).sort_values(ascending=False)
293
+
294
+ fig, ax = plt.subplots(figsize=(10, 6))
295
+ sns.barplot(
296
+ data=feature_importance.reset_index(),
297
+ x=0,
298
+ y='index',
299
+ palette='viridis'
300
+ )
301
+ plt.title(f'{name} Feature Importance')
302
+ plt.xlabel('Importance')
303
+ plt.ylabel('Feature')
304
+ st.pyplot(fig)
305
+ # Footer
306
+ st.markdown("""
307
+ ---
308
+ Created with ❤️ using Streamlit
309
+ """)