Spaces:
Running
Running
raymondEDS
commited on
Commit
·
63732ac
1
Parent(s):
4a23d33
mx homework
Browse files- app/.DS_Store +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/main.py +4 -1
- app/pages/.DS_Store +0 -0
- app/pages/__pycache__/week_7.cpython-311.pyc +0 -0
- app/pages/week_5.py +4 -4
- app/pages/week_7.py +337 -0
app/.DS_Store
CHANGED
Binary files a/app/.DS_Store and b/app/.DS_Store differ
|
|
app/__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
|
|
app/main.py
CHANGED
@@ -23,6 +23,7 @@ from app.pages import week_3
|
|
23 |
from app.pages import week_4
|
24 |
from app.pages import week_5
|
25 |
from app.pages import week_6
|
|
|
26 |
# Page configuration
|
27 |
st.set_page_config(
|
28 |
page_title="Data Science Course App",
|
@@ -151,6 +152,8 @@ def show_week_content():
|
|
151 |
week_5.show()
|
152 |
elif st.session_state.current_week == 6:
|
153 |
week_6.show()
|
|
|
|
|
154 |
else:
|
155 |
st.warning("Content for this week is not yet available.")
|
156 |
|
@@ -163,7 +166,7 @@ def main():
|
|
163 |
return
|
164 |
|
165 |
# User is logged in, show course content
|
166 |
-
if st.session_state.current_week in [1, 2, 3, 4, 5, 6]:
|
167 |
show_week_content()
|
168 |
else:
|
169 |
st.title("Data Science Research Paper Course")
|
|
|
23 |
from app.pages import week_4
|
24 |
from app.pages import week_5
|
25 |
from app.pages import week_6
|
26 |
+
from app.pages import week_7
|
27 |
# Page configuration
|
28 |
st.set_page_config(
|
29 |
page_title="Data Science Course App",
|
|
|
152 |
week_5.show()
|
153 |
elif st.session_state.current_week == 6:
|
154 |
week_6.show()
|
155 |
+
elif st.session_state.current_week == 7:
|
156 |
+
week_7.show()
|
157 |
else:
|
158 |
st.warning("Content for this week is not yet available.")
|
159 |
|
|
|
166 |
return
|
167 |
|
168 |
# User is logged in, show course content
|
169 |
+
if st.session_state.current_week in [1, 2, 3, 4, 5, 6, 7]:
|
170 |
show_week_content()
|
171 |
else:
|
172 |
st.title("Data Science Research Paper Course")
|
app/pages/.DS_Store
CHANGED
Binary files a/app/pages/.DS_Store and b/app/pages/.DS_Store differ
|
|
app/pages/__pycache__/week_7.cpython-311.pyc
ADDED
Binary file (14 kB). View file
|
|
app/pages/week_5.py
CHANGED
@@ -1147,10 +1147,10 @@ for features in feature_sets:
|
|
1147 |
if username == "manxiii":
|
1148 |
st.markdown("""
|
1149 |
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
|
1150 |
-
1.
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
|
1155 |
**Due Date:** End of Week 5
|
1156 |
""")
|
|
|
1147 |
if username == "manxiii":
|
1148 |
st.markdown("""
|
1149 |
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
|
1150 |
+
1. Pick out some figures from the Colab Notebook and write a short summary of the results. Add them to your overleaf paper
|
1151 |
+
- Colab [Link](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
|
1152 |
+
- Overleaf [Link](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
|
1153 |
+
2. Update your literature review section in the overleaf paper, given the homework.
|
1154 |
|
1155 |
**Due Date:** End of Week 5
|
1156 |
""")
|
app/pages/week_7.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import plotly.express as px
|
7 |
+
import plotly.graph_objects as go
|
8 |
+
from plotly.subplots import make_subplots
|
9 |
+
|
10 |
+
# Set up the style for all plots
|
11 |
+
plt.style.use('default')
|
12 |
+
sns.set_theme(style="whitegrid", palette="husl")
|
13 |
+
|
14 |
+
def load_titanic_data():
|
15 |
+
"""Load and return the Titanic dataset"""
|
16 |
+
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
|
17 |
+
df = pd.read_csv(url)
|
18 |
+
return df
|
19 |
+
|
20 |
+
def create_categorical_plot(df, column, target='Survived'):
|
21 |
+
"""Create an interactive plot for categorical variables"""
|
22 |
+
fig = px.bar(
|
23 |
+
df.groupby(column)[target].mean().reset_index(),
|
24 |
+
x=column,
|
25 |
+
y=target,
|
26 |
+
title=f'Survival Rate by {column}',
|
27 |
+
labels={target: 'Survival Rate', column: column},
|
28 |
+
color=target,
|
29 |
+
color_continuous_scale='RdBu'
|
30 |
+
)
|
31 |
+
fig.update_layout(
|
32 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
33 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
34 |
+
font=dict(color='white')
|
35 |
+
)
|
36 |
+
return fig
|
37 |
+
|
38 |
+
def create_numeric_plot(df, column, target='Survived'):
|
39 |
+
"""Create an interactive plot for numeric variables"""
|
40 |
+
fig = px.box(
|
41 |
+
df,
|
42 |
+
x=target,
|
43 |
+
y=column,
|
44 |
+
title=f'{column} Distribution by Survival',
|
45 |
+
labels={target: 'Survived', column: column},
|
46 |
+
color=target,
|
47 |
+
color_discrete_sequence=px.colors.qualitative.Set1
|
48 |
+
)
|
49 |
+
fig.update_layout(
|
50 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
51 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
52 |
+
font=dict(color='white')
|
53 |
+
)
|
54 |
+
return fig
|
55 |
+
|
56 |
+
def show():
|
57 |
+
st.title("Week 7: Data Cleaning and EDA with Categorical Variables")
|
58 |
+
|
59 |
+
# Introduction Section
|
60 |
+
st.header("Course Overview")
|
61 |
+
st.write("""
|
62 |
+
This week, we'll explore data cleaning and exploratory data analysis (EDA) with a focus on categorical variables.
|
63 |
+
We'll use the Titanic dataset to demonstrate:
|
64 |
+
- Data cleaning techniques
|
65 |
+
- Handling missing values
|
66 |
+
- Analyzing categorical variables
|
67 |
+
- Creating meaningful visualizations
|
68 |
+
- Feature engineering
|
69 |
+
""")
|
70 |
+
|
71 |
+
# Learning Path
|
72 |
+
st.subheader("Learning Path")
|
73 |
+
st.write("""
|
74 |
+
1. Understanding the Dataset: Titanic passenger data
|
75 |
+
2. Data Cleaning: Handling missing values and outliers
|
76 |
+
3. Categorical Variables: Analysis and visualization
|
77 |
+
4. Feature Engineering: Creating new features
|
78 |
+
5. Data Visualization: Interactive plots and insights
|
79 |
+
6. Practical Applications: Real-world data analysis
|
80 |
+
""")
|
81 |
+
|
82 |
+
# Load Data
|
83 |
+
st.header("The Dataset")
|
84 |
+
st.write("""
|
85 |
+
We'll be working with the Titanic dataset, which contains information about passengers aboard the Titanic.
|
86 |
+
The dataset includes both categorical and numerical variables, making it perfect for learning data cleaning and EDA.
|
87 |
+
""")
|
88 |
+
|
89 |
+
df = load_titanic_data()
|
90 |
+
|
91 |
+
# Display basic information
|
92 |
+
st.subheader("Dataset Overview")
|
93 |
+
st.write(f"Number of rows: {len(df)}")
|
94 |
+
st.write(f"Number of columns: {len(df.columns)}")
|
95 |
+
|
96 |
+
# Display missing values
|
97 |
+
st.subheader("Missing Values Analysis")
|
98 |
+
missing_values = df.isnull().sum()
|
99 |
+
fig_missing = px.bar(
|
100 |
+
x=missing_values.index,
|
101 |
+
y=missing_values.values,
|
102 |
+
title='Missing Values by Column',
|
103 |
+
labels={'x': 'Columns', 'y': 'Number of Missing Values'}
|
104 |
+
)
|
105 |
+
fig_missing.update_layout(
|
106 |
+
title_x=0.5,
|
107 |
+
title_font_size=20,
|
108 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
109 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
110 |
+
font=dict(color='white')
|
111 |
+
)
|
112 |
+
st.plotly_chart(fig_missing)
|
113 |
+
|
114 |
+
# Data Cleaning Section
|
115 |
+
st.header("Data Cleaning")
|
116 |
+
|
117 |
+
# Handle missing values
|
118 |
+
st.subheader("Handling Missing Values")
|
119 |
+
st.write("""
|
120 |
+
Let's clean the data by:
|
121 |
+
1. Filling missing Age values with median
|
122 |
+
2. Filling missing Embarked values with mode
|
123 |
+
3. Creating a new feature for Cabin availability
|
124 |
+
""")
|
125 |
+
|
126 |
+
# Create a copy for cleaning
|
127 |
+
df_cleaned = df.copy()
|
128 |
+
|
129 |
+
# Fill missing values
|
130 |
+
df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
|
131 |
+
df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
|
132 |
+
df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
|
133 |
+
|
134 |
+
# Categorical Variables Analysis
|
135 |
+
st.header("Categorical Variables Analysis")
|
136 |
+
|
137 |
+
# Select categorical column to analyze
|
138 |
+
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'HasCabin']
|
139 |
+
selected_col = st.selectbox(
|
140 |
+
"Select Categorical Variable to Analyze",
|
141 |
+
categorical_cols
|
142 |
+
)
|
143 |
+
|
144 |
+
# Create and display categorical plot
|
145 |
+
fig_cat = create_categorical_plot(df_cleaned, selected_col)
|
146 |
+
st.plotly_chart(fig_cat)
|
147 |
+
|
148 |
+
# Numeric Variables Analysis
|
149 |
+
st.header("Numeric Variables Analysis")
|
150 |
+
|
151 |
+
# Select numeric column to analyze
|
152 |
+
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
|
153 |
+
selected_num_col = st.selectbox(
|
154 |
+
"Select Numeric Variable to Analyze",
|
155 |
+
numeric_cols
|
156 |
+
)
|
157 |
+
|
158 |
+
# Create and display numeric plot
|
159 |
+
fig_num = create_numeric_plot(df_cleaned, selected_num_col)
|
160 |
+
st.plotly_chart(fig_num)
|
161 |
+
|
162 |
+
# Reference Code Section
|
163 |
+
st.header("Reference Code")
|
164 |
+
st.write("""
|
165 |
+
Below is the reference code for the data cleaning and analysis we just performed.
|
166 |
+
Study this code to understand how we implemented the analysis.
|
167 |
+
""")
|
168 |
+
|
169 |
+
with st.expander("View Reference Code"):
|
170 |
+
st.code("""
|
171 |
+
# Data Cleaning
|
172 |
+
df_cleaned = df.copy()
|
173 |
+
df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
|
174 |
+
df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
|
175 |
+
df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
|
176 |
+
|
177 |
+
# Categorical Analysis
|
178 |
+
def create_categorical_plot(df, column, target='Survived'):
|
179 |
+
fig = px.bar(
|
180 |
+
df.groupby(column)[target].mean().reset_index(),
|
181 |
+
x=column,
|
182 |
+
y=target,
|
183 |
+
title=f'Survival Rate by {column}',
|
184 |
+
labels={target: 'Survival Rate', column: column},
|
185 |
+
color=target,
|
186 |
+
color_continuous_scale='RdBu'
|
187 |
+
)
|
188 |
+
return fig
|
189 |
+
|
190 |
+
# Numeric Analysis
|
191 |
+
def create_numeric_plot(df, column, target='Survived'):
|
192 |
+
fig = px.box(
|
193 |
+
df,
|
194 |
+
x=target,
|
195 |
+
y=column,
|
196 |
+
title=f'{column} Distribution by Survival',
|
197 |
+
labels={target: 'Survived', column: column},
|
198 |
+
color=target,
|
199 |
+
color_discrete_sequence=px.colors.qualitative.Set1
|
200 |
+
)
|
201 |
+
return fig
|
202 |
+
|
203 |
+
# Feature Engineering
|
204 |
+
df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
|
205 |
+
df_cleaned['AgeGroup'] = pd.cut(
|
206 |
+
df_cleaned['Age'],
|
207 |
+
bins=[0, 12, 18, 35, 60, 100],
|
208 |
+
labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
|
209 |
+
)
|
210 |
+
df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
|
211 |
+
""", language="python")
|
212 |
+
|
213 |
+
# Knowledge Check Quiz
|
214 |
+
st.header("Knowledge Check")
|
215 |
+
st.write("Test your understanding of the concepts covered in this section.")
|
216 |
+
|
217 |
+
# Initialize session state for quiz if not exists
|
218 |
+
if 'quiz_submitted' not in st.session_state:
|
219 |
+
st.session_state.quiz_submitted = False
|
220 |
+
|
221 |
+
# Quiz questions
|
222 |
+
questions = {
|
223 |
+
"q1": {
|
224 |
+
"question": "What is the best way to handle missing values in the 'Age' column?",
|
225 |
+
"options": [
|
226 |
+
"Fill with 0",
|
227 |
+
"Fill with the median age",
|
228 |
+
"Remove all rows with missing age",
|
229 |
+
"Fill with the mean age"
|
230 |
+
],
|
231 |
+
"correct": 1
|
232 |
+
},
|
233 |
+
"q2": {
|
234 |
+
"question": "Why do we create the 'HasCabin' feature?",
|
235 |
+
"options": [
|
236 |
+
"To reduce the number of missing values",
|
237 |
+
"To create a binary indicator for cabin availability",
|
238 |
+
"To make the data more complex",
|
239 |
+
"To remove the Cabin column"
|
240 |
+
],
|
241 |
+
"correct": 1
|
242 |
+
},
|
243 |
+
"q3": {
|
244 |
+
"question": "What does the FamilySize feature represent?",
|
245 |
+
"options": [
|
246 |
+
"Number of siblings only",
|
247 |
+
"Number of parents only",
|
248 |
+
"Total family members (including the passenger)",
|
249 |
+
"Number of children only"
|
250 |
+
],
|
251 |
+
"correct": 2
|
252 |
+
}
|
253 |
+
}
|
254 |
+
|
255 |
+
# Display quiz if not submitted
|
256 |
+
if not st.session_state.quiz_submitted:
|
257 |
+
answers = {}
|
258 |
+
for q_id, q_data in questions.items():
|
259 |
+
st.write(f"**{q_data['question']}**")
|
260 |
+
answers[q_id] = st.radio(
|
261 |
+
"Select your answer:",
|
262 |
+
q_data["options"],
|
263 |
+
key=q_id
|
264 |
+
)
|
265 |
+
|
266 |
+
if st.button("Submit Quiz"):
|
267 |
+
# Calculate score
|
268 |
+
score = sum(1 for q_id, q_data in questions.items()
|
269 |
+
if answers[q_id] == q_data["options"][q_data["correct"]])
|
270 |
+
|
271 |
+
# Show results
|
272 |
+
st.write(f"Your score: {score}/{len(questions)}")
|
273 |
+
|
274 |
+
# Show correct answers
|
275 |
+
st.write("Correct answers:")
|
276 |
+
for q_id, q_data in questions.items():
|
277 |
+
st.write(f"- {q_data['question']}")
|
278 |
+
st.write(f" Correct answer: {q_data['options'][q_data['correct']]}")
|
279 |
+
|
280 |
+
st.session_state.quiz_submitted = True
|
281 |
+
|
282 |
+
# Reset quiz button
|
283 |
+
if st.session_state.quiz_submitted:
|
284 |
+
if st.button("Take Quiz Again"):
|
285 |
+
st.session_state.quiz_submitted = False
|
286 |
+
st.rerun()
|
287 |
+
|
288 |
+
# Feature Engineering
|
289 |
+
st.header("Feature Engineering")
|
290 |
+
st.write("""
|
291 |
+
Let's create some new features:
|
292 |
+
1. Family Size = SibSp + Parch + 1
|
293 |
+
2. Age Groups
|
294 |
+
3. Fare per Person
|
295 |
+
""")
|
296 |
+
|
297 |
+
# Create new features
|
298 |
+
df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
|
299 |
+
df_cleaned['AgeGroup'] = pd.cut(
|
300 |
+
df_cleaned['Age'],
|
301 |
+
bins=[0, 12, 18, 35, 60, 100],
|
302 |
+
labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
|
303 |
+
)
|
304 |
+
df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
|
305 |
+
|
306 |
+
# Display new features
|
307 |
+
st.subheader("New Features Analysis")
|
308 |
+
|
309 |
+
# Family Size Analysis
|
310 |
+
fig_family = create_categorical_plot(df_cleaned, 'FamilySize')
|
311 |
+
st.plotly_chart(fig_family)
|
312 |
+
|
313 |
+
# Age Group Analysis
|
314 |
+
fig_age = create_categorical_plot(df_cleaned, 'AgeGroup')
|
315 |
+
st.plotly_chart(fig_age)
|
316 |
+
|
317 |
+
# Conclusion
|
318 |
+
st.header("Conclusion")
|
319 |
+
st.write("""
|
320 |
+
Through this analysis, we've learned:
|
321 |
+
- How to handle missing values in real-world datasets
|
322 |
+
- Techniques for analyzing categorical variables
|
323 |
+
- Methods for creating meaningful visualizations
|
324 |
+
- Feature engineering approaches
|
325 |
+
- Best practices for data cleaning and EDA
|
326 |
+
""")
|
327 |
+
|
328 |
+
# Additional Resources
|
329 |
+
st.header("Additional Resources")
|
330 |
+
st.write("""
|
331 |
+
- [Pandas Documentation](https://pandas.pydata.org/docs/)
|
332 |
+
- [Seaborn Documentation](https://seaborn.pydata.org/)
|
333 |
+
- [Plotly Documentation](https://plotly.com/python/)
|
334 |
+
- [Data Cleaning Best Practices](https://towardsdatascience.com/data-cleaning-steps-and-process-8ae2d0f5147)
|
335 |
+
- [Colab Notebook](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
|
336 |
+
- [Overleaf Project](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
|
337 |
+
""")
|