raymondEDS commited on
Commit
63732ac
·
1 Parent(s): 4a23d33

mx homework

Browse files
app/.DS_Store CHANGED
Binary files a/app/.DS_Store and b/app/.DS_Store differ
 
app/__pycache__/main.cpython-311.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
 
app/main.py CHANGED
@@ -23,6 +23,7 @@ from app.pages import week_3
23
  from app.pages import week_4
24
  from app.pages import week_5
25
  from app.pages import week_6
 
26
  # Page configuration
27
  st.set_page_config(
28
  page_title="Data Science Course App",
@@ -151,6 +152,8 @@ def show_week_content():
151
  week_5.show()
152
  elif st.session_state.current_week == 6:
153
  week_6.show()
 
 
154
  else:
155
  st.warning("Content for this week is not yet available.")
156
 
@@ -163,7 +166,7 @@ def main():
163
  return
164
 
165
  # User is logged in, show course content
166
- if st.session_state.current_week in [1, 2, 3, 4, 5, 6]:
167
  show_week_content()
168
  else:
169
  st.title("Data Science Research Paper Course")
 
23
  from app.pages import week_4
24
  from app.pages import week_5
25
  from app.pages import week_6
26
+ from app.pages import week_7
27
  # Page configuration
28
  st.set_page_config(
29
  page_title="Data Science Course App",
 
152
  week_5.show()
153
  elif st.session_state.current_week == 6:
154
  week_6.show()
155
+ elif st.session_state.current_week == 7:
156
+ week_7.show()
157
  else:
158
  st.warning("Content for this week is not yet available.")
159
 
 
166
  return
167
 
168
  # User is logged in, show course content
169
+ if st.session_state.current_week in [1, 2, 3, 4, 5, 6, 7]:
170
  show_week_content()
171
  else:
172
  st.title("Data Science Research Paper Course")
app/pages/.DS_Store CHANGED
Binary files a/app/pages/.DS_Store and b/app/pages/.DS_Store differ
 
app/pages/__pycache__/week_7.cpython-311.pyc ADDED
Binary file (14 kB). View file
 
app/pages/week_5.py CHANGED
@@ -1147,10 +1147,10 @@ for features in feature_sets:
1147
  if username == "manxiii":
1148
  st.markdown("""
1149
  Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
1150
- 1. Complete the feature engineering pipeline for the ICLR dataset
1151
- 2. Build both simple and multiple linear regression models
1152
- 3. Compare model performance and interpret results
1153
- 4. Submit your findings in a Jupyter notebook
1154
 
1155
  **Due Date:** End of Week 5
1156
  """)
 
1147
  if username == "manxiii":
1148
  st.markdown("""
1149
  Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
1150
+ 1. Pick out some figures from the Colab Notebook and write a short summary of the results. Add them to your overleaf paper
1151
+ - Colab [Link](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
1152
+ - Overleaf [Link](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
1153
+ 2. Update your literature review section in the overleaf paper, given the homework.
1154
 
1155
  **Due Date:** End of Week 5
1156
  """)
app/pages/week_7.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+
10
+ # Set up the style for all plots
11
+ plt.style.use('default')
12
+ sns.set_theme(style="whitegrid", palette="husl")
13
+
14
+ def load_titanic_data():
15
+ """Load and return the Titanic dataset"""
16
+ url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
17
+ df = pd.read_csv(url)
18
+ return df
19
+
20
+ def create_categorical_plot(df, column, target='Survived'):
21
+ """Create an interactive plot for categorical variables"""
22
+ fig = px.bar(
23
+ df.groupby(column)[target].mean().reset_index(),
24
+ x=column,
25
+ y=target,
26
+ title=f'Survival Rate by {column}',
27
+ labels={target: 'Survival Rate', column: column},
28
+ color=target,
29
+ color_continuous_scale='RdBu'
30
+ )
31
+ fig.update_layout(
32
+ plot_bgcolor='rgb(30, 30, 30)',
33
+ paper_bgcolor='rgb(30, 30, 30)',
34
+ font=dict(color='white')
35
+ )
36
+ return fig
37
+
38
+ def create_numeric_plot(df, column, target='Survived'):
39
+ """Create an interactive plot for numeric variables"""
40
+ fig = px.box(
41
+ df,
42
+ x=target,
43
+ y=column,
44
+ title=f'{column} Distribution by Survival',
45
+ labels={target: 'Survived', column: column},
46
+ color=target,
47
+ color_discrete_sequence=px.colors.qualitative.Set1
48
+ )
49
+ fig.update_layout(
50
+ plot_bgcolor='rgb(30, 30, 30)',
51
+ paper_bgcolor='rgb(30, 30, 30)',
52
+ font=dict(color='white')
53
+ )
54
+ return fig
55
+
56
+ def show():
57
+ st.title("Week 7: Data Cleaning and EDA with Categorical Variables")
58
+
59
+ # Introduction Section
60
+ st.header("Course Overview")
61
+ st.write("""
62
+ This week, we'll explore data cleaning and exploratory data analysis (EDA) with a focus on categorical variables.
63
+ We'll use the Titanic dataset to demonstrate:
64
+ - Data cleaning techniques
65
+ - Handling missing values
66
+ - Analyzing categorical variables
67
+ - Creating meaningful visualizations
68
+ - Feature engineering
69
+ """)
70
+
71
+ # Learning Path
72
+ st.subheader("Learning Path")
73
+ st.write("""
74
+ 1. Understanding the Dataset: Titanic passenger data
75
+ 2. Data Cleaning: Handling missing values and outliers
76
+ 3. Categorical Variables: Analysis and visualization
77
+ 4. Feature Engineering: Creating new features
78
+ 5. Data Visualization: Interactive plots and insights
79
+ 6. Practical Applications: Real-world data analysis
80
+ """)
81
+
82
+ # Load Data
83
+ st.header("The Dataset")
84
+ st.write("""
85
+ We'll be working with the Titanic dataset, which contains information about passengers aboard the Titanic.
86
+ The dataset includes both categorical and numerical variables, making it perfect for learning data cleaning and EDA.
87
+ """)
88
+
89
+ df = load_titanic_data()
90
+
91
+ # Display basic information
92
+ st.subheader("Dataset Overview")
93
+ st.write(f"Number of rows: {len(df)}")
94
+ st.write(f"Number of columns: {len(df.columns)}")
95
+
96
+ # Display missing values
97
+ st.subheader("Missing Values Analysis")
98
+ missing_values = df.isnull().sum()
99
+ fig_missing = px.bar(
100
+ x=missing_values.index,
101
+ y=missing_values.values,
102
+ title='Missing Values by Column',
103
+ labels={'x': 'Columns', 'y': 'Number of Missing Values'}
104
+ )
105
+ fig_missing.update_layout(
106
+ title_x=0.5,
107
+ title_font_size=20,
108
+ plot_bgcolor='rgb(30, 30, 30)',
109
+ paper_bgcolor='rgb(30, 30, 30)',
110
+ font=dict(color='white')
111
+ )
112
+ st.plotly_chart(fig_missing)
113
+
114
+ # Data Cleaning Section
115
+ st.header("Data Cleaning")
116
+
117
+ # Handle missing values
118
+ st.subheader("Handling Missing Values")
119
+ st.write("""
120
+ Let's clean the data by:
121
+ 1. Filling missing Age values with median
122
+ 2. Filling missing Embarked values with mode
123
+ 3. Creating a new feature for Cabin availability
124
+ """)
125
+
126
+ # Create a copy for cleaning
127
+ df_cleaned = df.copy()
128
+
129
+ # Fill missing values
130
+ df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
131
+ df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
132
+ df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
133
+
134
+ # Categorical Variables Analysis
135
+ st.header("Categorical Variables Analysis")
136
+
137
+ # Select categorical column to analyze
138
+ categorical_cols = ['Pclass', 'Sex', 'Embarked', 'HasCabin']
139
+ selected_col = st.selectbox(
140
+ "Select Categorical Variable to Analyze",
141
+ categorical_cols
142
+ )
143
+
144
+ # Create and display categorical plot
145
+ fig_cat = create_categorical_plot(df_cleaned, selected_col)
146
+ st.plotly_chart(fig_cat)
147
+
148
+ # Numeric Variables Analysis
149
+ st.header("Numeric Variables Analysis")
150
+
151
+ # Select numeric column to analyze
152
+ numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
153
+ selected_num_col = st.selectbox(
154
+ "Select Numeric Variable to Analyze",
155
+ numeric_cols
156
+ )
157
+
158
+ # Create and display numeric plot
159
+ fig_num = create_numeric_plot(df_cleaned, selected_num_col)
160
+ st.plotly_chart(fig_num)
161
+
162
+ # Reference Code Section
163
+ st.header("Reference Code")
164
+ st.write("""
165
+ Below is the reference code for the data cleaning and analysis we just performed.
166
+ Study this code to understand how we implemented the analysis.
167
+ """)
168
+
169
+ with st.expander("View Reference Code"):
170
+ st.code("""
171
+ # Data Cleaning
172
+ df_cleaned = df.copy()
173
+ df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
174
+ df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
175
+ df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
176
+
177
+ # Categorical Analysis
178
+ def create_categorical_plot(df, column, target='Survived'):
179
+ fig = px.bar(
180
+ df.groupby(column)[target].mean().reset_index(),
181
+ x=column,
182
+ y=target,
183
+ title=f'Survival Rate by {column}',
184
+ labels={target: 'Survival Rate', column: column},
185
+ color=target,
186
+ color_continuous_scale='RdBu'
187
+ )
188
+ return fig
189
+
190
+ # Numeric Analysis
191
+ def create_numeric_plot(df, column, target='Survived'):
192
+ fig = px.box(
193
+ df,
194
+ x=target,
195
+ y=column,
196
+ title=f'{column} Distribution by Survival',
197
+ labels={target: 'Survived', column: column},
198
+ color=target,
199
+ color_discrete_sequence=px.colors.qualitative.Set1
200
+ )
201
+ return fig
202
+
203
+ # Feature Engineering
204
+ df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
205
+ df_cleaned['AgeGroup'] = pd.cut(
206
+ df_cleaned['Age'],
207
+ bins=[0, 12, 18, 35, 60, 100],
208
+ labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
209
+ )
210
+ df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
211
+ """, language="python")
212
+
213
+ # Knowledge Check Quiz
214
+ st.header("Knowledge Check")
215
+ st.write("Test your understanding of the concepts covered in this section.")
216
+
217
+ # Initialize session state for quiz if not exists
218
+ if 'quiz_submitted' not in st.session_state:
219
+ st.session_state.quiz_submitted = False
220
+
221
+ # Quiz questions
222
+ questions = {
223
+ "q1": {
224
+ "question": "What is the best way to handle missing values in the 'Age' column?",
225
+ "options": [
226
+ "Fill with 0",
227
+ "Fill with the median age",
228
+ "Remove all rows with missing age",
229
+ "Fill with the mean age"
230
+ ],
231
+ "correct": 1
232
+ },
233
+ "q2": {
234
+ "question": "Why do we create the 'HasCabin' feature?",
235
+ "options": [
236
+ "To reduce the number of missing values",
237
+ "To create a binary indicator for cabin availability",
238
+ "To make the data more complex",
239
+ "To remove the Cabin column"
240
+ ],
241
+ "correct": 1
242
+ },
243
+ "q3": {
244
+ "question": "What does the FamilySize feature represent?",
245
+ "options": [
246
+ "Number of siblings only",
247
+ "Number of parents only",
248
+ "Total family members (including the passenger)",
249
+ "Number of children only"
250
+ ],
251
+ "correct": 2
252
+ }
253
+ }
254
+
255
+ # Display quiz if not submitted
256
+ if not st.session_state.quiz_submitted:
257
+ answers = {}
258
+ for q_id, q_data in questions.items():
259
+ st.write(f"**{q_data['question']}**")
260
+ answers[q_id] = st.radio(
261
+ "Select your answer:",
262
+ q_data["options"],
263
+ key=q_id
264
+ )
265
+
266
+ if st.button("Submit Quiz"):
267
+ # Calculate score
268
+ score = sum(1 for q_id, q_data in questions.items()
269
+ if answers[q_id] == q_data["options"][q_data["correct"]])
270
+
271
+ # Show results
272
+ st.write(f"Your score: {score}/{len(questions)}")
273
+
274
+ # Show correct answers
275
+ st.write("Correct answers:")
276
+ for q_id, q_data in questions.items():
277
+ st.write(f"- {q_data['question']}")
278
+ st.write(f" Correct answer: {q_data['options'][q_data['correct']]}")
279
+
280
+ st.session_state.quiz_submitted = True
281
+
282
+ # Reset quiz button
283
+ if st.session_state.quiz_submitted:
284
+ if st.button("Take Quiz Again"):
285
+ st.session_state.quiz_submitted = False
286
+ st.rerun()
287
+
288
+ # Feature Engineering
289
+ st.header("Feature Engineering")
290
+ st.write("""
291
+ Let's create some new features:
292
+ 1. Family Size = SibSp + Parch + 1
293
+ 2. Age Groups
294
+ 3. Fare per Person
295
+ """)
296
+
297
+ # Create new features
298
+ df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
299
+ df_cleaned['AgeGroup'] = pd.cut(
300
+ df_cleaned['Age'],
301
+ bins=[0, 12, 18, 35, 60, 100],
302
+ labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
303
+ )
304
+ df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
305
+
306
+ # Display new features
307
+ st.subheader("New Features Analysis")
308
+
309
+ # Family Size Analysis
310
+ fig_family = create_categorical_plot(df_cleaned, 'FamilySize')
311
+ st.plotly_chart(fig_family)
312
+
313
+ # Age Group Analysis
314
+ fig_age = create_categorical_plot(df_cleaned, 'AgeGroup')
315
+ st.plotly_chart(fig_age)
316
+
317
+ # Conclusion
318
+ st.header("Conclusion")
319
+ st.write("""
320
+ Through this analysis, we've learned:
321
+ - How to handle missing values in real-world datasets
322
+ - Techniques for analyzing categorical variables
323
+ - Methods for creating meaningful visualizations
324
+ - Feature engineering approaches
325
+ - Best practices for data cleaning and EDA
326
+ """)
327
+
328
+ # Additional Resources
329
+ st.header("Additional Resources")
330
+ st.write("""
331
+ - [Pandas Documentation](https://pandas.pydata.org/docs/)
332
+ - [Seaborn Documentation](https://seaborn.pydata.org/)
333
+ - [Plotly Documentation](https://plotly.com/python/)
334
+ - [Data Cleaning Best Practices](https://towardsdatascience.com/data-cleaning-steps-and-process-8ae2d0f5147)
335
+ - [Colab Notebook](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
336
+ - [Overleaf Project](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
337
+ """)