Spaces:

raymondEDS
/

DS_webclass

Running

App Files Files Community

raymondEDS commited on Jun 10

Commit

63732ac

1 Parent(s): 4a23d33

mx homework

Browse files

Files changed (7) hide show

app/.DS_Store +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
app/main.py +4 -1
app/pages/.DS_Store +0 -0
app/pages/__pycache__/week_7.cpython-311.pyc +0 -0
app/pages/week_5.py +4 -4
app/pages/week_7.py +337 -0

app/.DS_Store CHANGED Viewed

Binary files a/app/.DS_Store and b/app/.DS_Store differ

app/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ

app/main.py CHANGED Viewed

@@ -23,6 +23,7 @@ from app.pages import week_3
 from app.pages import week_4
 from app.pages import week_5
 from app.pages import week_6
 # Page configuration
 st.set_page_config(
     page_title="Data Science Course App",
@@ -151,6 +152,8 @@ def show_week_content():
         week_5.show()
     elif st.session_state.current_week == 6:
         week_6.show()
     else:
         st.warning("Content for this week is not yet available.")
@@ -163,7 +166,7 @@ def main():
         return
     # User is logged in, show course content
-    if st.session_state.current_week in [1, 2, 3, 4, 5, 6]:
         show_week_content()
     else:
         st.title("Data Science Research Paper Course")

 from app.pages import week_4
 from app.pages import week_5
 from app.pages import week_6
+from app.pages import week_7
 # Page configuration
 st.set_page_config(
     page_title="Data Science Course App",
         week_5.show()
     elif st.session_state.current_week == 6:
         week_6.show()
+    elif st.session_state.current_week == 7:
+        week_7.show()
     else:
         st.warning("Content for this week is not yet available.")
         return
     # User is logged in, show course content
+    if st.session_state.current_week in [1, 2, 3, 4, 5, 6, 7]:
         show_week_content()
     else:
         st.title("Data Science Research Paper Course")

app/pages/.DS_Store CHANGED Viewed

Binary files a/app/pages/.DS_Store and b/app/pages/.DS_Store differ

app/pages/__pycache__/week_7.cpython-311.pyc ADDED Viewed

Binary file (14 kB). View file

app/pages/week_5.py CHANGED Viewed

@@ -1147,10 +1147,10 @@ for features in feature_sets:
     if username == "manxiii":
         st.markdown("""
         Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
-        1. Complete the feature engineering pipeline for the ICLR dataset
-        2. Build both simple and multiple linear regression models
-        3. Compare model performance and interpret results
-        4. Submit your findings in a Jupyter notebook
         **Due Date:** End of Week 5
         """)

     if username == "manxiii":
         st.markdown("""
         Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
+        1. Pick out some figures from the Colab Notebook and write a short summary of the results. Add them to your overleaf paper
+                    - Colab [Link](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
+                    - Overleaf [Link](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
+        2. Update your literature review section in the overleaf paper, given the homework.
         **Due Date:** End of Week 5
         """)

app/pages/week_7.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Set up the style for all plots
+plt.style.use('default')
+sns.set_theme(style="whitegrid", palette="husl")
+def load_titanic_data():
+    """Load and return the Titanic dataset"""
+    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
+    df = pd.read_csv(url)
+    return df
+def create_categorical_plot(df, column, target='Survived'):
+    """Create an interactive plot for categorical variables"""
+    fig = px.bar(
+        df.groupby(column)[target].mean().reset_index(),
+        x=column,
+        y=target,
+        title=f'Survival Rate by {column}',
+        labels={target: 'Survival Rate', column: column},
+        color=target,
+        color_continuous_scale='RdBu'
+    )
+    fig.update_layout(
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
+    )
+    return fig
+def create_numeric_plot(df, column, target='Survived'):
+    """Create an interactive plot for numeric variables"""
+    fig = px.box(
+        df,
+        x=target,
+        y=column,
+        title=f'{column} Distribution by Survival',
+        labels={target: 'Survived', column: column},
+        color=target,
+        color_discrete_sequence=px.colors.qualitative.Set1
+    )
+    fig.update_layout(
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
+    )
+    return fig
+def show():
+    st.title("Week 7: Data Cleaning and EDA with Categorical Variables")
+    # Introduction Section
+    st.header("Course Overview")
+    st.write("""
+    This week, we'll explore data cleaning and exploratory data analysis (EDA) with a focus on categorical variables.
+    We'll use the Titanic dataset to demonstrate:
+    - Data cleaning techniques
+    - Handling missing values
+    - Analyzing categorical variables
+    - Creating meaningful visualizations
+    - Feature engineering
+    """)
+    # Learning Path
+    st.subheader("Learning Path")
+    st.write("""
+    1. Understanding the Dataset: Titanic passenger data
+    2. Data Cleaning: Handling missing values and outliers
+    3. Categorical Variables: Analysis and visualization
+    4. Feature Engineering: Creating new features
+    5. Data Visualization: Interactive plots and insights
+    6. Practical Applications: Real-world data analysis
+    """)
+    # Load Data
+    st.header("The Dataset")
+    st.write("""
+    We'll be working with the Titanic dataset, which contains information about passengers aboard the Titanic.
+    The dataset includes both categorical and numerical variables, making it perfect for learning data cleaning and EDA.
+    """)
+    df = load_titanic_data()
+    # Display basic information
+    st.subheader("Dataset Overview")
+    st.write(f"Number of rows: {len(df)}")
+    st.write(f"Number of columns: {len(df.columns)}")
+    # Display missing values
+    st.subheader("Missing Values Analysis")
+    missing_values = df.isnull().sum()
+    fig_missing = px.bar(
+        x=missing_values.index,
+        y=missing_values.values,
+        title='Missing Values by Column',
+        labels={'x': 'Columns', 'y': 'Number of Missing Values'}
+    )
+    fig_missing.update_layout(
+        title_x=0.5,
+        title_font_size=20,
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
+    )
+    st.plotly_chart(fig_missing)
+    # Data Cleaning Section
+    st.header("Data Cleaning")
+    # Handle missing values
+    st.subheader("Handling Missing Values")
+    st.write("""
+    Let's clean the data by:
+    1. Filling missing Age values with median
+    2. Filling missing Embarked values with mode
+    3. Creating a new feature for Cabin availability
+    """)
+    # Create a copy for cleaning
+    df_cleaned = df.copy()
+    # Fill missing values
+    df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
+    df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
+    df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
+    # Categorical Variables Analysis
+    st.header("Categorical Variables Analysis")
+    # Select categorical column to analyze
+    categorical_cols = ['Pclass', 'Sex', 'Embarked', 'HasCabin']
+    selected_col = st.selectbox(
+        "Select Categorical Variable to Analyze",
+        categorical_cols
+    )
+    # Create and display categorical plot
+    fig_cat = create_categorical_plot(df_cleaned, selected_col)
+    st.plotly_chart(fig_cat)
+    # Numeric Variables Analysis
+    st.header("Numeric Variables Analysis")
+    # Select numeric column to analyze
+    numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']
+    selected_num_col = st.selectbox(
+        "Select Numeric Variable to Analyze",
+        numeric_cols
+    )
+    # Create and display numeric plot
+    fig_num = create_numeric_plot(df_cleaned, selected_num_col)
+    st.plotly_chart(fig_num)
+    # Reference Code Section
+    st.header("Reference Code")
+    st.write("""
+    Below is the reference code for the data cleaning and analysis we just performed.
+    Study this code to understand how we implemented the analysis.
+    """)
+    with st.expander("View Reference Code"):
+        st.code("""
+# Data Cleaning
+df_cleaned = df.copy()
+df_cleaned['Age'].fillna(df_cleaned['Age'].median(), inplace=True)
+df_cleaned['Embarked'].fillna(df_cleaned['Embarked'].mode()[0], inplace=True)
+df_cleaned['HasCabin'] = df_cleaned['Cabin'].notna().astype(int)
+# Categorical Analysis
+def create_categorical_plot(df, column, target='Survived'):
+    fig = px.bar(
+        df.groupby(column)[target].mean().reset_index(),
+        x=column,
+        y=target,
+        title=f'Survival Rate by {column}',
+        labels={target: 'Survival Rate', column: column},
+        color=target,
+        color_continuous_scale='RdBu'
+    )
+    return fig
+# Numeric Analysis
+def create_numeric_plot(df, column, target='Survived'):
+    fig = px.box(
+        df,
+        x=target,
+        y=column,
+        title=f'{column} Distribution by Survival',
+        labels={target: 'Survived', column: column},
+        color=target,
+        color_discrete_sequence=px.colors.qualitative.Set1
+    )
+    return fig
+# Feature Engineering
+df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
+df_cleaned['AgeGroup'] = pd.cut(
+    df_cleaned['Age'],
+    bins=[0, 12, 18, 35, 60, 100],
+    labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
+)
+df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
+        """, language="python")
+    # Knowledge Check Quiz
+    st.header("Knowledge Check")
+    st.write("Test your understanding of the concepts covered in this section.")
+    # Initialize session state for quiz if not exists
+    if 'quiz_submitted' not in st.session_state:
+        st.session_state.quiz_submitted = False
+    # Quiz questions
+    questions = {
+        "q1": {
+            "question": "What is the best way to handle missing values in the 'Age' column?",
+            "options": [
+                "Fill with 0",
+                "Fill with the median age",
+                "Remove all rows with missing age",
+                "Fill with the mean age"
+            ],
+            "correct": 1
+        },
+        "q2": {
+            "question": "Why do we create the 'HasCabin' feature?",
+            "options": [
+                "To reduce the number of missing values",
+                "To create a binary indicator for cabin availability",
+                "To make the data more complex",
+                "To remove the Cabin column"
+            ],
+            "correct": 1
+        },
+        "q3": {
+            "question": "What does the FamilySize feature represent?",
+            "options": [
+                "Number of siblings only",
+                "Number of parents only",
+                "Total family members (including the passenger)",
+                "Number of children only"
+            ],
+            "correct": 2
+        }
+    }
+    # Display quiz if not submitted
+    if not st.session_state.quiz_submitted:
+        answers = {}
+        for q_id, q_data in questions.items():
+            st.write(f"**{q_data['question']}**")
+            answers[q_id] = st.radio(
+                "Select your answer:",
+                q_data["options"],
+                key=q_id
+            )
+        if st.button("Submit Quiz"):
+            # Calculate score
+            score = sum(1 for q_id, q_data in questions.items()
+                       if answers[q_id] == q_data["options"][q_data["correct"]])
+            # Show results
+            st.write(f"Your score: {score}/{len(questions)}")
+            # Show correct answers
+            st.write("Correct answers:")
+            for q_id, q_data in questions.items():
+                st.write(f"- {q_data['question']}")
+                st.write(f"  Correct answer: {q_data['options'][q_data['correct']]}")
+            st.session_state.quiz_submitted = True
+    # Reset quiz button
+    if st.session_state.quiz_submitted:
+        if st.button("Take Quiz Again"):
+            st.session_state.quiz_submitted = False
+            st.rerun()
+    # Feature Engineering
+    st.header("Feature Engineering")
+    st.write("""
+    Let's create some new features:
+    1. Family Size = SibSp + Parch + 1
+    2. Age Groups
+    3. Fare per Person
+    """)
+    # Create new features
+    df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
+    df_cleaned['AgeGroup'] = pd.cut(
+        df_cleaned['Age'],
+        bins=[0, 12, 18, 35, 60, 100],
+        labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
+    )
+    df_cleaned['FarePerPerson'] = df_cleaned['Fare'] / df_cleaned['FamilySize']
+    # Display new features
+    st.subheader("New Features Analysis")
+    # Family Size Analysis
+    fig_family = create_categorical_plot(df_cleaned, 'FamilySize')
+    st.plotly_chart(fig_family)
+    # Age Group Analysis
+    fig_age = create_categorical_plot(df_cleaned, 'AgeGroup')
+    st.plotly_chart(fig_age)
+    # Conclusion
+    st.header("Conclusion")
+    st.write("""
+    Through this analysis, we've learned:
+    - How to handle missing values in real-world datasets
+    - Techniques for analyzing categorical variables
+    - Methods for creating meaningful visualizations
+    - Feature engineering approaches
+    - Best practices for data cleaning and EDA
+    """)
+    # Additional Resources
+    st.header("Additional Resources")
+    st.write("""
+    - [Pandas Documentation](https://pandas.pydata.org/docs/)
+    - [Seaborn Documentation](https://seaborn.pydata.org/)
+    - [Plotly Documentation](https://plotly.com/python/)
+    - [Data Cleaning Best Practices](https://towardsdatascience.com/data-cleaning-steps-and-process-8ae2d0f5147)
+    - [Colab Notebook](https://colab.research.google.com/drive/1ScwSa8WBcOMCloXsTV5TPFoVrcPHXlW2#scrollTo=VDMRGRbSR0gc)
+    - [Overleaf Project](https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13)
+    """)