Spaces:

raymondEDS
/

DS_webclass

Running

App Files Files Community

raymondEDS commited on May 27

Commit

223d6e3

1 Parent(s): ae38d1c

week 5 final

Browse files

Files changed (2) hide show

app/pages/__pycache__/week_5.cpython-311.pyc +0 -0
app/pages/week_5.py +1060 -214

app/pages/__pycache__/week_5.cpython-311.pyc CHANGED Viewed

Binary files a/app/pages/__pycache__/week_5.cpython-311.pyc and b/app/pages/__pycache__/week_5.cpython-311.pyc differ

app/pages/week_5.py CHANGED Viewed

@@ -4,18 +4,60 @@ import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.linear_model import LinearRegression
-from sklearn.metrics import r2_score
 import scipy.stats as stats
-from nltk.tokenize import word_tokenize
 import plotly.express as px
 import plotly.graph_objects as go
 from pathlib import Path
 import os
 # Set up the style for all plots
 plt.style.use('default')
 sns.set_theme(style="whitegrid", palette="husl")
 def load_data():
     """Load and prepare the data"""
     # Get the current file's directory
@@ -31,45 +73,166 @@ def load_data():
         df_dec = pd.read_csv(data_dir / "decision.csv")
         df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
         return df_reviews, df_submissions, df_dec, df_keyword
     except FileNotFoundError as e:
         st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
         st.error(f"Error details: {str(e)}")
         return None, None, None, None
 def create_feature_plot(df, x_col, y_col, title):
     """Create an interactive scatter plot using plotly"""
-    fig = px.scatter(df, x=x_col, y=y_col,
                     title=title,
                     labels={x_col: x_col.replace('_', ' ').title(),
                            y_col: y_col.replace('_', ' ').title()},
-                    template="plotly_white")
     fig.update_layout(
         title_x=0.5,
         title_font_size=20,
         showlegend=True,
-        plot_bgcolor='white',
-        paper_bgcolor='white'
     )
     return fig
 def create_correlation_heatmap(df, columns):
     """Create a correlation heatmap using plotly"""
-    corr = df[columns].corr()
     fig = go.Figure(data=go.Heatmap(
         z=corr,
         x=corr.columns,
         y=corr.columns,
         colorscale='RdBu',
-        zmin=-1, zmax=1
     ))
     fig.update_layout(
         title='Feature Correlation Heatmap',
         title_x=0.5,
         title_font_size=20,
-        plot_bgcolor='white',
-        paper_bgcolor='white'
     )
     return fig
 def show():
@@ -101,7 +264,49 @@ def show():
     2. **Correlation Analysis (相关性分析):**
        - Definition: Statistical measure that shows how strongly two variables are related
        - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
     3. **Reading Linear Regression Output (解读线性回归结果):**
        - R-squared (R²): Proportion of variance explained by the model (0-1)
        - p-value: Probability that the observed relationship occurred by chance
@@ -111,230 +316,871 @@ def show():
     """)
     # Load the data
-    try:
-        df_reviews, df_submissions, df_dec, df_keyword = load_data()
-        # Module 1: Data Exploration
-        st.header("Module 1: Data Exploration")
-        st.write("Let's explore our dataset to understand the review patterns:")
-        # Create features from review text
-        df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
-        df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
-        # Show basic statistics
-        col1, col2 = st.columns(2)
-        with col1:
-            st.metric("Total Reviews", len(df_reviews))
-            st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
-        with col2:
-            st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
-            st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
-        # Create interactive visualizations
-        st.subheader("Review Length vs Rating")
-        fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
-                                'Relationship between Review Length and Rating')
-        st.plotly_chart(fig, use_container_width=True)
-        # Correlation analysis
-        st.subheader("Feature Correlations")
-        corr_fig = create_correlation_heatmap(df_reviews,
-                                            ['word_count', 'rating_int', 'confidence_int'])
-        st.plotly_chart(corr_fig, use_container_width=True)
-        # Module 2: Feature Engineering
-        st.header("Module 2: Feature Engineering")
-        st.write("""
-        Let's create more sophisticated features from our review data:
-        - Review length (word count)
-        - Review rating
-        - Reviewer confidence
-        - Number of keywords in the paper
-        """)
-        # Interactive Feature Engineering
-        st.subheader("Try Feature Engineering")
-        review_text = st.text_area(
-            "Enter a review to analyze:",
-            "This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
-            key="review_text"
-        )
-        if st.button("Extract Features"):
-            # Calculate features
-            word_count = len(word_tokenize(review_text))
-            sentence_count = len(review_text.split('.'))
-            # Create a nice display of features
-            col1, col2, col3 = st.columns(3)
             with col1:
-                st.metric("Word Count", word_count)
             with col2:
-                st.metric("Sentence Count", sentence_count)
-            with col3:
-                st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
-        # Module 3: Linear Regression Analysis
-        st.header("Module 3: Linear Regression Analysis")
-        st.write("""
-        Let's build a linear regression model to predict paper ratings based on review features.
-        """)
-        # Prepare data for modeling
-        X = df_reviews[['word_count', 'confidence_int']]
-        y = df_reviews['rating_int']
-        # Fit regression model
-        model = LinearRegression()
-        model.fit(X, y)
-        # Create 3D visualization of the regression
-        st.subheader("3D Visualization of Review Features")
-        fig = px.scatter_3d(df_reviews.sample(1000),
-                           x='word_count',
-                           y='confidence_int',
-                           z='rating_int',
-                           title='Review Features in 3D Space',
-                           labels={
-                               'word_count': 'Word Count',
-                               'confidence_int': 'Confidence',
-                               'rating_int': 'Rating'
-                           })
-        fig.update_layout(
-            title_x=0.5,
-            title_font_size=20,
-            scene = dict(
-                xaxis_title='Word Count',
-                yaxis_title='Confidence',
-                zaxis_title='Rating'
-            )
-        )
-        st.plotly_chart(fig, use_container_width=True)
-        # Show model metrics
-        st.subheader("Model Performance")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("R-squared", f"{model.score(X, y):.3f}")
-        with col2:
-            st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
-        with col3:
-            st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
-        # Practice Exercises
-        st.header("Practice Exercises")
-        with st.expander("Exercise 1: Feature Engineering"):
-            st.write("""
-            1. Load the reviews dataset
-            2. Create features from review text
-            3. Calculate correlation between features
-            4. Visualize relationships
-            """)
-            st.code("""
-            # Solution
-            import pandas as pd
-            import numpy as np
-            from nltk.tokenize import word_tokenize
-            # Load data
-            df_reviews = pd.read_csv('reviews.csv')
-            # Create features
-            df_reviews['word_count'] = df_reviews['review'].apply(
-                lambda x: len(word_tokenize(x)))
-            df_reviews['sentence_count'] = df_reviews['review'].apply(
-                lambda x: len(x.split('.')))
-            # Calculate correlation
-            correlation = df_reviews[['word_count', 'rating_int',
-                                    'confidence_int']].corr()
-            # Visualize
-            sns.heatmap(correlation, annot=True)
-            plt.show()
-            """)
-        with st.expander("Exercise 2: Building a Predictive Model"):
             st.write("""
-            1. Prepare features for modeling
-            2. Split data into training and test sets
-            3. Train a linear regression model
-            4. Evaluate model performance
             """)
-            st.code("""
-            # Solution
-            from sklearn.model_selection import train_test_split
-            from sklearn.linear_model import LinearRegression
-            # Prepare features
-            X = df_reviews[['word_count', 'confidence_int']]
-            y = df_reviews['rating_int']
-            # Split data
-            X_train, X_test, y_train, y_test = train_test_split(
-                X, y, test_size=0.2, random_state=42)
-            # Train model
-            model = LinearRegression()
-            model.fit(X_train, y_train)
-            # Evaluate
-            train_score = model.score(X_train, y_train)
-            test_score = model.score(X_test, y_test)
-            print(f"Training R²: {train_score:.3f}")
-            print(f"Testing R²: {test_score:.3f}")
             """)
-        # Weekly Assignment
-        username = st.session_state.get("username", "Student")
-        st.header(f"{username}'s Weekly Assignment")
-        if username == "manxiii":
-            st.markdown("""
-            Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
-            1. Complete the feature engineering pipeline for the ICLR dataset
-            2. Build a linear regression model to predict paper ratings
-            3. Analyze the relationship between review features and acceptance
-            4. Submit your findings in a Jupyter notebook
-            **Due Date:** End of Week 5
-            """)
-        elif username == "zhu":
-            st.markdown("""
-            Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
-            1. Implement the complete machine learning workflow
-            2. Create insightful visualizations of model results
-            3. Draw conclusions from your analysis
-            4. Submit your work in a Jupyter notebook
-            **Due Date:** End of Week 5
-            """)
-        elif username == "WK":
-            st.markdown("""
-            Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
-            1. Complete the feature engineering pipeline
-            2. Build and evaluate a linear regression model
-            3. Analyze patterns in the data
-            4. Submit your findings
-            **Due Date:** End of Week 5
-            """)
-        else:
-            st.markdown(f"""
-            Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
-            1. Complete the feature engineering pipeline
-            2. Build and evaluate a linear regression model
-            3. Analyze patterns in the data
-            4. Submit your findings
-            **Due Date:** End of Week 5
-            """)
     except Exception as e:
-        st.error(f"Error loading data: {str(e)}")
-        st.write("Please make sure the data files are in the correct location.")

 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score, mean_squared_error
+from sklearn.model_selection import train_test_split
 import scipy.stats as stats
 import plotly.express as px
 import plotly.graph_objects as go
 from pathlib import Path
 import os
+import re
+from plotly.subplots import make_subplots
 # Set up the style for all plots
 plt.style.use('default')
 sns.set_theme(style="whitegrid", palette="husl")
+def simple_word_tokenize(text):
+    """Simple word tokenization function"""
+    # Convert to string and lowercase
+    text = str(text).lower()
+    # Remove special characters and extra whitespace
+    text = re.sub(r'[^\w\s]', ' ', text)
+    # Split on whitespace and remove empty strings
+    words = [word for word in text.split() if word]
+    return words
+def simple_sentence_split(text):
+    """Simple sentence splitting function"""
+    # Convert to string
+    text = str(text)
+    # Split on common sentence endings
+    sentences = re.split(r'[.!?]+', text)
+    # Remove empty strings and strip whitespace
+    sentences = [s.strip() for s in sentences if s.strip()]
+    return sentences
+def extract_text_features(text):
+    """Extract basic features from text"""
+    try:
+        # Handle NaN or None values
+        if pd.isna(text) or text is None:
+            return None  # Return None instead of default values
+        words = simple_word_tokenize(text)
+        sentences = simple_sentence_split(text)
+        features = {
+            'word_count': len(words),
+            'sentence_count': len(sentences),
+            'avg_word_length': np.mean([len(word) for word in words]) if words else None,
+            'avg_sentence_length': len(words) / len(sentences) if sentences else None
+        }
+        return features
+    except Exception as e:
+        return None  # Return None if any error occurs
 def load_data():
     """Load and prepare the data"""
     # Get the current file's directory
         df_dec = pd.read_csv(data_dir / "decision.csv")
         df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
+        # Clean the data by dropping rows with NaN values in critical columns
+        df_reviews = df_reviews.dropna(subset=['review', 'rating_int', 'confidence_int'])
+        # Extract features
+        features = df_reviews['review'].apply(extract_text_features)
+        df_features = pd.DataFrame(features.tolist())
+        df_reviews = pd.concat([df_reviews, df_features], axis=1)
+        # Drop any remaining rows with NaN values
+        df_reviews = df_reviews.dropna()
+        # Verify no NaN values remain
+        if df_reviews.isna().any().any():
+            st.warning("Some NaN values were found and those rows were dropped")
+            df_reviews = df_reviews.dropna()
         return df_reviews, df_submissions, df_dec, df_keyword
     except FileNotFoundError as e:
         st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
         st.error(f"Error details: {str(e)}")
         return None, None, None, None
+    except Exception as e:
+        st.error(f"Error processing data: {str(e)}")
+        return None, None, None, None
 def create_feature_plot(df, x_col, y_col, title):
     """Create an interactive scatter plot using plotly"""
+    # Ensure no NaN values
+    df_plot = df.dropna(subset=[x_col, y_col])
+    fig = px.scatter(df_plot, x=x_col, y=y_col,
                     title=title,
                     labels={x_col: x_col.replace('_', ' ').title(),
                            y_col: y_col.replace('_', ' ').title()},
+                    template="plotly_dark")
     fig.update_layout(
         title_x=0.5,
         title_font_size=20,
         showlegend=True,
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
     )
     return fig
 def create_correlation_heatmap(df, columns):
     """Create a correlation heatmap using plotly"""
+    # Ensure no NaN values
+    df_corr = df[columns].dropna()
+    corr = df_corr.corr()
     fig = go.Figure(data=go.Heatmap(
         z=corr,
         x=corr.columns,
         y=corr.columns,
         colorscale='RdBu',
+        zmin=-1, zmax=1,
+        text=[[f'{val:.2f}' for val in row] for row in corr.values],
+        texttemplate='%{text}',
+        textfont={"size": 12}
     ))
     fig.update_layout(
         title='Feature Correlation Heatmap',
         title_x=0.5,
         title_font_size=20,
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
+    )
+    return fig
+def create_regression_plot(df, x_col, y_col, title):
+    """Create a scatter plot with regression line"""
+    # Ensure no NaN values
+    df_plot = df.dropna(subset=[x_col, y_col])
+    fig = px.scatter(df_plot, x=x_col, y=y_col,
+                    title=title,
+                    labels={x_col: x_col.replace('_', ' ').title(),
+                           y_col: y_col.replace('_', ' ').title()},
+                    template="plotly_dark")
+    # Add regression line
+    model = LinearRegression()
+    X = df_plot[x_col].values.reshape(-1, 1)
+    y = df_plot[y_col].values
+    model.fit(X, y)
+    y_pred = model.predict(X)
+    fig.add_trace(go.Scatter(
+        x=df_plot[x_col],
+        y=y_pred,
+        mode='lines',
+        name='Regression Line',
+        line=dict(color='red', width=2)
+    ))
+    fig.update_layout(
+        title_x=0.5,
+        title_font_size=20,
+        showlegend=True,
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white')
+    )
+    return fig, model
+def create_correlation_examples():
+    """Create example plots showing different correlation types"""
+    # Generate example data
+    np.random.seed(42)
+    n_points = 100
+    # Perfect positive correlation
+    x1 = np.linspace(0, 10, n_points)
+    y1 = x1 + np.random.normal(0, 0.1, n_points)
+    # Perfect negative correlation
+    x2 = np.linspace(0, 10, n_points)
+    y2 = -x2 + np.random.normal(0, 0.1, n_points)
+    # Low correlation
+    x3 = np.random.normal(5, 2, n_points)
+    y3 = np.random.normal(5, 2, n_points)
+    # Create subplots
+    fig = make_subplots(rows=1, cols=3,
+                       subplot_titles=('Perfect Positive Correlation (r ≈ 1)',
+                                     'Perfect Negative Correlation (r ≈ -1)',
+                                     'Low Correlation (r ≈ 0)'))
+    # Add traces
+    fig.add_trace(go.Scatter(x=x1, y=y1, mode='markers', name='r ≈ 1'),
+                 row=1, col=1)
+    fig.add_trace(go.Scatter(x=x2, y=y2, mode='markers', name='r ≈ -1'),
+                 row=1, col=2)
+    fig.add_trace(go.Scatter(x=x3, y=y3, mode='markers', name='r ≈ 0'),
+                 row=1, col=3)
+    # Update layout
+    fig.update_layout(
+        height=400,
+        showlegend=False,
+        template="plotly_dark",
+        plot_bgcolor='rgb(30, 30, 30)',
+        paper_bgcolor='rgb(30, 30, 30)',
+        font=dict(color='white', size=14),
+        title=dict(
+            text='Examples of Different Correlation Types',
+            x=0.5,
+            y=0.95,
+            font=dict(size=20)
+        )
     )
+    # Update axes
+    for i in range(1, 4):
+        fig.update_xaxes(title_text='X', row=1, col=i)
+        fig.update_yaxes(title_text='Y', row=1, col=i)
     return fig
 def show():
     2. **Correlation Analysis (相关性分析):**
        - Definition: Statistical measure that shows how strongly two variables are related
        - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
+    """)
+    # Add correlation examples
+    st.write("Here are examples of different correlation types:")
+    corr_examples = create_correlation_examples()
+    st.plotly_chart(corr_examples, use_container_width=True)
+    # Show example code for correlation analysis
+    with st.expander("Example Code: Correlation Analysis"):
+        st.code("""
+# Example: Calculating and visualizing correlations
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Generate example data
+np.random.seed(42)
+n_points = 100
+# Perfect positive correlation
+x1 = np.linspace(0, 10, n_points)
+y1 = x1 + np.random.normal(0, 0.1, n_points)
+# Perfect negative correlation
+x2 = np.linspace(0, 10, n_points)
+y2 = -x2 + np.random.normal(0, 0.1, n_points)
+# Low correlation
+x3 = np.random.normal(5, 2, n_points)
+y3 = np.random.normal(5, 2, n_points)
+# Calculate correlations
+corr1 = np.corrcoef(x1, y1)[0,1]  # Should be close to 1
+corr2 = np.corrcoef(x2, y2)[0,1]  # Should be close to -1
+corr3 = np.corrcoef(x3, y3)[0,1]  # Should be close to 0
+print(f"Correlation 1: {corr1:.3f}")
+print(f"Correlation 2: {corr2:.3f}")
+print(f"Correlation 3: {corr3:.3f}")
+        """)
+    st.write("""
     3. **Reading Linear Regression Output (解读线性回归结果):**
        - R-squared (R²): Proportion of variance explained by the model (0-1)
        - p-value: Probability that the observed relationship occurred by chance
     """)
     # Load the data
+    df_reviews, df_submissions, df_dec, df_keyword = load_data()
+    if df_reviews is not None:
+        try:
+            # Module 1: Data Exploration
+            st.header("Module 1: Data Exploration")
+            st.write("Let's explore our dataset to understand the review patterns:")
+            # Show example code for data loading and cleaning
+            with st.expander("Example Code: Data Loading and Cleaning"):
+                st.code("""
+# Load and clean the data
+import pandas as pd
+import numpy as np
+def load_and_clean_data():
+    # Load datasets
+    df_reviews = pd.read_csv('reviews.csv')
+    df_submissions = pd.read_csv('Submissions.csv')
+    df_dec = pd.read_csv('decision.csv')
+    df_keyword = pd.read_csv('submission_keyword.csv')
+    # Clean reviews data
+    df_reviews = df_reviews.dropna(subset=['review', 'rating_int', 'confidence_int'])
+    # Extract text features
+    def extract_text_features(text):
+        if pd.isna(text) or text is None:
+            return {
+                'word_count': 0,
+                'sentence_count': 0,
+                'avg_word_length': 0,
+                'avg_sentence_length': 0
+            }
+        # Convert to string and clean
+        text = str(text).lower()
+        text = re.sub(r'[^\\w\\s]', ' ', text)
+        # Split into words and sentences
+        words = [word for word in text.split() if word]
+        sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
+        return {
+            'word_count': len(words),
+            'sentence_count': len(sentences),
+            'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
+            'avg_sentence_length': len(words) / len(sentences) if sentences else 0
+        }
+    # Apply feature extraction
+    features = df_reviews['review'].apply(extract_text_features)
+    df_features = pd.DataFrame(features.tolist())
+    df_reviews = pd.concat([df_reviews, df_features], axis=1)
+    # Fill any remaining NaN values
+    df_reviews = df_reviews.fillna(0)
+    return df_reviews, df_submissions, df_dec, df_keyword
+                """)
+            # Verify data quality
+            st.subheader("Data Quality Check")
+            missing_data = df_reviews.isna().sum()
+            if missing_data.any():
+                st.warning("Missing values found in the dataset:")
+                st.write(missing_data[missing_data > 0])
+            # Show basic statistics
+            col1, col2 = st.columns(2)
             with col1:
+                st.metric("Total Reviews", len(df_reviews))
+                st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
             with col2:
+                st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
+                st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
+            # Interactive feature selection
+            st.subheader("Interactive Feature Analysis")
+            feature_cols = ['word_count', 'sentence_count', 'avg_word_length',
+                          'avg_sentence_length', 'rating_int', 'confidence_int']
+            col1, col2 = st.columns(2)
+            with col1:
+                x_feature = st.selectbox("Select X-axis feature:", feature_cols)
+            with col2:
+                y_feature = st.selectbox("Select Y-axis feature:", feature_cols)
+            # Create interactive plot
+            fig = create_feature_plot(df_reviews, x_feature, y_feature,
+                                   f'{x_feature.replace("_", " ").title()} vs {y_feature.replace("_", " ").title()}')
+            st.plotly_chart(fig, use_container_width=True)
+            # Show correlation between selected features
+            corr = df_reviews[[x_feature, y_feature]].corr().iloc[0,1]
+            st.write(f"Correlation between {x_feature} and {y_feature}: {corr:.3f}")
+            # Distribution plots
+            st.subheader("Distribution of Ratings and Confidence")
+            col1, col2 = st.columns(2)
+            with col1:
+                fig = px.histogram(df_reviews.dropna(subset=['rating_int']),
+                                 x='rating_int',
+                                 title='Distribution of Ratings',
+                                 template="plotly_dark")
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                fig = px.histogram(df_reviews.dropna(subset=['confidence_int']),
+                                 x='confidence_int',
+                                 title='Distribution of Confidence',
+                                 template="plotly_dark")
+                st.plotly_chart(fig, use_container_width=True)
+            # Show example code for distribution analysis
+            with st.expander("Example Code: Distribution Analysis"):
+                st.code("""
+# Analyze distributions of numerical features
+import plotly.express as px
+def analyze_distributions(df):
+    # Create histograms for key features
+    fig1 = px.histogram(df, x='rating_int',
+                       title='Distribution of Ratings',
+                       template="plotly_dark")
+    fig2 = px.histogram(df, x='confidence_int',
+                       title='Distribution of Confidence',
+                       template="plotly_dark")
+    # Calculate summary statistics
+    stats = df[['rating_int', 'confidence_int']].describe()
+    return fig1, fig2, stats
+# Usage
+fig1, fig2, stats = analyze_distributions(df_reviews)
+print(stats)
+                """)
+            # Text feature distributions
+            st.subheader("Text Feature Distributions")
+            col1, col2 = st.columns(2)
+            with col1:
+                fig = px.histogram(df_reviews.dropna(subset=['avg_word_length']),
+                                 x='avg_word_length',
+                                 title='Average Word Length Distribution',
+                                 template="plotly_dark")
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                fig = px.histogram(df_reviews.dropna(subset=['avg_sentence_length']),
+                                 x='avg_sentence_length',
+                                 title='Average Sentence Length Distribution',
+                                 template="plotly_dark")
+                st.plotly_chart(fig, use_container_width=True)
+            # Correlation analysis
+            st.subheader("Feature Correlations")
+            corr_fig = create_correlation_heatmap(df_reviews, feature_cols)
+            st.plotly_chart(corr_fig, use_container_width=True)
+            # Show example code for correlation analysis
+            with st.expander("Example Code: Correlation Analysis"):
+                st.code("""
+# Analyze correlations between features
+import plotly.graph_objects as go
+def analyze_correlations(df, columns):
+    # Calculate correlation matrix
+    corr = df[columns].corr()
+    # Create heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=corr,
+        x=corr.columns,
+        y=corr.columns,
+        colorscale='RdBu',
+        zmin=-1, zmax=1,
+        text=[[f'{val:.2f}' for val in row] for row in corr.values],
+        texttemplate='%{text}',
+        textfont={"size": 12}
+    ))
+    fig.update_layout(
+        title='Feature Correlation Heatmap',
+        template="plotly_dark"
+    )
+    return fig, corr
+# Usage
+fig, corr_matrix = analyze_correlations(df_reviews, feature_cols)
+print(corr_matrix)
+                """)
+            # Module 2: Simple Linear Regression
+            st.header("Module 2: Simple Linear Regression")
             st.write("""
+            Let's explore the relationship between review length and rating using simple linear regression.
             """)
+            # Interactive feature selection for regression
+            st.subheader("Interactive Regression Analysis")
+            col1, col2 = st.columns(2)
+            with col1:
+                x_reg = st.selectbox("Select feature for X-axis:", feature_cols)
+            with col2:
+                y_reg = st.selectbox("Select target variable:", feature_cols)
+            # Create regression plot
+            fig, model = create_regression_plot(df_reviews, x_reg, y_reg,
+                                             f'{x_reg.replace("_", " ").title()} vs {y_reg.replace("_", " ").title()}')
+            st.plotly_chart(fig, use_container_width=True)
+            # Show regression metrics
+            st.subheader("Regression Metrics")
+            col1, col2 = st.columns(2)
+            with col1:
+                r2_score = model.score(df_reviews[[x_reg]].dropna(),
+                                     df_reviews[y_reg].dropna())
+                st.metric("R-squared", f"{r2_score:.3f}")
+            with col2:
+                st.metric("Slope", f"{model.coef_[0]:.3f}")
+            # Show example code for simple linear regression
+            with st.expander("Example Code: Simple Linear Regression"):
+                st.code('''
+# Perform simple linear regression
+from sklearn.linear_model import LinearRegression
+import plotly.graph_objects as go
+def simple_linear_regression(df, x_col, y_col, title=None):
+    """
+    Perform simple linear regression on any DataFrame.
+    Parameters:
+    -----------
+    df : pandas.DataFrame
+        Input DataFrame containing the features
+    x_col : str
+        Name of the column to use as independent variable
+    y_col : str
+        Name of the column to use as dependent variable
+    title : str, optional
+        Title for the plot. If None, will use column names
+    Returns:
+    --------
+    tuple
+        (model, r2_score, fig) where:
+        - model is the fitted LinearRegression object
+        - r2_score is the R-squared value
+        - fig is the plotly figure object
+    """
+    # Handle missing values by dropping them
+    df_clean = df.dropna(subset=[x_col, y_col])
+    if len(df_clean) == 0:
+        raise ValueError("No valid data points after removing missing values")
+    # Prepare data
+    X = df_clean[[x_col]]
+    y = df_clean[y_col]
+    # Fit model
+    model = LinearRegression()
+    model.fit(X, y)
+    # Calculate R-squared
+    r2_score = model.score(X, y)
+    # Create visualization
+    fig = go.Figure()
+    # Add scatter plot
+    fig.add_trace(go.Scatter(
+        x=X[x_col],
+        y=y,
+        mode='markers',
+        name='Data Points',
+        marker=dict(size=8, opacity=0.6)
+    ))
+    # Add regression line
+    x_range = np.linspace(X[x_col].min(), X[x_col].max(), 100)
+    y_pred = model.predict(x_range.reshape(-1, 1))
+    fig.add_trace(go.Scatter(
+        x=x_range,
+        y=y_pred,
+        mode='lines',
+        name='Regression Line',
+        line=dict(color='red', width=2)
+    ))
+    # Update layout
+    title = title or f'{x_col} vs {y_col}'
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_col,
+        yaxis_title=y_col,
+        template="plotly_dark",
+        showlegend=True
+    )
+    return model, r2_score, fig
+# Usage
+fig, model = simple_linear_regression(df_reviews, 'word_count', 'rating_int')
+print(f"R-squared: {model.score(X, y):.3f}")
+print(f"Slope: {model.coef_[0]:.3f}")
+''')
+            # Module 3: Multiple Linear Regression
+            st.header("Module 3: Multiple Linear Regression")
+            st.write("""
+            Now let's build a more complex model using multiple features to predict ratings.
             """)
+            try:
+                # Prepare data for modeling
+                feature_cols = ['word_count', 'sentence_count',
+                              'avg_word_length', 'avg_sentence_length',
+                              'confidence_int']
+                # Interactive feature selection for multiple regression
+                st.subheader("Select Features for Multiple Regression")
+                selected_features = st.multiselect(
+                    "Choose features to include in the model:",
+                    feature_cols,
+                    default=feature_cols
+                )
+                if selected_features:
+                    # Ensure no NaN values in features
+                    df_model = df_reviews.dropna(subset=selected_features + ['rating_int'])
+                    X = df_model[selected_features]
+                    y = df_model['rating_int']
+                    # Split data
+                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+                    # Fit regression model
+                    model = LinearRegression()
+                    model.fit(X_train, y_train)
+                    # Create 3D visualization if exactly 2 features are selected
+                    if len(selected_features) == 2:
+                        st.subheader("3D Visualization of Selected Features")
+                        fig = px.scatter_3d(df_model.sample(min(1000, len(df_model))),
+                                        x=selected_features[0],
+                                        y=selected_features[1],
+                                        z='rating_int',
+                                        title='Review Features in 3D Space',
+                                        template="plotly_dark")
+                        fig.update_layout(
+                            title_x=0.5,
+                            title_font_size=20,
+                            scene = dict(
+                                xaxis_title=selected_features[0].replace('_', ' ').title(),
+                                yaxis_title=selected_features[1].replace('_', ' ').title(),
+                                zaxis_title='Rating'
+                            )
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    # Show model metrics
+                    st.subheader("Model Performance")
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Training R²", f"{model.score(X_train, y_train):.3f}")
+                    with col2:
+                        st.metric("Testing R²", f"{model.score(X_test, y_test):.3f}")
+                    with col3:
+                        st.metric("RMSE", f"{np.sqrt(mean_squared_error(y_test, model.predict(X_test))):.3f}")
+                    # Show coefficients
+                    st.subheader("Model Coefficients")
+                    coef_df = pd.DataFrame({
+                        'Feature': X.columns,
+                        'Coefficient': model.coef_
+                    })
+                    st.dataframe(coef_df)
+                    # Show example code for multiple linear regression
+                    with st.expander("Example Code: Multiple Linear Regression"):
+                        st.code('''
+# Perform multiple linear regression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+def multiple_linear_regression(df, feature_cols, target_col, test_size=0.2, random_state=42):
+    """
+    Perform multiple linear regression on any DataFrame.
+    Parameters:
+    -----------
+    df : pandas.DataFrame
+        Input DataFrame containing the features
+    feature_cols : list of str
+        Names of the columns to use as independent variables
+    target_col : str
+        Name of the column to use as dependent variable
+    test_size : float, optional
+        Proportion of data to use for testing
+    random_state : int, optional
+        Random seed for reproducibility
+    Returns:
+    --------
+    tuple
+        (model, metrics, coef_df, fig) where:
+        - model is the fitted LinearRegression object
+        - metrics is a dictionary of performance metrics
+        - coef_df is a DataFrame of feature coefficients
+        - fig is the plotly figure object (if 2 features selected)
+    """
+    # Handle missing values by dropping them
+    df_clean = df.dropna(subset=feature_cols + [target_col])
+    if len(df_clean) == 0:
+        raise ValueError("No valid data points after removing missing values")
+    # Prepare data
+    X = df_clean[feature_cols]
+    y = df_clean[target_col]
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state)
+    # Fit model
+    model = LinearRegression()
+    model.fit(X_train, y_train)
+    # Make predictions
+    y_train_pred = model.predict(X_train)
+    y_test_pred = model.predict(X_test)
+    # Calculate metrics
+    metrics = {
+        'train_r2': r2_score(y_train, y_train_pred),
+        'test_r2': r2_score(y_test, y_test_pred),
+        'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
+        'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred))
+    }
+    # Create coefficient DataFrame
+    coef_df = pd.DataFrame({
+        'Feature': feature_cols,
+        'Coefficient': model.coef_,
+        'Absolute_Impact': np.abs(model.coef_)
+    }).sort_values('Absolute_Impact', ascending=False)
+    # Create visualization if exactly 2 features are selected
+    fig = None
+    if len(feature_cols) == 2:
+        fig = px.scatter_3d(
+            df_clean.sample(min(1000, len(df_clean))),
+            x=feature_cols[0],
+            y=feature_cols[1],
+            z=target_col,
+            title=f'Relationship between {feature_cols[0]}, {feature_cols[1]}, and {target_col}',
+            template="plotly_dark"
+        )
+        # Add regression plane
+        x_range = np.linspace(df_clean[feature_cols[0]].min(), df_clean[feature_cols[0]].max(), 20)
+        y_range = np.linspace(df_clean[feature_cols[1]].min(), df_clean[feature_cols[1]].max(), 20)
+        x_grid, y_grid = np.meshgrid(x_range, y_range)
+        z_grid = (model.intercept_ +
+                 model.coef_[0] * x_grid +
+                 model.coef_[1] * y_grid)
+        fig.add_trace(go.Surface(
+            x=x_grid,
+            y=y_grid,
+            z=z_grid,
+            opacity=0.5,
+            showscale=False
+        ))
+    return model, metrics, coef_df, fig
+# Usage
+model, train_score, test_score, rmse, coef_df = multiple_linear_regression(
+    df_reviews,
+    ['word_count', 'sentence_count', 'confidence_int'],
+    'rating_int'
+)
+print(f"Training R²: {train_score:.3f}")
+print(f"Testing R²: {test_score:.3f}")
+print(f"RMSE: {rmse:.3f}")
+print(coef_df)
+''')
+            except Exception as e:
+                st.error(f"Error in model training: {str(e)}")
+                st.write("Please check the data quality and try again.")
+        except Exception as e:
+            st.error(f"Error in data processing: {str(e)}")
+            st.write("Please check the data format and try again.")
+    # Practice Exercises
+    st.header("Practice Exercises")
+    # Add new section for writing prompts
+    st.subheader("Writing Prompts for Analyzing Linear Regression Results")
+    st.write("""
+    Use these prompts to help you interpret and write about your linear regression results:
+    1. **Model Fit and R-squared:**
+       - "The model explains [R² value]% of the variance in [dependent variable], suggesting [strong/moderate/weak] predictive power."
+       - "With an R-squared of [value], we can conclude that [interpretation of model fit]."
+       - "The relatively [high/low] R-squared value indicates that [interpretation of model's explanatory power]."
+    2. **Statistical Significance and p-values:**
+       - "The p-value of [value] for [feature] suggests that this relationship is [statistically significant/not significant]."
+       - "Given the p-value of [value], we [can/cannot] reject the null hypothesis that [interpretation]."
+       - "The statistical significance (p = [value]) indicates that [interpretation of relationship]."
+    3. **Coefficients and Their Meaning:**
+       - "For each unit increase in [independent variable], [dependent variable] [increases/decreases] by [coefficient value] units."
+       - "The coefficient of [value] for [feature] suggests that [interpretation of relationship]."
+       - "The positive/negative coefficient indicates that [interpretation of direction of relationship]."
+    4. **Uncertainty and Standard Errors:**
+       - "The standard error of [value] for [feature] indicates [interpretation of precision]."
+       - "The relatively [small/large] standard error suggests that [interpretation of estimate reliability]."
+       - "The uncertainty in our coefficient estimates, as shown by the standard errors, [interpretation of confidence in results]."
+    5. **Confidence Intervals:**
+       - "We are 95% confident that the true coefficient for [feature] lies between [lower bound] and [upper bound]."
+       - "The confidence interval [includes/does not include] zero, suggesting that [interpretation of significance]."
+       - "The narrow/wide confidence interval indicates [interpretation of precision]."
+    6. **Practical Significance:**
+       - "While the relationship is statistically significant, the effect size of [value] suggests [interpretation of practical importance]."
+       - "The coefficient of [value] indicates that [interpretation of real-world impact]."
+       - "In practical terms, this means that [interpretation of practical implications]."
+    7. **Model Limitations:**
+       - "The model's assumptions of [assumptions] may not hold in this case because [explanation]."
+       - "Potential limitations of our analysis include [list limitations]."
+       - "We should be cautious in interpreting these results because [explanation of limitations]."
+    8. **Recommendations:**
+       - "Based on our analysis, we recommend [specific action] because [explanation]."
+       - "The results suggest that [interpretation] and therefore [recommendation]."
+       - "To improve the model, we could [suggestions for improvement]."
+    """)
+    with st.expander("Exercise 1: Simple Linear Regression"):
+        st.write("""
+        1. Create a function that performs simple linear regression on any DataFrame
+        2. The function should:
+           - Take a DataFrame and column names as input
+           - Handle missing values appropriately
+           - Calculate and return R-squared value
+           - Create a visualization of the relationship
+        3. Test your function with different features from the dataset
+        """)
+        st.code('''
+# Solution: Generic Simple Linear Regression Function
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LinearRegression
+import plotly.express as px
+import plotly.graph_objects as go
+def simple_linear_regression(df, x_col, y_col, title=None):
+    """
+    Perform simple linear regression on any DataFrame.
+    Parameters:
+    -----------
+    df : pandas.DataFrame
+        Input DataFrame containing the features
+    x_col : str
+        Name of the column to use as independent variable
+    y_col : str
+        Name of the column to use as dependent variable
+    title : str, optional
+        Title for the plot. If None, will use column names
+    Returns:
+    --------
+    tuple
+        (model, r2_score, fig) where:
+        - model is the fitted LinearRegression object
+        - r2_score is the R-squared value
+        - fig is the plotly figure object
+    """
+    # Handle missing values by dropping them
+    df_clean = df.dropna(subset=[x_col, y_col])
+    if len(df_clean) == 0:
+        raise ValueError("No valid data points after removing missing values")
+    # Prepare data
+    X = df_clean[[x_col]]
+    y = df_clean[y_col]
+    # Fit model
+    model = LinearRegression()
+    model.fit(X, y)
+    # Calculate R-squared
+    r2_score = model.score(X, y)
+    # Create visualization
+    fig = go.Figure()
+    # Add scatter plot
+    fig.add_trace(go.Scatter(
+        x=X[x_col],
+        y=y,
+        mode='markers',
+        name='Data Points',
+        marker=dict(size=8, opacity=0.6)
+    ))
+    # Add regression line
+    x_range = np.linspace(X[x_col].min(), X[x_col].max(), 100)
+    y_pred = model.predict(x_range.reshape(-1, 1))
+    fig.add_trace(go.Scatter(
+        x=x_range,
+        y=y_pred,
+        mode='lines',
+        name='Regression Line',
+        line=dict(color='red', width=2)
+    ))
+    # Update layout
+    title = title or f'{x_col} vs {y_col}'
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_col,
+        yaxis_title=y_col,
+        template="plotly_dark",
+        showlegend=True
+    )
+    return model, r2_score, fig
+# Example usage:
+# Load your data
+df = pd.read_csv('your_data.csv')
+# Try different feature pairs
+feature_pairs = [
+    ('word_count', 'rating_int'),
+    ('confidence_int', 'rating_int'),
+    ('avg_word_length', 'rating_int')
+]
+# Analyze each pair
+for x_col, y_col in feature_pairs:
+    try:
+        model, r2, fig = simple_linear_regression(df, x_col, y_col)
+        print(f"\nAnalysis of {x_col} vs {y_col}:")
+        print(f"R-squared: {r2:.3f}")
+        print(f"Slope: {model.coef_[0]:.3f}")
+        print(f"Intercept: {model.intercept_:.3f}")
+        fig.show()
+    except Exception as e:
+        print(f"Error analyzing {x_col} vs {y_col}: {str(e)}")
+''')
+    with st.expander("Exercise 2: Multiple Linear Regression"):
+        st.write("""
+        1. Create a function that performs multiple linear regression on any DataFrame
+        2. The function should:
+           - Take a DataFrame and lists of feature columns as input
+           - Handle missing values appropriately
+           - Split data into training and test sets
+           - Calculate and return performance metrics
+           - Create visualizations of the results
+        3. Test your function with different combinations of features
+        """)
+        st.code('''
+# Solution: Generic Multiple Linear Regression Function
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+import plotly.express as px
+import plotly.graph_objects as go
+def multiple_linear_regression(df, feature_cols, target_col, test_size=0.2, random_state=42):
+    """
+    Perform multiple linear regression on any DataFrame.
+    Parameters:
+    -----------
+    df : pandas.DataFrame
+        Input DataFrame containing the features
+    feature_cols : list of str
+        Names of the columns to use as independent variables
+    target_col : str
+        Name of the column to use as dependent variable
+    test_size : float, optional
+        Proportion of data to use for testing
+    random_state : int, optional
+        Random seed for reproducibility
+    Returns:
+    --------
+    tuple
+        (model, metrics, coef_df, fig) where:
+        - model is the fitted LinearRegression object
+        - metrics is a dictionary of performance metrics
+        - coef_df is a DataFrame of feature coefficients
+        - fig is the plotly figure object (if 2 features selected)
+    """
+    # Handle missing values by dropping them
+    df_clean = df.dropna(subset=feature_cols + [target_col])
+    if len(df_clean) == 0:
+        raise ValueError("No valid data points after removing missing values")
+    # Prepare data
+    X = df_clean[feature_cols]
+    y = df_clean[target_col]
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state)
+    # Fit model
+    model = LinearRegression()
+    model.fit(X_train, y_train)
+    # Make predictions
+    y_train_pred = model.predict(X_train)
+    y_test_pred = model.predict(X_test)
+    # Calculate metrics
+    metrics = {
+        'train_r2': r2_score(y_train, y_train_pred),
+        'test_r2': r2_score(y_test, y_test_pred),
+        'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
+        'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred))
+    }
+    # Create coefficient DataFrame
+    coef_df = pd.DataFrame({
+        'Feature': feature_cols,
+        'Coefficient': model.coef_,
+        'Absolute_Impact': np.abs(model.coef_)
+    }).sort_values('Absolute_Impact', ascending=False)
+    # Create visualization if exactly 2 features are selected
+    fig = None
+    if len(feature_cols) == 2:
+        fig = px.scatter_3d(
+            df_clean.sample(min(1000, len(df_clean))),
+            x=feature_cols[0],
+            y=feature_cols[1],
+            z=target_col,
+            title=f'Relationship between {feature_cols[0]}, {feature_cols[1]}, and {target_col}',
+            template="plotly_dark"
+        )
+        # Add regression plane
+        x_range = np.linspace(df_clean[feature_cols[0]].min(), df_clean[feature_cols[0]].max(), 20)
+        y_range = np.linspace(df_clean[feature_cols[1]].min(), df_clean[feature_cols[1]].max(), 20)
+        x_grid, y_grid = np.meshgrid(x_range, y_range)
+        z_grid = (model.intercept_ +
+                 model.coef_[0] * x_grid +
+                 model.coef_[1] * y_grid)
+        fig.add_trace(go.Surface(
+            x=x_grid,
+            y=y_grid,
+            z=z_grid,
+            opacity=0.5,
+            showscale=False
+        ))
+    return model, metrics, coef_df, fig
+# Example usage:
+# Load your data
+df = pd.read_csv('your_data.csv')
+# Define feature sets to try
+feature_sets = [
+    ['word_count', 'confidence_int'],
+    ['word_count', 'sentence_count', 'confidence_int'],
+    ['word_count', 'sentence_count', 'avg_word_length', 'avg_sentence_length', 'confidence_int']
+]
+# Analyze each feature set
+for features in feature_sets:
+    try:
+        print(f"\nAnalyzing features: {features}")
+        model, metrics, coef_df, fig = multiple_linear_regression(
+            df, features, 'rating_int')
+        # Print metrics
+        print("\nPerformance Metrics:")
+        for metric, value in metrics.items():
+            print(f"{metric}: {value:.3f}")
+        # Print coefficients
+        print("\nFeature Coefficients:")
+        print(coef_df)
+        # Show visualization if available
+        if fig is not None:
+            fig.show()
     except Exception as e:
+        print(f"Error analyzing features {features}: {str(e)}")
+''')
+    # Weekly Assignment
+    username = st.session_state.get("username", "Student")
+    st.header(f"{username}'s Weekly Assignment")
+    if username == "manxiii":
+        st.markdown("""
+        Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
+        1. Complete the feature engineering pipeline for the ICLR dataset
+        2. Build both simple and multiple linear regression models
+        3. Compare model performance and interpret results
+        4. Submit your findings in a Jupyter notebook
+        **Due Date:** End of Week 5
+        """)
+    elif username == "zhu":
+        st.markdown("""
+        Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
+        1. Implement the complete machine learning workflow
+        2. Create insightful visualizations of model results
+        3. Draw conclusions from your analysis
+        4. Submit your work in a Jupyter notebook
+        **Due Date:** End of Week 5
+        """)
+    elif username == "WK":
+        st.markdown("""
+        Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
+        1. Complete the feature engineering pipeline
+        2. Build and evaluate linear regression models
+        3. Analyze patterns in the data
+        4. Submit your findings
+        **Due Date:** End of Week 5
+        """)
+    else:
+        st.markdown(f"""
+        Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
+        1. Complete the feature engineering pipeline
+        2. Build and evaluate linear regression models
+        3. Analyze patterns in the data
+        4. Submit your findings
+        **Due Date:** End of Week 5
+        """)