Spaces:

raymondEDS
/

DS_webclass

Sleeping

File size: 13,454 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from nltk.tokenize import word_tokenize
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import os

# Set up the style for all plots
plt.style.use('default')
sns.set_theme(style="whitegrid", palette="husl")

def load_data():
    """Load and prepare the data"""
    # Get the current file's directory
    current_dir = Path(__file__).parent
    
    # Navigate to the Data directory (two levels up from the pages directory)
    data_dir = current_dir.parent.parent / "Data"
    
    # Load the datasets
    try:
        df_reviews = pd.read_csv(data_dir / "reviews.csv")
        df_submissions = pd.read_csv(data_dir / "Submissions.csv")
        df_dec = pd.read_csv(data_dir / "decision.csv")
        df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
        
        return df_reviews, df_submissions, df_dec, df_keyword
    except FileNotFoundError as e:
        st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
        st.error(f"Error details: {str(e)}")
        return None, None, None, None

def create_feature_plot(df, x_col, y_col, title):
    """Create an interactive scatter plot using plotly"""
    fig = px.scatter(df, x=x_col, y=y_col, 
                    title=title,
                    labels={x_col: x_col.replace('_', ' ').title(),
                           y_col: y_col.replace('_', ' ').title()},
                    template="plotly_white")
    fig.update_layout(
        title_x=0.5,
        title_font_size=20,
        showlegend=True,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    return fig

def create_correlation_heatmap(df, columns):
    """Create a correlation heatmap using plotly"""
    corr = df[columns].corr()
    fig = go.Figure(data=go.Heatmap(
        z=corr,
        x=corr.columns,
        y=corr.columns,
        colorscale='RdBu',
        zmin=-1, zmax=1
    ))
    fig.update_layout(
        title='Feature Correlation Heatmap',
        title_x=0.5,
        title_font_size=20,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    return fig

def show():
    st.title("Week 5: Introduction to Machine Learning and Linear Regression")
    
    # Introduction Section
    st.header("Course Overview")
    st.write("""
    In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis.
    
    Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to:
    - Decide which papers to accept (only 20% can be accepted)
    - Ensure fair and consistent reviews
    - Understand what makes reviewers confident in their assessments
    
    The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences. 
    How can we use data to understand and improve this process?
    
    **Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!**
    """)
    
    # Learning Path
    st.subheader("Key Concepts You'll Learn")
    st.write("""
    1. **Linear Regression (线性回归):**
       - Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
       - Real-world example: Predicting house prices based on size and location
    
    2. **Correlation Analysis (相关性分析):**
       - Definition: Statistical measure that shows how strongly two variables are related
       - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
    
    3. **Reading Linear Regression Output (解读线性回归结果):**
       - R-squared (R²): Proportion of variance explained by the model (0-1)
       - p-value: Probability that the observed relationship occurred by chance
       - Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable
       - Standard errors: Uncertainty in coefficient estimates
       - Confidence intervals: Range where true coefficient likely lies
    """)

    # Load the data
    try:
        df_reviews, df_submissions, df_dec, df_keyword = load_data()
        
        # Module 1: Data Exploration
        st.header("Module 1: Data Exploration")
        st.write("Let's explore our dataset to understand the review patterns:")
        
        # Create features from review text
        df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
        df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
        
        # Show basic statistics
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Total Reviews", len(df_reviews))
            st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
        with col2:
            st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
            st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
        
        # Create interactive visualizations
        st.subheader("Review Length vs Rating")
        fig = create_feature_plot(df_reviews, 'word_count', 'rating_int', 
                                'Relationship between Review Length and Rating')
        st.plotly_chart(fig, use_container_width=True)
        
        # Correlation analysis
        st.subheader("Feature Correlations")
        corr_fig = create_correlation_heatmap(df_reviews, 
                                            ['word_count', 'rating_int', 'confidence_int'])
        st.plotly_chart(corr_fig, use_container_width=True)
        
        # Module 2: Feature Engineering
        st.header("Module 2: Feature Engineering")
        st.write("""
        Let's create more sophisticated features from our review data:
        - Review length (word count)
        - Review rating
        - Reviewer confidence
        - Number of keywords in the paper
        """)
        
        # Interactive Feature Engineering
        st.subheader("Try Feature Engineering")
        review_text = st.text_area(
            "Enter a review to analyze:",
            "This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
            key="review_text"
        )
        
        if st.button("Extract Features"):
            # Calculate features
            word_count = len(word_tokenize(review_text))
            sentence_count = len(review_text.split('.'))
            
            # Create a nice display of features
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Word Count", word_count)
            with col2:
                st.metric("Sentence Count", sentence_count)
            with col3:
                st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
        
        # Module 3: Linear Regression Analysis
        st.header("Module 3: Linear Regression Analysis")
        st.write("""
        Let's build a linear regression model to predict paper ratings based on review features.
        """)
        
        # Prepare data for modeling
        X = df_reviews[['word_count', 'confidence_int']]
        y = df_reviews['rating_int']
        
        # Fit regression model
        model = LinearRegression()
        model.fit(X, y)
        
        # Create 3D visualization of the regression
        st.subheader("3D Visualization of Review Features")
        fig = px.scatter_3d(df_reviews.sample(1000), 
                           x='word_count', 
                           y='confidence_int', 
                           z='rating_int',
                           title='Review Features in 3D Space',
                           labels={
                               'word_count': 'Word Count',
                               'confidence_int': 'Confidence',
                               'rating_int': 'Rating'
                           })
        fig.update_layout(
            title_x=0.5,
            title_font_size=20,
            scene = dict(
                xaxis_title='Word Count',
                yaxis_title='Confidence',
                zaxis_title='Rating'
            )
        )
        st.plotly_chart(fig, use_container_width=True)
        
        # Show model metrics
        st.subheader("Model Performance")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("R-squared", f"{model.score(X, y):.3f}")
        with col2:
            st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
        with col3:
            st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
        
        # Practice Exercises
        st.header("Practice Exercises")
        
        with st.expander("Exercise 1: Feature Engineering"):
            st.write("""
            1. Load the reviews dataset
            2. Create features from review text
            3. Calculate correlation between features
            4. Visualize relationships
            """)
            
            st.code("""
            # Solution
            import pandas as pd
            import numpy as np
            from nltk.tokenize import word_tokenize
            
            # Load data
            df_reviews = pd.read_csv('reviews.csv')
            
            # Create features
            df_reviews['word_count'] = df_reviews['review'].apply(
                lambda x: len(word_tokenize(x)))
            df_reviews['sentence_count'] = df_reviews['review'].apply(
                lambda x: len(x.split('.')))
            
            # Calculate correlation
            correlation = df_reviews[['word_count', 'rating_int', 
                                    'confidence_int']].corr()
            
            # Visualize
            sns.heatmap(correlation, annot=True)
            plt.show()
            """)
        
        with st.expander("Exercise 2: Building a Predictive Model"):
            st.write("""
            1. Prepare features for modeling
            2. Split data into training and test sets
            3. Train a linear regression model
            4. Evaluate model performance
            """)
            
            st.code("""
            # Solution
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LinearRegression
            
            # Prepare features
            X = df_reviews[['word_count', 'confidence_int']]
            y = df_reviews['rating_int']
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            
            # Train model
            model = LinearRegression()
            model.fit(X_train, y_train)
            
            # Evaluate
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            
            print(f"Training R²: {train_score:.3f}")
            print(f"Testing R²: {test_score:.3f}")
            """)

        # Weekly Assignment
        username = st.session_state.get("username", "Student")
        st.header(f"{username}'s Weekly Assignment")
        
        if username == "manxiii":
            st.markdown("""
            Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline for the ICLR dataset
            2. Build a linear regression model to predict paper ratings
            3. Analyze the relationship between review features and acceptance
            4. Submit your findings in a Jupyter notebook

            **Due Date:** End of Week 5
            """)
        elif username == "zhu":
            st.markdown("""
            Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
            1. Implement the complete machine learning workflow
            2. Create insightful visualizations of model results
            3. Draw conclusions from your analysis
            4. Submit your work in a Jupyter notebook

            **Due Date:** End of Week 5
            """)
        elif username == "WK":
            st.markdown("""
            Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline
            2. Build and evaluate a linear regression model
            3. Analyze patterns in the data
            4. Submit your findings

            **Due Date:** End of Week 5
            """)
        else:
            st.markdown(f"""
            Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline
            2. Build and evaluate a linear regression model
            3. Analyze patterns in the data
            4. Submit your findings

            **Due Date:** End of Week 5
            """)
            
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        st.write("Please make sure the data files are in the correct location.")