import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score import scipy.stats as stats from nltk.tokenize import word_tokenize import plotly.express as px import plotly.graph_objects as go from pathlib import Path import os # Set up the style for all plots plt.style.use('default') sns.set_theme(style="whitegrid", palette="husl") def load_data(): """Load and prepare the data""" # Get the current file's directory current_dir = Path(__file__).parent # Navigate to the Data directory (two levels up from the pages directory) data_dir = current_dir.parent.parent / "Data" # Load the datasets try: df_reviews = pd.read_csv(data_dir / "reviews.csv") df_submissions = pd.read_csv(data_dir / "Submissions.csv") df_dec = pd.read_csv(data_dir / "decision.csv") df_keyword = pd.read_csv(data_dir / "submission_keyword.csv") return df_reviews, df_submissions, df_dec, df_keyword except FileNotFoundError as e: st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}") st.error(f"Error details: {str(e)}") return None, None, None, None def create_feature_plot(df, x_col, y_col, title): """Create an interactive scatter plot using plotly""" fig = px.scatter(df, x=x_col, y=y_col, title=title, labels={x_col: x_col.replace('_', ' ').title(), y_col: y_col.replace('_', ' ').title()}, template="plotly_white") fig.update_layout( title_x=0.5, title_font_size=20, showlegend=True, plot_bgcolor='white', paper_bgcolor='white' ) return fig def create_correlation_heatmap(df, columns): """Create a correlation heatmap using plotly""" corr = df[columns].corr() fig = go.Figure(data=go.Heatmap( z=corr, x=corr.columns, y=corr.columns, colorscale='RdBu', zmin=-1, zmax=1 )) fig.update_layout( title='Feature Correlation Heatmap', title_x=0.5, title_font_size=20, plot_bgcolor='white', paper_bgcolor='white' ) return fig def show(): st.title("Week 5: Introduction to Machine Learning and Linear Regression") # Introduction Section st.header("Course Overview") st.write(""" In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis. Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to: - Decide which papers to accept (only 20% can be accepted) - Ensure fair and consistent reviews - Understand what makes reviewers confident in their assessments The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences. How can we use data to understand and improve this process? **Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!** """) # Learning Path st.subheader("Key Concepts You'll Learn") st.write(""" 1. **Linear Regression (线性回归):** - Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables - Real-world example: Predicting house prices based on size and location 2. **Correlation Analysis (相关性分析):** - Definition: Statistical measure that shows how strongly two variables are related - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation) 3. **Reading Linear Regression Output (解读线性回归结果):** - R-squared (R²): Proportion of variance explained by the model (0-1) - p-value: Probability that the observed relationship occurred by chance - Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable - Standard errors: Uncertainty in coefficient estimates - Confidence intervals: Range where true coefficient likely lies """) # Load the data try: df_reviews, df_submissions, df_dec, df_keyword = load_data() # Module 1: Data Exploration st.header("Module 1: Data Exploration") st.write("Let's explore our dataset to understand the review patterns:") # Create features from review text df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split())) df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.'))) # Show basic statistics col1, col2 = st.columns(2) with col1: st.metric("Total Reviews", len(df_reviews)) st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}") with col2: st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}") st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}") # Create interactive visualizations st.subheader("Review Length vs Rating") fig = create_feature_plot(df_reviews, 'word_count', 'rating_int', 'Relationship between Review Length and Rating') st.plotly_chart(fig, use_container_width=True) # Correlation analysis st.subheader("Feature Correlations") corr_fig = create_correlation_heatmap(df_reviews, ['word_count', 'rating_int', 'confidence_int']) st.plotly_chart(corr_fig, use_container_width=True) # Module 2: Feature Engineering st.header("Module 2: Feature Engineering") st.write(""" Let's create more sophisticated features from our review data: - Review length (word count) - Review rating - Reviewer confidence - Number of keywords in the paper """) # Interactive Feature Engineering st.subheader("Try Feature Engineering") review_text = st.text_area( "Enter a review to analyze:", "This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.", key="review_text" ) if st.button("Extract Features"): # Calculate features word_count = len(word_tokenize(review_text)) sentence_count = len(review_text.split('.')) # Create a nice display of features col1, col2, col3 = st.columns(3) with col1: st.metric("Word Count", word_count) with col2: st.metric("Sentence Count", sentence_count) with col3: st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}") # Module 3: Linear Regression Analysis st.header("Module 3: Linear Regression Analysis") st.write(""" Let's build a linear regression model to predict paper ratings based on review features. """) # Prepare data for modeling X = df_reviews[['word_count', 'confidence_int']] y = df_reviews['rating_int'] # Fit regression model model = LinearRegression() model.fit(X, y) # Create 3D visualization of the regression st.subheader("3D Visualization of Review Features") fig = px.scatter_3d(df_reviews.sample(1000), x='word_count', y='confidence_int', z='rating_int', title='Review Features in 3D Space', labels={ 'word_count': 'Word Count', 'confidence_int': 'Confidence', 'rating_int': 'Rating' }) fig.update_layout( title_x=0.5, title_font_size=20, scene = dict( xaxis_title='Word Count', yaxis_title='Confidence', zaxis_title='Rating' ) ) st.plotly_chart(fig, use_container_width=True) # Show model metrics st.subheader("Model Performance") col1, col2, col3 = st.columns(3) with col1: st.metric("R-squared", f"{model.score(X, y):.3f}") with col2: st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}") with col3: st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}") # Practice Exercises st.header("Practice Exercises") with st.expander("Exercise 1: Feature Engineering"): st.write(""" 1. Load the reviews dataset 2. Create features from review text 3. Calculate correlation between features 4. Visualize relationships """) st.code(""" # Solution import pandas as pd import numpy as np from nltk.tokenize import word_tokenize # Load data df_reviews = pd.read_csv('reviews.csv') # Create features df_reviews['word_count'] = df_reviews['review'].apply( lambda x: len(word_tokenize(x))) df_reviews['sentence_count'] = df_reviews['review'].apply( lambda x: len(x.split('.'))) # Calculate correlation correlation = df_reviews[['word_count', 'rating_int', 'confidence_int']].corr() # Visualize sns.heatmap(correlation, annot=True) plt.show() """) with st.expander("Exercise 2: Building a Predictive Model"): st.write(""" 1. Prepare features for modeling 2. Split data into training and test sets 3. Train a linear regression model 4. Evaluate model performance """) st.code(""" # Solution from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression # Prepare features X = df_reviews[['word_count', 'confidence_int']] y = df_reviews['rating_int'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) # Train model model = LinearRegression() model.fit(X_train, y_train) # Evaluate train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) print(f"Training R²: {train_score:.3f}") print(f"Testing R²: {test_score:.3f}") """) # Weekly Assignment username = st.session_state.get("username", "Student") st.header(f"{username}'s Weekly Assignment") if username == "manxiii": st.markdown(""" Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis. 1. Complete the feature engineering pipeline for the ICLR dataset 2. Build a linear regression model to predict paper ratings 3. Analyze the relationship between review features and acceptance 4. Submit your findings in a Jupyter notebook **Due Date:** End of Week 5 """) elif username == "zhu": st.markdown(""" Hello **zhu**, here is your Assignment 5: Machine Learning Analysis. 1. Implement the complete machine learning workflow 2. Create insightful visualizations of model results 3. Draw conclusions from your analysis 4. Submit your work in a Jupyter notebook **Due Date:** End of Week 5 """) elif username == "WK": st.markdown(""" Hello **WK**, here is your Assignment 5: Machine Learning Analysis. 1. Complete the feature engineering pipeline 2. Build and evaluate a linear regression model 3. Analyze patterns in the data 4. Submit your findings **Due Date:** End of Week 5 """) else: st.markdown(f""" Hello **{username}**, here is your Assignment 5: Machine Learning Analysis. 1. Complete the feature engineering pipeline 2. Build and evaluate a linear regression model 3. Analyze patterns in the data 4. Submit your findings **Due Date:** End of Week 5 """) except Exception as e: st.error(f"Error loading data: {str(e)}") st.write("Please make sure the data files are in the correct location.")