DS_webclass / app /pages /week_5.py
raymondEDS
Updating lesson 5
ae38d1c
raw
history blame
13.5 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from nltk.tokenize import word_tokenize
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import os
# Set up the style for all plots
plt.style.use('default')
sns.set_theme(style="whitegrid", palette="husl")
def load_data():
"""Load and prepare the data"""
# Get the current file's directory
current_dir = Path(__file__).parent
# Navigate to the Data directory (two levels up from the pages directory)
data_dir = current_dir.parent.parent / "Data"
# Load the datasets
try:
df_reviews = pd.read_csv(data_dir / "reviews.csv")
df_submissions = pd.read_csv(data_dir / "Submissions.csv")
df_dec = pd.read_csv(data_dir / "decision.csv")
df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
return df_reviews, df_submissions, df_dec, df_keyword
except FileNotFoundError as e:
st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
st.error(f"Error details: {str(e)}")
return None, None, None, None
def create_feature_plot(df, x_col, y_col, title):
"""Create an interactive scatter plot using plotly"""
fig = px.scatter(df, x=x_col, y=y_col,
title=title,
labels={x_col: x_col.replace('_', ' ').title(),
y_col: y_col.replace('_', ' ').title()},
template="plotly_white")
fig.update_layout(
title_x=0.5,
title_font_size=20,
showlegend=True,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
def create_correlation_heatmap(df, columns):
"""Create a correlation heatmap using plotly"""
corr = df[columns].corr()
fig = go.Figure(data=go.Heatmap(
z=corr,
x=corr.columns,
y=corr.columns,
colorscale='RdBu',
zmin=-1, zmax=1
))
fig.update_layout(
title='Feature Correlation Heatmap',
title_x=0.5,
title_font_size=20,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
def show():
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
# Introduction Section
st.header("Course Overview")
st.write("""
In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis.
Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to:
- Decide which papers to accept (only 20% can be accepted)
- Ensure fair and consistent reviews
- Understand what makes reviewers confident in their assessments
The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences.
How can we use data to understand and improve this process?
**Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!**
""")
# Learning Path
st.subheader("Key Concepts You'll Learn")
st.write("""
1. **Linear Regression (线性回归):**
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
- Real-world example: Predicting house prices based on size and location
2. **Correlation Analysis (相关性分析):**
- Definition: Statistical measure that shows how strongly two variables are related
- Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
3. **Reading Linear Regression Output (解读线性回归结果):**
- R-squared (R²): Proportion of variance explained by the model (0-1)
- p-value: Probability that the observed relationship occurred by chance
- Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable
- Standard errors: Uncertainty in coefficient estimates
- Confidence intervals: Range where true coefficient likely lies
""")
# Load the data
try:
df_reviews, df_submissions, df_dec, df_keyword = load_data()
# Module 1: Data Exploration
st.header("Module 1: Data Exploration")
st.write("Let's explore our dataset to understand the review patterns:")
# Create features from review text
df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
# Show basic statistics
col1, col2 = st.columns(2)
with col1:
st.metric("Total Reviews", len(df_reviews))
st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
with col2:
st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
# Create interactive visualizations
st.subheader("Review Length vs Rating")
fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
'Relationship between Review Length and Rating')
st.plotly_chart(fig, use_container_width=True)
# Correlation analysis
st.subheader("Feature Correlations")
corr_fig = create_correlation_heatmap(df_reviews,
['word_count', 'rating_int', 'confidence_int'])
st.plotly_chart(corr_fig, use_container_width=True)
# Module 2: Feature Engineering
st.header("Module 2: Feature Engineering")
st.write("""
Let's create more sophisticated features from our review data:
- Review length (word count)
- Review rating
- Reviewer confidence
- Number of keywords in the paper
""")
# Interactive Feature Engineering
st.subheader("Try Feature Engineering")
review_text = st.text_area(
"Enter a review to analyze:",
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
key="review_text"
)
if st.button("Extract Features"):
# Calculate features
word_count = len(word_tokenize(review_text))
sentence_count = len(review_text.split('.'))
# Create a nice display of features
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Word Count", word_count)
with col2:
st.metric("Sentence Count", sentence_count)
with col3:
st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
# Module 3: Linear Regression Analysis
st.header("Module 3: Linear Regression Analysis")
st.write("""
Let's build a linear regression model to predict paper ratings based on review features.
""")
# Prepare data for modeling
X = df_reviews[['word_count', 'confidence_int']]
y = df_reviews['rating_int']
# Fit regression model
model = LinearRegression()
model.fit(X, y)
# Create 3D visualization of the regression
st.subheader("3D Visualization of Review Features")
fig = px.scatter_3d(df_reviews.sample(1000),
x='word_count',
y='confidence_int',
z='rating_int',
title='Review Features in 3D Space',
labels={
'word_count': 'Word Count',
'confidence_int': 'Confidence',
'rating_int': 'Rating'
})
fig.update_layout(
title_x=0.5,
title_font_size=20,
scene = dict(
xaxis_title='Word Count',
yaxis_title='Confidence',
zaxis_title='Rating'
)
)
st.plotly_chart(fig, use_container_width=True)
# Show model metrics
st.subheader("Model Performance")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("R-squared", f"{model.score(X, y):.3f}")
with col2:
st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
with col3:
st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
# Practice Exercises
st.header("Practice Exercises")
with st.expander("Exercise 1: Feature Engineering"):
st.write("""
1. Load the reviews dataset
2. Create features from review text
3. Calculate correlation between features
4. Visualize relationships
""")
st.code("""
# Solution
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
# Load data
df_reviews = pd.read_csv('reviews.csv')
# Create features
df_reviews['word_count'] = df_reviews['review'].apply(
lambda x: len(word_tokenize(x)))
df_reviews['sentence_count'] = df_reviews['review'].apply(
lambda x: len(x.split('.')))
# Calculate correlation
correlation = df_reviews[['word_count', 'rating_int',
'confidence_int']].corr()
# Visualize
sns.heatmap(correlation, annot=True)
plt.show()
""")
with st.expander("Exercise 2: Building a Predictive Model"):
st.write("""
1. Prepare features for modeling
2. Split data into training and test sets
3. Train a linear regression model
4. Evaluate model performance
""")
st.code("""
# Solution
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Prepare features
X = df_reviews[['word_count', 'confidence_int']]
y = df_reviews['rating_int']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")
""")
# Weekly Assignment
username = st.session_state.get("username", "Student")
st.header(f"{username}'s Weekly Assignment")
if username == "manxiii":
st.markdown("""
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline for the ICLR dataset
2. Build a linear regression model to predict paper ratings
3. Analyze the relationship between review features and acceptance
4. Submit your findings in a Jupyter notebook
**Due Date:** End of Week 5
""")
elif username == "zhu":
st.markdown("""
Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
1. Implement the complete machine learning workflow
2. Create insightful visualizations of model results
3. Draw conclusions from your analysis
4. Submit your work in a Jupyter notebook
**Due Date:** End of Week 5
""")
elif username == "WK":
st.markdown("""
Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline
2. Build and evaluate a linear regression model
3. Analyze patterns in the data
4. Submit your findings
**Due Date:** End of Week 5
""")
else:
st.markdown(f"""
Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline
2. Build and evaluate a linear regression model
3. Analyze patterns in the data
4. Submit your findings
**Due Date:** End of Week 5
""")
except Exception as e:
st.error(f"Error loading data: {str(e)}")
st.write("Please make sure the data files are in the correct location.")