Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import r2_score | |
import scipy.stats as stats | |
from nltk.tokenize import word_tokenize | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from pathlib import Path | |
import os | |
# Set up the style for all plots | |
plt.style.use('default') | |
sns.set_theme(style="whitegrid", palette="husl") | |
def load_data(): | |
"""Load and prepare the data""" | |
# Get the current file's directory | |
current_dir = Path(__file__).parent | |
# Navigate to the Data directory (two levels up from the pages directory) | |
data_dir = current_dir.parent.parent / "Data" | |
# Load the datasets | |
try: | |
df_reviews = pd.read_csv(data_dir / "reviews.csv") | |
df_submissions = pd.read_csv(data_dir / "Submissions.csv") | |
df_dec = pd.read_csv(data_dir / "decision.csv") | |
df_keyword = pd.read_csv(data_dir / "submission_keyword.csv") | |
return df_reviews, df_submissions, df_dec, df_keyword | |
except FileNotFoundError as e: | |
st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}") | |
st.error(f"Error details: {str(e)}") | |
return None, None, None, None | |
def create_feature_plot(df, x_col, y_col, title): | |
"""Create an interactive scatter plot using plotly""" | |
fig = px.scatter(df, x=x_col, y=y_col, | |
title=title, | |
labels={x_col: x_col.replace('_', ' ').title(), | |
y_col: y_col.replace('_', ' ').title()}, | |
template="plotly_white") | |
fig.update_layout( | |
title_x=0.5, | |
title_font_size=20, | |
showlegend=True, | |
plot_bgcolor='white', | |
paper_bgcolor='white' | |
) | |
return fig | |
def create_correlation_heatmap(df, columns): | |
"""Create a correlation heatmap using plotly""" | |
corr = df[columns].corr() | |
fig = go.Figure(data=go.Heatmap( | |
z=corr, | |
x=corr.columns, | |
y=corr.columns, | |
colorscale='RdBu', | |
zmin=-1, zmax=1 | |
)) | |
fig.update_layout( | |
title='Feature Correlation Heatmap', | |
title_x=0.5, | |
title_font_size=20, | |
plot_bgcolor='white', | |
paper_bgcolor='white' | |
) | |
return fig | |
def show(): | |
st.title("Week 5: Introduction to Machine Learning and Linear Regression") | |
# Introduction Section | |
st.header("Course Overview") | |
st.write(""" | |
In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis. | |
Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to: | |
- Decide which papers to accept (only 20% can be accepted) | |
- Ensure fair and consistent reviews | |
- Understand what makes reviewers confident in their assessments | |
The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences. | |
How can we use data to understand and improve this process? | |
**Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!** | |
""") | |
# Learning Path | |
st.subheader("Key Concepts You'll Learn") | |
st.write(""" | |
1. **Linear Regression (线性回归):** | |
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables | |
- Real-world example: Predicting house prices based on size and location | |
2. **Correlation Analysis (相关性分析):** | |
- Definition: Statistical measure that shows how strongly two variables are related | |
- Range: -1 (perfect negative correlation) to +1 (perfect positive correlation) | |
3. **Reading Linear Regression Output (解读线性回归结果):** | |
- R-squared (R²): Proportion of variance explained by the model (0-1) | |
- p-value: Probability that the observed relationship occurred by chance | |
- Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable | |
- Standard errors: Uncertainty in coefficient estimates | |
- Confidence intervals: Range where true coefficient likely lies | |
""") | |
# Load the data | |
try: | |
df_reviews, df_submissions, df_dec, df_keyword = load_data() | |
# Module 1: Data Exploration | |
st.header("Module 1: Data Exploration") | |
st.write("Let's explore our dataset to understand the review patterns:") | |
# Create features from review text | |
df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split())) | |
df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.'))) | |
# Show basic statistics | |
col1, col2 = st.columns(2) | |
with col1: | |
st.metric("Total Reviews", len(df_reviews)) | |
st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}") | |
with col2: | |
st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}") | |
st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}") | |
# Create interactive visualizations | |
st.subheader("Review Length vs Rating") | |
fig = create_feature_plot(df_reviews, 'word_count', 'rating_int', | |
'Relationship between Review Length and Rating') | |
st.plotly_chart(fig, use_container_width=True) | |
# Correlation analysis | |
st.subheader("Feature Correlations") | |
corr_fig = create_correlation_heatmap(df_reviews, | |
['word_count', 'rating_int', 'confidence_int']) | |
st.plotly_chart(corr_fig, use_container_width=True) | |
# Module 2: Feature Engineering | |
st.header("Module 2: Feature Engineering") | |
st.write(""" | |
Let's create more sophisticated features from our review data: | |
- Review length (word count) | |
- Review rating | |
- Reviewer confidence | |
- Number of keywords in the paper | |
""") | |
# Interactive Feature Engineering | |
st.subheader("Try Feature Engineering") | |
review_text = st.text_area( | |
"Enter a review to analyze:", | |
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.", | |
key="review_text" | |
) | |
if st.button("Extract Features"): | |
# Calculate features | |
word_count = len(word_tokenize(review_text)) | |
sentence_count = len(review_text.split('.')) | |
# Create a nice display of features | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Word Count", word_count) | |
with col2: | |
st.metric("Sentence Count", sentence_count) | |
with col3: | |
st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}") | |
# Module 3: Linear Regression Analysis | |
st.header("Module 3: Linear Regression Analysis") | |
st.write(""" | |
Let's build a linear regression model to predict paper ratings based on review features. | |
""") | |
# Prepare data for modeling | |
X = df_reviews[['word_count', 'confidence_int']] | |
y = df_reviews['rating_int'] | |
# Fit regression model | |
model = LinearRegression() | |
model.fit(X, y) | |
# Create 3D visualization of the regression | |
st.subheader("3D Visualization of Review Features") | |
fig = px.scatter_3d(df_reviews.sample(1000), | |
x='word_count', | |
y='confidence_int', | |
z='rating_int', | |
title='Review Features in 3D Space', | |
labels={ | |
'word_count': 'Word Count', | |
'confidence_int': 'Confidence', | |
'rating_int': 'Rating' | |
}) | |
fig.update_layout( | |
title_x=0.5, | |
title_font_size=20, | |
scene = dict( | |
xaxis_title='Word Count', | |
yaxis_title='Confidence', | |
zaxis_title='Rating' | |
) | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
# Show model metrics | |
st.subheader("Model Performance") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("R-squared", f"{model.score(X, y):.3f}") | |
with col2: | |
st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}") | |
with col3: | |
st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}") | |
# Practice Exercises | |
st.header("Practice Exercises") | |
with st.expander("Exercise 1: Feature Engineering"): | |
st.write(""" | |
1. Load the reviews dataset | |
2. Create features from review text | |
3. Calculate correlation between features | |
4. Visualize relationships | |
""") | |
st.code(""" | |
# Solution | |
import pandas as pd | |
import numpy as np | |
from nltk.tokenize import word_tokenize | |
# Load data | |
df_reviews = pd.read_csv('reviews.csv') | |
# Create features | |
df_reviews['word_count'] = df_reviews['review'].apply( | |
lambda x: len(word_tokenize(x))) | |
df_reviews['sentence_count'] = df_reviews['review'].apply( | |
lambda x: len(x.split('.'))) | |
# Calculate correlation | |
correlation = df_reviews[['word_count', 'rating_int', | |
'confidence_int']].corr() | |
# Visualize | |
sns.heatmap(correlation, annot=True) | |
plt.show() | |
""") | |
with st.expander("Exercise 2: Building a Predictive Model"): | |
st.write(""" | |
1. Prepare features for modeling | |
2. Split data into training and test sets | |
3. Train a linear regression model | |
4. Evaluate model performance | |
""") | |
st.code(""" | |
# Solution | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LinearRegression | |
# Prepare features | |
X = df_reviews[['word_count', 'confidence_int']] | |
y = df_reviews['rating_int'] | |
# Split data | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
# Train model | |
model = LinearRegression() | |
model.fit(X_train, y_train) | |
# Evaluate | |
train_score = model.score(X_train, y_train) | |
test_score = model.score(X_test, y_test) | |
print(f"Training R²: {train_score:.3f}") | |
print(f"Testing R²: {test_score:.3f}") | |
""") | |
# Weekly Assignment | |
username = st.session_state.get("username", "Student") | |
st.header(f"{username}'s Weekly Assignment") | |
if username == "manxiii": | |
st.markdown(""" | |
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis. | |
1. Complete the feature engineering pipeline for the ICLR dataset | |
2. Build a linear regression model to predict paper ratings | |
3. Analyze the relationship between review features and acceptance | |
4. Submit your findings in a Jupyter notebook | |
**Due Date:** End of Week 5 | |
""") | |
elif username == "zhu": | |
st.markdown(""" | |
Hello **zhu**, here is your Assignment 5: Machine Learning Analysis. | |
1. Implement the complete machine learning workflow | |
2. Create insightful visualizations of model results | |
3. Draw conclusions from your analysis | |
4. Submit your work in a Jupyter notebook | |
**Due Date:** End of Week 5 | |
""") | |
elif username == "WK": | |
st.markdown(""" | |
Hello **WK**, here is your Assignment 5: Machine Learning Analysis. | |
1. Complete the feature engineering pipeline | |
2. Build and evaluate a linear regression model | |
3. Analyze patterns in the data | |
4. Submit your findings | |
**Due Date:** End of Week 5 | |
""") | |
else: | |
st.markdown(f""" | |
Hello **{username}**, here is your Assignment 5: Machine Learning Analysis. | |
1. Complete the feature engineering pipeline | |
2. Build and evaluate a linear regression model | |
3. Analyze patterns in the data | |
4. Submit your findings | |
**Due Date:** End of Week 5 | |
""") | |
except Exception as e: | |
st.error(f"Error loading data: {str(e)}") | |
st.write("Please make sure the data files are in the correct location.") |