Spaces:
Sleeping
Sleeping
File size: 13,454 Bytes
faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c faeb953 ae38d1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from nltk.tokenize import word_tokenize
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import os
# Set up the style for all plots
plt.style.use('default')
sns.set_theme(style="whitegrid", palette="husl")
def load_data():
"""Load and prepare the data"""
# Get the current file's directory
current_dir = Path(__file__).parent
# Navigate to the Data directory (two levels up from the pages directory)
data_dir = current_dir.parent.parent / "Data"
# Load the datasets
try:
df_reviews = pd.read_csv(data_dir / "reviews.csv")
df_submissions = pd.read_csv(data_dir / "Submissions.csv")
df_dec = pd.read_csv(data_dir / "decision.csv")
df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
return df_reviews, df_submissions, df_dec, df_keyword
except FileNotFoundError as e:
st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
st.error(f"Error details: {str(e)}")
return None, None, None, None
def create_feature_plot(df, x_col, y_col, title):
"""Create an interactive scatter plot using plotly"""
fig = px.scatter(df, x=x_col, y=y_col,
title=title,
labels={x_col: x_col.replace('_', ' ').title(),
y_col: y_col.replace('_', ' ').title()},
template="plotly_white")
fig.update_layout(
title_x=0.5,
title_font_size=20,
showlegend=True,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
def create_correlation_heatmap(df, columns):
"""Create a correlation heatmap using plotly"""
corr = df[columns].corr()
fig = go.Figure(data=go.Heatmap(
z=corr,
x=corr.columns,
y=corr.columns,
colorscale='RdBu',
zmin=-1, zmax=1
))
fig.update_layout(
title='Feature Correlation Heatmap',
title_x=0.5,
title_font_size=20,
plot_bgcolor='white',
paper_bgcolor='white'
)
return fig
def show():
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
# Introduction Section
st.header("Course Overview")
st.write("""
In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis.
Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to:
- Decide which papers to accept (only 20% can be accepted)
- Ensure fair and consistent reviews
- Understand what makes reviewers confident in their assessments
The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences.
How can we use data to understand and improve this process?
**Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!**
""")
# Learning Path
st.subheader("Key Concepts You'll Learn")
st.write("""
1. **Linear Regression (线性回归):**
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
- Real-world example: Predicting house prices based on size and location
2. **Correlation Analysis (相关性分析):**
- Definition: Statistical measure that shows how strongly two variables are related
- Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
3. **Reading Linear Regression Output (解读线性回归结果):**
- R-squared (R²): Proportion of variance explained by the model (0-1)
- p-value: Probability that the observed relationship occurred by chance
- Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable
- Standard errors: Uncertainty in coefficient estimates
- Confidence intervals: Range where true coefficient likely lies
""")
# Load the data
try:
df_reviews, df_submissions, df_dec, df_keyword = load_data()
# Module 1: Data Exploration
st.header("Module 1: Data Exploration")
st.write("Let's explore our dataset to understand the review patterns:")
# Create features from review text
df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
# Show basic statistics
col1, col2 = st.columns(2)
with col1:
st.metric("Total Reviews", len(df_reviews))
st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
with col2:
st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
# Create interactive visualizations
st.subheader("Review Length vs Rating")
fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
'Relationship between Review Length and Rating')
st.plotly_chart(fig, use_container_width=True)
# Correlation analysis
st.subheader("Feature Correlations")
corr_fig = create_correlation_heatmap(df_reviews,
['word_count', 'rating_int', 'confidence_int'])
st.plotly_chart(corr_fig, use_container_width=True)
# Module 2: Feature Engineering
st.header("Module 2: Feature Engineering")
st.write("""
Let's create more sophisticated features from our review data:
- Review length (word count)
- Review rating
- Reviewer confidence
- Number of keywords in the paper
""")
# Interactive Feature Engineering
st.subheader("Try Feature Engineering")
review_text = st.text_area(
"Enter a review to analyze:",
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
key="review_text"
)
if st.button("Extract Features"):
# Calculate features
word_count = len(word_tokenize(review_text))
sentence_count = len(review_text.split('.'))
# Create a nice display of features
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Word Count", word_count)
with col2:
st.metric("Sentence Count", sentence_count)
with col3:
st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
# Module 3: Linear Regression Analysis
st.header("Module 3: Linear Regression Analysis")
st.write("""
Let's build a linear regression model to predict paper ratings based on review features.
""")
# Prepare data for modeling
X = df_reviews[['word_count', 'confidence_int']]
y = df_reviews['rating_int']
# Fit regression model
model = LinearRegression()
model.fit(X, y)
# Create 3D visualization of the regression
st.subheader("3D Visualization of Review Features")
fig = px.scatter_3d(df_reviews.sample(1000),
x='word_count',
y='confidence_int',
z='rating_int',
title='Review Features in 3D Space',
labels={
'word_count': 'Word Count',
'confidence_int': 'Confidence',
'rating_int': 'Rating'
})
fig.update_layout(
title_x=0.5,
title_font_size=20,
scene = dict(
xaxis_title='Word Count',
yaxis_title='Confidence',
zaxis_title='Rating'
)
)
st.plotly_chart(fig, use_container_width=True)
# Show model metrics
st.subheader("Model Performance")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("R-squared", f"{model.score(X, y):.3f}")
with col2:
st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
with col3:
st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
# Practice Exercises
st.header("Practice Exercises")
with st.expander("Exercise 1: Feature Engineering"):
st.write("""
1. Load the reviews dataset
2. Create features from review text
3. Calculate correlation between features
4. Visualize relationships
""")
st.code("""
# Solution
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
# Load data
df_reviews = pd.read_csv('reviews.csv')
# Create features
df_reviews['word_count'] = df_reviews['review'].apply(
lambda x: len(word_tokenize(x)))
df_reviews['sentence_count'] = df_reviews['review'].apply(
lambda x: len(x.split('.')))
# Calculate correlation
correlation = df_reviews[['word_count', 'rating_int',
'confidence_int']].corr()
# Visualize
sns.heatmap(correlation, annot=True)
plt.show()
""")
with st.expander("Exercise 2: Building a Predictive Model"):
st.write("""
1. Prepare features for modeling
2. Split data into training and test sets
3. Train a linear regression model
4. Evaluate model performance
""")
st.code("""
# Solution
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Prepare features
X = df_reviews[['word_count', 'confidence_int']]
y = df_reviews['rating_int']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")
""")
# Weekly Assignment
username = st.session_state.get("username", "Student")
st.header(f"{username}'s Weekly Assignment")
if username == "manxiii":
st.markdown("""
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline for the ICLR dataset
2. Build a linear regression model to predict paper ratings
3. Analyze the relationship between review features and acceptance
4. Submit your findings in a Jupyter notebook
**Due Date:** End of Week 5
""")
elif username == "zhu":
st.markdown("""
Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
1. Implement the complete machine learning workflow
2. Create insightful visualizations of model results
3. Draw conclusions from your analysis
4. Submit your work in a Jupyter notebook
**Due Date:** End of Week 5
""")
elif username == "WK":
st.markdown("""
Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline
2. Build and evaluate a linear regression model
3. Analyze patterns in the data
4. Submit your findings
**Due Date:** End of Week 5
""")
else:
st.markdown(f"""
Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
1. Complete the feature engineering pipeline
2. Build and evaluate a linear regression model
3. Analyze patterns in the data
4. Submit your findings
**Due Date:** End of Week 5
""")
except Exception as e:
st.error(f"Error loading data: {str(e)}")
st.write("Please make sure the data files are in the correct location.") |