Spaces:

raymondEDS
/

DS_webclass

Sleeping

raymondEDS

Updating lesson 5

ae38d1c 3 months ago

13.5 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import r2_score
	import scipy.stats as stats
	from nltk.tokenize import word_tokenize
	import plotly.express as px
	import plotly.graph_objects as go
	from pathlib import Path
	import os

	# Set up the style for all plots
	plt.style.use('default')
	sns.set_theme(style="whitegrid", palette="husl")

	def load_data():
	"""Load and prepare the data"""
	# Get the current file's directory
	current_dir = Path(__file__).parent

	# Navigate to the Data directory (two levels up from the pages directory)
	data_dir = current_dir.parent.parent / "Data"

	# Load the datasets
	try:
	df_reviews = pd.read_csv(data_dir / "reviews.csv")
	df_submissions = pd.read_csv(data_dir / "Submissions.csv")
	df_dec = pd.read_csv(data_dir / "decision.csv")
	df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")

	return df_reviews, df_submissions, df_dec, df_keyword
	except FileNotFoundError as e:
	st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
	st.error(f"Error details: {str(e)}")
	return None, None, None, None

	def create_feature_plot(df, x_col, y_col, title):
	"""Create an interactive scatter plot using plotly"""
	fig = px.scatter(df, x=x_col, y=y_col,
	title=title,
	labels={x_col: x_col.replace('_', ' ').title(),
	y_col: y_col.replace('_', ' ').title()},
	template="plotly_white")
	fig.update_layout(
	title_x=0.5,
	title_font_size=20,
	showlegend=True,
	plot_bgcolor='white',
	paper_bgcolor='white'
	)
	return fig

	def create_correlation_heatmap(df, columns):
	"""Create a correlation heatmap using plotly"""
	corr = df[columns].corr()
	fig = go.Figure(data=go.Heatmap(
	z=corr,
	x=corr.columns,
	y=corr.columns,
	colorscale='RdBu',
	zmin=-1, zmax=1
	))
	fig.update_layout(
	title='Feature Correlation Heatmap',
	title_x=0.5,
	title_font_size=20,
	plot_bgcolor='white',
	paper_bgcolor='white'
	)
	return fig

	def show():
	st.title("Week 5: Introduction to Machine Learning and Linear Regression")

	# Introduction Section
	st.header("Course Overview")
	st.write("""
	In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis.

	Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to:
	- Decide which papers to accept (only 20% can be accepted)
	- Ensure fair and consistent reviews
	- Understand what makes reviewers confident in their assessments

	The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences.
	How can we use data to understand and improve this process?

	Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!
	""")

	# Learning Path
	st.subheader("Key Concepts You'll Learn")
	st.write("""
	1. Linear Regression (线性回归):
	- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
	- Real-world example: Predicting house prices based on size and location

	2. Correlation Analysis (相关性分析):
	- Definition: Statistical measure that shows how strongly two variables are related
	- Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)

	3. Reading Linear Regression Output (解读线性回归结果):
	- R-squared (R²): Proportion of variance explained by the model (0-1)
	- p-value: Probability that the observed relationship occurred by chance
	- Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable
	- Standard errors: Uncertainty in coefficient estimates
	- Confidence intervals: Range where true coefficient likely lies
	""")

	# Load the data
	try:
	df_reviews, df_submissions, df_dec, df_keyword = load_data()

	# Module 1: Data Exploration
	st.header("Module 1: Data Exploration")
	st.write("Let's explore our dataset to understand the review patterns:")

	# Create features from review text
	df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
	df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))

	# Show basic statistics
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Total Reviews", len(df_reviews))
	st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
	with col2:
	st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
	st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")

	# Create interactive visualizations
	st.subheader("Review Length vs Rating")
	fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
	'Relationship between Review Length and Rating')
	st.plotly_chart(fig, use_container_width=True)

	# Correlation analysis
	st.subheader("Feature Correlations")
	corr_fig = create_correlation_heatmap(df_reviews,
	['word_count', 'rating_int', 'confidence_int'])
	st.plotly_chart(corr_fig, use_container_width=True)

	# Module 2: Feature Engineering
	st.header("Module 2: Feature Engineering")
	st.write("""
	Let's create more sophisticated features from our review data:
	- Review length (word count)
	- Review rating
	- Reviewer confidence
	- Number of keywords in the paper
	""")

	# Interactive Feature Engineering
	st.subheader("Try Feature Engineering")
	review_text = st.text_area(
	"Enter a review to analyze:",
	"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
	key="review_text"
	)

	if st.button("Extract Features"):
	# Calculate features
	word_count = len(word_tokenize(review_text))
	sentence_count = len(review_text.split('.'))

	# Create a nice display of features
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Word Count", word_count)
	with col2:
	st.metric("Sentence Count", sentence_count)
	with col3:
	st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")

	# Module 3: Linear Regression Analysis
	st.header("Module 3: Linear Regression Analysis")
	st.write("""
	Let's build a linear regression model to predict paper ratings based on review features.
	""")

	# Prepare data for modeling
	X = df_reviews[['word_count', 'confidence_int']]
	y = df_reviews['rating_int']

	# Fit regression model
	model = LinearRegression()
	model.fit(X, y)

	# Create 3D visualization of the regression
	st.subheader("3D Visualization of Review Features")
	fig = px.scatter_3d(df_reviews.sample(1000),
	x='word_count',
	y='confidence_int',
	z='rating_int',
	title='Review Features in 3D Space',
	labels={
	'word_count': 'Word Count',
	'confidence_int': 'Confidence',
	'rating_int': 'Rating'
	})
	fig.update_layout(
	title_x=0.5,
	title_font_size=20,
	scene = dict(
	xaxis_title='Word Count',
	yaxis_title='Confidence',
	zaxis_title='Rating'
	)
	)
	st.plotly_chart(fig, use_container_width=True)

	# Show model metrics
	st.subheader("Model Performance")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("R-squared", f"{model.score(X, y):.3f}")
	with col2:
	st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
	with col3:
	st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")

	# Practice Exercises
	st.header("Practice Exercises")

	with st.expander("Exercise 1: Feature Engineering"):
	st.write("""
	1. Load the reviews dataset
	2. Create features from review text
	3. Calculate correlation between features
	4. Visualize relationships
	""")

	st.code("""
	# Solution
	import pandas as pd
	import numpy as np
	from nltk.tokenize import word_tokenize

	# Load data
	df_reviews = pd.read_csv('reviews.csv')

	# Create features
	df_reviews['word_count'] = df_reviews['review'].apply(
	lambda x: len(word_tokenize(x)))
	df_reviews['sentence_count'] = df_reviews['review'].apply(
	lambda x: len(x.split('.')))

	# Calculate correlation
	correlation = df_reviews[['word_count', 'rating_int',
	'confidence_int']].corr()

	# Visualize
	sns.heatmap(correlation, annot=True)
	plt.show()
	""")

	with st.expander("Exercise 2: Building a Predictive Model"):
	st.write("""
	1. Prepare features for modeling
	2. Split data into training and test sets
	3. Train a linear regression model
	4. Evaluate model performance
	""")

	st.code("""
	# Solution
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression

	# Prepare features
	X = df_reviews[['word_count', 'confidence_int']]
	y = df_reviews['rating_int']

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42)

	# Train model
	model = LinearRegression()
	model.fit(X_train, y_train)

	# Evaluate
	train_score = model.score(X_train, y_train)
	test_score = model.score(X_test, y_test)

	print(f"Training R²: {train_score:.3f}")
	print(f"Testing R²: {test_score:.3f}")
	""")

	# Weekly Assignment
	username = st.session_state.get("username", "Student")
	st.header(f"{username}'s Weekly Assignment")

	if username == "manxiii":
	st.markdown("""
	Hello manxiii, here is your Assignment 5: Machine Learning Analysis.
	1. Complete the feature engineering pipeline for the ICLR dataset
	2. Build a linear regression model to predict paper ratings
	3. Analyze the relationship between review features and acceptance
	4. Submit your findings in a Jupyter notebook

	Due Date: End of Week 5
	""")
	elif username == "zhu":
	st.markdown("""
	Hello zhu, here is your Assignment 5: Machine Learning Analysis.
	1. Implement the complete machine learning workflow
	2. Create insightful visualizations of model results
	3. Draw conclusions from your analysis
	4. Submit your work in a Jupyter notebook

	Due Date: End of Week 5
	""")
	elif username == "WK":
	st.markdown("""
	Hello WK, here is your Assignment 5: Machine Learning Analysis.
	1. Complete the feature engineering pipeline
	2. Build and evaluate a linear regression model
	3. Analyze patterns in the data
	4. Submit your findings

	Due Date: End of Week 5
	""")
	else:
	st.markdown(f"""
	Hello {username}, here is your Assignment 5: Machine Learning Analysis.
	1. Complete the feature engineering pipeline
	2. Build and evaluate a linear regression model
	3. Analyze patterns in the data
	4. Submit your findings

	Due Date: End of Week 5
	""")

	except Exception as e:
	st.error(f"Error loading data: {str(e)}")
	st.write("Please make sure the data files are in the correct location.")