Spaces:
Sleeping
Sleeping
raymondEDS
commited on
Commit
·
ae38d1c
1
Parent(s):
faeb953
Updating lesson 5
Browse files- Data/Submissions.csv +0 -0
- Data/decision.csv +0 -0
- Data/reviews.csv +0 -0
- Data/submission_keyword.csv +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/components/__pycache__/__init__.cpython-311.pyc +0 -0
- app/components/__pycache__/login.cpython-311.pyc +0 -0
- app/main.py +4 -1
- app/pages/__pycache__/week_1.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_2.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_3.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_4.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_5.cpython-311.pyc +0 -0
- app/pages/week_5.py +269 -200
Data/Submissions.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/decision.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/reviews.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/submission_keyword.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/__pycache__/__init__.cpython-311.pyc
CHANGED
|
Binary files a/app/__pycache__/__init__.cpython-311.pyc and b/app/__pycache__/__init__.cpython-311.pyc differ
|
|
|
app/__pycache__/main.cpython-311.pyc
CHANGED
|
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
|
|
|
app/components/__pycache__/__init__.cpython-311.pyc
CHANGED
|
Binary files a/app/components/__pycache__/__init__.cpython-311.pyc and b/app/components/__pycache__/__init__.cpython-311.pyc differ
|
|
|
app/components/__pycache__/login.cpython-311.pyc
CHANGED
|
Binary files a/app/components/__pycache__/login.cpython-311.pyc and b/app/components/__pycache__/login.cpython-311.pyc differ
|
|
|
app/main.py
CHANGED
|
@@ -22,6 +22,7 @@ from app.pages import week_1
|
|
| 22 |
from app.pages import week_2
|
| 23 |
from app.pages import week_3
|
| 24 |
from app.pages import week_4
|
|
|
|
| 25 |
# Page configuration
|
| 26 |
st.set_page_config(
|
| 27 |
page_title="Data Science Course App",
|
|
@@ -146,6 +147,8 @@ def show_week_content():
|
|
| 146 |
week_3.show()
|
| 147 |
elif st.session_state.current_week == 4:
|
| 148 |
week_4.show()
|
|
|
|
|
|
|
| 149 |
else:
|
| 150 |
st.warning("Content for this week is not yet available.")
|
| 151 |
|
|
@@ -158,7 +161,7 @@ def main():
|
|
| 158 |
return
|
| 159 |
|
| 160 |
# User is logged in, show course content
|
| 161 |
-
if st.session_state.current_week in [1, 2, 3, 4]:
|
| 162 |
show_week_content()
|
| 163 |
else:
|
| 164 |
st.title("Data Science Research Paper Course")
|
|
|
|
| 22 |
from app.pages import week_2
|
| 23 |
from app.pages import week_3
|
| 24 |
from app.pages import week_4
|
| 25 |
+
from app.pages import week_5
|
| 26 |
# Page configuration
|
| 27 |
st.set_page_config(
|
| 28 |
page_title="Data Science Course App",
|
|
|
|
| 147 |
week_3.show()
|
| 148 |
elif st.session_state.current_week == 4:
|
| 149 |
week_4.show()
|
| 150 |
+
elif st.session_state.current_week == 5:
|
| 151 |
+
week_5.show()
|
| 152 |
else:
|
| 153 |
st.warning("Content for this week is not yet available.")
|
| 154 |
|
|
|
|
| 161 |
return
|
| 162 |
|
| 163 |
# User is logged in, show course content
|
| 164 |
+
if st.session_state.current_week in [1, 2, 3, 4, 5]:
|
| 165 |
show_week_content()
|
| 166 |
else:
|
| 167 |
st.title("Data Science Research Paper Course")
|
app/pages/__pycache__/week_1.cpython-311.pyc
CHANGED
|
Binary files a/app/pages/__pycache__/week_1.cpython-311.pyc and b/app/pages/__pycache__/week_1.cpython-311.pyc differ
|
|
|
app/pages/__pycache__/week_2.cpython-311.pyc
CHANGED
|
Binary files a/app/pages/__pycache__/week_2.cpython-311.pyc and b/app/pages/__pycache__/week_2.cpython-311.pyc differ
|
|
|
app/pages/__pycache__/week_3.cpython-311.pyc
CHANGED
|
Binary files a/app/pages/__pycache__/week_3.cpython-311.pyc and b/app/pages/__pycache__/week_3.cpython-311.pyc differ
|
|
|
app/pages/__pycache__/week_4.cpython-311.pyc
CHANGED
|
Binary files a/app/pages/__pycache__/week_4.cpython-311.pyc and b/app/pages/__pycache__/week_4.cpython-311.pyc differ
|
|
|
app/pages/__pycache__/week_5.cpython-311.pyc
ADDED
|
Binary file (18.4 kB). View file
|
|
|
app/pages/week_5.py
CHANGED
|
@@ -7,6 +7,70 @@ from sklearn.linear_model import LinearRegression
|
|
| 7 |
from sklearn.metrics import r2_score
|
| 8 |
import scipy.stats as stats
|
| 9 |
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def show():
|
| 12 |
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
|
|
@@ -28,7 +92,7 @@ def show():
|
|
| 28 |
""")
|
| 29 |
|
| 30 |
# Learning Path
|
| 31 |
-
st.subheader("Key Concepts You'll
|
| 32 |
st.write("""
|
| 33 |
1. **Linear Regression (线性回归):**
|
| 34 |
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
|
|
@@ -46,226 +110,231 @@ def show():
|
|
| 46 |
- Confidence intervals: Range where true coefficient likely lies
|
| 47 |
""")
|
| 48 |
|
| 49 |
-
#
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
Let's start by importing the necessary libraries for our analysis:
|
| 53 |
-
""")
|
| 54 |
-
|
| 55 |
-
st.code("""
|
| 56 |
-
import numpy as np
|
| 57 |
-
import pandas as pd
|
| 58 |
-
import scipy.stats as stats
|
| 59 |
-
import matplotlib.pyplot as plt
|
| 60 |
-
import sklearn
|
| 61 |
-
from nltk.tokenize import word_tokenize
|
| 62 |
-
import seaborn as sns
|
| 63 |
-
|
| 64 |
-
# Set up visualization style
|
| 65 |
-
sns.set_style("whitegrid")
|
| 66 |
-
sns.set_context("poster")
|
| 67 |
-
""")
|
| 68 |
-
|
| 69 |
-
# Module 2: Loading and Understanding Data
|
| 70 |
-
st.header("Module 2: Loading and Understanding Data")
|
| 71 |
-
st.write("""
|
| 72 |
-
Before diving into analysis, we need to understand our data structure. What information do we have about each review? Each submission?
|
| 73 |
-
""")
|
| 74 |
-
|
| 75 |
-
if st.button("Load Sample Data"):
|
| 76 |
-
# Create sample data for demonstration
|
| 77 |
-
sample_reviews = pd.DataFrame({
|
| 78 |
-
'rating_int': [6, 6, 5, 6, 8],
|
| 79 |
-
'confidence_int': [4.0, 4.0, 4.0, 3.0, 3.0],
|
| 80 |
-
'review': [
|
| 81 |
-
'There is a lot of recent work on link-prediction...',
|
| 82 |
-
'Pros: The different attention techniques...',
|
| 83 |
-
'Overview of the paper: This paper studies...',
|
| 84 |
-
'Summary: The authors propose a near minimax...',
|
| 85 |
-
'This paper introduces a GPU-friendly variant...'
|
| 86 |
-
],
|
| 87 |
-
'forum': ['tGZu6DlbreV', 'uKhGRvM8QNH', 'IrM64DGB21', 'ww-7bdU6GA9', 'r1VGvBcxl']
|
| 88 |
-
})
|
| 89 |
|
| 90 |
-
|
| 91 |
-
st.
|
| 92 |
-
|
| 93 |
-
# Module 3: Feature Engineering
|
| 94 |
-
st.header("Module 3: Feature Engineering")
|
| 95 |
-
st.write("""
|
| 96 |
-
We'll create features from our text data that can help predict paper acceptance:
|
| 97 |
-
- Review length (word count)
|
| 98 |
-
- Review rating
|
| 99 |
-
- Reviewer confidence
|
| 100 |
-
- Number of keywords in the paper
|
| 101 |
-
""")
|
| 102 |
-
|
| 103 |
-
# Interactive Feature Engineering
|
| 104 |
-
st.subheader("Try Feature Engineering")
|
| 105 |
-
st.write("""
|
| 106 |
-
Let's create some features from a review:
|
| 107 |
-
""")
|
| 108 |
-
|
| 109 |
-
review_text = st.text_area(
|
| 110 |
-
"Enter a review to analyze:",
|
| 111 |
-
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
|
| 112 |
-
key="review_text"
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
if st.button("Extract Features"):
|
| 116 |
-
# Calculate features
|
| 117 |
-
word_count = len(word_tokenize(review_text))
|
| 118 |
-
sentence_count = len(review_text.split('.'))
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
# Module 4: Linear Regression Analysis
|
| 125 |
-
st.header("Module 4: Linear Regression Analysis")
|
| 126 |
-
st.write("""
|
| 127 |
-
Let's build a simple linear regression model to predict paper ratings based on review features.
|
| 128 |
-
""")
|
| 129 |
-
|
| 130 |
-
# Interactive Regression
|
| 131 |
-
st.subheader("Try Linear Regression")
|
| 132 |
-
st.write("""
|
| 133 |
-
Let's create a simple regression model:
|
| 134 |
-
""")
|
| 135 |
-
|
| 136 |
-
if st.button("Run Sample Regression"):
|
| 137 |
-
# Create sample data
|
| 138 |
-
np.random.seed(42)
|
| 139 |
-
X = np.random.rand(100, 1) * 10 # Review length
|
| 140 |
-
y = 2 * X + np.random.randn(100, 1) * 2 # Rating with some noise
|
| 141 |
|
| 142 |
-
#
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
# Create
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
plt.ylabel('Rating')
|
| 152 |
-
plt.title('Linear Regression: Review Length vs Rating')
|
| 153 |
-
st.pyplot(plt)
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
st.
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
with st.expander("Exercise 1: Feature Engineering"):
|
| 164 |
st.write("""
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
|
|
|
| 169 |
""")
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
df_reviews = pd.read_csv('reviews.csv')
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
-
'confidence_int']].corr()
|
| 189 |
-
|
| 190 |
-
# Visualize
|
| 191 |
-
sns.heatmap(correlation, annot=True)
|
| 192 |
-
plt.show()
|
| 193 |
-
""")
|
| 194 |
-
|
| 195 |
-
with st.expander("Exercise 2: Building a Predictive Model"):
|
| 196 |
st.write("""
|
| 197 |
-
|
| 198 |
-
2. Split data into training and test sets
|
| 199 |
-
3. Train a linear regression model
|
| 200 |
-
4. Evaluate model performance
|
| 201 |
""")
|
| 202 |
|
| 203 |
-
|
| 204 |
-
# Solution
|
| 205 |
-
from sklearn.model_selection import train_test_split
|
| 206 |
-
from sklearn.linear_model import LinearRegression
|
| 207 |
-
|
| 208 |
-
# Prepare features
|
| 209 |
X = df_reviews[['word_count', 'confidence_int']]
|
| 210 |
y = df_reviews['rating_int']
|
| 211 |
|
| 212 |
-
#
|
| 213 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
| 214 |
-
X, y, test_size=0.2, random_state=42)
|
| 215 |
-
|
| 216 |
-
# Train model
|
| 217 |
model = LinearRegression()
|
| 218 |
-
model.fit(
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from sklearn.metrics import r2_score
|
| 8 |
import scipy.stats as stats
|
| 9 |
from nltk.tokenize import word_tokenize
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# Set up the style for all plots
|
| 16 |
+
plt.style.use('default')
|
| 17 |
+
sns.set_theme(style="whitegrid", palette="husl")
|
| 18 |
+
|
| 19 |
+
def load_data():
|
| 20 |
+
"""Load and prepare the data"""
|
| 21 |
+
# Get the current file's directory
|
| 22 |
+
current_dir = Path(__file__).parent
|
| 23 |
+
|
| 24 |
+
# Navigate to the Data directory (two levels up from the pages directory)
|
| 25 |
+
data_dir = current_dir.parent.parent / "Data"
|
| 26 |
+
|
| 27 |
+
# Load the datasets
|
| 28 |
+
try:
|
| 29 |
+
df_reviews = pd.read_csv(data_dir / "reviews.csv")
|
| 30 |
+
df_submissions = pd.read_csv(data_dir / "Submissions.csv")
|
| 31 |
+
df_dec = pd.read_csv(data_dir / "decision.csv")
|
| 32 |
+
df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
|
| 33 |
+
|
| 34 |
+
return df_reviews, df_submissions, df_dec, df_keyword
|
| 35 |
+
except FileNotFoundError as e:
|
| 36 |
+
st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
|
| 37 |
+
st.error(f"Error details: {str(e)}")
|
| 38 |
+
return None, None, None, None
|
| 39 |
+
|
| 40 |
+
def create_feature_plot(df, x_col, y_col, title):
|
| 41 |
+
"""Create an interactive scatter plot using plotly"""
|
| 42 |
+
fig = px.scatter(df, x=x_col, y=y_col,
|
| 43 |
+
title=title,
|
| 44 |
+
labels={x_col: x_col.replace('_', ' ').title(),
|
| 45 |
+
y_col: y_col.replace('_', ' ').title()},
|
| 46 |
+
template="plotly_white")
|
| 47 |
+
fig.update_layout(
|
| 48 |
+
title_x=0.5,
|
| 49 |
+
title_font_size=20,
|
| 50 |
+
showlegend=True,
|
| 51 |
+
plot_bgcolor='white',
|
| 52 |
+
paper_bgcolor='white'
|
| 53 |
+
)
|
| 54 |
+
return fig
|
| 55 |
+
|
| 56 |
+
def create_correlation_heatmap(df, columns):
|
| 57 |
+
"""Create a correlation heatmap using plotly"""
|
| 58 |
+
corr = df[columns].corr()
|
| 59 |
+
fig = go.Figure(data=go.Heatmap(
|
| 60 |
+
z=corr,
|
| 61 |
+
x=corr.columns,
|
| 62 |
+
y=corr.columns,
|
| 63 |
+
colorscale='RdBu',
|
| 64 |
+
zmin=-1, zmax=1
|
| 65 |
+
))
|
| 66 |
+
fig.update_layout(
|
| 67 |
+
title='Feature Correlation Heatmap',
|
| 68 |
+
title_x=0.5,
|
| 69 |
+
title_font_size=20,
|
| 70 |
+
plot_bgcolor='white',
|
| 71 |
+
paper_bgcolor='white'
|
| 72 |
+
)
|
| 73 |
+
return fig
|
| 74 |
|
| 75 |
def show():
|
| 76 |
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
|
|
|
|
| 92 |
""")
|
| 93 |
|
| 94 |
# Learning Path
|
| 95 |
+
st.subheader("Key Concepts You'll Learn")
|
| 96 |
st.write("""
|
| 97 |
1. **Linear Regression (线性回归):**
|
| 98 |
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
|
|
|
|
| 110 |
- Confidence intervals: Range where true coefficient likely lies
|
| 111 |
""")
|
| 112 |
|
| 113 |
+
# Load the data
|
| 114 |
+
try:
|
| 115 |
+
df_reviews, df_submissions, df_dec, df_keyword = load_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
# Module 1: Data Exploration
|
| 118 |
+
st.header("Module 1: Data Exploration")
|
| 119 |
+
st.write("Let's explore our dataset to understand the review patterns:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
# Create features from review text
|
| 122 |
+
df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
|
| 123 |
+
df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
# Show basic statistics
|
| 126 |
+
col1, col2 = st.columns(2)
|
| 127 |
+
with col1:
|
| 128 |
+
st.metric("Total Reviews", len(df_reviews))
|
| 129 |
+
st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
|
| 130 |
+
with col2:
|
| 131 |
+
st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
|
| 132 |
+
st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
|
| 133 |
|
| 134 |
+
# Create interactive visualizations
|
| 135 |
+
st.subheader("Review Length vs Rating")
|
| 136 |
+
fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
|
| 137 |
+
'Relationship between Review Length and Rating')
|
| 138 |
+
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
# Correlation analysis
|
| 141 |
+
st.subheader("Feature Correlations")
|
| 142 |
+
corr_fig = create_correlation_heatmap(df_reviews,
|
| 143 |
+
['word_count', 'rating_int', 'confidence_int'])
|
| 144 |
+
st.plotly_chart(corr_fig, use_container_width=True)
|
| 145 |
+
|
| 146 |
+
# Module 2: Feature Engineering
|
| 147 |
+
st.header("Module 2: Feature Engineering")
|
|
|
|
| 148 |
st.write("""
|
| 149 |
+
Let's create more sophisticated features from our review data:
|
| 150 |
+
- Review length (word count)
|
| 151 |
+
- Review rating
|
| 152 |
+
- Reviewer confidence
|
| 153 |
+
- Number of keywords in the paper
|
| 154 |
""")
|
| 155 |
|
| 156 |
+
# Interactive Feature Engineering
|
| 157 |
+
st.subheader("Try Feature Engineering")
|
| 158 |
+
review_text = st.text_area(
|
| 159 |
+
"Enter a review to analyze:",
|
| 160 |
+
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
|
| 161 |
+
key="review_text"
|
| 162 |
+
)
|
|
|
|
| 163 |
|
| 164 |
+
if st.button("Extract Features"):
|
| 165 |
+
# Calculate features
|
| 166 |
+
word_count = len(word_tokenize(review_text))
|
| 167 |
+
sentence_count = len(review_text.split('.'))
|
| 168 |
+
|
| 169 |
+
# Create a nice display of features
|
| 170 |
+
col1, col2, col3 = st.columns(3)
|
| 171 |
+
with col1:
|
| 172 |
+
st.metric("Word Count", word_count)
|
| 173 |
+
with col2:
|
| 174 |
+
st.metric("Sentence Count", sentence_count)
|
| 175 |
+
with col3:
|
| 176 |
+
st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
|
| 177 |
|
| 178 |
+
# Module 3: Linear Regression Analysis
|
| 179 |
+
st.header("Module 3: Linear Regression Analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
st.write("""
|
| 181 |
+
Let's build a linear regression model to predict paper ratings based on review features.
|
|
|
|
|
|
|
|
|
|
| 182 |
""")
|
| 183 |
|
| 184 |
+
# Prepare data for modeling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
X = df_reviews[['word_count', 'confidence_int']]
|
| 186 |
y = df_reviews['rating_int']
|
| 187 |
|
| 188 |
+
# Fit regression model
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
model = LinearRegression()
|
| 190 |
+
model.fit(X, y)
|
| 191 |
|
| 192 |
+
# Create 3D visualization of the regression
|
| 193 |
+
st.subheader("3D Visualization of Review Features")
|
| 194 |
+
fig = px.scatter_3d(df_reviews.sample(1000),
|
| 195 |
+
x='word_count',
|
| 196 |
+
y='confidence_int',
|
| 197 |
+
z='rating_int',
|
| 198 |
+
title='Review Features in 3D Space',
|
| 199 |
+
labels={
|
| 200 |
+
'word_count': 'Word Count',
|
| 201 |
+
'confidence_int': 'Confidence',
|
| 202 |
+
'rating_int': 'Rating'
|
| 203 |
+
})
|
| 204 |
+
fig.update_layout(
|
| 205 |
+
title_x=0.5,
|
| 206 |
+
title_font_size=20,
|
| 207 |
+
scene = dict(
|
| 208 |
+
xaxis_title='Word Count',
|
| 209 |
+
yaxis_title='Confidence',
|
| 210 |
+
zaxis_title='Rating'
|
| 211 |
+
)
|
| 212 |
+
)
|
| 213 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 214 |
|
| 215 |
+
# Show model metrics
|
| 216 |
+
st.subheader("Model Performance")
|
| 217 |
+
col1, col2, col3 = st.columns(3)
|
| 218 |
+
with col1:
|
| 219 |
+
st.metric("R-squared", f"{model.score(X, y):.3f}")
|
| 220 |
+
with col2:
|
| 221 |
+
st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
|
| 222 |
+
with col3:
|
| 223 |
+
st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
|
| 224 |
+
|
| 225 |
+
# Practice Exercises
|
| 226 |
+
st.header("Practice Exercises")
|
| 227 |
+
|
| 228 |
+
with st.expander("Exercise 1: Feature Engineering"):
|
| 229 |
+
st.write("""
|
| 230 |
+
1. Load the reviews dataset
|
| 231 |
+
2. Create features from review text
|
| 232 |
+
3. Calculate correlation between features
|
| 233 |
+
4. Visualize relationships
|
| 234 |
+
""")
|
| 235 |
+
|
| 236 |
+
st.code("""
|
| 237 |
+
# Solution
|
| 238 |
+
import pandas as pd
|
| 239 |
+
import numpy as np
|
| 240 |
+
from nltk.tokenize import word_tokenize
|
| 241 |
+
|
| 242 |
+
# Load data
|
| 243 |
+
df_reviews = pd.read_csv('reviews.csv')
|
| 244 |
+
|
| 245 |
+
# Create features
|
| 246 |
+
df_reviews['word_count'] = df_reviews['review'].apply(
|
| 247 |
+
lambda x: len(word_tokenize(x)))
|
| 248 |
+
df_reviews['sentence_count'] = df_reviews['review'].apply(
|
| 249 |
+
lambda x: len(x.split('.')))
|
| 250 |
+
|
| 251 |
+
# Calculate correlation
|
| 252 |
+
correlation = df_reviews[['word_count', 'rating_int',
|
| 253 |
+
'confidence_int']].corr()
|
| 254 |
+
|
| 255 |
+
# Visualize
|
| 256 |
+
sns.heatmap(correlation, annot=True)
|
| 257 |
+
plt.show()
|
| 258 |
+
""")
|
| 259 |
+
|
| 260 |
+
with st.expander("Exercise 2: Building a Predictive Model"):
|
| 261 |
+
st.write("""
|
| 262 |
+
1. Prepare features for modeling
|
| 263 |
+
2. Split data into training and test sets
|
| 264 |
+
3. Train a linear regression model
|
| 265 |
+
4. Evaluate model performance
|
| 266 |
+
""")
|
| 267 |
+
|
| 268 |
+
st.code("""
|
| 269 |
+
# Solution
|
| 270 |
+
from sklearn.model_selection import train_test_split
|
| 271 |
+
from sklearn.linear_model import LinearRegression
|
| 272 |
+
|
| 273 |
+
# Prepare features
|
| 274 |
+
X = df_reviews[['word_count', 'confidence_int']]
|
| 275 |
+
y = df_reviews['rating_int']
|
| 276 |
+
|
| 277 |
+
# Split data
|
| 278 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 279 |
+
X, y, test_size=0.2, random_state=42)
|
| 280 |
+
|
| 281 |
+
# Train model
|
| 282 |
+
model = LinearRegression()
|
| 283 |
+
model.fit(X_train, y_train)
|
| 284 |
+
|
| 285 |
+
# Evaluate
|
| 286 |
+
train_score = model.score(X_train, y_train)
|
| 287 |
+
test_score = model.score(X_test, y_test)
|
| 288 |
+
|
| 289 |
+
print(f"Training R²: {train_score:.3f}")
|
| 290 |
+
print(f"Testing R²: {test_score:.3f}")
|
| 291 |
+
""")
|
| 292 |
|
| 293 |
+
# Weekly Assignment
|
| 294 |
+
username = st.session_state.get("username", "Student")
|
| 295 |
+
st.header(f"{username}'s Weekly Assignment")
|
| 296 |
+
|
| 297 |
+
if username == "manxiii":
|
| 298 |
+
st.markdown("""
|
| 299 |
+
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
|
| 300 |
+
1. Complete the feature engineering pipeline for the ICLR dataset
|
| 301 |
+
2. Build a linear regression model to predict paper ratings
|
| 302 |
+
3. Analyze the relationship between review features and acceptance
|
| 303 |
+
4. Submit your findings in a Jupyter notebook
|
| 304 |
|
| 305 |
+
**Due Date:** End of Week 5
|
| 306 |
+
""")
|
| 307 |
+
elif username == "zhu":
|
| 308 |
+
st.markdown("""
|
| 309 |
+
Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
|
| 310 |
+
1. Implement the complete machine learning workflow
|
| 311 |
+
2. Create insightful visualizations of model results
|
| 312 |
+
3. Draw conclusions from your analysis
|
| 313 |
+
4. Submit your work in a Jupyter notebook
|
| 314 |
|
| 315 |
+
**Due Date:** End of Week 5
|
| 316 |
+
""")
|
| 317 |
+
elif username == "WK":
|
| 318 |
+
st.markdown("""
|
| 319 |
+
Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
|
| 320 |
+
1. Complete the feature engineering pipeline
|
| 321 |
+
2. Build and evaluate a linear regression model
|
| 322 |
+
3. Analyze patterns in the data
|
| 323 |
+
4. Submit your findings
|
| 324 |
|
| 325 |
+
**Due Date:** End of Week 5
|
| 326 |
+
""")
|
| 327 |
+
else:
|
| 328 |
+
st.markdown(f"""
|
| 329 |
+
Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
|
| 330 |
+
1. Complete the feature engineering pipeline
|
| 331 |
+
2. Build and evaluate a linear regression model
|
| 332 |
+
3. Analyze patterns in the data
|
| 333 |
+
4. Submit your findings
|
| 334 |
|
| 335 |
+
**Due Date:** End of Week 5
|
| 336 |
+
""")
|
| 337 |
+
|
| 338 |
+
except Exception as e:
|
| 339 |
+
st.error(f"Error loading data: {str(e)}")
|
| 340 |
+
st.write("Please make sure the data files are in the correct location.")
|