Spaces:
Running
Running
raymondEDS
commited on
Commit
·
ae38d1c
1
Parent(s):
faeb953
Updating lesson 5
Browse files- Data/Submissions.csv +0 -0
- Data/decision.csv +0 -0
- Data/reviews.csv +0 -0
- Data/submission_keyword.csv +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/components/__pycache__/__init__.cpython-311.pyc +0 -0
- app/components/__pycache__/login.cpython-311.pyc +0 -0
- app/main.py +4 -1
- app/pages/__pycache__/week_1.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_2.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_3.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_4.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_5.cpython-311.pyc +0 -0
- app/pages/week_5.py +269 -200
Data/Submissions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Data/decision.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Data/reviews.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Data/submission_keyword.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app/__pycache__/__init__.cpython-311.pyc
CHANGED
Binary files a/app/__pycache__/__init__.cpython-311.pyc and b/app/__pycache__/__init__.cpython-311.pyc differ
|
|
app/__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
|
|
app/components/__pycache__/__init__.cpython-311.pyc
CHANGED
Binary files a/app/components/__pycache__/__init__.cpython-311.pyc and b/app/components/__pycache__/__init__.cpython-311.pyc differ
|
|
app/components/__pycache__/login.cpython-311.pyc
CHANGED
Binary files a/app/components/__pycache__/login.cpython-311.pyc and b/app/components/__pycache__/login.cpython-311.pyc differ
|
|
app/main.py
CHANGED
@@ -22,6 +22,7 @@ from app.pages import week_1
|
|
22 |
from app.pages import week_2
|
23 |
from app.pages import week_3
|
24 |
from app.pages import week_4
|
|
|
25 |
# Page configuration
|
26 |
st.set_page_config(
|
27 |
page_title="Data Science Course App",
|
@@ -146,6 +147,8 @@ def show_week_content():
|
|
146 |
week_3.show()
|
147 |
elif st.session_state.current_week == 4:
|
148 |
week_4.show()
|
|
|
|
|
149 |
else:
|
150 |
st.warning("Content for this week is not yet available.")
|
151 |
|
@@ -158,7 +161,7 @@ def main():
|
|
158 |
return
|
159 |
|
160 |
# User is logged in, show course content
|
161 |
-
if st.session_state.current_week in [1, 2, 3, 4]:
|
162 |
show_week_content()
|
163 |
else:
|
164 |
st.title("Data Science Research Paper Course")
|
|
|
22 |
from app.pages import week_2
|
23 |
from app.pages import week_3
|
24 |
from app.pages import week_4
|
25 |
+
from app.pages import week_5
|
26 |
# Page configuration
|
27 |
st.set_page_config(
|
28 |
page_title="Data Science Course App",
|
|
|
147 |
week_3.show()
|
148 |
elif st.session_state.current_week == 4:
|
149 |
week_4.show()
|
150 |
+
elif st.session_state.current_week == 5:
|
151 |
+
week_5.show()
|
152 |
else:
|
153 |
st.warning("Content for this week is not yet available.")
|
154 |
|
|
|
161 |
return
|
162 |
|
163 |
# User is logged in, show course content
|
164 |
+
if st.session_state.current_week in [1, 2, 3, 4, 5]:
|
165 |
show_week_content()
|
166 |
else:
|
167 |
st.title("Data Science Research Paper Course")
|
app/pages/__pycache__/week_1.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_1.cpython-311.pyc and b/app/pages/__pycache__/week_1.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_2.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_2.cpython-311.pyc and b/app/pages/__pycache__/week_2.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_3.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_3.cpython-311.pyc and b/app/pages/__pycache__/week_3.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_4.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_4.cpython-311.pyc and b/app/pages/__pycache__/week_4.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_5.cpython-311.pyc
ADDED
Binary file (18.4 kB). View file
|
|
app/pages/week_5.py
CHANGED
@@ -7,6 +7,70 @@ from sklearn.linear_model import LinearRegression
|
|
7 |
from sklearn.metrics import r2_score
|
8 |
import scipy.stats as stats
|
9 |
from nltk.tokenize import word_tokenize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def show():
|
12 |
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
|
@@ -28,7 +92,7 @@ def show():
|
|
28 |
""")
|
29 |
|
30 |
# Learning Path
|
31 |
-
st.subheader("Key Concepts You'll
|
32 |
st.write("""
|
33 |
1. **Linear Regression (线性回归):**
|
34 |
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
|
@@ -46,226 +110,231 @@ def show():
|
|
46 |
- Confidence intervals: Range where true coefficient likely lies
|
47 |
""")
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
Let's start by importing the necessary libraries for our analysis:
|
53 |
-
""")
|
54 |
-
|
55 |
-
st.code("""
|
56 |
-
import numpy as np
|
57 |
-
import pandas as pd
|
58 |
-
import scipy.stats as stats
|
59 |
-
import matplotlib.pyplot as plt
|
60 |
-
import sklearn
|
61 |
-
from nltk.tokenize import word_tokenize
|
62 |
-
import seaborn as sns
|
63 |
-
|
64 |
-
# Set up visualization style
|
65 |
-
sns.set_style("whitegrid")
|
66 |
-
sns.set_context("poster")
|
67 |
-
""")
|
68 |
-
|
69 |
-
# Module 2: Loading and Understanding Data
|
70 |
-
st.header("Module 2: Loading and Understanding Data")
|
71 |
-
st.write("""
|
72 |
-
Before diving into analysis, we need to understand our data structure. What information do we have about each review? Each submission?
|
73 |
-
""")
|
74 |
-
|
75 |
-
if st.button("Load Sample Data"):
|
76 |
-
# Create sample data for demonstration
|
77 |
-
sample_reviews = pd.DataFrame({
|
78 |
-
'rating_int': [6, 6, 5, 6, 8],
|
79 |
-
'confidence_int': [4.0, 4.0, 4.0, 3.0, 3.0],
|
80 |
-
'review': [
|
81 |
-
'There is a lot of recent work on link-prediction...',
|
82 |
-
'Pros: The different attention techniques...',
|
83 |
-
'Overview of the paper: This paper studies...',
|
84 |
-
'Summary: The authors propose a near minimax...',
|
85 |
-
'This paper introduces a GPU-friendly variant...'
|
86 |
-
],
|
87 |
-
'forum': ['tGZu6DlbreV', 'uKhGRvM8QNH', 'IrM64DGB21', 'ww-7bdU6GA9', 'r1VGvBcxl']
|
88 |
-
})
|
89 |
|
90 |
-
|
91 |
-
st.
|
92 |
-
|
93 |
-
# Module 3: Feature Engineering
|
94 |
-
st.header("Module 3: Feature Engineering")
|
95 |
-
st.write("""
|
96 |
-
We'll create features from our text data that can help predict paper acceptance:
|
97 |
-
- Review length (word count)
|
98 |
-
- Review rating
|
99 |
-
- Reviewer confidence
|
100 |
-
- Number of keywords in the paper
|
101 |
-
""")
|
102 |
-
|
103 |
-
# Interactive Feature Engineering
|
104 |
-
st.subheader("Try Feature Engineering")
|
105 |
-
st.write("""
|
106 |
-
Let's create some features from a review:
|
107 |
-
""")
|
108 |
-
|
109 |
-
review_text = st.text_area(
|
110 |
-
"Enter a review to analyze:",
|
111 |
-
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
|
112 |
-
key="review_text"
|
113 |
-
)
|
114 |
-
|
115 |
-
if st.button("Extract Features"):
|
116 |
-
# Calculate features
|
117 |
-
word_count = len(word_tokenize(review_text))
|
118 |
-
sentence_count = len(review_text.split('.'))
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
# Module 4: Linear Regression Analysis
|
125 |
-
st.header("Module 4: Linear Regression Analysis")
|
126 |
-
st.write("""
|
127 |
-
Let's build a simple linear regression model to predict paper ratings based on review features.
|
128 |
-
""")
|
129 |
-
|
130 |
-
# Interactive Regression
|
131 |
-
st.subheader("Try Linear Regression")
|
132 |
-
st.write("""
|
133 |
-
Let's create a simple regression model:
|
134 |
-
""")
|
135 |
-
|
136 |
-
if st.button("Run Sample Regression"):
|
137 |
-
# Create sample data
|
138 |
-
np.random.seed(42)
|
139 |
-
X = np.random.rand(100, 1) * 10 # Review length
|
140 |
-
y = 2 * X + np.random.randn(100, 1) * 2 # Rating with some noise
|
141 |
|
142 |
-
#
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
# Create
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
plt.ylabel('Rating')
|
152 |
-
plt.title('Linear Regression: Review Length vs Rating')
|
153 |
-
st.pyplot(plt)
|
154 |
|
155 |
-
#
|
156 |
-
st.
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
with st.expander("Exercise 1: Feature Engineering"):
|
164 |
st.write("""
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
169 |
""")
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
df_reviews = pd.read_csv('reviews.csv')
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
'confidence_int']].corr()
|
189 |
-
|
190 |
-
# Visualize
|
191 |
-
sns.heatmap(correlation, annot=True)
|
192 |
-
plt.show()
|
193 |
-
""")
|
194 |
-
|
195 |
-
with st.expander("Exercise 2: Building a Predictive Model"):
|
196 |
st.write("""
|
197 |
-
|
198 |
-
2. Split data into training and test sets
|
199 |
-
3. Train a linear regression model
|
200 |
-
4. Evaluate model performance
|
201 |
""")
|
202 |
|
203 |
-
|
204 |
-
# Solution
|
205 |
-
from sklearn.model_selection import train_test_split
|
206 |
-
from sklearn.linear_model import LinearRegression
|
207 |
-
|
208 |
-
# Prepare features
|
209 |
X = df_reviews[['word_count', 'confidence_int']]
|
210 |
y = df_reviews['rating_int']
|
211 |
|
212 |
-
#
|
213 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
214 |
-
X, y, test_size=0.2, random_state=42)
|
215 |
-
|
216 |
-
# Train model
|
217 |
model = LinearRegression()
|
218 |
-
model.fit(
|
219 |
|
220 |
-
#
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
7 |
from sklearn.metrics import r2_score
|
8 |
import scipy.stats as stats
|
9 |
from nltk.tokenize import word_tokenize
|
10 |
+
import plotly.express as px
|
11 |
+
import plotly.graph_objects as go
|
12 |
+
from pathlib import Path
|
13 |
+
import os
|
14 |
+
|
15 |
+
# Set up the style for all plots
|
16 |
+
plt.style.use('default')
|
17 |
+
sns.set_theme(style="whitegrid", palette="husl")
|
18 |
+
|
19 |
+
def load_data():
|
20 |
+
"""Load and prepare the data"""
|
21 |
+
# Get the current file's directory
|
22 |
+
current_dir = Path(__file__).parent
|
23 |
+
|
24 |
+
# Navigate to the Data directory (two levels up from the pages directory)
|
25 |
+
data_dir = current_dir.parent.parent / "Data"
|
26 |
+
|
27 |
+
# Load the datasets
|
28 |
+
try:
|
29 |
+
df_reviews = pd.read_csv(data_dir / "reviews.csv")
|
30 |
+
df_submissions = pd.read_csv(data_dir / "Submissions.csv")
|
31 |
+
df_dec = pd.read_csv(data_dir / "decision.csv")
|
32 |
+
df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
|
33 |
+
|
34 |
+
return df_reviews, df_submissions, df_dec, df_keyword
|
35 |
+
except FileNotFoundError as e:
|
36 |
+
st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
|
37 |
+
st.error(f"Error details: {str(e)}")
|
38 |
+
return None, None, None, None
|
39 |
+
|
40 |
+
def create_feature_plot(df, x_col, y_col, title):
|
41 |
+
"""Create an interactive scatter plot using plotly"""
|
42 |
+
fig = px.scatter(df, x=x_col, y=y_col,
|
43 |
+
title=title,
|
44 |
+
labels={x_col: x_col.replace('_', ' ').title(),
|
45 |
+
y_col: y_col.replace('_', ' ').title()},
|
46 |
+
template="plotly_white")
|
47 |
+
fig.update_layout(
|
48 |
+
title_x=0.5,
|
49 |
+
title_font_size=20,
|
50 |
+
showlegend=True,
|
51 |
+
plot_bgcolor='white',
|
52 |
+
paper_bgcolor='white'
|
53 |
+
)
|
54 |
+
return fig
|
55 |
+
|
56 |
+
def create_correlation_heatmap(df, columns):
|
57 |
+
"""Create a correlation heatmap using plotly"""
|
58 |
+
corr = df[columns].corr()
|
59 |
+
fig = go.Figure(data=go.Heatmap(
|
60 |
+
z=corr,
|
61 |
+
x=corr.columns,
|
62 |
+
y=corr.columns,
|
63 |
+
colorscale='RdBu',
|
64 |
+
zmin=-1, zmax=1
|
65 |
+
))
|
66 |
+
fig.update_layout(
|
67 |
+
title='Feature Correlation Heatmap',
|
68 |
+
title_x=0.5,
|
69 |
+
title_font_size=20,
|
70 |
+
plot_bgcolor='white',
|
71 |
+
paper_bgcolor='white'
|
72 |
+
)
|
73 |
+
return fig
|
74 |
|
75 |
def show():
|
76 |
st.title("Week 5: Introduction to Machine Learning and Linear Regression")
|
|
|
92 |
""")
|
93 |
|
94 |
# Learning Path
|
95 |
+
st.subheader("Key Concepts You'll Learn")
|
96 |
st.write("""
|
97 |
1. **Linear Regression (线性回归):**
|
98 |
- Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
|
|
|
110 |
- Confidence intervals: Range where true coefficient likely lies
|
111 |
""")
|
112 |
|
113 |
+
# Load the data
|
114 |
+
try:
|
115 |
+
df_reviews, df_submissions, df_dec, df_keyword = load_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
# Module 1: Data Exploration
|
118 |
+
st.header("Module 1: Data Exploration")
|
119 |
+
st.write("Let's explore our dataset to understand the review patterns:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# Create features from review text
|
122 |
+
df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
|
123 |
+
df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
+
# Show basic statistics
|
126 |
+
col1, col2 = st.columns(2)
|
127 |
+
with col1:
|
128 |
+
st.metric("Total Reviews", len(df_reviews))
|
129 |
+
st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
|
130 |
+
with col2:
|
131 |
+
st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
|
132 |
+
st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
|
133 |
|
134 |
+
# Create interactive visualizations
|
135 |
+
st.subheader("Review Length vs Rating")
|
136 |
+
fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
|
137 |
+
'Relationship between Review Length and Rating')
|
138 |
+
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
139 |
|
140 |
+
# Correlation analysis
|
141 |
+
st.subheader("Feature Correlations")
|
142 |
+
corr_fig = create_correlation_heatmap(df_reviews,
|
143 |
+
['word_count', 'rating_int', 'confidence_int'])
|
144 |
+
st.plotly_chart(corr_fig, use_container_width=True)
|
145 |
+
|
146 |
+
# Module 2: Feature Engineering
|
147 |
+
st.header("Module 2: Feature Engineering")
|
|
|
148 |
st.write("""
|
149 |
+
Let's create more sophisticated features from our review data:
|
150 |
+
- Review length (word count)
|
151 |
+
- Review rating
|
152 |
+
- Reviewer confidence
|
153 |
+
- Number of keywords in the paper
|
154 |
""")
|
155 |
|
156 |
+
# Interactive Feature Engineering
|
157 |
+
st.subheader("Try Feature Engineering")
|
158 |
+
review_text = st.text_area(
|
159 |
+
"Enter a review to analyze:",
|
160 |
+
"This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
|
161 |
+
key="review_text"
|
162 |
+
)
|
|
|
163 |
|
164 |
+
if st.button("Extract Features"):
|
165 |
+
# Calculate features
|
166 |
+
word_count = len(word_tokenize(review_text))
|
167 |
+
sentence_count = len(review_text.split('.'))
|
168 |
+
|
169 |
+
# Create a nice display of features
|
170 |
+
col1, col2, col3 = st.columns(3)
|
171 |
+
with col1:
|
172 |
+
st.metric("Word Count", word_count)
|
173 |
+
with col2:
|
174 |
+
st.metric("Sentence Count", sentence_count)
|
175 |
+
with col3:
|
176 |
+
st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
|
177 |
|
178 |
+
# Module 3: Linear Regression Analysis
|
179 |
+
st.header("Module 3: Linear Regression Analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
st.write("""
|
181 |
+
Let's build a linear regression model to predict paper ratings based on review features.
|
|
|
|
|
|
|
182 |
""")
|
183 |
|
184 |
+
# Prepare data for modeling
|
|
|
|
|
|
|
|
|
|
|
185 |
X = df_reviews[['word_count', 'confidence_int']]
|
186 |
y = df_reviews['rating_int']
|
187 |
|
188 |
+
# Fit regression model
|
|
|
|
|
|
|
|
|
189 |
model = LinearRegression()
|
190 |
+
model.fit(X, y)
|
191 |
|
192 |
+
# Create 3D visualization of the regression
|
193 |
+
st.subheader("3D Visualization of Review Features")
|
194 |
+
fig = px.scatter_3d(df_reviews.sample(1000),
|
195 |
+
x='word_count',
|
196 |
+
y='confidence_int',
|
197 |
+
z='rating_int',
|
198 |
+
title='Review Features in 3D Space',
|
199 |
+
labels={
|
200 |
+
'word_count': 'Word Count',
|
201 |
+
'confidence_int': 'Confidence',
|
202 |
+
'rating_int': 'Rating'
|
203 |
+
})
|
204 |
+
fig.update_layout(
|
205 |
+
title_x=0.5,
|
206 |
+
title_font_size=20,
|
207 |
+
scene = dict(
|
208 |
+
xaxis_title='Word Count',
|
209 |
+
yaxis_title='Confidence',
|
210 |
+
zaxis_title='Rating'
|
211 |
+
)
|
212 |
+
)
|
213 |
+
st.plotly_chart(fig, use_container_width=True)
|
214 |
|
215 |
+
# Show model metrics
|
216 |
+
st.subheader("Model Performance")
|
217 |
+
col1, col2, col3 = st.columns(3)
|
218 |
+
with col1:
|
219 |
+
st.metric("R-squared", f"{model.score(X, y):.3f}")
|
220 |
+
with col2:
|
221 |
+
st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
|
222 |
+
with col3:
|
223 |
+
st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
|
224 |
+
|
225 |
+
# Practice Exercises
|
226 |
+
st.header("Practice Exercises")
|
227 |
+
|
228 |
+
with st.expander("Exercise 1: Feature Engineering"):
|
229 |
+
st.write("""
|
230 |
+
1. Load the reviews dataset
|
231 |
+
2. Create features from review text
|
232 |
+
3. Calculate correlation between features
|
233 |
+
4. Visualize relationships
|
234 |
+
""")
|
235 |
+
|
236 |
+
st.code("""
|
237 |
+
# Solution
|
238 |
+
import pandas as pd
|
239 |
+
import numpy as np
|
240 |
+
from nltk.tokenize import word_tokenize
|
241 |
+
|
242 |
+
# Load data
|
243 |
+
df_reviews = pd.read_csv('reviews.csv')
|
244 |
+
|
245 |
+
# Create features
|
246 |
+
df_reviews['word_count'] = df_reviews['review'].apply(
|
247 |
+
lambda x: len(word_tokenize(x)))
|
248 |
+
df_reviews['sentence_count'] = df_reviews['review'].apply(
|
249 |
+
lambda x: len(x.split('.')))
|
250 |
+
|
251 |
+
# Calculate correlation
|
252 |
+
correlation = df_reviews[['word_count', 'rating_int',
|
253 |
+
'confidence_int']].corr()
|
254 |
+
|
255 |
+
# Visualize
|
256 |
+
sns.heatmap(correlation, annot=True)
|
257 |
+
plt.show()
|
258 |
+
""")
|
259 |
+
|
260 |
+
with st.expander("Exercise 2: Building a Predictive Model"):
|
261 |
+
st.write("""
|
262 |
+
1. Prepare features for modeling
|
263 |
+
2. Split data into training and test sets
|
264 |
+
3. Train a linear regression model
|
265 |
+
4. Evaluate model performance
|
266 |
+
""")
|
267 |
+
|
268 |
+
st.code("""
|
269 |
+
# Solution
|
270 |
+
from sklearn.model_selection import train_test_split
|
271 |
+
from sklearn.linear_model import LinearRegression
|
272 |
+
|
273 |
+
# Prepare features
|
274 |
+
X = df_reviews[['word_count', 'confidence_int']]
|
275 |
+
y = df_reviews['rating_int']
|
276 |
+
|
277 |
+
# Split data
|
278 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
279 |
+
X, y, test_size=0.2, random_state=42)
|
280 |
+
|
281 |
+
# Train model
|
282 |
+
model = LinearRegression()
|
283 |
+
model.fit(X_train, y_train)
|
284 |
+
|
285 |
+
# Evaluate
|
286 |
+
train_score = model.score(X_train, y_train)
|
287 |
+
test_score = model.score(X_test, y_test)
|
288 |
+
|
289 |
+
print(f"Training R²: {train_score:.3f}")
|
290 |
+
print(f"Testing R²: {test_score:.3f}")
|
291 |
+
""")
|
292 |
|
293 |
+
# Weekly Assignment
|
294 |
+
username = st.session_state.get("username", "Student")
|
295 |
+
st.header(f"{username}'s Weekly Assignment")
|
296 |
+
|
297 |
+
if username == "manxiii":
|
298 |
+
st.markdown("""
|
299 |
+
Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
|
300 |
+
1. Complete the feature engineering pipeline for the ICLR dataset
|
301 |
+
2. Build a linear regression model to predict paper ratings
|
302 |
+
3. Analyze the relationship between review features and acceptance
|
303 |
+
4. Submit your findings in a Jupyter notebook
|
304 |
|
305 |
+
**Due Date:** End of Week 5
|
306 |
+
""")
|
307 |
+
elif username == "zhu":
|
308 |
+
st.markdown("""
|
309 |
+
Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
|
310 |
+
1. Implement the complete machine learning workflow
|
311 |
+
2. Create insightful visualizations of model results
|
312 |
+
3. Draw conclusions from your analysis
|
313 |
+
4. Submit your work in a Jupyter notebook
|
314 |
|
315 |
+
**Due Date:** End of Week 5
|
316 |
+
""")
|
317 |
+
elif username == "WK":
|
318 |
+
st.markdown("""
|
319 |
+
Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
|
320 |
+
1. Complete the feature engineering pipeline
|
321 |
+
2. Build and evaluate a linear regression model
|
322 |
+
3. Analyze patterns in the data
|
323 |
+
4. Submit your findings
|
324 |
|
325 |
+
**Due Date:** End of Week 5
|
326 |
+
""")
|
327 |
+
else:
|
328 |
+
st.markdown(f"""
|
329 |
+
Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
|
330 |
+
1. Complete the feature engineering pipeline
|
331 |
+
2. Build and evaluate a linear regression model
|
332 |
+
3. Analyze patterns in the data
|
333 |
+
4. Submit your findings
|
334 |
|
335 |
+
**Due Date:** End of Week 5
|
336 |
+
""")
|
337 |
+
|
338 |
+
except Exception as e:
|
339 |
+
st.error(f"Error loading data: {str(e)}")
|
340 |
+
st.write("Please make sure the data files are in the correct location.")
|