raymondEDS commited on
Commit
223d6e3
·
1 Parent(s): ae38d1c

week 5 final

Browse files
app/pages/__pycache__/week_5.cpython-311.pyc CHANGED
Binary files a/app/pages/__pycache__/week_5.cpython-311.pyc and b/app/pages/__pycache__/week_5.cpython-311.pyc differ
 
app/pages/week_5.py CHANGED
@@ -4,18 +4,60 @@ import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
  from sklearn.linear_model import LinearRegression
7
- from sklearn.metrics import r2_score
 
8
  import scipy.stats as stats
9
- from nltk.tokenize import word_tokenize
10
  import plotly.express as px
11
  import plotly.graph_objects as go
12
  from pathlib import Path
13
  import os
 
 
14
 
15
  # Set up the style for all plots
16
  plt.style.use('default')
17
  sns.set_theme(style="whitegrid", palette="husl")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def load_data():
20
  """Load and prepare the data"""
21
  # Get the current file's directory
@@ -31,45 +73,166 @@ def load_data():
31
  df_dec = pd.read_csv(data_dir / "decision.csv")
32
  df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  return df_reviews, df_submissions, df_dec, df_keyword
35
  except FileNotFoundError as e:
36
  st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
37
  st.error(f"Error details: {str(e)}")
38
  return None, None, None, None
 
 
 
39
 
40
  def create_feature_plot(df, x_col, y_col, title):
41
  """Create an interactive scatter plot using plotly"""
42
- fig = px.scatter(df, x=x_col, y=y_col,
 
 
 
43
  title=title,
44
  labels={x_col: x_col.replace('_', ' ').title(),
45
  y_col: y_col.replace('_', ' ').title()},
46
- template="plotly_white")
47
  fig.update_layout(
48
  title_x=0.5,
49
  title_font_size=20,
50
  showlegend=True,
51
- plot_bgcolor='white',
52
- paper_bgcolor='white'
 
53
  )
54
  return fig
55
 
56
  def create_correlation_heatmap(df, columns):
57
  """Create a correlation heatmap using plotly"""
58
- corr = df[columns].corr()
 
 
 
59
  fig = go.Figure(data=go.Heatmap(
60
  z=corr,
61
  x=corr.columns,
62
  y=corr.columns,
63
  colorscale='RdBu',
64
- zmin=-1, zmax=1
 
 
 
65
  ))
66
  fig.update_layout(
67
  title='Feature Correlation Heatmap',
68
  title_x=0.5,
69
  title_font_size=20,
70
- plot_bgcolor='white',
71
- paper_bgcolor='white'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  )
 
 
 
 
 
 
73
  return fig
74
 
75
  def show():
@@ -101,7 +264,49 @@ def show():
101
  2. **Correlation Analysis (相关性分析):**
102
  - Definition: Statistical measure that shows how strongly two variables are related
103
  - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
 
105
  3. **Reading Linear Regression Output (解读线性回归结果):**
106
  - R-squared (R²): Proportion of variance explained by the model (0-1)
107
  - p-value: Probability that the observed relationship occurred by chance
@@ -111,230 +316,871 @@ def show():
111
  """)
112
 
113
  # Load the data
114
- try:
115
- df_reviews, df_submissions, df_dec, df_keyword = load_data()
116
-
117
- # Module 1: Data Exploration
118
- st.header("Module 1: Data Exploration")
119
- st.write("Let's explore our dataset to understand the review patterns:")
120
-
121
- # Create features from review text
122
- df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
123
- df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
124
-
125
- # Show basic statistics
126
- col1, col2 = st.columns(2)
127
- with col1:
128
- st.metric("Total Reviews", len(df_reviews))
129
- st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
130
- with col2:
131
- st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
132
- st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
133
-
134
- # Create interactive visualizations
135
- st.subheader("Review Length vs Rating")
136
- fig = create_feature_plot(df_reviews, 'word_count', 'rating_int',
137
- 'Relationship between Review Length and Rating')
138
- st.plotly_chart(fig, use_container_width=True)
139
-
140
- # Correlation analysis
141
- st.subheader("Feature Correlations")
142
- corr_fig = create_correlation_heatmap(df_reviews,
143
- ['word_count', 'rating_int', 'confidence_int'])
144
- st.plotly_chart(corr_fig, use_container_width=True)
 
 
 
145
 
146
- # Module 2: Feature Engineering
147
- st.header("Module 2: Feature Engineering")
148
- st.write("""
149
- Let's create more sophisticated features from our review data:
150
- - Review length (word count)
151
- - Review rating
152
- - Reviewer confidence
153
- - Number of keywords in the paper
154
- """)
155
 
156
- # Interactive Feature Engineering
157
- st.subheader("Try Feature Engineering")
158
- review_text = st.text_area(
159
- "Enter a review to analyze:",
160
- "This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
161
- key="review_text"
162
- )
163
 
164
- if st.button("Extract Features"):
165
- # Calculate features
166
- word_count = len(word_tokenize(review_text))
167
- sentence_count = len(review_text.split('.'))
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Create a nice display of features
170
- col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
171
  with col1:
172
- st.metric("Word Count", word_count)
 
173
  with col2:
174
- st.metric("Sentence Count", sentence_count)
175
- with col3:
176
- st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
177
-
178
- # Module 3: Linear Regression Analysis
179
- st.header("Module 3: Linear Regression Analysis")
180
- st.write("""
181
- Let's build a linear regression model to predict paper ratings based on review features.
182
- """)
183
-
184
- # Prepare data for modeling
185
- X = df_reviews[['word_count', 'confidence_int']]
186
- y = df_reviews['rating_int']
187
-
188
- # Fit regression model
189
- model = LinearRegression()
190
- model.fit(X, y)
191
-
192
- # Create 3D visualization of the regression
193
- st.subheader("3D Visualization of Review Features")
194
- fig = px.scatter_3d(df_reviews.sample(1000),
195
- x='word_count',
196
- y='confidence_int',
197
- z='rating_int',
198
- title='Review Features in 3D Space',
199
- labels={
200
- 'word_count': 'Word Count',
201
- 'confidence_int': 'Confidence',
202
- 'rating_int': 'Rating'
203
- })
204
- fig.update_layout(
205
- title_x=0.5,
206
- title_font_size=20,
207
- scene = dict(
208
- xaxis_title='Word Count',
209
- yaxis_title='Confidence',
210
- zaxis_title='Rating'
211
- )
212
- )
213
- st.plotly_chart(fig, use_container_width=True)
214
-
215
- # Show model metrics
216
- st.subheader("Model Performance")
217
- col1, col2, col3 = st.columns(3)
218
- with col1:
219
- st.metric("R-squared", f"{model.score(X, y):.3f}")
220
- with col2:
221
- st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
222
- with col3:
223
- st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
224
-
225
- # Practice Exercises
226
- st.header("Practice Exercises")
227
-
228
- with st.expander("Exercise 1: Feature Engineering"):
229
- st.write("""
230
- 1. Load the reviews dataset
231
- 2. Create features from review text
232
- 3. Calculate correlation between features
233
- 4. Visualize relationships
234
- """)
235
 
236
- st.code("""
237
- # Solution
238
- import pandas as pd
239
- import numpy as np
240
- from nltk.tokenize import word_tokenize
241
 
242
- # Load data
243
- df_reviews = pd.read_csv('reviews.csv')
 
 
 
244
 
245
- # Create features
246
- df_reviews['word_count'] = df_reviews['review'].apply(
247
- lambda x: len(word_tokenize(x)))
248
- df_reviews['sentence_count'] = df_reviews['review'].apply(
249
- lambda x: len(x.split('.')))
250
 
251
- # Calculate correlation
252
- correlation = df_reviews[['word_count', 'rating_int',
253
- 'confidence_int']].corr()
254
 
255
- # Visualize
256
- sns.heatmap(correlation, annot=True)
257
- plt.show()
258
- """)
259
-
260
- with st.expander("Exercise 2: Building a Predictive Model"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  st.write("""
262
- 1. Prepare features for modeling
263
- 2. Split data into training and test sets
264
- 3. Train a linear regression model
265
- 4. Evaluate model performance
266
  """)
267
 
268
- st.code("""
269
- # Solution
270
- from sklearn.model_selection import train_test_split
271
- from sklearn.linear_model import LinearRegression
272
-
273
- # Prepare features
274
- X = df_reviews[['word_count', 'confidence_int']]
275
- y = df_reviews['rating_int']
276
 
277
- # Split data
278
- X_train, X_test, y_train, y_test = train_test_split(
279
- X, y, test_size=0.2, random_state=42)
 
280
 
281
- # Train model
282
- model = LinearRegression()
283
- model.fit(X_train, y_train)
 
 
 
 
 
 
284
 
285
- # Evaluate
286
- train_score = model.score(X_train, y_train)
287
- test_score = model.score(X_test, y_test)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- print(f"Training R²: {train_score:.3f}")
290
- print(f"Testing R²: {test_score:.3f}")
 
 
291
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- # Weekly Assignment
294
- username = st.session_state.get("username", "Student")
295
- st.header(f"{username}'s Weekly Assignment")
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- if username == "manxiii":
298
- st.markdown("""
299
- Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
300
- 1. Complete the feature engineering pipeline for the ICLR dataset
301
- 2. Build a linear regression model to predict paper ratings
302
- 3. Analyze the relationship between review features and acceptance
303
- 4. Submit your findings in a Jupyter notebook
304
-
305
- **Due Date:** End of Week 5
306
- """)
307
- elif username == "zhu":
308
- st.markdown("""
309
- Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
310
- 1. Implement the complete machine learning workflow
311
- 2. Create insightful visualizations of model results
312
- 3. Draw conclusions from your analysis
313
- 4. Submit your work in a Jupyter notebook
314
-
315
- **Due Date:** End of Week 5
316
- """)
317
- elif username == "WK":
318
- st.markdown("""
319
- Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
320
- 1. Complete the feature engineering pipeline
321
- 2. Build and evaluate a linear regression model
322
- 3. Analyze patterns in the data
323
- 4. Submit your findings
324
-
325
- **Due Date:** End of Week 5
326
- """)
327
- else:
328
- st.markdown(f"""
329
- Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
330
- 1. Complete the feature engineering pipeline
331
- 2. Build and evaluate a linear regression model
332
- 3. Analyze patterns in the data
333
- 4. Submit your findings
334
-
335
- **Due Date:** End of Week 5
336
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  except Exception as e:
339
- st.error(f"Error loading data: {str(e)}")
340
- st.write("Please make sure the data files are in the correct location.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
  from sklearn.linear_model import LinearRegression
7
+ from sklearn.metrics import r2_score, mean_squared_error
8
+ from sklearn.model_selection import train_test_split
9
  import scipy.stats as stats
 
10
  import plotly.express as px
11
  import plotly.graph_objects as go
12
  from pathlib import Path
13
  import os
14
+ import re
15
+ from plotly.subplots import make_subplots
16
 
17
  # Set up the style for all plots
18
  plt.style.use('default')
19
  sns.set_theme(style="whitegrid", palette="husl")
20
 
21
+ def simple_word_tokenize(text):
22
+ """Simple word tokenization function"""
23
+ # Convert to string and lowercase
24
+ text = str(text).lower()
25
+ # Remove special characters and extra whitespace
26
+ text = re.sub(r'[^\w\s]', ' ', text)
27
+ # Split on whitespace and remove empty strings
28
+ words = [word for word in text.split() if word]
29
+ return words
30
+
31
+ def simple_sentence_split(text):
32
+ """Simple sentence splitting function"""
33
+ # Convert to string
34
+ text = str(text)
35
+ # Split on common sentence endings
36
+ sentences = re.split(r'[.!?]+', text)
37
+ # Remove empty strings and strip whitespace
38
+ sentences = [s.strip() for s in sentences if s.strip()]
39
+ return sentences
40
+
41
+ def extract_text_features(text):
42
+ """Extract basic features from text"""
43
+ try:
44
+ # Handle NaN or None values
45
+ if pd.isna(text) or text is None:
46
+ return None # Return None instead of default values
47
+
48
+ words = simple_word_tokenize(text)
49
+ sentences = simple_sentence_split(text)
50
+
51
+ features = {
52
+ 'word_count': len(words),
53
+ 'sentence_count': len(sentences),
54
+ 'avg_word_length': np.mean([len(word) for word in words]) if words else None,
55
+ 'avg_sentence_length': len(words) / len(sentences) if sentences else None
56
+ }
57
+ return features
58
+ except Exception as e:
59
+ return None # Return None if any error occurs
60
+
61
  def load_data():
62
  """Load and prepare the data"""
63
  # Get the current file's directory
 
73
  df_dec = pd.read_csv(data_dir / "decision.csv")
74
  df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
75
 
76
+ # Clean the data by dropping rows with NaN values in critical columns
77
+ df_reviews = df_reviews.dropna(subset=['review', 'rating_int', 'confidence_int'])
78
+
79
+ # Extract features
80
+ features = df_reviews['review'].apply(extract_text_features)
81
+ df_features = pd.DataFrame(features.tolist())
82
+ df_reviews = pd.concat([df_reviews, df_features], axis=1)
83
+
84
+ # Drop any remaining rows with NaN values
85
+ df_reviews = df_reviews.dropna()
86
+
87
+ # Verify no NaN values remain
88
+ if df_reviews.isna().any().any():
89
+ st.warning("Some NaN values were found and those rows were dropped")
90
+ df_reviews = df_reviews.dropna()
91
+
92
  return df_reviews, df_submissions, df_dec, df_keyword
93
  except FileNotFoundError as e:
94
  st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
95
  st.error(f"Error details: {str(e)}")
96
  return None, None, None, None
97
+ except Exception as e:
98
+ st.error(f"Error processing data: {str(e)}")
99
+ return None, None, None, None
100
 
101
  def create_feature_plot(df, x_col, y_col, title):
102
  """Create an interactive scatter plot using plotly"""
103
+ # Ensure no NaN values
104
+ df_plot = df.dropna(subset=[x_col, y_col])
105
+
106
+ fig = px.scatter(df_plot, x=x_col, y=y_col,
107
  title=title,
108
  labels={x_col: x_col.replace('_', ' ').title(),
109
  y_col: y_col.replace('_', ' ').title()},
110
+ template="plotly_dark")
111
  fig.update_layout(
112
  title_x=0.5,
113
  title_font_size=20,
114
  showlegend=True,
115
+ plot_bgcolor='rgb(30, 30, 30)',
116
+ paper_bgcolor='rgb(30, 30, 30)',
117
+ font=dict(color='white')
118
  )
119
  return fig
120
 
121
  def create_correlation_heatmap(df, columns):
122
  """Create a correlation heatmap using plotly"""
123
+ # Ensure no NaN values
124
+ df_corr = df[columns].dropna()
125
+ corr = df_corr.corr()
126
+
127
  fig = go.Figure(data=go.Heatmap(
128
  z=corr,
129
  x=corr.columns,
130
  y=corr.columns,
131
  colorscale='RdBu',
132
+ zmin=-1, zmax=1,
133
+ text=[[f'{val:.2f}' for val in row] for row in corr.values],
134
+ texttemplate='%{text}',
135
+ textfont={"size": 12}
136
  ))
137
  fig.update_layout(
138
  title='Feature Correlation Heatmap',
139
  title_x=0.5,
140
  title_font_size=20,
141
+ plot_bgcolor='rgb(30, 30, 30)',
142
+ paper_bgcolor='rgb(30, 30, 30)',
143
+ font=dict(color='white')
144
+ )
145
+ return fig
146
+
147
+ def create_regression_plot(df, x_col, y_col, title):
148
+ """Create a scatter plot with regression line"""
149
+ # Ensure no NaN values
150
+ df_plot = df.dropna(subset=[x_col, y_col])
151
+
152
+ fig = px.scatter(df_plot, x=x_col, y=y_col,
153
+ title=title,
154
+ labels={x_col: x_col.replace('_', ' ').title(),
155
+ y_col: y_col.replace('_', ' ').title()},
156
+ template="plotly_dark")
157
+
158
+ # Add regression line
159
+ model = LinearRegression()
160
+ X = df_plot[x_col].values.reshape(-1, 1)
161
+ y = df_plot[y_col].values
162
+ model.fit(X, y)
163
+ y_pred = model.predict(X)
164
+
165
+ fig.add_trace(go.Scatter(
166
+ x=df_plot[x_col],
167
+ y=y_pred,
168
+ mode='lines',
169
+ name='Regression Line',
170
+ line=dict(color='red', width=2)
171
+ ))
172
+
173
+ fig.update_layout(
174
+ title_x=0.5,
175
+ title_font_size=20,
176
+ showlegend=True,
177
+ plot_bgcolor='rgb(30, 30, 30)',
178
+ paper_bgcolor='rgb(30, 30, 30)',
179
+ font=dict(color='white')
180
+ )
181
+ return fig, model
182
+
183
+ def create_correlation_examples():
184
+ """Create example plots showing different correlation types"""
185
+ # Generate example data
186
+ np.random.seed(42)
187
+ n_points = 100
188
+
189
+ # Perfect positive correlation
190
+ x1 = np.linspace(0, 10, n_points)
191
+ y1 = x1 + np.random.normal(0, 0.1, n_points)
192
+
193
+ # Perfect negative correlation
194
+ x2 = np.linspace(0, 10, n_points)
195
+ y2 = -x2 + np.random.normal(0, 0.1, n_points)
196
+
197
+ # Low correlation
198
+ x3 = np.random.normal(5, 2, n_points)
199
+ y3 = np.random.normal(5, 2, n_points)
200
+
201
+ # Create subplots
202
+ fig = make_subplots(rows=1, cols=3,
203
+ subplot_titles=('Perfect Positive Correlation (r ≈ 1)',
204
+ 'Perfect Negative Correlation (r ≈ -1)',
205
+ 'Low Correlation (r ≈ 0)'))
206
+
207
+ # Add traces
208
+ fig.add_trace(go.Scatter(x=x1, y=y1, mode='markers', name='r ≈ 1'),
209
+ row=1, col=1)
210
+ fig.add_trace(go.Scatter(x=x2, y=y2, mode='markers', name='r ≈ -1'),
211
+ row=1, col=2)
212
+ fig.add_trace(go.Scatter(x=x3, y=y3, mode='markers', name='r ≈ 0'),
213
+ row=1, col=3)
214
+
215
+ # Update layout
216
+ fig.update_layout(
217
+ height=400,
218
+ showlegend=False,
219
+ template="plotly_dark",
220
+ plot_bgcolor='rgb(30, 30, 30)',
221
+ paper_bgcolor='rgb(30, 30, 30)',
222
+ font=dict(color='white', size=14),
223
+ title=dict(
224
+ text='Examples of Different Correlation Types',
225
+ x=0.5,
226
+ y=0.95,
227
+ font=dict(size=20)
228
+ )
229
  )
230
+
231
+ # Update axes
232
+ for i in range(1, 4):
233
+ fig.update_xaxes(title_text='X', row=1, col=i)
234
+ fig.update_yaxes(title_text='Y', row=1, col=i)
235
+
236
  return fig
237
 
238
  def show():
 
264
  2. **Correlation Analysis (相关性分析):**
265
  - Definition: Statistical measure that shows how strongly two variables are related
266
  - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
267
+ """)
268
+
269
+ # Add correlation examples
270
+ st.write("Here are examples of different correlation types:")
271
+ corr_examples = create_correlation_examples()
272
+ st.plotly_chart(corr_examples, use_container_width=True)
273
+
274
+ # Show example code for correlation analysis
275
+ with st.expander("Example Code: Correlation Analysis"):
276
+ st.code("""
277
+ # Example: Calculating and visualizing correlations
278
+ import numpy as np
279
+ import pandas as pd
280
+ import plotly.graph_objects as go
281
+ from plotly.subplots import make_subplots
282
+
283
+ # Generate example data
284
+ np.random.seed(42)
285
+ n_points = 100
286
+
287
+ # Perfect positive correlation
288
+ x1 = np.linspace(0, 10, n_points)
289
+ y1 = x1 + np.random.normal(0, 0.1, n_points)
290
+
291
+ # Perfect negative correlation
292
+ x2 = np.linspace(0, 10, n_points)
293
+ y2 = -x2 + np.random.normal(0, 0.1, n_points)
294
+
295
+ # Low correlation
296
+ x3 = np.random.normal(5, 2, n_points)
297
+ y3 = np.random.normal(5, 2, n_points)
298
+
299
+ # Calculate correlations
300
+ corr1 = np.corrcoef(x1, y1)[0,1] # Should be close to 1
301
+ corr2 = np.corrcoef(x2, y2)[0,1] # Should be close to -1
302
+ corr3 = np.corrcoef(x3, y3)[0,1] # Should be close to 0
303
+
304
+ print(f"Correlation 1: {corr1:.3f}")
305
+ print(f"Correlation 2: {corr2:.3f}")
306
+ print(f"Correlation 3: {corr3:.3f}")
307
+ """)
308
 
309
+ st.write("""
310
  3. **Reading Linear Regression Output (解读线性回归结果):**
311
  - R-squared (R²): Proportion of variance explained by the model (0-1)
312
  - p-value: Probability that the observed relationship occurred by chance
 
316
  """)
317
 
318
  # Load the data
319
+ df_reviews, df_submissions, df_dec, df_keyword = load_data()
320
+
321
+ if df_reviews is not None:
322
+ try:
323
+ # Module 1: Data Exploration
324
+ st.header("Module 1: Data Exploration")
325
+ st.write("Let's explore our dataset to understand the review patterns:")
326
+
327
+ # Show example code for data loading and cleaning
328
+ with st.expander("Example Code: Data Loading and Cleaning"):
329
+ st.code("""
330
+ # Load and clean the data
331
+ import pandas as pd
332
+ import numpy as np
333
+
334
+ def load_and_clean_data():
335
+ # Load datasets
336
+ df_reviews = pd.read_csv('reviews.csv')
337
+ df_submissions = pd.read_csv('Submissions.csv')
338
+ df_dec = pd.read_csv('decision.csv')
339
+ df_keyword = pd.read_csv('submission_keyword.csv')
340
+
341
+ # Clean reviews data
342
+ df_reviews = df_reviews.dropna(subset=['review', 'rating_int', 'confidence_int'])
343
+
344
+ # Extract text features
345
+ def extract_text_features(text):
346
+ if pd.isna(text) or text is None:
347
+ return {
348
+ 'word_count': 0,
349
+ 'sentence_count': 0,
350
+ 'avg_word_length': 0,
351
+ 'avg_sentence_length': 0
352
+ }
353
 
354
+ # Convert to string and clean
355
+ text = str(text).lower()
356
+ text = re.sub(r'[^\\w\\s]', ' ', text)
 
 
 
 
 
 
357
 
358
+ # Split into words and sentences
359
+ words = [word for word in text.split() if word]
360
+ sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
 
 
 
 
361
 
362
+ return {
363
+ 'word_count': len(words),
364
+ 'sentence_count': len(sentences),
365
+ 'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
366
+ 'avg_sentence_length': len(words) / len(sentences) if sentences else 0
367
+ }
368
+
369
+ # Apply feature extraction
370
+ features = df_reviews['review'].apply(extract_text_features)
371
+ df_features = pd.DataFrame(features.tolist())
372
+ df_reviews = pd.concat([df_reviews, df_features], axis=1)
373
+
374
+ # Fill any remaining NaN values
375
+ df_reviews = df_reviews.fillna(0)
376
+
377
+ return df_reviews, df_submissions, df_dec, df_keyword
378
+ """)
379
 
380
+ # Verify data quality
381
+ st.subheader("Data Quality Check")
382
+ missing_data = df_reviews.isna().sum()
383
+ if missing_data.any():
384
+ st.warning("Missing values found in the dataset:")
385
+ st.write(missing_data[missing_data > 0])
386
+
387
+ # Show basic statistics
388
+ col1, col2 = st.columns(2)
389
  with col1:
390
+ st.metric("Total Reviews", len(df_reviews))
391
+ st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
392
  with col2:
393
+ st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
394
+ st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
+ # Interactive feature selection
397
+ st.subheader("Interactive Feature Analysis")
398
+ feature_cols = ['word_count', 'sentence_count', 'avg_word_length',
399
+ 'avg_sentence_length', 'rating_int', 'confidence_int']
 
400
 
401
+ col1, col2 = st.columns(2)
402
+ with col1:
403
+ x_feature = st.selectbox("Select X-axis feature:", feature_cols)
404
+ with col2:
405
+ y_feature = st.selectbox("Select Y-axis feature:", feature_cols)
406
 
407
+ # Create interactive plot
408
+ fig = create_feature_plot(df_reviews, x_feature, y_feature,
409
+ f'{x_feature.replace("_", " ").title()} vs {y_feature.replace("_", " ").title()}')
410
+ st.plotly_chart(fig, use_container_width=True)
 
411
 
412
+ # Show correlation between selected features
413
+ corr = df_reviews[[x_feature, y_feature]].corr().iloc[0,1]
414
+ st.write(f"Correlation between {x_feature} and {y_feature}: {corr:.3f}")
415
 
416
+ # Distribution plots
417
+ st.subheader("Distribution of Ratings and Confidence")
418
+ col1, col2 = st.columns(2)
419
+ with col1:
420
+ fig = px.histogram(df_reviews.dropna(subset=['rating_int']),
421
+ x='rating_int',
422
+ title='Distribution of Ratings',
423
+ template="plotly_dark")
424
+ st.plotly_chart(fig, use_container_width=True)
425
+ with col2:
426
+ fig = px.histogram(df_reviews.dropna(subset=['confidence_int']),
427
+ x='confidence_int',
428
+ title='Distribution of Confidence',
429
+ template="plotly_dark")
430
+ st.plotly_chart(fig, use_container_width=True)
431
+
432
+ # Show example code for distribution analysis
433
+ with st.expander("Example Code: Distribution Analysis"):
434
+ st.code("""
435
+ # Analyze distributions of numerical features
436
+ import plotly.express as px
437
+
438
+ def analyze_distributions(df):
439
+ # Create histograms for key features
440
+ fig1 = px.histogram(df, x='rating_int',
441
+ title='Distribution of Ratings',
442
+ template="plotly_dark")
443
+
444
+ fig2 = px.histogram(df, x='confidence_int',
445
+ title='Distribution of Confidence',
446
+ template="plotly_dark")
447
+
448
+ # Calculate summary statistics
449
+ stats = df[['rating_int', 'confidence_int']].describe()
450
+
451
+ return fig1, fig2, stats
452
+
453
+ # Usage
454
+ fig1, fig2, stats = analyze_distributions(df_reviews)
455
+ print(stats)
456
+ """)
457
+
458
+ # Text feature distributions
459
+ st.subheader("Text Feature Distributions")
460
+ col1, col2 = st.columns(2)
461
+ with col1:
462
+ fig = px.histogram(df_reviews.dropna(subset=['avg_word_length']),
463
+ x='avg_word_length',
464
+ title='Average Word Length Distribution',
465
+ template="plotly_dark")
466
+ st.plotly_chart(fig, use_container_width=True)
467
+ with col2:
468
+ fig = px.histogram(df_reviews.dropna(subset=['avg_sentence_length']),
469
+ x='avg_sentence_length',
470
+ title='Average Sentence Length Distribution',
471
+ template="plotly_dark")
472
+ st.plotly_chart(fig, use_container_width=True)
473
+
474
+ # Correlation analysis
475
+ st.subheader("Feature Correlations")
476
+ corr_fig = create_correlation_heatmap(df_reviews, feature_cols)
477
+ st.plotly_chart(corr_fig, use_container_width=True)
478
+
479
+ # Show example code for correlation analysis
480
+ with st.expander("Example Code: Correlation Analysis"):
481
+ st.code("""
482
+ # Analyze correlations between features
483
+ import plotly.graph_objects as go
484
+
485
+ def analyze_correlations(df, columns):
486
+ # Calculate correlation matrix
487
+ corr = df[columns].corr()
488
+
489
+ # Create heatmap
490
+ fig = go.Figure(data=go.Heatmap(
491
+ z=corr,
492
+ x=corr.columns,
493
+ y=corr.columns,
494
+ colorscale='RdBu',
495
+ zmin=-1, zmax=1,
496
+ text=[[f'{val:.2f}' for val in row] for row in corr.values],
497
+ texttemplate='%{text}',
498
+ textfont={"size": 12}
499
+ ))
500
+
501
+ fig.update_layout(
502
+ title='Feature Correlation Heatmap',
503
+ template="plotly_dark"
504
+ )
505
+
506
+ return fig, corr
507
+
508
+ # Usage
509
+ fig, corr_matrix = analyze_correlations(df_reviews, feature_cols)
510
+ print(corr_matrix)
511
+ """)
512
+
513
+ # Module 2: Simple Linear Regression
514
+ st.header("Module 2: Simple Linear Regression")
515
  st.write("""
516
+ Let's explore the relationship between review length and rating using simple linear regression.
 
 
 
517
  """)
518
 
519
+ # Interactive feature selection for regression
520
+ st.subheader("Interactive Regression Analysis")
521
+ col1, col2 = st.columns(2)
522
+ with col1:
523
+ x_reg = st.selectbox("Select feature for X-axis:", feature_cols)
524
+ with col2:
525
+ y_reg = st.selectbox("Select target variable:", feature_cols)
 
526
 
527
+ # Create regression plot
528
+ fig, model = create_regression_plot(df_reviews, x_reg, y_reg,
529
+ f'{x_reg.replace("_", " ").title()} vs {y_reg.replace("_", " ").title()}')
530
+ st.plotly_chart(fig, use_container_width=True)
531
 
532
+ # Show regression metrics
533
+ st.subheader("Regression Metrics")
534
+ col1, col2 = st.columns(2)
535
+ with col1:
536
+ r2_score = model.score(df_reviews[[x_reg]].dropna(),
537
+ df_reviews[y_reg].dropna())
538
+ st.metric("R-squared", f"{r2_score:.3f}")
539
+ with col2:
540
+ st.metric("Slope", f"{model.coef_[0]:.3f}")
541
 
542
+ # Show example code for simple linear regression
543
+ with st.expander("Example Code: Simple Linear Regression"):
544
+ st.code('''
545
+ # Perform simple linear regression
546
+ from sklearn.linear_model import LinearRegression
547
+ import plotly.graph_objects as go
548
+
549
+ def simple_linear_regression(df, x_col, y_col, title=None):
550
+ """
551
+ Perform simple linear regression on any DataFrame.
552
+
553
+ Parameters:
554
+ -----------
555
+ df : pandas.DataFrame
556
+ Input DataFrame containing the features
557
+ x_col : str
558
+ Name of the column to use as independent variable
559
+ y_col : str
560
+ Name of the column to use as dependent variable
561
+ title : str, optional
562
+ Title for the plot. If None, will use column names
563
+
564
+ Returns:
565
+ --------
566
+ tuple
567
+ (model, r2_score, fig) where:
568
+ - model is the fitted LinearRegression object
569
+ - r2_score is the R-squared value
570
+ - fig is the plotly figure object
571
+ """
572
+ # Handle missing values by dropping them
573
+ df_clean = df.dropna(subset=[x_col, y_col])
574
+
575
+ if len(df_clean) == 0:
576
+ raise ValueError("No valid data points after removing missing values")
577
+
578
+ # Prepare data
579
+ X = df_clean[[x_col]]
580
+ y = df_clean[y_col]
581
+
582
+ # Fit model
583
+ model = LinearRegression()
584
+ model.fit(X, y)
585
+
586
+ # Calculate R-squared
587
+ r2_score = model.score(X, y)
588
+
589
+ # Create visualization
590
+ fig = go.Figure()
591
+
592
+ # Add scatter plot
593
+ fig.add_trace(go.Scatter(
594
+ x=X[x_col],
595
+ y=y,
596
+ mode='markers',
597
+ name='Data Points',
598
+ marker=dict(size=8, opacity=0.6)
599
+ ))
600
+
601
+ # Add regression line
602
+ x_range = np.linspace(X[x_col].min(), X[x_col].max(), 100)
603
+ y_pred = model.predict(x_range.reshape(-1, 1))
604
+
605
+ fig.add_trace(go.Scatter(
606
+ x=x_range,
607
+ y=y_pred,
608
+ mode='lines',
609
+ name='Regression Line',
610
+ line=dict(color='red', width=2)
611
+ ))
612
+
613
+ # Update layout
614
+ title = title or f'{x_col} vs {y_col}'
615
+ fig.update_layout(
616
+ title=title,
617
+ xaxis_title=x_col,
618
+ yaxis_title=y_col,
619
+ template="plotly_dark",
620
+ showlegend=True
621
+ )
622
+
623
+ return model, r2_score, fig
624
+
625
+ # Usage
626
+ fig, model = simple_linear_regression(df_reviews, 'word_count', 'rating_int')
627
+ print(f"R-squared: {model.score(X, y):.3f}")
628
+ print(f"Slope: {model.coef_[0]:.3f}")
629
+ ''')
630
 
631
+ # Module 3: Multiple Linear Regression
632
+ st.header("Module 3: Multiple Linear Regression")
633
+ st.write("""
634
+ Now let's build a more complex model using multiple features to predict ratings.
635
  """)
636
+
637
+ try:
638
+ # Prepare data for modeling
639
+ feature_cols = ['word_count', 'sentence_count',
640
+ 'avg_word_length', 'avg_sentence_length',
641
+ 'confidence_int']
642
+
643
+ # Interactive feature selection for multiple regression
644
+ st.subheader("Select Features for Multiple Regression")
645
+ selected_features = st.multiselect(
646
+ "Choose features to include in the model:",
647
+ feature_cols,
648
+ default=feature_cols
649
+ )
650
+
651
+ if selected_features:
652
+ # Ensure no NaN values in features
653
+ df_model = df_reviews.dropna(subset=selected_features + ['rating_int'])
654
+
655
+ X = df_model[selected_features]
656
+ y = df_model['rating_int']
657
+
658
+ # Split data
659
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
660
+
661
+ # Fit regression model
662
+ model = LinearRegression()
663
+ model.fit(X_train, y_train)
664
+
665
+ # Create 3D visualization if exactly 2 features are selected
666
+ if len(selected_features) == 2:
667
+ st.subheader("3D Visualization of Selected Features")
668
+ fig = px.scatter_3d(df_model.sample(min(1000, len(df_model))),
669
+ x=selected_features[0],
670
+ y=selected_features[1],
671
+ z='rating_int',
672
+ title='Review Features in 3D Space',
673
+ template="plotly_dark")
674
+ fig.update_layout(
675
+ title_x=0.5,
676
+ title_font_size=20,
677
+ scene = dict(
678
+ xaxis_title=selected_features[0].replace('_', ' ').title(),
679
+ yaxis_title=selected_features[1].replace('_', ' ').title(),
680
+ zaxis_title='Rating'
681
+ )
682
+ )
683
+ st.plotly_chart(fig, use_container_width=True)
684
+
685
+ # Show model metrics
686
+ st.subheader("Model Performance")
687
+ col1, col2, col3 = st.columns(3)
688
+ with col1:
689
+ st.metric("Training R²", f"{model.score(X_train, y_train):.3f}")
690
+ with col2:
691
+ st.metric("Testing R²", f"{model.score(X_test, y_test):.3f}")
692
+ with col3:
693
+ st.metric("RMSE", f"{np.sqrt(mean_squared_error(y_test, model.predict(X_test))):.3f}")
694
+
695
+ # Show coefficients
696
+ st.subheader("Model Coefficients")
697
+ coef_df = pd.DataFrame({
698
+ 'Feature': X.columns,
699
+ 'Coefficient': model.coef_
700
+ })
701
+ st.dataframe(coef_df)
702
+
703
+ # Show example code for multiple linear regression
704
+ with st.expander("Example Code: Multiple Linear Regression"):
705
+ st.code('''
706
+ # Perform multiple linear regression
707
+ from sklearn.model_selection import train_test_split
708
+ from sklearn.metrics import mean_squared_error
709
+
710
+ def multiple_linear_regression(df, feature_cols, target_col, test_size=0.2, random_state=42):
711
+ """
712
+ Perform multiple linear regression on any DataFrame.
713
+
714
+ Parameters:
715
+ -----------
716
+ df : pandas.DataFrame
717
+ Input DataFrame containing the features
718
+ feature_cols : list of str
719
+ Names of the columns to use as independent variables
720
+ target_col : str
721
+ Name of the column to use as dependent variable
722
+ test_size : float, optional
723
+ Proportion of data to use for testing
724
+ random_state : int, optional
725
+ Random seed for reproducibility
726
+
727
+ Returns:
728
+ --------
729
+ tuple
730
+ (model, metrics, coef_df, fig) where:
731
+ - model is the fitted LinearRegression object
732
+ - metrics is a dictionary of performance metrics
733
+ - coef_df is a DataFrame of feature coefficients
734
+ - fig is the plotly figure object (if 2 features selected)
735
+ """
736
+ # Handle missing values by dropping them
737
+ df_clean = df.dropna(subset=feature_cols + [target_col])
738
+
739
+ if len(df_clean) == 0:
740
+ raise ValueError("No valid data points after removing missing values")
741
+
742
+ # Prepare data
743
+ X = df_clean[feature_cols]
744
+ y = df_clean[target_col]
745
+
746
+ # Split data
747
+ X_train, X_test, y_train, y_test = train_test_split(
748
+ X, y, test_size=test_size, random_state=random_state)
749
+
750
+ # Fit model
751
+ model = LinearRegression()
752
+ model.fit(X_train, y_train)
753
+
754
+ # Make predictions
755
+ y_train_pred = model.predict(X_train)
756
+ y_test_pred = model.predict(X_test)
757
+
758
+ # Calculate metrics
759
+ metrics = {
760
+ 'train_r2': r2_score(y_train, y_train_pred),
761
+ 'test_r2': r2_score(y_test, y_test_pred),
762
+ 'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
763
+ 'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred))
764
+ }
765
+
766
+ # Create coefficient DataFrame
767
+ coef_df = pd.DataFrame({
768
+ 'Feature': feature_cols,
769
+ 'Coefficient': model.coef_,
770
+ 'Absolute_Impact': np.abs(model.coef_)
771
+ }).sort_values('Absolute_Impact', ascending=False)
772
+
773
+ # Create visualization if exactly 2 features are selected
774
+ fig = None
775
+ if len(feature_cols) == 2:
776
+ fig = px.scatter_3d(
777
+ df_clean.sample(min(1000, len(df_clean))),
778
+ x=feature_cols[0],
779
+ y=feature_cols[1],
780
+ z=target_col,
781
+ title=f'Relationship between {feature_cols[0]}, {feature_cols[1]}, and {target_col}',
782
+ template="plotly_dark"
783
+ )
784
+
785
+ # Add regression plane
786
+ x_range = np.linspace(df_clean[feature_cols[0]].min(), df_clean[feature_cols[0]].max(), 20)
787
+ y_range = np.linspace(df_clean[feature_cols[1]].min(), df_clean[feature_cols[1]].max(), 20)
788
+ x_grid, y_grid = np.meshgrid(x_range, y_range)
789
+
790
+ z_grid = (model.intercept_ +
791
+ model.coef_[0] * x_grid +
792
+ model.coef_[1] * y_grid)
793
+
794
+ fig.add_trace(go.Surface(
795
+ x=x_grid,
796
+ y=y_grid,
797
+ z=z_grid,
798
+ opacity=0.5,
799
+ showscale=False
800
+ ))
801
+
802
+ return model, metrics, coef_df, fig
803
 
804
+ # Usage
805
+ model, train_score, test_score, rmse, coef_df = multiple_linear_regression(
806
+ df_reviews,
807
+ ['word_count', 'sentence_count', 'confidence_int'],
808
+ 'rating_int'
809
+ )
810
+ print(f"Training R²: {train_score:.3f}")
811
+ print(f"Testing R²: {test_score:.3f}")
812
+ print(f"RMSE: {rmse:.3f}")
813
+ print(coef_df)
814
+ ''')
815
+
816
+ except Exception as e:
817
+ st.error(f"Error in model training: {str(e)}")
818
+ st.write("Please check the data quality and try again.")
819
 
820
+ except Exception as e:
821
+ st.error(f"Error in data processing: {str(e)}")
822
+ st.write("Please check the data format and try again.")
823
+
824
+ # Practice Exercises
825
+ st.header("Practice Exercises")
826
+
827
+ # Add new section for writing prompts
828
+ st.subheader("Writing Prompts for Analyzing Linear Regression Results")
829
+ st.write("""
830
+ Use these prompts to help you interpret and write about your linear regression results:
831
+
832
+ 1. **Model Fit and R-squared:**
833
+ - "The model explains [R² value]% of the variance in [dependent variable], suggesting [strong/moderate/weak] predictive power."
834
+ - "With an R-squared of [value], we can conclude that [interpretation of model fit]."
835
+ - "The relatively [high/low] R-squared value indicates that [interpretation of model's explanatory power]."
836
+
837
+ 2. **Statistical Significance and p-values:**
838
+ - "The p-value of [value] for [feature] suggests that this relationship is [statistically significant/not significant]."
839
+ - "Given the p-value of [value], we [can/cannot] reject the null hypothesis that [interpretation]."
840
+ - "The statistical significance (p = [value]) indicates that [interpretation of relationship]."
841
+
842
+ 3. **Coefficients and Their Meaning:**
843
+ - "For each unit increase in [independent variable], [dependent variable] [increases/decreases] by [coefficient value] units."
844
+ - "The coefficient of [value] for [feature] suggests that [interpretation of relationship]."
845
+ - "The positive/negative coefficient indicates that [interpretation of direction of relationship]."
846
+
847
+ 4. **Uncertainty and Standard Errors:**
848
+ - "The standard error of [value] for [feature] indicates [interpretation of precision]."
849
+ - "The relatively [small/large] standard error suggests that [interpretation of estimate reliability]."
850
+ - "The uncertainty in our coefficient estimates, as shown by the standard errors, [interpretation of confidence in results]."
851
+
852
+ 5. **Confidence Intervals:**
853
+ - "We are 95% confident that the true coefficient for [feature] lies between [lower bound] and [upper bound]."
854
+ - "The confidence interval [includes/does not include] zero, suggesting that [interpretation of significance]."
855
+ - "The narrow/wide confidence interval indicates [interpretation of precision]."
856
+
857
+ 6. **Practical Significance:**
858
+ - "While the relationship is statistically significant, the effect size of [value] suggests [interpretation of practical importance]."
859
+ - "The coefficient of [value] indicates that [interpretation of real-world impact]."
860
+ - "In practical terms, this means that [interpretation of practical implications]."
861
+
862
+ 7. **Model Limitations:**
863
+ - "The model's assumptions of [assumptions] may not hold in this case because [explanation]."
864
+ - "Potential limitations of our analysis include [list limitations]."
865
+ - "We should be cautious in interpreting these results because [explanation of limitations]."
866
+
867
+ 8. **Recommendations:**
868
+ - "Based on our analysis, we recommend [specific action] because [explanation]."
869
+ - "The results suggest that [interpretation] and therefore [recommendation]."
870
+ - "To improve the model, we could [suggestions for improvement]."
871
+ """)
872
+
873
+ with st.expander("Exercise 1: Simple Linear Regression"):
874
+ st.write("""
875
+ 1. Create a function that performs simple linear regression on any DataFrame
876
+ 2. The function should:
877
+ - Take a DataFrame and column names as input
878
+ - Handle missing values appropriately
879
+ - Calculate and return R-squared value
880
+ - Create a visualization of the relationship
881
+ 3. Test your function with different features from the dataset
882
+ """)
883
+
884
+ st.code('''
885
+ # Solution: Generic Simple Linear Regression Function
886
+ import pandas as pd
887
+ import numpy as np
888
+ from sklearn.linear_model import LinearRegression
889
+ import plotly.express as px
890
+ import plotly.graph_objects as go
891
+
892
+ def simple_linear_regression(df, x_col, y_col, title=None):
893
+ """
894
+ Perform simple linear regression on any DataFrame.
895
+
896
+ Parameters:
897
+ -----------
898
+ df : pandas.DataFrame
899
+ Input DataFrame containing the features
900
+ x_col : str
901
+ Name of the column to use as independent variable
902
+ y_col : str
903
+ Name of the column to use as dependent variable
904
+ title : str, optional
905
+ Title for the plot. If None, will use column names
906
+
907
+ Returns:
908
+ --------
909
+ tuple
910
+ (model, r2_score, fig) where:
911
+ - model is the fitted LinearRegression object
912
+ - r2_score is the R-squared value
913
+ - fig is the plotly figure object
914
+ """
915
+ # Handle missing values by dropping them
916
+ df_clean = df.dropna(subset=[x_col, y_col])
917
+
918
+ if len(df_clean) == 0:
919
+ raise ValueError("No valid data points after removing missing values")
920
+
921
+ # Prepare data
922
+ X = df_clean[[x_col]]
923
+ y = df_clean[y_col]
924
+
925
+ # Fit model
926
+ model = LinearRegression()
927
+ model.fit(X, y)
928
+
929
+ # Calculate R-squared
930
+ r2_score = model.score(X, y)
931
+
932
+ # Create visualization
933
+ fig = go.Figure()
934
+
935
+ # Add scatter plot
936
+ fig.add_trace(go.Scatter(
937
+ x=X[x_col],
938
+ y=y,
939
+ mode='markers',
940
+ name='Data Points',
941
+ marker=dict(size=8, opacity=0.6)
942
+ ))
943
+
944
+ # Add regression line
945
+ x_range = np.linspace(X[x_col].min(), X[x_col].max(), 100)
946
+ y_pred = model.predict(x_range.reshape(-1, 1))
947
+
948
+ fig.add_trace(go.Scatter(
949
+ x=x_range,
950
+ y=y_pred,
951
+ mode='lines',
952
+ name='Regression Line',
953
+ line=dict(color='red', width=2)
954
+ ))
955
+
956
+ # Update layout
957
+ title = title or f'{x_col} vs {y_col}'
958
+ fig.update_layout(
959
+ title=title,
960
+ xaxis_title=x_col,
961
+ yaxis_title=y_col,
962
+ template="plotly_dark",
963
+ showlegend=True
964
+ )
965
+
966
+ return model, r2_score, fig
967
+
968
+ # Example usage:
969
+ # Load your data
970
+ df = pd.read_csv('your_data.csv')
971
+
972
+ # Try different feature pairs
973
+ feature_pairs = [
974
+ ('word_count', 'rating_int'),
975
+ ('confidence_int', 'rating_int'),
976
+ ('avg_word_length', 'rating_int')
977
+ ]
978
+
979
+ # Analyze each pair
980
+ for x_col, y_col in feature_pairs:
981
+ try:
982
+ model, r2, fig = simple_linear_regression(df, x_col, y_col)
983
+ print(f"\nAnalysis of {x_col} vs {y_col}:")
984
+ print(f"R-squared: {r2:.3f}")
985
+ print(f"Slope: {model.coef_[0]:.3f}")
986
+ print(f"Intercept: {model.intercept_:.3f}")
987
+ fig.show()
988
+ except Exception as e:
989
+ print(f"Error analyzing {x_col} vs {y_col}: {str(e)}")
990
+ ''')
991
+
992
+ with st.expander("Exercise 2: Multiple Linear Regression"):
993
+ st.write("""
994
+ 1. Create a function that performs multiple linear regression on any DataFrame
995
+ 2. The function should:
996
+ - Take a DataFrame and lists of feature columns as input
997
+ - Handle missing values appropriately
998
+ - Split data into training and test sets
999
+ - Calculate and return performance metrics
1000
+ - Create visualizations of the results
1001
+ 3. Test your function with different combinations of features
1002
+ """)
1003
+
1004
+ st.code('''
1005
+ # Solution: Generic Multiple Linear Regression Function
1006
+ import pandas as pd
1007
+ import numpy as np
1008
+ from sklearn.linear_model import LinearRegression
1009
+ from sklearn.model_selection import train_test_split
1010
+ from sklearn.metrics import mean_squared_error, r2_score
1011
+ import plotly.express as px
1012
+ import plotly.graph_objects as go
1013
+
1014
+ def multiple_linear_regression(df, feature_cols, target_col, test_size=0.2, random_state=42):
1015
+ """
1016
+ Perform multiple linear regression on any DataFrame.
1017
+
1018
+ Parameters:
1019
+ -----------
1020
+ df : pandas.DataFrame
1021
+ Input DataFrame containing the features
1022
+ feature_cols : list of str
1023
+ Names of the columns to use as independent variables
1024
+ target_col : str
1025
+ Name of the column to use as dependent variable
1026
+ test_size : float, optional
1027
+ Proportion of data to use for testing
1028
+ random_state : int, optional
1029
+ Random seed for reproducibility
1030
+
1031
+ Returns:
1032
+ --------
1033
+ tuple
1034
+ (model, metrics, coef_df, fig) where:
1035
+ - model is the fitted LinearRegression object
1036
+ - metrics is a dictionary of performance metrics
1037
+ - coef_df is a DataFrame of feature coefficients
1038
+ - fig is the plotly figure object (if 2 features selected)
1039
+ """
1040
+ # Handle missing values by dropping them
1041
+ df_clean = df.dropna(subset=feature_cols + [target_col])
1042
+
1043
+ if len(df_clean) == 0:
1044
+ raise ValueError("No valid data points after removing missing values")
1045
+
1046
+ # Prepare data
1047
+ X = df_clean[feature_cols]
1048
+ y = df_clean[target_col]
1049
+
1050
+ # Split data
1051
+ X_train, X_test, y_train, y_test = train_test_split(
1052
+ X, y, test_size=test_size, random_state=random_state)
1053
+
1054
+ # Fit model
1055
+ model = LinearRegression()
1056
+ model.fit(X_train, y_train)
1057
+
1058
+ # Make predictions
1059
+ y_train_pred = model.predict(X_train)
1060
+ y_test_pred = model.predict(X_test)
1061
+
1062
+ # Calculate metrics
1063
+ metrics = {
1064
+ 'train_r2': r2_score(y_train, y_train_pred),
1065
+ 'test_r2': r2_score(y_test, y_test_pred),
1066
+ 'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
1067
+ 'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred))
1068
+ }
1069
+
1070
+ # Create coefficient DataFrame
1071
+ coef_df = pd.DataFrame({
1072
+ 'Feature': feature_cols,
1073
+ 'Coefficient': model.coef_,
1074
+ 'Absolute_Impact': np.abs(model.coef_)
1075
+ }).sort_values('Absolute_Impact', ascending=False)
1076
+
1077
+ # Create visualization if exactly 2 features are selected
1078
+ fig = None
1079
+ if len(feature_cols) == 2:
1080
+ fig = px.scatter_3d(
1081
+ df_clean.sample(min(1000, len(df_clean))),
1082
+ x=feature_cols[0],
1083
+ y=feature_cols[1],
1084
+ z=target_col,
1085
+ title=f'Relationship between {feature_cols[0]}, {feature_cols[1]}, and {target_col}',
1086
+ template="plotly_dark"
1087
+ )
1088
+
1089
+ # Add regression plane
1090
+ x_range = np.linspace(df_clean[feature_cols[0]].min(), df_clean[feature_cols[0]].max(), 20)
1091
+ y_range = np.linspace(df_clean[feature_cols[1]].min(), df_clean[feature_cols[1]].max(), 20)
1092
+ x_grid, y_grid = np.meshgrid(x_range, y_range)
1093
+
1094
+ z_grid = (model.intercept_ +
1095
+ model.coef_[0] * x_grid +
1096
+ model.coef_[1] * y_grid)
1097
+
1098
+ fig.add_trace(go.Surface(
1099
+ x=x_grid,
1100
+ y=y_grid,
1101
+ z=z_grid,
1102
+ opacity=0.5,
1103
+ showscale=False
1104
+ ))
1105
+
1106
+ return model, metrics, coef_df, fig
1107
+
1108
+ # Example usage:
1109
+ # Load your data
1110
+ df = pd.read_csv('your_data.csv')
1111
+
1112
+ # Define feature sets to try
1113
+ feature_sets = [
1114
+ ['word_count', 'confidence_int'],
1115
+ ['word_count', 'sentence_count', 'confidence_int'],
1116
+ ['word_count', 'sentence_count', 'avg_word_length', 'avg_sentence_length', 'confidence_int']
1117
+ ]
1118
+
1119
+ # Analyze each feature set
1120
+ for features in feature_sets:
1121
+ try:
1122
+ print(f"\nAnalyzing features: {features}")
1123
+ model, metrics, coef_df, fig = multiple_linear_regression(
1124
+ df, features, 'rating_int')
1125
+
1126
+ # Print metrics
1127
+ print("\nPerformance Metrics:")
1128
+ for metric, value in metrics.items():
1129
+ print(f"{metric}: {value:.3f}")
1130
+
1131
+ # Print coefficients
1132
+ print("\nFeature Coefficients:")
1133
+ print(coef_df)
1134
+
1135
+ # Show visualization if available
1136
+ if fig is not None:
1137
+ fig.show()
1138
 
1139
  except Exception as e:
1140
+ print(f"Error analyzing features {features}: {str(e)}")
1141
+ ''')
1142
+
1143
+ # Weekly Assignment
1144
+ username = st.session_state.get("username", "Student")
1145
+ st.header(f"{username}'s Weekly Assignment")
1146
+
1147
+ if username == "manxiii":
1148
+ st.markdown("""
1149
+ Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
1150
+ 1. Complete the feature engineering pipeline for the ICLR dataset
1151
+ 2. Build both simple and multiple linear regression models
1152
+ 3. Compare model performance and interpret results
1153
+ 4. Submit your findings in a Jupyter notebook
1154
+
1155
+ **Due Date:** End of Week 5
1156
+ """)
1157
+ elif username == "zhu":
1158
+ st.markdown("""
1159
+ Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
1160
+ 1. Implement the complete machine learning workflow
1161
+ 2. Create insightful visualizations of model results
1162
+ 3. Draw conclusions from your analysis
1163
+ 4. Submit your work in a Jupyter notebook
1164
+
1165
+ **Due Date:** End of Week 5
1166
+ """)
1167
+ elif username == "WK":
1168
+ st.markdown("""
1169
+ Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
1170
+ 1. Complete the feature engineering pipeline
1171
+ 2. Build and evaluate linear regression models
1172
+ 3. Analyze patterns in the data
1173
+ 4. Submit your findings
1174
+
1175
+ **Due Date:** End of Week 5
1176
+ """)
1177
+ else:
1178
+ st.markdown(f"""
1179
+ Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
1180
+ 1. Complete the feature engineering pipeline
1181
+ 2. Build and evaluate linear regression models
1182
+ 3. Analyze patterns in the data
1183
+ 4. Submit your findings
1184
+
1185
+ **Due Date:** End of Week 5
1186
+ """)