File size: 13,454 Bytes
faeb953
 
 
 
 
 
 
 
 
ae38d1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faeb953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae38d1c
faeb953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae38d1c
 
 
faeb953
ae38d1c
 
 
faeb953
ae38d1c
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
faeb953
 
ae38d1c
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
faeb953
ae38d1c
faeb953
 
ae38d1c
faeb953
 
 
ae38d1c
faeb953
ae38d1c
faeb953
ae38d1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
 
 
 
faeb953
ae38d1c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from nltk.tokenize import word_tokenize
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import os

# Set up the style for all plots
plt.style.use('default')
sns.set_theme(style="whitegrid", palette="husl")

def load_data():
    """Load and prepare the data"""
    # Get the current file's directory
    current_dir = Path(__file__).parent
    
    # Navigate to the Data directory (two levels up from the pages directory)
    data_dir = current_dir.parent.parent / "Data"
    
    # Load the datasets
    try:
        df_reviews = pd.read_csv(data_dir / "reviews.csv")
        df_submissions = pd.read_csv(data_dir / "Submissions.csv")
        df_dec = pd.read_csv(data_dir / "decision.csv")
        df_keyword = pd.read_csv(data_dir / "submission_keyword.csv")
        
        return df_reviews, df_submissions, df_dec, df_keyword
    except FileNotFoundError as e:
        st.error(f"Data files not found. Please make sure the data files are in the correct location: {data_dir}")
        st.error(f"Error details: {str(e)}")
        return None, None, None, None

def create_feature_plot(df, x_col, y_col, title):
    """Create an interactive scatter plot using plotly"""
    fig = px.scatter(df, x=x_col, y=y_col, 
                    title=title,
                    labels={x_col: x_col.replace('_', ' ').title(),
                           y_col: y_col.replace('_', ' ').title()},
                    template="plotly_white")
    fig.update_layout(
        title_x=0.5,
        title_font_size=20,
        showlegend=True,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    return fig

def create_correlation_heatmap(df, columns):
    """Create a correlation heatmap using plotly"""
    corr = df[columns].corr()
    fig = go.Figure(data=go.Heatmap(
        z=corr,
        x=corr.columns,
        y=corr.columns,
        colorscale='RdBu',
        zmin=-1, zmax=1
    ))
    fig.update_layout(
        title='Feature Correlation Heatmap',
        title_x=0.5,
        title_font_size=20,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    return fig

def show():
    st.title("Week 5: Introduction to Machine Learning and Linear Regression")
    
    # Introduction Section
    st.header("Course Overview")
    st.write("""
    In this week, we'll explore machine learning through a fascinating real-world challenge: The Academic Publishing Crisis.
    
    Imagine you're the program chair for a prestigious AI conference. You've just received 5,000 paper submissions, and you need to:
    - Decide which papers to accept (only 20% can be accepted)
    - Ensure fair and consistent reviews
    - Understand what makes reviewers confident in their assessments
    
    The Problem: Human reviewers are inconsistent. Some are harsh, others lenient. Some write detailed reviews, others just a few sentences. 
    How can we use data to understand and improve this process?
    
    **Your Mission: Build a machine learning system to analyze review patterns and predict paper acceptance!**
    """)
    
    # Learning Path
    st.subheader("Key Concepts You'll Learn")
    st.write("""
    1. **Linear Regression (线性回归):**
       - Definition: A statistical method that models the relationship between a dependent variable and one or more independent variables
       - Real-world example: Predicting house prices based on size and location
    
    2. **Correlation Analysis (相关性分析):**
       - Definition: Statistical measure that shows how strongly two variables are related
       - Range: -1 (perfect negative correlation) to +1 (perfect positive correlation)
    
    3. **Reading Linear Regression Output (解读线性回归结果):**
       - R-squared (R²): Proportion of variance explained by the model (0-1)
       - p-value: Probability that the observed relationship occurred by chance
       - Coefficients (系数): How much the dependent variable changes with a one-unit change in the independent variable
       - Standard errors: Uncertainty in coefficient estimates
       - Confidence intervals: Range where true coefficient likely lies
    """)

    # Load the data
    try:
        df_reviews, df_submissions, df_dec, df_keyword = load_data()
        
        # Module 1: Data Exploration
        st.header("Module 1: Data Exploration")
        st.write("Let's explore our dataset to understand the review patterns:")
        
        # Create features from review text
        df_reviews['word_count'] = df_reviews['review'].apply(lambda x: len(str(x).split()))
        df_reviews['sentence_count'] = df_reviews['review'].apply(lambda x: len(str(x).split('.')))
        
        # Show basic statistics
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Total Reviews", len(df_reviews))
            st.metric("Average Rating", f"{df_reviews['rating_int'].mean():.2f}")
        with col2:
            st.metric("Average Word Count", f"{df_reviews['word_count'].mean():.0f}")
            st.metric("Average Confidence", f"{df_reviews['confidence_int'].mean():.2f}")
        
        # Create interactive visualizations
        st.subheader("Review Length vs Rating")
        fig = create_feature_plot(df_reviews, 'word_count', 'rating_int', 
                                'Relationship between Review Length and Rating')
        st.plotly_chart(fig, use_container_width=True)
        
        # Correlation analysis
        st.subheader("Feature Correlations")
        corr_fig = create_correlation_heatmap(df_reviews, 
                                            ['word_count', 'rating_int', 'confidence_int'])
        st.plotly_chart(corr_fig, use_container_width=True)
        
        # Module 2: Feature Engineering
        st.header("Module 2: Feature Engineering")
        st.write("""
        Let's create more sophisticated features from our review data:
        - Review length (word count)
        - Review rating
        - Reviewer confidence
        - Number of keywords in the paper
        """)
        
        # Interactive Feature Engineering
        st.subheader("Try Feature Engineering")
        review_text = st.text_area(
            "Enter a review to analyze:",
            "This paper introduces a novel approach to machine learning. The methodology is sound and the results are promising.",
            key="review_text"
        )
        
        if st.button("Extract Features"):
            # Calculate features
            word_count = len(word_tokenize(review_text))
            sentence_count = len(review_text.split('.'))
            
            # Create a nice display of features
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Word Count", word_count)
            with col2:
                st.metric("Sentence Count", sentence_count)
            with col3:
                st.metric("Average Words per Sentence", f"{word_count/sentence_count:.1f}")
        
        # Module 3: Linear Regression Analysis
        st.header("Module 3: Linear Regression Analysis")
        st.write("""
        Let's build a linear regression model to predict paper ratings based on review features.
        """)
        
        # Prepare data for modeling
        X = df_reviews[['word_count', 'confidence_int']]
        y = df_reviews['rating_int']
        
        # Fit regression model
        model = LinearRegression()
        model.fit(X, y)
        
        # Create 3D visualization of the regression
        st.subheader("3D Visualization of Review Features")
        fig = px.scatter_3d(df_reviews.sample(1000), 
                           x='word_count', 
                           y='confidence_int', 
                           z='rating_int',
                           title='Review Features in 3D Space',
                           labels={
                               'word_count': 'Word Count',
                               'confidence_int': 'Confidence',
                               'rating_int': 'Rating'
                           })
        fig.update_layout(
            title_x=0.5,
            title_font_size=20,
            scene = dict(
                xaxis_title='Word Count',
                yaxis_title='Confidence',
                zaxis_title='Rating'
            )
        )
        st.plotly_chart(fig, use_container_width=True)
        
        # Show model metrics
        st.subheader("Model Performance")
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("R-squared", f"{model.score(X, y):.3f}")
        with col2:
            st.metric("Word Count Coefficient", f"{model.coef_[0]:.3f}")
        with col3:
            st.metric("Confidence Coefficient", f"{model.coef_[1]:.3f}")
        
        # Practice Exercises
        st.header("Practice Exercises")
        
        with st.expander("Exercise 1: Feature Engineering"):
            st.write("""
            1. Load the reviews dataset
            2. Create features from review text
            3. Calculate correlation between features
            4. Visualize relationships
            """)
            
            st.code("""
            # Solution
            import pandas as pd
            import numpy as np
            from nltk.tokenize import word_tokenize
            
            # Load data
            df_reviews = pd.read_csv('reviews.csv')
            
            # Create features
            df_reviews['word_count'] = df_reviews['review'].apply(
                lambda x: len(word_tokenize(x)))
            df_reviews['sentence_count'] = df_reviews['review'].apply(
                lambda x: len(x.split('.')))
            
            # Calculate correlation
            correlation = df_reviews[['word_count', 'rating_int', 
                                    'confidence_int']].corr()
            
            # Visualize
            sns.heatmap(correlation, annot=True)
            plt.show()
            """)
        
        with st.expander("Exercise 2: Building a Predictive Model"):
            st.write("""
            1. Prepare features for modeling
            2. Split data into training and test sets
            3. Train a linear regression model
            4. Evaluate model performance
            """)
            
            st.code("""
            # Solution
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LinearRegression
            
            # Prepare features
            X = df_reviews[['word_count', 'confidence_int']]
            y = df_reviews['rating_int']
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            
            # Train model
            model = LinearRegression()
            model.fit(X_train, y_train)
            
            # Evaluate
            train_score = model.score(X_train, y_train)
            test_score = model.score(X_test, y_test)
            
            print(f"Training R²: {train_score:.3f}")
            print(f"Testing R²: {test_score:.3f}")
            """)

        # Weekly Assignment
        username = st.session_state.get("username", "Student")
        st.header(f"{username}'s Weekly Assignment")
        
        if username == "manxiii":
            st.markdown("""
            Hello **manxiii**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline for the ICLR dataset
            2. Build a linear regression model to predict paper ratings
            3. Analyze the relationship between review features and acceptance
            4. Submit your findings in a Jupyter notebook

            **Due Date:** End of Week 5
            """)
        elif username == "zhu":
            st.markdown("""
            Hello **zhu**, here is your Assignment 5: Machine Learning Analysis.
            1. Implement the complete machine learning workflow
            2. Create insightful visualizations of model results
            3. Draw conclusions from your analysis
            4. Submit your work in a Jupyter notebook

            **Due Date:** End of Week 5
            """)
        elif username == "WK":
            st.markdown("""
            Hello **WK**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline
            2. Build and evaluate a linear regression model
            3. Analyze patterns in the data
            4. Submit your findings

            **Due Date:** End of Week 5
            """)
        else:
            st.markdown(f"""
            Hello **{username}**, here is your Assignment 5: Machine Learning Analysis.
            1. Complete the feature engineering pipeline
            2. Build and evaluate a linear regression model
            3. Analyze patterns in the data
            4. Submit your findings

            **Due Date:** End of Week 5
            """)
            
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        st.write("Please make sure the data files are in the correct location.")