File size: 8,426 Bytes
49e3aec
 
 
 
 
 
 
 
 
b1b0b70
49e3aec
b1b0b70
 
 
 
 
 
 
 
 
49e3aec
b1b0b70
 
 
 
 
 
 
 
 
 
 
 
49e3aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1b0b70
49e3aec
 
 
 
 
 
 
 
 
 
b1b0b70
49e3aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1b0b70
 
49e3aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1b0b70
 
49e3aec
 
 
 
 
 
 
 
 
 
 
b1b0b70
49e3aec
 
 
 
 
 
b1b0b70
 
 
 
 
 
49e3aec
b1b0b70
 
49e3aec
b1b0b70
 
 
49e3aec
 
b1b0b70
 
 
 
88ff3a0
 
 
 
 
 
 
cb4004b
88ff3a0
 
 
 
 
cb4004b
88ff3a0
 
 
cb4004b
88ff3a0
 
 
cb4004b
88ff3a0
cb4004b
 
88ff3a0
 
 
cb4004b
49e3aec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import string
import io
from contextlib import redirect_stdout
import re

# Define a simple list of common English stop words
STOP_WORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
    'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
    'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
    'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
    'just', 'should', 'now'
}

def simple_tokenize(text):
    """Simple tokenization function that splits on whitespace and removes punctuation"""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split on whitespace
    return text.split()

def remove_stop_words(tokens):
    """Remove stop words from a list of tokens"""
    return [word for word in tokens if word.lower() not in STOP_WORDS]

def show():
    st.title("Week 4: Introduction to Natural Language Processing")
    
    # Introduction Section
    st.header("Course Overview")
    st.write("""
    In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question: 
    What is the effect of releasing a preprint of a paper before it is submitted for peer review?
    
    Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles 
    from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing.
    """)
    
    # Learning Path
    st.subheader("Learning Path")
    st.write("""
    1. Understanding Text as Data: How computers represent and work with text
    2. Text Processing Fundamentals: Basic cleaning and normalization
    3. Quantitative Text Analysis: Measuring and comparing text features
    4. Tokenization Approaches: Breaking text into meaningful units
    5. Text Visualization Techniques: Creating insightful visual representations
    6. From Analysis to Insights: Drawing evidence-based conclusions
    """)

    # Module 1: Text as Data
    st.header("Module 1: Text as Data")
    st.write("""
    When we look at text like customer reviews or academic papers, we naturally understand the meaning. 
    But how can a computer understand this?
    
    Key Concept: Text can be treated as data that we can analyze quantitatively.
    Unlike numerical data (age, price, temperature) that has inherent mathematical properties, 
    text data needs to be transformed before we can analyze it.
    """)

    # Interactive Example
    st.subheader("Interactive Example: Text Tokenization")
    st.write("Let's try tokenizing some text:")
    
    example_text = st.text_area(
        "Enter some text to tokenize:",
        "The quick brown fox jumps over the lazy dog."
    )
    
    if st.button("Tokenize Text"):
        tokens = simple_tokenize(example_text)
        st.write("Tokens:", tokens)
        st.write("Number of tokens:", len(tokens))

    # Module 2: Text Processing
    st.header("Module 2: Text Processing")
    st.write("""
    Before we can analyze text, we need to clean and normalize it. This includes:
    - Converting to lowercase
    - Removing punctuation
    - Removing stop words
    - Basic text normalization
    """)

    # Interactive Text Processing
    st.subheader("Try Text Processing")
    st.write("""
    Let's process some text using different techniques:
    """)
    
    process_text = st.text_area(
        "Enter text to process:",
        "The quick brown fox jumps over the lazy dog.",
        key="process_text"
    )
    
    col1, col2 = st.columns(2)
    
    with col1:
        if st.button("Remove Stop Words"):
            tokens = simple_tokenize(process_text)
            filtered_words = remove_stop_words(tokens)
            st.write("After removing stop words:", filtered_words)
    
    with col2:
        if st.button("Remove Punctuation"):
            no_punct = process_text.translate(str.maketrans('', '', string.punctuation))
            st.write("After removing punctuation:", no_punct)

    # Module 3: Text Visualization
    st.header("Module 3: Text Visualization")
    st.write("""
    Visual representations help us identify patterns across text data.
    Common visualization methods include:
    - Word clouds
    - Frequency distributions
    - Sentiment over time
    - Topic clusters
    """)

    # Interactive Word Cloud
    st.subheader("Create a Word Cloud")
    st.write("""
    Let's create a word cloud from some text:
    """)
    
    wordcloud_text = st.text_area(
        "Enter text for word cloud:",
        "The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.",
        key="wordcloud_text"
    )
    
    if st.button("Generate Word Cloud"):
        # Create and generate a word cloud image
        wordcloud = WordCloud().generate(wordcloud_text)
        
        # Display the word cloud
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        st.pyplot(plt)

    # Practice Exercises
    st.header("Practice Exercises")
    
    with st.expander("Exercise 1: Text Processing"):
        st.write("""
        1. Load a sample text
        2. Remove stop words and punctuation
        3. Create a word cloud
        4. Analyze word frequencies
        """)
        
        st.code("""
        # Solution
        from wordcloud import WordCloud
        import string
        
        # Sample text
        text = "Your text here"
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove stop words
        tokens = text.split()
        filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
        
        # Create word cloud
        wordcloud = WordCloud().generate(' '.join(filtered_words))
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
        """)
    
    with st.expander("Exercise 2: Text Analysis"):
        st.write("""
        1. Calculate basic text metrics (word count, unique words)
        2. Perform basic text normalization
        3. Compare the results
        4. Visualize the differences
        """)
        
        st.code("""
        # Solution
        def normalize_text(text):
            # Convert to lowercase
            text = text.lower()
            # Remove punctuation
            text = text.translate(str.maketrans('', '', string.punctuation))
            return text
        
        # Sample text
        text = "Running, runs, ran, better, good"
        
        # Normalize text
        normalized = normalize_text(text)
        words = normalized.split()
        
        # Compare results
        print(f"Original: {text}")
        print(f"Normalized: {normalized}")
        print(f"Word count: {len(words)}")
        print(f"Unique words: {len(set(words))}")
        """)

    username = st.session_state.get("username", "Student")
    st.header(f"{username}'s Weekly Assignment")
    
    if username == "manxiii":
        st.markdown("""
        Hello **manxiii**, here is your Assignment 4: Python Basics.
        1. Finish looking for 3 more research papers and add them to your literate review
        2. Finish literate review for the 2 papers you have already summerized
        3. Add the plots from the previous week to the dataset section and add a description
        4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13

        **Due Date:** End of Week 4
        """)
    elif username == "zhu":
        st.markdown("""
        Hello **zhu**, here is your Assignment 4: NLP Basics.
        """)
    elif username == "WK":
        st.markdown("""
        Hello **WK**, here is your Assignment 4: NLP Basics.


        **Due Date:** End of Week 4
        """)
    else:
        st.markdown(f"""
        Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor
        """)