Spaces:
Running
Running
File size: 8,426 Bytes
49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 49e3aec b1b0b70 88ff3a0 cb4004b 88ff3a0 cb4004b 88ff3a0 cb4004b 88ff3a0 cb4004b 88ff3a0 cb4004b 88ff3a0 cb4004b 49e3aec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import string
import io
from contextlib import redirect_stdout
import re
# Define a simple list of common English stop words
STOP_WORDS = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
'just', 'should', 'now'
}
def simple_tokenize(text):
"""Simple tokenization function that splits on whitespace and removes punctuation"""
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Split on whitespace
return text.split()
def remove_stop_words(tokens):
"""Remove stop words from a list of tokens"""
return [word for word in tokens if word.lower() not in STOP_WORDS]
def show():
st.title("Week 4: Introduction to Natural Language Processing")
# Introduction Section
st.header("Course Overview")
st.write("""
In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question:
What is the effect of releasing a preprint of a paper before it is submitted for peer review?
Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles
from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing.
""")
# Learning Path
st.subheader("Learning Path")
st.write("""
1. Understanding Text as Data: How computers represent and work with text
2. Text Processing Fundamentals: Basic cleaning and normalization
3. Quantitative Text Analysis: Measuring and comparing text features
4. Tokenization Approaches: Breaking text into meaningful units
5. Text Visualization Techniques: Creating insightful visual representations
6. From Analysis to Insights: Drawing evidence-based conclusions
""")
# Module 1: Text as Data
st.header("Module 1: Text as Data")
st.write("""
When we look at text like customer reviews or academic papers, we naturally understand the meaning.
But how can a computer understand this?
Key Concept: Text can be treated as data that we can analyze quantitatively.
Unlike numerical data (age, price, temperature) that has inherent mathematical properties,
text data needs to be transformed before we can analyze it.
""")
# Interactive Example
st.subheader("Interactive Example: Text Tokenization")
st.write("Let's try tokenizing some text:")
example_text = st.text_area(
"Enter some text to tokenize:",
"The quick brown fox jumps over the lazy dog."
)
if st.button("Tokenize Text"):
tokens = simple_tokenize(example_text)
st.write("Tokens:", tokens)
st.write("Number of tokens:", len(tokens))
# Module 2: Text Processing
st.header("Module 2: Text Processing")
st.write("""
Before we can analyze text, we need to clean and normalize it. This includes:
- Converting to lowercase
- Removing punctuation
- Removing stop words
- Basic text normalization
""")
# Interactive Text Processing
st.subheader("Try Text Processing")
st.write("""
Let's process some text using different techniques:
""")
process_text = st.text_area(
"Enter text to process:",
"The quick brown fox jumps over the lazy dog.",
key="process_text"
)
col1, col2 = st.columns(2)
with col1:
if st.button("Remove Stop Words"):
tokens = simple_tokenize(process_text)
filtered_words = remove_stop_words(tokens)
st.write("After removing stop words:", filtered_words)
with col2:
if st.button("Remove Punctuation"):
no_punct = process_text.translate(str.maketrans('', '', string.punctuation))
st.write("After removing punctuation:", no_punct)
# Module 3: Text Visualization
st.header("Module 3: Text Visualization")
st.write("""
Visual representations help us identify patterns across text data.
Common visualization methods include:
- Word clouds
- Frequency distributions
- Sentiment over time
- Topic clusters
""")
# Interactive Word Cloud
st.subheader("Create a Word Cloud")
st.write("""
Let's create a word cloud from some text:
""")
wordcloud_text = st.text_area(
"Enter text for word cloud:",
"The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.",
key="wordcloud_text"
)
if st.button("Generate Word Cloud"):
# Create and generate a word cloud image
wordcloud = WordCloud().generate(wordcloud_text)
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
# Practice Exercises
st.header("Practice Exercises")
with st.expander("Exercise 1: Text Processing"):
st.write("""
1. Load a sample text
2. Remove stop words and punctuation
3. Create a word cloud
4. Analyze word frequencies
""")
st.code("""
# Solution
from wordcloud import WordCloud
import string
# Sample text
text = "Your text here"
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove stop words
tokens = text.split()
filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
# Create word cloud
wordcloud = WordCloud().generate(' '.join(filtered_words))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
""")
with st.expander("Exercise 2: Text Analysis"):
st.write("""
1. Calculate basic text metrics (word count, unique words)
2. Perform basic text normalization
3. Compare the results
4. Visualize the differences
""")
st.code("""
# Solution
def normalize_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
return text
# Sample text
text = "Running, runs, ran, better, good"
# Normalize text
normalized = normalize_text(text)
words = normalized.split()
# Compare results
print(f"Original: {text}")
print(f"Normalized: {normalized}")
print(f"Word count: {len(words)}")
print(f"Unique words: {len(set(words))}")
""")
username = st.session_state.get("username", "Student")
st.header(f"{username}'s Weekly Assignment")
if username == "manxiii":
st.markdown("""
Hello **manxiii**, here is your Assignment 4: Python Basics.
1. Finish looking for 3 more research papers and add them to your literate review
2. Finish literate review for the 2 papers you have already summerized
3. Add the plots from the previous week to the dataset section and add a description
4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13
**Due Date:** End of Week 4
""")
elif username == "zhu":
st.markdown("""
Hello **zhu**, here is your Assignment 4: NLP Basics.
""")
elif username == "WK":
st.markdown("""
Hello **WK**, here is your Assignment 4: NLP Basics.
**Due Date:** End of Week 4
""")
else:
st.markdown(f"""
Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor
""") |