Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from wordcloud import WordCloud | |
import string | |
import io | |
from contextlib import redirect_stdout | |
import re | |
# Define a simple list of common English stop words | |
STOP_WORDS = { | |
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', | |
'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', | |
'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when', | |
'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few', | |
'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will', | |
'just', 'should', 'now' | |
} | |
def simple_tokenize(text): | |
"""Simple tokenization function that splits on whitespace and removes punctuation""" | |
# Convert to lowercase | |
text = text.lower() | |
# Remove punctuation | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
# Split on whitespace | |
return text.split() | |
def remove_stop_words(tokens): | |
"""Remove stop words from a list of tokens""" | |
return [word for word in tokens if word.lower() not in STOP_WORDS] | |
def show(): | |
st.title("Week 4: Introduction to Natural Language Processing") | |
# Introduction Section | |
st.header("Course Overview") | |
st.write(""" | |
In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question: | |
What is the effect of releasing a preprint of a paper before it is submitted for peer review? | |
Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles | |
from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing. | |
""") | |
# Learning Path | |
st.subheader("Learning Path") | |
st.write(""" | |
1. Understanding Text as Data: How computers represent and work with text | |
2. Text Processing Fundamentals: Basic cleaning and normalization | |
3. Quantitative Text Analysis: Measuring and comparing text features | |
4. Tokenization Approaches: Breaking text into meaningful units | |
5. Text Visualization Techniques: Creating insightful visual representations | |
6. From Analysis to Insights: Drawing evidence-based conclusions | |
""") | |
# Module 1: Text as Data | |
st.header("Module 1: Text as Data") | |
st.write(""" | |
When we look at text like customer reviews or academic papers, we naturally understand the meaning. | |
But how can a computer understand this? | |
Key Concept: Text can be treated as data that we can analyze quantitatively. | |
Unlike numerical data (age, price, temperature) that has inherent mathematical properties, | |
text data needs to be transformed before we can analyze it. | |
""") | |
# Interactive Example | |
st.subheader("Interactive Example: Text Tokenization") | |
st.write("Let's try tokenizing some text:") | |
example_text = st.text_area( | |
"Enter some text to tokenize:", | |
"The quick brown fox jumps over the lazy dog." | |
) | |
if st.button("Tokenize Text"): | |
tokens = simple_tokenize(example_text) | |
st.write("Tokens:", tokens) | |
st.write("Number of tokens:", len(tokens)) | |
# Module 2: Text Processing | |
st.header("Module 2: Text Processing") | |
st.write(""" | |
Before we can analyze text, we need to clean and normalize it. This includes: | |
- Converting to lowercase | |
- Removing punctuation | |
- Removing stop words | |
- Basic text normalization | |
""") | |
# Interactive Text Processing | |
st.subheader("Try Text Processing") | |
st.write(""" | |
Let's process some text using different techniques: | |
""") | |
process_text = st.text_area( | |
"Enter text to process:", | |
"The quick brown fox jumps over the lazy dog.", | |
key="process_text" | |
) | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button("Remove Stop Words"): | |
tokens = simple_tokenize(process_text) | |
filtered_words = remove_stop_words(tokens) | |
st.write("After removing stop words:", filtered_words) | |
with col2: | |
if st.button("Remove Punctuation"): | |
no_punct = process_text.translate(str.maketrans('', '', string.punctuation)) | |
st.write("After removing punctuation:", no_punct) | |
# Module 3: Text Visualization | |
st.header("Module 3: Text Visualization") | |
st.write(""" | |
Visual representations help us identify patterns across text data. | |
Common visualization methods include: | |
- Word clouds | |
- Frequency distributions | |
- Sentiment over time | |
- Topic clusters | |
""") | |
# Interactive Word Cloud | |
st.subheader("Create a Word Cloud") | |
st.write(""" | |
Let's create a word cloud from some text: | |
""") | |
wordcloud_text = st.text_area( | |
"Enter text for word cloud:", | |
"The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.", | |
key="wordcloud_text" | |
) | |
if st.button("Generate Word Cloud"): | |
# Create and generate a word cloud image | |
wordcloud = WordCloud().generate(wordcloud_text) | |
# Display the word cloud | |
plt.figure(figsize=(10, 6)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
st.pyplot(plt) | |
# Practice Exercises | |
st.header("Practice Exercises") | |
with st.expander("Exercise 1: Text Processing"): | |
st.write(""" | |
1. Load a sample text | |
2. Remove stop words and punctuation | |
3. Create a word cloud | |
4. Analyze word frequencies | |
""") | |
st.code(""" | |
# Solution | |
from wordcloud import WordCloud | |
import string | |
# Sample text | |
text = "Your text here" | |
# Remove punctuation | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
# Remove stop words | |
tokens = text.split() | |
filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS] | |
# Create word cloud | |
wordcloud = WordCloud().generate(' '.join(filtered_words)) | |
plt.imshow(wordcloud) | |
plt.axis('off') | |
plt.show() | |
""") | |
with st.expander("Exercise 2: Text Analysis"): | |
st.write(""" | |
1. Calculate basic text metrics (word count, unique words) | |
2. Perform basic text normalization | |
3. Compare the results | |
4. Visualize the differences | |
""") | |
st.code(""" | |
# Solution | |
def normalize_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Remove punctuation | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
return text | |
# Sample text | |
text = "Running, runs, ran, better, good" | |
# Normalize text | |
normalized = normalize_text(text) | |
words = normalized.split() | |
# Compare results | |
print(f"Original: {text}") | |
print(f"Normalized: {normalized}") | |
print(f"Word count: {len(words)}") | |
print(f"Unique words: {len(set(words))}") | |
""") | |
username = st.session_state.get("username", "Student") | |
st.header(f"{username}'s Weekly Assignment") | |
if username == "manxiii": | |
st.markdown(""" | |
Hello **manxiii**, here is your Assignment 4: Python Basics. | |
1. Finish looking for 3 more research papers and add them to your literate review | |
2. Finish literate review for the 2 papers you have already summerized | |
3. Add the plots from the previous week to the dataset section and add a description | |
4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13 | |
**Due Date:** End of Week 4 | |
""") | |
elif username == "zhu": | |
st.markdown(""" | |
Hello **zhu**, here is your Assignment 4: NLP Basics. | |
""") | |
elif username == "WK": | |
st.markdown(""" | |
Hello **WK**, here is your Assignment 4: NLP Basics. | |
**Due Date:** End of Week 4 | |
""") | |
else: | |
st.markdown(f""" | |
Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor | |
""") |