import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from wordcloud import WordCloud import string import io from contextlib import redirect_stdout import re # Define a simple list of common English stop words STOP_WORDS = { 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will', 'just', 'should', 'now' } def simple_tokenize(text): """Simple tokenization function that splits on whitespace and removes punctuation""" # Convert to lowercase text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Split on whitespace return text.split() def remove_stop_words(tokens): """Remove stop words from a list of tokens""" return [word for word in tokens if word.lower() not in STOP_WORDS] def show(): st.title("Week 4: Introduction to Natural Language Processing") # Introduction Section st.header("Course Overview") st.write(""" In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question: What is the effect of releasing a preprint of a paper before it is submitted for peer review? Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing. """) # Learning Path st.subheader("Learning Path") st.write(""" 1. Understanding Text as Data: How computers represent and work with text 2. Text Processing Fundamentals: Basic cleaning and normalization 3. Quantitative Text Analysis: Measuring and comparing text features 4. Tokenization Approaches: Breaking text into meaningful units 5. Text Visualization Techniques: Creating insightful visual representations 6. From Analysis to Insights: Drawing evidence-based conclusions """) # Module 1: Text as Data st.header("Module 1: Text as Data") st.write(""" When we look at text like customer reviews or academic papers, we naturally understand the meaning. But how can a computer understand this? Key Concept: Text can be treated as data that we can analyze quantitatively. Unlike numerical data (age, price, temperature) that has inherent mathematical properties, text data needs to be transformed before we can analyze it. """) # Interactive Example st.subheader("Interactive Example: Text Tokenization") st.write("Let's try tokenizing some text:") example_text = st.text_area( "Enter some text to tokenize:", "The quick brown fox jumps over the lazy dog." ) if st.button("Tokenize Text"): tokens = simple_tokenize(example_text) st.write("Tokens:", tokens) st.write("Number of tokens:", len(tokens)) # Module 2: Text Processing st.header("Module 2: Text Processing") st.write(""" Before we can analyze text, we need to clean and normalize it. This includes: - Converting to lowercase - Removing punctuation - Removing stop words - Basic text normalization """) # Interactive Text Processing st.subheader("Try Text Processing") st.write(""" Let's process some text using different techniques: """) process_text = st.text_area( "Enter text to process:", "The quick brown fox jumps over the lazy dog.", key="process_text" ) col1, col2 = st.columns(2) with col1: if st.button("Remove Stop Words"): tokens = simple_tokenize(process_text) filtered_words = remove_stop_words(tokens) st.write("After removing stop words:", filtered_words) with col2: if st.button("Remove Punctuation"): no_punct = process_text.translate(str.maketrans('', '', string.punctuation)) st.write("After removing punctuation:", no_punct) # Module 3: Text Visualization st.header("Module 3: Text Visualization") st.write(""" Visual representations help us identify patterns across text data. Common visualization methods include: - Word clouds - Frequency distributions - Sentiment over time - Topic clusters """) # Interactive Word Cloud st.subheader("Create a Word Cloud") st.write(""" Let's create a word cloud from some text: """) wordcloud_text = st.text_area( "Enter text for word cloud:", "The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.", key="wordcloud_text" ) if st.button("Generate Word Cloud"): # Create and generate a word cloud image wordcloud = WordCloud().generate(wordcloud_text) # Display the word cloud plt.figure(figsize=(10, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') st.pyplot(plt) # Practice Exercises st.header("Practice Exercises") with st.expander("Exercise 1: Text Processing"): st.write(""" 1. Load a sample text 2. Remove stop words and punctuation 3. Create a word cloud 4. Analyze word frequencies """) st.code(""" # Solution from wordcloud import WordCloud import string # Sample text text = "Your text here" # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Remove stop words tokens = text.split() filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS] # Create word cloud wordcloud = WordCloud().generate(' '.join(filtered_words)) plt.imshow(wordcloud) plt.axis('off') plt.show() """) with st.expander("Exercise 2: Text Analysis"): st.write(""" 1. Calculate basic text metrics (word count, unique words) 2. Perform basic text normalization 3. Compare the results 4. Visualize the differences """) st.code(""" # Solution def normalize_text(text): # Convert to lowercase text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) return text # Sample text text = "Running, runs, ran, better, good" # Normalize text normalized = normalize_text(text) words = normalized.split() # Compare results print(f"Original: {text}") print(f"Normalized: {normalized}") print(f"Word count: {len(words)}") print(f"Unique words: {len(set(words))}") """) username = st.session_state.get("username", "Student") st.header(f"{username}'s Weekly Assignment") if username == "manxiii": st.markdown(""" Hello **manxiii**, here is your Assignment 4: Python Basics. 1. Finish looking for 3 more research papers and add them to your literate review 2. Finish literate review for the 2 papers you have already summerized 3. Add the plots from the previous week to the dataset section and add a description 4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13 **Due Date:** End of Week 4 """) elif username == "zhu": st.markdown(""" Hello **zhu**, here is your Assignment 4: NLP Basics. """) elif username == "WK": st.markdown(""" Hello **WK**, here is your Assignment 4: NLP Basics. **Due Date:** End of Week 4 """) else: st.markdown(f""" Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor """)