Spaces:

raymondEDS
/

DS_webclass

Running

File size: 8,426 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import string
import io
from contextlib import redirect_stdout
import re

# Define a simple list of common English stop words
STOP_WORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
    'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
    'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
    'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
    'just', 'should', 'now'
}

def simple_tokenize(text):
    """Simple tokenization function that splits on whitespace and removes punctuation"""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split on whitespace
    return text.split()

def remove_stop_words(tokens):
    """Remove stop words from a list of tokens"""
    return [word for word in tokens if word.lower() not in STOP_WORDS]

def show():
    st.title("Week 4: Introduction to Natural Language Processing")
    
    # Introduction Section
    st.header("Course Overview")
    st.write("""
    In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question: 
    What is the effect of releasing a preprint of a paper before it is submitted for peer review?
    
    Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles 
    from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing.
    """)
    
    # Learning Path
    st.subheader("Learning Path")
    st.write("""
    1. Understanding Text as Data: How computers represent and work with text
    2. Text Processing Fundamentals: Basic cleaning and normalization
    3. Quantitative Text Analysis: Measuring and comparing text features
    4. Tokenization Approaches: Breaking text into meaningful units
    5. Text Visualization Techniques: Creating insightful visual representations
    6. From Analysis to Insights: Drawing evidence-based conclusions
    """)

    # Module 1: Text as Data
    st.header("Module 1: Text as Data")
    st.write("""
    When we look at text like customer reviews or academic papers, we naturally understand the meaning. 
    But how can a computer understand this?
    
    Key Concept: Text can be treated as data that we can analyze quantitatively.
    Unlike numerical data (age, price, temperature) that has inherent mathematical properties, 
    text data needs to be transformed before we can analyze it.
    """)

    # Interactive Example
    st.subheader("Interactive Example: Text Tokenization")
    st.write("Let's try tokenizing some text:")
    
    example_text = st.text_area(
        "Enter some text to tokenize:",
        "The quick brown fox jumps over the lazy dog."
    )
    
    if st.button("Tokenize Text"):
        tokens = simple_tokenize(example_text)
        st.write("Tokens:", tokens)
        st.write("Number of tokens:", len(tokens))

    # Module 2: Text Processing
    st.header("Module 2: Text Processing")
    st.write("""
    Before we can analyze text, we need to clean and normalize it. This includes:
    - Converting to lowercase
    - Removing punctuation
    - Removing stop words
    - Basic text normalization
    """)

    # Interactive Text Processing
    st.subheader("Try Text Processing")
    st.write("""
    Let's process some text using different techniques:
    """)
    
    process_text = st.text_area(
        "Enter text to process:",
        "The quick brown fox jumps over the lazy dog.",
        key="process_text"
    )
    
    col1, col2 = st.columns(2)
    
    with col1:
        if st.button("Remove Stop Words"):
            tokens = simple_tokenize(process_text)
            filtered_words = remove_stop_words(tokens)
            st.write("After removing stop words:", filtered_words)
    
    with col2:
        if st.button("Remove Punctuation"):
            no_punct = process_text.translate(str.maketrans('', '', string.punctuation))
            st.write("After removing punctuation:", no_punct)

    # Module 3: Text Visualization
    st.header("Module 3: Text Visualization")
    st.write("""
    Visual representations help us identify patterns across text data.
    Common visualization methods include:
    - Word clouds
    - Frequency distributions
    - Sentiment over time
    - Topic clusters
    """)

    # Interactive Word Cloud
    st.subheader("Create a Word Cloud")
    st.write("""
    Let's create a word cloud from some text:
    """)
    
    wordcloud_text = st.text_area(
        "Enter text for word cloud:",
        "The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.",
        key="wordcloud_text"
    )
    
    if st.button("Generate Word Cloud"):
        # Create and generate a word cloud image
        wordcloud = WordCloud().generate(wordcloud_text)
        
        # Display the word cloud
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        st.pyplot(plt)

    # Practice Exercises
    st.header("Practice Exercises")
    
    with st.expander("Exercise 1: Text Processing"):
        st.write("""
        1. Load a sample text
        2. Remove stop words and punctuation
        3. Create a word cloud
        4. Analyze word frequencies
        """)
        
        st.code("""
        # Solution
        from wordcloud import WordCloud
        import string
        
        # Sample text
        text = "Your text here"
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove stop words
        tokens = text.split()
        filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
        
        # Create word cloud
        wordcloud = WordCloud().generate(' '.join(filtered_words))
        plt.imshow(wordcloud)
        plt.axis('off')
        plt.show()
        """)
    
    with st.expander("Exercise 2: Text Analysis"):
        st.write("""
        1. Calculate basic text metrics (word count, unique words)
        2. Perform basic text normalization
        3. Compare the results
        4. Visualize the differences
        """)
        
        st.code("""
        # Solution
        def normalize_text(text):
            # Convert to lowercase
            text = text.lower()
            # Remove punctuation
            text = text.translate(str.maketrans('', '', string.punctuation))
            return text
        
        # Sample text
        text = "Running, runs, ran, better, good"
        
        # Normalize text
        normalized = normalize_text(text)
        words = normalized.split()
        
        # Compare results
        print(f"Original: {text}")
        print(f"Normalized: {normalized}")
        print(f"Word count: {len(words)}")
        print(f"Unique words: {len(set(words))}")
        """)

    username = st.session_state.get("username", "Student")
    st.header(f"{username}'s Weekly Assignment")
    
    if username == "manxiii":
        st.markdown("""
        Hello **manxiii**, here is your Assignment 4: Python Basics.
        1. Finish looking for 3 more research papers and add them to your literate review
        2. Finish literate review for the 2 papers you have already summerized
        3. Add the plots from the previous week to the dataset section and add a description
        4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13

        **Due Date:** End of Week 4
        """)
    elif username == "zhu":
        st.markdown("""
        Hello **zhu**, here is your Assignment 4: NLP Basics.
        """)
    elif username == "WK":
        st.markdown("""
        Hello **WK**, here is your Assignment 4: NLP Basics.


        **Due Date:** End of Week 4
        """)
    else:
        st.markdown(f"""
        Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor
        """)