Spaces:

raymondEDS
/

DS_webclass

Running

App Files Files Community

raymondEDS commited on May 21

Commit

b1b0b70

1 Parent(s): 1748447

Removing NLTK package

Browse files

Files changed (4) hide show

app/__pycache__/main.cpython-311.pyc +0 -0
app/pages/__pycache__/week_3.cpython-311.pyc +0 -0
app/pages/__pycache__/week_4.cpython-311.pyc +0 -0
app/pages/week_4.py +44 -49

app/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ

app/pages/__pycache__/week_3.cpython-311.pyc CHANGED Viewed

Binary files a/app/pages/__pycache__/week_3.cpython-311.pyc and b/app/pages/__pycache__/week_3.cpython-311.pyc differ

app/pages/__pycache__/week_4.cpython-311.pyc CHANGED Viewed

Binary files a/app/pages/__pycache__/week_4.cpython-311.pyc and b/app/pages/__pycache__/week_4.cpython-311.pyc differ

app/pages/week_4.py CHANGED Viewed

@@ -3,36 +3,34 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
-nltk.download('punkt_tab')
-nltk.download('stopwords')
-from nltk.stem import PorterStemmer, WordNetLemmatizer
 from wordcloud import WordCloud
 import string
 import io
 from contextlib import redirect_stdout
-# Initialize session state for notebook-like cells
-if 'cells' not in st.session_state:
-    st.session_state.cells = []
-if 'df' not in st.session_state:
-    st.session_state.df = None
-def capture_output(code, df=None):
-    """Helper function to capture print output"""
-    f = io.StringIO()
-    with redirect_stdout(f):
-        try:
-            # Create a dictionary of variables to use in exec
-            variables = {'pd': pd, 'np': np, 'plt': plt, 'sns': sns, 'nltk': nltk}
-            if df is not None:
-                variables['df'] = df
-            exec(code, variables)
-        except Exception as e:
-            return f"Error: {str(e)}"
-    return f.getvalue()
 def show():
     st.title("Week 4: Introduction to Natural Language Processing")
@@ -79,9 +77,7 @@ def show():
     )
     if st.button("Tokenize Text"):
-        nltk.download('punkt_tab')
-        nltk.download('stopwords')
-        tokens = word_tokenize(example_text)
         st.write("Tokens:", tokens)
         st.write("Number of tokens:", len(tokens))
@@ -92,7 +88,7 @@ def show():
     - Converting to lowercase
     - Removing punctuation
     - Removing stop words
-    - Stemming or lemmatization
     """)
     # Interactive Text Processing
@@ -111,9 +107,8 @@ def show():
     with col1:
         if st.button("Remove Stop Words"):
-            stop_words = set(stopwords.words('english'))
-            words = word_tokenize(process_text.lower())
-            filtered_words = [word for word in words if word not in stop_words]
             st.write("After removing stop words:", filtered_words)
     with col2:
@@ -167,8 +162,6 @@ def show():
         st.code("""
         # Solution
-        import nltk
-        from nltk.corpus import stopwords
         from wordcloud import WordCloud
         import string
@@ -179,9 +172,8 @@ def show():
         text = text.translate(str.maketrans('', '', string.punctuation))
         # Remove stop words
-        stop_words = set(stopwords.words('english'))
-        words = text.split()
-        filtered_words = [word for word in words if word.lower() not in stop_words]
         # Create word cloud
         wordcloud = WordCloud().generate(' '.join(filtered_words))
@@ -193,29 +185,32 @@ def show():
     with st.expander("Exercise 2: Text Analysis"):
         st.write("""
         1. Calculate basic text metrics (word count, unique words)
-        2. Perform stemming and lemmatization
         3. Compare the results
         4. Visualize the differences
         """)
         st.code("""
         # Solution
-        from nltk.stem import PorterStemmer, WordNetLemmatizer
-        # Initialize stemmer and lemmatizer
-        stemmer = PorterStemmer()
-        lemmatizer = WordNetLemmatizer()
-        # Sample words
-        words = ["running", "runs", "ran", "better", "good"]
-        # Apply stemming and lemmatization
-        stemmed = [stemmer.stem(word) for word in words]
-        lemmatized = [lemmatizer.lemmatize(word) for word in words]
         # Compare results
-        for word, stem, lemma in zip(words, stemmed, lemmatized):
-            print(f"Original: {word}, Stemmed: {stem}, Lemmatized: {lemma}")
         """)
     username = st.session_state.get("username", "Student")

 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from wordcloud import WordCloud
 import string
 import io
 from contextlib import redirect_stdout
+import re
+# Define a simple list of common English stop words
+STOP_WORDS = {
+    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
+    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
+    'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
+    'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
+    'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
+    'just', 'should', 'now'
+}
+def simple_tokenize(text):
+    """Simple tokenization function that splits on whitespace and removes punctuation"""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    # Split on whitespace
+    return text.split()
+def remove_stop_words(tokens):
+    """Remove stop words from a list of tokens"""
+    return [word for word in tokens if word.lower() not in STOP_WORDS]
 def show():
     st.title("Week 4: Introduction to Natural Language Processing")
     )
     if st.button("Tokenize Text"):
+        tokens = simple_tokenize(example_text)
         st.write("Tokens:", tokens)
         st.write("Number of tokens:", len(tokens))
     - Converting to lowercase
     - Removing punctuation
     - Removing stop words
+    - Basic text normalization
     """)
     # Interactive Text Processing
     with col1:
         if st.button("Remove Stop Words"):
+            tokens = simple_tokenize(process_text)
+            filtered_words = remove_stop_words(tokens)
             st.write("After removing stop words:", filtered_words)
     with col2:
         st.code("""
         # Solution
         from wordcloud import WordCloud
         import string
         text = text.translate(str.maketrans('', '', string.punctuation))
         # Remove stop words
+        tokens = text.split()
+        filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
         # Create word cloud
         wordcloud = WordCloud().generate(' '.join(filtered_words))
     with st.expander("Exercise 2: Text Analysis"):
         st.write("""
         1. Calculate basic text metrics (word count, unique words)
+        2. Perform basic text normalization
         3. Compare the results
         4. Visualize the differences
         """)
         st.code("""
         # Solution
+        def normalize_text(text):
+            # Convert to lowercase
+            text = text.lower()
+            # Remove punctuation
+            text = text.translate(str.maketrans('', '', string.punctuation))
+            return text
+        # Sample text
+        text = "Running, runs, ran, better, good"
+        # Normalize text
+        normalized = normalize_text(text)
+        words = normalized.split()
         # Compare results
+        print(f"Original: {text}")
+        print(f"Normalized: {normalized}")
+        print(f"Word count: {len(words)}")
+        print(f"Unique words: {len(set(words))}")
         """)
     username = st.session_state.get("username", "Student")