raymondEDS commited on
Commit
b1b0b70
·
1 Parent(s): 1748447

Removing NLTK package

Browse files
app/__pycache__/main.cpython-311.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
 
app/pages/__pycache__/week_3.cpython-311.pyc CHANGED
Binary files a/app/pages/__pycache__/week_3.cpython-311.pyc and b/app/pages/__pycache__/week_3.cpython-311.pyc differ
 
app/pages/__pycache__/week_4.cpython-311.pyc CHANGED
Binary files a/app/pages/__pycache__/week_4.cpython-311.pyc and b/app/pages/__pycache__/week_4.cpython-311.pyc differ
 
app/pages/week_4.py CHANGED
@@ -3,36 +3,34 @@ import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- import nltk
7
- from nltk.corpus import stopwords
8
- from nltk.tokenize import word_tokenize, sent_tokenize
9
- nltk.download('punkt_tab')
10
- nltk.download('stopwords')
11
- from nltk.stem import PorterStemmer, WordNetLemmatizer
12
  from wordcloud import WordCloud
13
  import string
14
  import io
15
  from contextlib import redirect_stdout
 
16
 
17
- # Initialize session state for notebook-like cells
18
- if 'cells' not in st.session_state:
19
- st.session_state.cells = []
20
- if 'df' not in st.session_state:
21
- st.session_state.df = None
 
 
 
 
22
 
23
- def capture_output(code, df=None):
24
- """Helper function to capture print output"""
25
- f = io.StringIO()
26
- with redirect_stdout(f):
27
- try:
28
- # Create a dictionary of variables to use in exec
29
- variables = {'pd': pd, 'np': np, 'plt': plt, 'sns': sns, 'nltk': nltk}
30
- if df is not None:
31
- variables['df'] = df
32
- exec(code, variables)
33
- except Exception as e:
34
- return f"Error: {str(e)}"
35
- return f.getvalue()
36
 
37
  def show():
38
  st.title("Week 4: Introduction to Natural Language Processing")
@@ -79,9 +77,7 @@ def show():
79
  )
80
 
81
  if st.button("Tokenize Text"):
82
- nltk.download('punkt_tab')
83
- nltk.download('stopwords')
84
- tokens = word_tokenize(example_text)
85
  st.write("Tokens:", tokens)
86
  st.write("Number of tokens:", len(tokens))
87
 
@@ -92,7 +88,7 @@ def show():
92
  - Converting to lowercase
93
  - Removing punctuation
94
  - Removing stop words
95
- - Stemming or lemmatization
96
  """)
97
 
98
  # Interactive Text Processing
@@ -111,9 +107,8 @@ def show():
111
 
112
  with col1:
113
  if st.button("Remove Stop Words"):
114
- stop_words = set(stopwords.words('english'))
115
- words = word_tokenize(process_text.lower())
116
- filtered_words = [word for word in words if word not in stop_words]
117
  st.write("After removing stop words:", filtered_words)
118
 
119
  with col2:
@@ -167,8 +162,6 @@ def show():
167
 
168
  st.code("""
169
  # Solution
170
- import nltk
171
- from nltk.corpus import stopwords
172
  from wordcloud import WordCloud
173
  import string
174
 
@@ -179,9 +172,8 @@ def show():
179
  text = text.translate(str.maketrans('', '', string.punctuation))
180
 
181
  # Remove stop words
182
- stop_words = set(stopwords.words('english'))
183
- words = text.split()
184
- filtered_words = [word for word in words if word.lower() not in stop_words]
185
 
186
  # Create word cloud
187
  wordcloud = WordCloud().generate(' '.join(filtered_words))
@@ -193,29 +185,32 @@ def show():
193
  with st.expander("Exercise 2: Text Analysis"):
194
  st.write("""
195
  1. Calculate basic text metrics (word count, unique words)
196
- 2. Perform stemming and lemmatization
197
  3. Compare the results
198
  4. Visualize the differences
199
  """)
200
 
201
  st.code("""
202
  # Solution
203
- from nltk.stem import PorterStemmer, WordNetLemmatizer
204
-
205
- # Initialize stemmer and lemmatizer
206
- stemmer = PorterStemmer()
207
- lemmatizer = WordNetLemmatizer()
 
208
 
209
- # Sample words
210
- words = ["running", "runs", "ran", "better", "good"]
211
 
212
- # Apply stemming and lemmatization
213
- stemmed = [stemmer.stem(word) for word in words]
214
- lemmatized = [lemmatizer.lemmatize(word) for word in words]
215
 
216
  # Compare results
217
- for word, stem, lemma in zip(words, stemmed, lemmatized):
218
- print(f"Original: {word}, Stemmed: {stem}, Lemmatized: {lemma}")
 
 
219
  """)
220
 
221
  username = st.session_state.get("username", "Student")
 
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
 
 
 
 
 
 
6
  from wordcloud import WordCloud
7
  import string
8
  import io
9
  from contextlib import redirect_stdout
10
+ import re
11
 
12
+ # Define a simple list of common English stop words
13
+ STOP_WORDS = {
14
+ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
15
+ 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
16
+ 'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
17
+ 'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
18
+ 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
19
+ 'just', 'should', 'now'
20
+ }
21
 
22
+ def simple_tokenize(text):
23
+ """Simple tokenization function that splits on whitespace and removes punctuation"""
24
+ # Convert to lowercase
25
+ text = text.lower()
26
+ # Remove punctuation
27
+ text = text.translate(str.maketrans('', '', string.punctuation))
28
+ # Split on whitespace
29
+ return text.split()
30
+
31
+ def remove_stop_words(tokens):
32
+ """Remove stop words from a list of tokens"""
33
+ return [word for word in tokens if word.lower() not in STOP_WORDS]
 
34
 
35
  def show():
36
  st.title("Week 4: Introduction to Natural Language Processing")
 
77
  )
78
 
79
  if st.button("Tokenize Text"):
80
+ tokens = simple_tokenize(example_text)
 
 
81
  st.write("Tokens:", tokens)
82
  st.write("Number of tokens:", len(tokens))
83
 
 
88
  - Converting to lowercase
89
  - Removing punctuation
90
  - Removing stop words
91
+ - Basic text normalization
92
  """)
93
 
94
  # Interactive Text Processing
 
107
 
108
  with col1:
109
  if st.button("Remove Stop Words"):
110
+ tokens = simple_tokenize(process_text)
111
+ filtered_words = remove_stop_words(tokens)
 
112
  st.write("After removing stop words:", filtered_words)
113
 
114
  with col2:
 
162
 
163
  st.code("""
164
  # Solution
 
 
165
  from wordcloud import WordCloud
166
  import string
167
 
 
172
  text = text.translate(str.maketrans('', '', string.punctuation))
173
 
174
  # Remove stop words
175
+ tokens = text.split()
176
+ filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
 
177
 
178
  # Create word cloud
179
  wordcloud = WordCloud().generate(' '.join(filtered_words))
 
185
  with st.expander("Exercise 2: Text Analysis"):
186
  st.write("""
187
  1. Calculate basic text metrics (word count, unique words)
188
+ 2. Perform basic text normalization
189
  3. Compare the results
190
  4. Visualize the differences
191
  """)
192
 
193
  st.code("""
194
  # Solution
195
+ def normalize_text(text):
196
+ # Convert to lowercase
197
+ text = text.lower()
198
+ # Remove punctuation
199
+ text = text.translate(str.maketrans('', '', string.punctuation))
200
+ return text
201
 
202
+ # Sample text
203
+ text = "Running, runs, ran, better, good"
204
 
205
+ # Normalize text
206
+ normalized = normalize_text(text)
207
+ words = normalized.split()
208
 
209
  # Compare results
210
+ print(f"Original: {text}")
211
+ print(f"Normalized: {normalized}")
212
+ print(f"Word count: {len(words)}")
213
+ print(f"Unique words: {len(set(words))}")
214
  """)
215
 
216
  username = st.session_state.get("username", "Student")