Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
fe6b103
1
Parent(s):
42b16a4
update
Browse files- processors/topic_modeling.py +55 -10
- ui/analysis_screen.py +31 -8
processors/topic_modeling.py
CHANGED
@@ -10,6 +10,54 @@ from nltk.stem import WordNetLemmatizer
|
|
10 |
import re
|
11 |
from scipy.spatial import distance
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def download_nltk_resources():
|
14 |
"""Download required NLTK resources if not already downloaded"""
|
15 |
try:
|
@@ -35,7 +83,7 @@ def preprocess_text(text):
|
|
35 |
# Convert to lowercase
|
36 |
text = text.lower()
|
37 |
|
38 |
-
# Remove special characters and digits
|
39 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
40 |
|
41 |
# Tokenize
|
@@ -44,20 +92,17 @@ def preprocess_text(text):
|
|
44 |
# Remove stopwords
|
45 |
stop_words = set(stopwords.words('english'))
|
46 |
|
47 |
-
#
|
48 |
-
custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for',
|
49 |
-
'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those'
|
50 |
-
'or', 'not', 'be', 'are', 'it', 'was', 'were', 'been', 'being',
|
51 |
-
'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
|
52 |
-
'would', 'should', 'could', 'might', 'will', 'shall', 'can', 'may',
|
53 |
-
'political', 'generally', 'policy', 'policies', 'also'}
|
54 |
|
55 |
stop_words.update(custom_stopwords)
|
56 |
|
57 |
-
# Lemmatize tokens
|
|
|
58 |
lemmatizer = WordNetLemmatizer()
|
59 |
tokens = [lemmatizer.lemmatize(token) for token in tokens
|
60 |
-
if token not in stop_words and len(token) >
|
61 |
|
62 |
return ' '.join(tokens)
|
63 |
|
|
|
10 |
import re
|
11 |
from scipy.spatial import distance
|
12 |
|
13 |
+
def load_all_datasets_for_topic_modeling():
|
14 |
+
"""
|
15 |
+
Load all dataset files and prepare them for topic modeling.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
tuple: (all_model1_responses, all_model2_responses, all_model_names)
|
19 |
+
"""
|
20 |
+
import os
|
21 |
+
from pathlib import Path
|
22 |
+
from utils.text_dataset_parser import parse_text_file
|
23 |
+
|
24 |
+
dataset_dir = "dataset"
|
25 |
+
if not os.path.exists(dataset_dir):
|
26 |
+
print(f"Dataset directory '{dataset_dir}' not found")
|
27 |
+
return [], [], []
|
28 |
+
|
29 |
+
# Get all text files
|
30 |
+
path = Path(dataset_dir)
|
31 |
+
text_files = list(path.glob('*.txt'))
|
32 |
+
|
33 |
+
all_model1_responses = []
|
34 |
+
all_model2_responses = []
|
35 |
+
all_model_names = set()
|
36 |
+
|
37 |
+
for file_path in text_files:
|
38 |
+
try:
|
39 |
+
dataset = parse_text_file(str(file_path))
|
40 |
+
|
41 |
+
if dataset.get("response1") and dataset.get("response2"):
|
42 |
+
all_model1_responses.append(dataset.get("response1"))
|
43 |
+
all_model2_responses.append(dataset.get("response2"))
|
44 |
+
|
45 |
+
# Collect model names
|
46 |
+
if dataset.get("model1"):
|
47 |
+
all_model_names.add(dataset.get("model1"))
|
48 |
+
if dataset.get("model2"):
|
49 |
+
all_model_names.add(dataset.get("model2"))
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error loading dataset file {file_path}: {e}")
|
52 |
+
|
53 |
+
# Convert set to list for model names
|
54 |
+
model_names_list = list(all_model_names)
|
55 |
+
if len(model_names_list) < 2:
|
56 |
+
# If we couldn't find enough model names, use defaults
|
57 |
+
model_names_list = ["Model 1", "Model 2"]
|
58 |
+
|
59 |
+
return all_model1_responses, all_model2_responses, model_names_list
|
60 |
+
|
61 |
def download_nltk_resources():
|
62 |
"""Download required NLTK resources if not already downloaded"""
|
63 |
try:
|
|
|
83 |
# Convert to lowercase
|
84 |
text = text.lower()
|
85 |
|
86 |
+
# Remove special characters and digits but keep spaces (fixed regex)
|
87 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
88 |
|
89 |
# Tokenize
|
|
|
92 |
# Remove stopwords
|
93 |
stop_words = set(stopwords.words('english'))
|
94 |
|
95 |
+
# Reduced custom stopwords list - keep more meaningful political terms
|
96 |
+
custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for',
|
97 |
+
'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those'}
|
|
|
|
|
|
|
|
|
98 |
|
99 |
stop_words.update(custom_stopwords)
|
100 |
|
101 |
+
# Lemmatize tokens - CHANGED from len(token) > 3 to len(token) > 2
|
102 |
+
# This keeps more meaningful short words like "tax", "war", "law", etc.
|
103 |
lemmatizer = WordNetLemmatizer()
|
104 |
tokens = [lemmatizer.lemmatize(token) for token in tokens
|
105 |
+
if token not in stop_words and len(token) > 2]
|
106 |
|
107 |
return ' '.join(tokens)
|
108 |
|
ui/analysis_screen.py
CHANGED
@@ -286,22 +286,45 @@ def process_analysis_request(dataset, selected_analysis, parameters):
|
|
286 |
|
287 |
try:
|
288 |
# Import the enhanced topic modeling function
|
289 |
-
from processors.topic_modeling import compare_topics
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
texts_set_2=[model2_response],
|
294 |
-
n_topics=topic_count,
|
295 |
-
model_names=[model1_name, model2_name])
|
296 |
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
# Add helpful message if text is very short
|
300 |
if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
|
301 |
if "error" not in topic_results:
|
302 |
# Add a warning message about short text
|
303 |
results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
|
304 |
-
|
305 |
except Exception as e:
|
306 |
import traceback
|
307 |
print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")
|
|
|
286 |
|
287 |
try:
|
288 |
# Import the enhanced topic modeling function
|
289 |
+
from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
|
290 |
|
291 |
+
# Get all responses from dataset directory
|
292 |
+
all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
|
|
|
|
|
|
|
293 |
|
294 |
+
# Add current responses to the collection if they're not empty
|
295 |
+
if model1_response.strip():
|
296 |
+
all_model1_responses.append(model1_response)
|
297 |
+
if model2_response.strip():
|
298 |
+
all_model2_responses.append(model2_response)
|
299 |
+
|
300 |
+
# If we have data, perform topic modeling with all available responses
|
301 |
+
if all_model1_responses and all_model2_responses:
|
302 |
+
topic_results = compare_topics(
|
303 |
+
texts_set_1=all_model1_responses,
|
304 |
+
texts_set_2=all_model2_responses,
|
305 |
+
n_topics=topic_count,
|
306 |
+
model_names=[model1_name, model2_name]) # Keep original model names for output
|
307 |
+
|
308 |
+
results["analyses"][prompt_text]["topic_modeling"] = topic_results
|
309 |
+
|
310 |
+
# Add helpful message about using all datasets
|
311 |
+
results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
|
312 |
+
else:
|
313 |
+
# Fallback to original implementation if no data found
|
314 |
+
topic_results = compare_topics(
|
315 |
+
texts_set_1=[model1_response],
|
316 |
+
texts_set_2=[model2_response],
|
317 |
+
n_topics=topic_count,
|
318 |
+
model_names=[model1_name, model2_name])
|
319 |
+
|
320 |
+
results["analyses"][prompt_text]["topic_modeling"] = topic_results
|
321 |
|
322 |
# Add helpful message if text is very short
|
323 |
if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
|
324 |
if "error" not in topic_results:
|
325 |
# Add a warning message about short text
|
326 |
results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
|
327 |
+
|
328 |
except Exception as e:
|
329 |
import traceback
|
330 |
print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")
|