Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
765cde4
1
Parent(s):
fe6b103
update
Browse files- processors/topic_modeling.py +47 -22
- ui/analysis_screen.py +23 -1
processors/topic_modeling.py
CHANGED
@@ -13,6 +13,7 @@ from scipy.spatial import distance
|
|
13 |
def load_all_datasets_for_topic_modeling():
|
14 |
"""
|
15 |
Load all dataset files and prepare them for topic modeling.
|
|
|
16 |
|
17 |
Returns:
|
18 |
tuple: (all_model1_responses, all_model2_responses, all_model_names)
|
@@ -21,34 +22,56 @@ def load_all_datasets_for_topic_modeling():
|
|
21 |
from pathlib import Path
|
22 |
from utils.text_dataset_parser import parse_text_file
|
23 |
|
24 |
-
dataset_dir = "dataset"
|
25 |
-
if not os.path.exists(dataset_dir):
|
26 |
-
print(f"Dataset directory '{dataset_dir}' not found")
|
27 |
-
return [], [], []
|
28 |
-
|
29 |
-
# Get all text files
|
30 |
-
path = Path(dataset_dir)
|
31 |
-
text_files = list(path.glob('*.txt'))
|
32 |
-
|
33 |
all_model1_responses = []
|
34 |
all_model2_responses = []
|
35 |
all_model_names = set()
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Convert set to list for model names
|
54 |
model_names_list = list(all_model_names)
|
@@ -56,6 +79,8 @@ def load_all_datasets_for_topic_modeling():
|
|
56 |
# If we couldn't find enough model names, use defaults
|
57 |
model_names_list = ["Model 1", "Model 2"]
|
58 |
|
|
|
|
|
59 |
return all_model1_responses, all_model2_responses, model_names_list
|
60 |
|
61 |
def download_nltk_resources():
|
|
|
13 |
def load_all_datasets_for_topic_modeling():
|
14 |
"""
|
15 |
Load all dataset files and prepare them for topic modeling.
|
16 |
+
Uses multiple approaches to ensure files are found.
|
17 |
|
18 |
Returns:
|
19 |
tuple: (all_model1_responses, all_model2_responses, all_model_names)
|
|
|
22 |
from pathlib import Path
|
23 |
from utils.text_dataset_parser import parse_text_file
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
all_model1_responses = []
|
26 |
all_model2_responses = []
|
27 |
all_model_names = set()
|
28 |
|
29 |
+
# APPROACH 1: Try loading specific known files
|
30 |
+
known_files = [
|
31 |
+
"person-harris.txt",
|
32 |
+
"person-trump.txt",
|
33 |
+
"topic-foreign_policy.txt",
|
34 |
+
"topic-the_economy.txt"
|
35 |
+
]
|
36 |
+
|
37 |
+
# Try different possible paths
|
38 |
+
possible_paths = [
|
39 |
+
"dataset",
|
40 |
+
os.path.join(os.path.dirname(__file__), "..", "dataset"),
|
41 |
+
os.path.abspath("dataset")
|
42 |
+
]
|
43 |
+
|
44 |
+
dataset_dir = None
|
45 |
+
for path in possible_paths:
|
46 |
+
if os.path.exists(path) and os.path.isdir(path):
|
47 |
+
dataset_dir = path
|
48 |
+
print(f"Found dataset directory at: {path}")
|
49 |
|
50 |
+
# Try to load each known file
|
51 |
+
for file_name in known_files:
|
52 |
+
file_path = os.path.join(path, file_name)
|
53 |
|
54 |
+
if os.path.exists(file_path):
|
55 |
+
try:
|
56 |
+
print(f"Loading known dataset: {file_name}")
|
57 |
+
dataset = parse_text_file(file_path)
|
58 |
+
|
59 |
+
if dataset.get("response1") and dataset.get("response2"):
|
60 |
+
all_model1_responses.append(dataset.get("response1"))
|
61 |
+
all_model2_responses.append(dataset.get("response2"))
|
62 |
+
|
63 |
+
# Collect model names
|
64 |
+
if dataset.get("model1"):
|
65 |
+
all_model_names.add(dataset.get("model1"))
|
66 |
+
if dataset.get("model2"):
|
67 |
+
all_model_names.add(dataset.get("model2"))
|
68 |
+
|
69 |
+
print(f"Successfully loaded {file_name}")
|
70 |
+
except Exception as e:
|
71 |
+
print(f"Error loading file {file_name}: {e}")
|
72 |
+
|
73 |
+
# We've found a dataset directory, no need to check other paths
|
74 |
+
break
|
75 |
|
76 |
# Convert set to list for model names
|
77 |
model_names_list = list(all_model_names)
|
|
|
79 |
# If we couldn't find enough model names, use defaults
|
80 |
model_names_list = ["Model 1", "Model 2"]
|
81 |
|
82 |
+
print(f"Total loaded: {len(all_model1_responses)} response1 entries and {len(all_model2_responses)} response2 entries")
|
83 |
+
|
84 |
return all_model1_responses, all_model2_responses, model_names_list
|
85 |
|
86 |
def download_nltk_resources():
|
ui/analysis_screen.py
CHANGED
@@ -288,17 +288,29 @@ def process_analysis_request(dataset, selected_analysis, parameters):
|
|
288 |
# Import the enhanced topic modeling function
|
289 |
from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
|
290 |
|
|
|
|
|
291 |
# Get all responses from dataset directory
|
292 |
all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
|
293 |
|
294 |
# Add current responses to the collection if they're not empty
|
295 |
if model1_response.strip():
|
296 |
all_model1_responses.append(model1_response)
|
|
|
297 |
if model2_response.strip():
|
298 |
all_model2_responses.append(model2_response)
|
|
|
|
|
|
|
|
|
299 |
|
300 |
# If we have data, perform topic modeling with all available responses
|
301 |
if all_model1_responses and all_model2_responses:
|
|
|
|
|
|
|
|
|
|
|
302 |
topic_results = compare_topics(
|
303 |
texts_set_1=all_model1_responses,
|
304 |
texts_set_2=all_model2_responses,
|
@@ -309,8 +321,17 @@ def process_analysis_request(dataset, selected_analysis, parameters):
|
|
309 |
|
310 |
# Add helpful message about using all datasets
|
311 |
results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
else:
|
313 |
# Fallback to original implementation if no data found
|
|
|
314 |
topic_results = compare_topics(
|
315 |
texts_set_1=[model1_response],
|
316 |
texts_set_2=[model2_response],
|
@@ -327,7 +348,8 @@ def process_analysis_request(dataset, selected_analysis, parameters):
|
|
327 |
|
328 |
except Exception as e:
|
329 |
import traceback
|
330 |
-
|
|
|
331 |
results["analyses"][prompt_text]["topic_modeling"] = {
|
332 |
"models": [model1_name, model2_name],
|
333 |
"error": str(e),
|
|
|
288 |
# Import the enhanced topic modeling function
|
289 |
from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
|
290 |
|
291 |
+
print("Starting topic modeling analysis...")
|
292 |
+
|
293 |
# Get all responses from dataset directory
|
294 |
all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
|
295 |
|
296 |
# Add current responses to the collection if they're not empty
|
297 |
if model1_response.strip():
|
298 |
all_model1_responses.append(model1_response)
|
299 |
+
print(f"Added current model1 response ({len(model1_response.split())} words)")
|
300 |
if model2_response.strip():
|
301 |
all_model2_responses.append(model2_response)
|
302 |
+
print(f"Added current model2 response ({len(model2_response.split())} words)")
|
303 |
+
|
304 |
+
# Ensure we're using all loaded responses
|
305 |
+
print(f"Using {len(all_model1_responses)} model1 responses and {len(all_model2_responses)} model2 responses")
|
306 |
|
307 |
# If we have data, perform topic modeling with all available responses
|
308 |
if all_model1_responses and all_model2_responses:
|
309 |
+
# Calculate total word count for diagnostics
|
310 |
+
total_words_model1 = sum(len(text.split()) for text in all_model1_responses)
|
311 |
+
total_words_model2 = sum(len(text.split()) for text in all_model2_responses)
|
312 |
+
print(f"Total words: Model1={total_words_model1}, Model2={total_words_model2}")
|
313 |
+
|
314 |
topic_results = compare_topics(
|
315 |
texts_set_1=all_model1_responses,
|
316 |
texts_set_2=all_model2_responses,
|
|
|
321 |
|
322 |
# Add helpful message about using all datasets
|
323 |
results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
|
324 |
+
|
325 |
+
# Add corpus details to help users understand the analysis
|
326 |
+
results["analyses"][prompt_text]["topic_modeling"]["corpus_stats"] = {
|
327 |
+
"model1_documents": len(all_model1_responses),
|
328 |
+
"model2_documents": len(all_model2_responses),
|
329 |
+
"model1_total_words": total_words_model1,
|
330 |
+
"model2_total_words": total_words_model2
|
331 |
+
}
|
332 |
else:
|
333 |
# Fallback to original implementation if no data found
|
334 |
+
print("No dataset responses loaded, falling back to current responses only")
|
335 |
topic_results = compare_topics(
|
336 |
texts_set_1=[model1_response],
|
337 |
texts_set_2=[model2_response],
|
|
|
348 |
|
349 |
except Exception as e:
|
350 |
import traceback
|
351 |
+
error_trace = traceback.format_exc()
|
352 |
+
print(f"Topic modeling error: {str(e)}\n{error_trace}")
|
353 |
results["analyses"][prompt_text]["topic_modeling"] = {
|
354 |
"models": [model1_name, model2_name],
|
355 |
"error": str(e),
|