Ryan commited on
Commit
765cde4
·
1 Parent(s): fe6b103
Files changed (2) hide show
  1. processors/topic_modeling.py +47 -22
  2. ui/analysis_screen.py +23 -1
processors/topic_modeling.py CHANGED
@@ -13,6 +13,7 @@ from scipy.spatial import distance
13
  def load_all_datasets_for_topic_modeling():
14
  """
15
  Load all dataset files and prepare them for topic modeling.
 
16
 
17
  Returns:
18
  tuple: (all_model1_responses, all_model2_responses, all_model_names)
@@ -21,34 +22,56 @@ def load_all_datasets_for_topic_modeling():
21
  from pathlib import Path
22
  from utils.text_dataset_parser import parse_text_file
23
 
24
- dataset_dir = "dataset"
25
- if not os.path.exists(dataset_dir):
26
- print(f"Dataset directory '{dataset_dir}' not found")
27
- return [], [], []
28
-
29
- # Get all text files
30
- path = Path(dataset_dir)
31
- text_files = list(path.glob('*.txt'))
32
-
33
  all_model1_responses = []
34
  all_model2_responses = []
35
  all_model_names = set()
36
 
37
- for file_path in text_files:
38
- try:
39
- dataset = parse_text_file(str(file_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- if dataset.get("response1") and dataset.get("response2"):
42
- all_model1_responses.append(dataset.get("response1"))
43
- all_model2_responses.append(dataset.get("response2"))
44
 
45
- # Collect model names
46
- if dataset.get("model1"):
47
- all_model_names.add(dataset.get("model1"))
48
- if dataset.get("model2"):
49
- all_model_names.add(dataset.get("model2"))
50
- except Exception as e:
51
- print(f"Error loading dataset file {file_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Convert set to list for model names
54
  model_names_list = list(all_model_names)
@@ -56,6 +79,8 @@ def load_all_datasets_for_topic_modeling():
56
  # If we couldn't find enough model names, use defaults
57
  model_names_list = ["Model 1", "Model 2"]
58
 
 
 
59
  return all_model1_responses, all_model2_responses, model_names_list
60
 
61
  def download_nltk_resources():
 
13
  def load_all_datasets_for_topic_modeling():
14
  """
15
  Load all dataset files and prepare them for topic modeling.
16
+ Uses multiple approaches to ensure files are found.
17
 
18
  Returns:
19
  tuple: (all_model1_responses, all_model2_responses, all_model_names)
 
22
  from pathlib import Path
23
  from utils.text_dataset_parser import parse_text_file
24
 
 
 
 
 
 
 
 
 
 
25
  all_model1_responses = []
26
  all_model2_responses = []
27
  all_model_names = set()
28
 
29
+ # APPROACH 1: Try loading specific known files
30
+ known_files = [
31
+ "person-harris.txt",
32
+ "person-trump.txt",
33
+ "topic-foreign_policy.txt",
34
+ "topic-the_economy.txt"
35
+ ]
36
+
37
+ # Try different possible paths
38
+ possible_paths = [
39
+ "dataset",
40
+ os.path.join(os.path.dirname(__file__), "..", "dataset"),
41
+ os.path.abspath("dataset")
42
+ ]
43
+
44
+ dataset_dir = None
45
+ for path in possible_paths:
46
+ if os.path.exists(path) and os.path.isdir(path):
47
+ dataset_dir = path
48
+ print(f"Found dataset directory at: {path}")
49
 
50
+ # Try to load each known file
51
+ for file_name in known_files:
52
+ file_path = os.path.join(path, file_name)
53
 
54
+ if os.path.exists(file_path):
55
+ try:
56
+ print(f"Loading known dataset: {file_name}")
57
+ dataset = parse_text_file(file_path)
58
+
59
+ if dataset.get("response1") and dataset.get("response2"):
60
+ all_model1_responses.append(dataset.get("response1"))
61
+ all_model2_responses.append(dataset.get("response2"))
62
+
63
+ # Collect model names
64
+ if dataset.get("model1"):
65
+ all_model_names.add(dataset.get("model1"))
66
+ if dataset.get("model2"):
67
+ all_model_names.add(dataset.get("model2"))
68
+
69
+ print(f"Successfully loaded {file_name}")
70
+ except Exception as e:
71
+ print(f"Error loading file {file_name}: {e}")
72
+
73
+ # We've found a dataset directory, no need to check other paths
74
+ break
75
 
76
  # Convert set to list for model names
77
  model_names_list = list(all_model_names)
 
79
  # If we couldn't find enough model names, use defaults
80
  model_names_list = ["Model 1", "Model 2"]
81
 
82
+ print(f"Total loaded: {len(all_model1_responses)} response1 entries and {len(all_model2_responses)} response2 entries")
83
+
84
  return all_model1_responses, all_model2_responses, model_names_list
85
 
86
  def download_nltk_resources():
ui/analysis_screen.py CHANGED
@@ -288,17 +288,29 @@ def process_analysis_request(dataset, selected_analysis, parameters):
288
  # Import the enhanced topic modeling function
289
  from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
290
 
 
 
291
  # Get all responses from dataset directory
292
  all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
293
 
294
  # Add current responses to the collection if they're not empty
295
  if model1_response.strip():
296
  all_model1_responses.append(model1_response)
 
297
  if model2_response.strip():
298
  all_model2_responses.append(model2_response)
 
 
 
 
299
 
300
  # If we have data, perform topic modeling with all available responses
301
  if all_model1_responses and all_model2_responses:
 
 
 
 
 
302
  topic_results = compare_topics(
303
  texts_set_1=all_model1_responses,
304
  texts_set_2=all_model2_responses,
@@ -309,8 +321,17 @@ def process_analysis_request(dataset, selected_analysis, parameters):
309
 
310
  # Add helpful message about using all datasets
311
  results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
 
 
 
 
 
 
 
 
312
  else:
313
  # Fallback to original implementation if no data found
 
314
  topic_results = compare_topics(
315
  texts_set_1=[model1_response],
316
  texts_set_2=[model2_response],
@@ -327,7 +348,8 @@ def process_analysis_request(dataset, selected_analysis, parameters):
327
 
328
  except Exception as e:
329
  import traceback
330
- print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")
 
331
  results["analyses"][prompt_text]["topic_modeling"] = {
332
  "models": [model1_name, model2_name],
333
  "error": str(e),
 
288
  # Import the enhanced topic modeling function
289
  from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
290
 
291
+ print("Starting topic modeling analysis...")
292
+
293
  # Get all responses from dataset directory
294
  all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
295
 
296
  # Add current responses to the collection if they're not empty
297
  if model1_response.strip():
298
  all_model1_responses.append(model1_response)
299
+ print(f"Added current model1 response ({len(model1_response.split())} words)")
300
  if model2_response.strip():
301
  all_model2_responses.append(model2_response)
302
+ print(f"Added current model2 response ({len(model2_response.split())} words)")
303
+
304
+ # Ensure we're using all loaded responses
305
+ print(f"Using {len(all_model1_responses)} model1 responses and {len(all_model2_responses)} model2 responses")
306
 
307
  # If we have data, perform topic modeling with all available responses
308
  if all_model1_responses and all_model2_responses:
309
+ # Calculate total word count for diagnostics
310
+ total_words_model1 = sum(len(text.split()) for text in all_model1_responses)
311
+ total_words_model2 = sum(len(text.split()) for text in all_model2_responses)
312
+ print(f"Total words: Model1={total_words_model1}, Model2={total_words_model2}")
313
+
314
  topic_results = compare_topics(
315
  texts_set_1=all_model1_responses,
316
  texts_set_2=all_model2_responses,
 
321
 
322
  # Add helpful message about using all datasets
323
  results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
324
+
325
+ # Add corpus details to help users understand the analysis
326
+ results["analyses"][prompt_text]["topic_modeling"]["corpus_stats"] = {
327
+ "model1_documents": len(all_model1_responses),
328
+ "model2_documents": len(all_model2_responses),
329
+ "model1_total_words": total_words_model1,
330
+ "model2_total_words": total_words_model2
331
+ }
332
  else:
333
  # Fallback to original implementation if no data found
334
+ print("No dataset responses loaded, falling back to current responses only")
335
  topic_results = compare_topics(
336
  texts_set_1=[model1_response],
337
  texts_set_2=[model2_response],
 
348
 
349
  except Exception as e:
350
  import traceback
351
+ error_trace = traceback.format_exc()
352
+ print(f"Topic modeling error: {str(e)}\n{error_trace}")
353
  results["analyses"][prompt_text]["topic_modeling"] = {
354
  "models": [model1_name, model2_name],
355
  "error": str(e),