Spaces:

RyanS974
/

525GradioApp

Sleeping

App Files Files Community

Ryan commited on Apr 22

Commit

765cde4

1 Parent(s): fe6b103

update

Browse files

Files changed (2) hide show

processors/topic_modeling.py +47 -22
ui/analysis_screen.py +23 -1

processors/topic_modeling.py CHANGED Viewed

@@ -13,6 +13,7 @@ from scipy.spatial import distance
 def load_all_datasets_for_topic_modeling():
     """
     Load all dataset files and prepare them for topic modeling.
     Returns:
         tuple: (all_model1_responses, all_model2_responses, all_model_names)
@@ -21,34 +22,56 @@ def load_all_datasets_for_topic_modeling():
     from pathlib import Path
     from utils.text_dataset_parser import parse_text_file
-    dataset_dir = "dataset"
-    if not os.path.exists(dataset_dir):
-        print(f"Dataset directory '{dataset_dir}' not found")
-        return [], [], []
-    # Get all text files
-    path = Path(dataset_dir)
-    text_files = list(path.glob('*.txt'))
     all_model1_responses = []
     all_model2_responses = []
     all_model_names = set()
-    for file_path in text_files:
-        try:
-            dataset = parse_text_file(str(file_path))
-            if dataset.get("response1") and dataset.get("response2"):
-                all_model1_responses.append(dataset.get("response1"))
-                all_model2_responses.append(dataset.get("response2"))
-                # Collect model names
-                if dataset.get("model1"):
-                    all_model_names.add(dataset.get("model1"))
-                if dataset.get("model2"):
-                    all_model_names.add(dataset.get("model2"))
-        except Exception as e:
-            print(f"Error loading dataset file {file_path}: {e}")
     # Convert set to list for model names
     model_names_list = list(all_model_names)
@@ -56,6 +79,8 @@ def load_all_datasets_for_topic_modeling():
         # If we couldn't find enough model names, use defaults
         model_names_list = ["Model 1", "Model 2"]
     return all_model1_responses, all_model2_responses, model_names_list
 def download_nltk_resources():

 def load_all_datasets_for_topic_modeling():
     """
     Load all dataset files and prepare them for topic modeling.
+    Uses multiple approaches to ensure files are found.
     Returns:
         tuple: (all_model1_responses, all_model2_responses, all_model_names)
     from pathlib import Path
     from utils.text_dataset_parser import parse_text_file
     all_model1_responses = []
     all_model2_responses = []
     all_model_names = set()
+    # APPROACH 1: Try loading specific known files
+    known_files = [
+        "person-harris.txt",
+        "person-trump.txt",
+        "topic-foreign_policy.txt",
+        "topic-the_economy.txt"
+    ]
+    # Try different possible paths
+    possible_paths = [
+        "dataset",
+        os.path.join(os.path.dirname(__file__), "..", "dataset"),
+        os.path.abspath("dataset")
+    ]
+    dataset_dir = None
+    for path in possible_paths:
+        if os.path.exists(path) and os.path.isdir(path):
+            dataset_dir = path
+            print(f"Found dataset directory at: {path}")
+            # Try to load each known file
+            for file_name in known_files:
+                file_path = os.path.join(path, file_name)
+                if os.path.exists(file_path):
+                    try:
+                        print(f"Loading known dataset: {file_name}")
+                        dataset = parse_text_file(file_path)
+                        if dataset.get("response1") and dataset.get("response2"):
+                            all_model1_responses.append(dataset.get("response1"))
+                            all_model2_responses.append(dataset.get("response2"))
+                            # Collect model names
+                            if dataset.get("model1"):
+                                all_model_names.add(dataset.get("model1"))
+                            if dataset.get("model2"):
+                                all_model_names.add(dataset.get("model2"))
+                            print(f"Successfully loaded {file_name}")
+                    except Exception as e:
+                        print(f"Error loading file {file_name}: {e}")
+            # We've found a dataset directory, no need to check other paths
+            break
     # Convert set to list for model names
     model_names_list = list(all_model_names)
         # If we couldn't find enough model names, use defaults
         model_names_list = ["Model 1", "Model 2"]
+    print(f"Total loaded: {len(all_model1_responses)} response1 entries and {len(all_model2_responses)} response2 entries")
     return all_model1_responses, all_model2_responses, model_names_list
 def download_nltk_resources():

ui/analysis_screen.py CHANGED Viewed

@@ -288,17 +288,29 @@ def process_analysis_request(dataset, selected_analysis, parameters):
             # Import the enhanced topic modeling function
             from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
             # Get all responses from dataset directory
             all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
             # Add current responses to the collection if they're not empty
             if model1_response.strip():
                 all_model1_responses.append(model1_response)
             if model2_response.strip():
                 all_model2_responses.append(model2_response)
             # If we have data, perform topic modeling with all available responses
             if all_model1_responses and all_model2_responses:
                 topic_results = compare_topics(
                     texts_set_1=all_model1_responses,
                     texts_set_2=all_model2_responses,
@@ -309,8 +321,17 @@ def process_analysis_request(dataset, selected_analysis, parameters):
                 # Add helpful message about using all datasets
                 results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
             else:
                 # Fallback to original implementation if no data found
                 topic_results = compare_topics(
                     texts_set_1=[model1_response],
                     texts_set_2=[model2_response],
@@ -327,7 +348,8 @@ def process_analysis_request(dataset, selected_analysis, parameters):
         except Exception as e:
             import traceback
-            print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")
             results["analyses"][prompt_text]["topic_modeling"] = {
                 "models": [model1_name, model2_name],
                 "error": str(e),

             # Import the enhanced topic modeling function
             from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
+            print("Starting topic modeling analysis...")
             # Get all responses from dataset directory
             all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
             # Add current responses to the collection if they're not empty
             if model1_response.strip():
                 all_model1_responses.append(model1_response)
+                print(f"Added current model1 response ({len(model1_response.split())} words)")
             if model2_response.strip():
                 all_model2_responses.append(model2_response)
+                print(f"Added current model2 response ({len(model2_response.split())} words)")
+            # Ensure we're using all loaded responses
+            print(f"Using {len(all_model1_responses)} model1 responses and {len(all_model2_responses)} model2 responses")
             # If we have data, perform topic modeling with all available responses
             if all_model1_responses and all_model2_responses:
+                # Calculate total word count for diagnostics
+                total_words_model1 = sum(len(text.split()) for text in all_model1_responses)
+                total_words_model2 = sum(len(text.split()) for text in all_model2_responses)
+                print(f"Total words: Model1={total_words_model1}, Model2={total_words_model2}")
                 topic_results = compare_topics(
                     texts_set_1=all_model1_responses,
                     texts_set_2=all_model2_responses,
                 # Add helpful message about using all datasets
                 results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
+                # Add corpus details to help users understand the analysis
+                results["analyses"][prompt_text]["topic_modeling"]["corpus_stats"] = {
+                    "model1_documents": len(all_model1_responses),
+                    "model2_documents": len(all_model2_responses),
+                    "model1_total_words": total_words_model1,
+                    "model2_total_words": total_words_model2
+                }
             else:
                 # Fallback to original implementation if no data found
+                print("No dataset responses loaded, falling back to current responses only")
                 topic_results = compare_topics(
                     texts_set_1=[model1_response],
                     texts_set_2=[model2_response],
         except Exception as e:
             import traceback
+            error_trace = traceback.format_exc()
+            print(f"Topic modeling error: {str(e)}\n{error_trace}")
             results["analyses"][prompt_text]["topic_modeling"] = {
                 "models": [model1_name, model2_name],
                 "error": str(e),