Spaces:

RyanS974
/

525GradioApp

Sleeping

App Files Files Community

Ryan commited on Apr 22

Commit

fe6b103

1 Parent(s): 42b16a4

update

Browse files

Files changed (2) hide show

processors/topic_modeling.py +55 -10
ui/analysis_screen.py +31 -8

processors/topic_modeling.py CHANGED Viewed

@@ -10,6 +10,54 @@ from nltk.stem import WordNetLemmatizer
 import re
 from scipy.spatial import distance
 def download_nltk_resources():
     """Download required NLTK resources if not already downloaded"""
     try:
@@ -35,7 +83,7 @@ def preprocess_text(text):
     # Convert to lowercase
     text = text.lower()
-    # Remove special characters and digits
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     # Tokenize
@@ -44,20 +92,17 @@ def preprocess_text(text):
     # Remove stopwords
     stop_words = set(stopwords.words('english'))
-    # Add custom stopwords (common in political discourse but low information)
-    custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'on',
-                        'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those', 'from',
-                        'or', 'not', 'be', 'are', 'it', 'was', 'were', 'been', 'being',
-                        'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
-                        'would', 'should', 'could', 'might', 'will', 'shall', 'can', 'may',
-                        'political', 'generally', 'policy', 'policies', 'also'}
     stop_words.update(custom_stopwords)
-    # Lemmatize tokens
     lemmatizer = WordNetLemmatizer()
     tokens = [lemmatizer.lemmatize(token) for token in tokens
-              if token not in stop_words and len(token) > 3]
     return ' '.join(tokens)

 import re
 from scipy.spatial import distance
+def load_all_datasets_for_topic_modeling():
+    """
+    Load all dataset files and prepare them for topic modeling.
+    Returns:
+        tuple: (all_model1_responses, all_model2_responses, all_model_names)
+    """
+    import os
+    from pathlib import Path
+    from utils.text_dataset_parser import parse_text_file
+    dataset_dir = "dataset"
+    if not os.path.exists(dataset_dir):
+        print(f"Dataset directory '{dataset_dir}' not found")
+        return [], [], []
+    # Get all text files
+    path = Path(dataset_dir)
+    text_files = list(path.glob('*.txt'))
+    all_model1_responses = []
+    all_model2_responses = []
+    all_model_names = set()
+    for file_path in text_files:
+        try:
+            dataset = parse_text_file(str(file_path))
+            if dataset.get("response1") and dataset.get("response2"):
+                all_model1_responses.append(dataset.get("response1"))
+                all_model2_responses.append(dataset.get("response2"))
+                # Collect model names
+                if dataset.get("model1"):
+                    all_model_names.add(dataset.get("model1"))
+                if dataset.get("model2"):
+                    all_model_names.add(dataset.get("model2"))
+        except Exception as e:
+            print(f"Error loading dataset file {file_path}: {e}")
+    # Convert set to list for model names
+    model_names_list = list(all_model_names)
+    if len(model_names_list) < 2:
+        # If we couldn't find enough model names, use defaults
+        model_names_list = ["Model 1", "Model 2"]
+    return all_model1_responses, all_model2_responses, model_names_list
 def download_nltk_resources():
     """Download required NLTK resources if not already downloaded"""
     try:
     # Convert to lowercase
     text = text.lower()
+    # Remove special characters and digits but keep spaces (fixed regex)
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     # Tokenize
     # Remove stopwords
     stop_words = set(stopwords.words('english'))
+    # Reduced custom stopwords list - keep more meaningful political terms
+    custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for',
+                        'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those'}
     stop_words.update(custom_stopwords)
+    # Lemmatize tokens - CHANGED from len(token) > 3 to len(token) > 2
+    # This keeps more meaningful short words like "tax", "war", "law", etc.
     lemmatizer = WordNetLemmatizer()
     tokens = [lemmatizer.lemmatize(token) for token in tokens
+              if token not in stop_words and len(token) > 2]
     return ' '.join(tokens)

ui/analysis_screen.py CHANGED Viewed

@@ -286,22 +286,45 @@ def process_analysis_request(dataset, selected_analysis, parameters):
         try:
             # Import the enhanced topic modeling function
-            from processors.topic_modeling import compare_topics
-            topic_results = compare_topics(
-                texts_set_1=[model1_response],
-                texts_set_2=[model2_response],
-                n_topics=topic_count,
-                model_names=[model1_name, model2_name])
-            results["analyses"][prompt_text]["topic_modeling"] = topic_results
             # Add helpful message if text is very short
             if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
                 if "error" not in topic_results:
                     # Add a warning message about short text
                     results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
         except Exception as e:
             import traceback
             print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")

         try:
             # Import the enhanced topic modeling function
+            from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
+            # Get all responses from dataset directory
+            all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
+            # Add current responses to the collection if they're not empty
+            if model1_response.strip():
+                all_model1_responses.append(model1_response)
+            if model2_response.strip():
+                all_model2_responses.append(model2_response)
+            # If we have data, perform topic modeling with all available responses
+            if all_model1_responses and all_model2_responses:
+                topic_results = compare_topics(
+                    texts_set_1=all_model1_responses,
+                    texts_set_2=all_model2_responses,
+                    n_topics=topic_count,
+                    model_names=[model1_name, model2_name])  # Keep original model names for output
+                results["analyses"][prompt_text]["topic_modeling"] = topic_results
+                # Add helpful message about using all datasets
+                results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
+            else:
+                # Fallback to original implementation if no data found
+                topic_results = compare_topics(
+                    texts_set_1=[model1_response],
+                    texts_set_2=[model2_response],
+                    n_topics=topic_count,
+                    model_names=[model1_name, model2_name])
+                results["analyses"][prompt_text]["topic_modeling"] = topic_results
             # Add helpful message if text is very short
             if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
                 if "error" not in topic_results:
                     # Add a warning message about short text
                     results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
         except Exception as e:
             import traceback
             print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")