Ryan commited on
Commit
fe6b103
·
1 Parent(s): 42b16a4
Files changed (2) hide show
  1. processors/topic_modeling.py +55 -10
  2. ui/analysis_screen.py +31 -8
processors/topic_modeling.py CHANGED
@@ -10,6 +10,54 @@ from nltk.stem import WordNetLemmatizer
10
  import re
11
  from scipy.spatial import distance
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def download_nltk_resources():
14
  """Download required NLTK resources if not already downloaded"""
15
  try:
@@ -35,7 +83,7 @@ def preprocess_text(text):
35
  # Convert to lowercase
36
  text = text.lower()
37
 
38
- # Remove special characters and digits
39
  text = re.sub(r'[^a-zA-Z\s]', '', text)
40
 
41
  # Tokenize
@@ -44,20 +92,17 @@ def preprocess_text(text):
44
  # Remove stopwords
45
  stop_words = set(stopwords.words('english'))
46
 
47
- # Add custom stopwords (common in political discourse but low information)
48
- custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'on',
49
- 'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those', 'from',
50
- 'or', 'not', 'be', 'are', 'it', 'was', 'were', 'been', 'being',
51
- 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
52
- 'would', 'should', 'could', 'might', 'will', 'shall', 'can', 'may',
53
- 'political', 'generally', 'policy', 'policies', 'also'}
54
 
55
  stop_words.update(custom_stopwords)
56
 
57
- # Lemmatize tokens
 
58
  lemmatizer = WordNetLemmatizer()
59
  tokens = [lemmatizer.lemmatize(token) for token in tokens
60
- if token not in stop_words and len(token) > 3]
61
 
62
  return ' '.join(tokens)
63
 
 
10
  import re
11
  from scipy.spatial import distance
12
 
13
+ def load_all_datasets_for_topic_modeling():
14
+ """
15
+ Load all dataset files and prepare them for topic modeling.
16
+
17
+ Returns:
18
+ tuple: (all_model1_responses, all_model2_responses, all_model_names)
19
+ """
20
+ import os
21
+ from pathlib import Path
22
+ from utils.text_dataset_parser import parse_text_file
23
+
24
+ dataset_dir = "dataset"
25
+ if not os.path.exists(dataset_dir):
26
+ print(f"Dataset directory '{dataset_dir}' not found")
27
+ return [], [], []
28
+
29
+ # Get all text files
30
+ path = Path(dataset_dir)
31
+ text_files = list(path.glob('*.txt'))
32
+
33
+ all_model1_responses = []
34
+ all_model2_responses = []
35
+ all_model_names = set()
36
+
37
+ for file_path in text_files:
38
+ try:
39
+ dataset = parse_text_file(str(file_path))
40
+
41
+ if dataset.get("response1") and dataset.get("response2"):
42
+ all_model1_responses.append(dataset.get("response1"))
43
+ all_model2_responses.append(dataset.get("response2"))
44
+
45
+ # Collect model names
46
+ if dataset.get("model1"):
47
+ all_model_names.add(dataset.get("model1"))
48
+ if dataset.get("model2"):
49
+ all_model_names.add(dataset.get("model2"))
50
+ except Exception as e:
51
+ print(f"Error loading dataset file {file_path}: {e}")
52
+
53
+ # Convert set to list for model names
54
+ model_names_list = list(all_model_names)
55
+ if len(model_names_list) < 2:
56
+ # If we couldn't find enough model names, use defaults
57
+ model_names_list = ["Model 1", "Model 2"]
58
+
59
+ return all_model1_responses, all_model2_responses, model_names_list
60
+
61
  def download_nltk_resources():
62
  """Download required NLTK resources if not already downloaded"""
63
  try:
 
83
  # Convert to lowercase
84
  text = text.lower()
85
 
86
+ # Remove special characters and digits but keep spaces (fixed regex)
87
  text = re.sub(r'[^a-zA-Z\s]', '', text)
88
 
89
  # Tokenize
 
92
  # Remove stopwords
93
  stop_words = set(stopwords.words('english'))
94
 
95
+ # Reduced custom stopwords list - keep more meaningful political terms
96
+ custom_stopwords = {'the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for',
97
+ 'with', 'as', 'by', 'at', 'an', 'this', 'these', 'those'}
 
 
 
 
98
 
99
  stop_words.update(custom_stopwords)
100
 
101
+ # Lemmatize tokens - CHANGED from len(token) > 3 to len(token) > 2
102
+ # This keeps more meaningful short words like "tax", "war", "law", etc.
103
  lemmatizer = WordNetLemmatizer()
104
  tokens = [lemmatizer.lemmatize(token) for token in tokens
105
+ if token not in stop_words and len(token) > 2]
106
 
107
  return ' '.join(tokens)
108
 
ui/analysis_screen.py CHANGED
@@ -286,22 +286,45 @@ def process_analysis_request(dataset, selected_analysis, parameters):
286
 
287
  try:
288
  # Import the enhanced topic modeling function
289
- from processors.topic_modeling import compare_topics
290
 
291
- topic_results = compare_topics(
292
- texts_set_1=[model1_response],
293
- texts_set_2=[model2_response],
294
- n_topics=topic_count,
295
- model_names=[model1_name, model2_name])
296
 
297
- results["analyses"][prompt_text]["topic_modeling"] = topic_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  # Add helpful message if text is very short
300
  if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
301
  if "error" not in topic_results:
302
  # Add a warning message about short text
303
  results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
304
-
305
  except Exception as e:
306
  import traceback
307
  print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")
 
286
 
287
  try:
288
  # Import the enhanced topic modeling function
289
+ from processors.topic_modeling import compare_topics, load_all_datasets_for_topic_modeling
290
 
291
+ # Get all responses from dataset directory
292
+ all_model1_responses, all_model2_responses, dataset_model_names = load_all_datasets_for_topic_modeling()
 
 
 
293
 
294
+ # Add current responses to the collection if they're not empty
295
+ if model1_response.strip():
296
+ all_model1_responses.append(model1_response)
297
+ if model2_response.strip():
298
+ all_model2_responses.append(model2_response)
299
+
300
+ # If we have data, perform topic modeling with all available responses
301
+ if all_model1_responses and all_model2_responses:
302
+ topic_results = compare_topics(
303
+ texts_set_1=all_model1_responses,
304
+ texts_set_2=all_model2_responses,
305
+ n_topics=topic_count,
306
+ model_names=[model1_name, model2_name]) # Keep original model names for output
307
+
308
+ results["analyses"][prompt_text]["topic_modeling"] = topic_results
309
+
310
+ # Add helpful message about using all datasets
311
+ results["analyses"][prompt_text]["topic_modeling"]["info"] = f"Topic modeling performed using {len(all_model1_responses)} responses from model 1 and {len(all_model2_responses)} responses from model 2 for better results."
312
+ else:
313
+ # Fallback to original implementation if no data found
314
+ topic_results = compare_topics(
315
+ texts_set_1=[model1_response],
316
+ texts_set_2=[model2_response],
317
+ n_topics=topic_count,
318
+ model_names=[model1_name, model2_name])
319
+
320
+ results["analyses"][prompt_text]["topic_modeling"] = topic_results
321
 
322
  # Add helpful message if text is very short
323
  if (len(model1_response.split()) < 50 or len(model2_response.split()) < 50):
324
  if "error" not in topic_results:
325
  # Add a warning message about short text
326
  results["analyses"][prompt_text]["topic_modeling"]["warning"] = "One or both texts are relatively short. Topic modeling works best with longer texts."
327
+
328
  except Exception as e:
329
  import traceback
330
  print(f"Topic modeling error: {str(e)}\n{traceback.format_exc()}")