Ryan commited on
Commit
cc57712
·
1 Parent(s): 9a806ac
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. app.py +21 -25
  3. ui/analysis_screen.py +221 -41
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -559,31 +559,27 @@ def create_app():
559
  outputs=[llm_analysis_status, llm_analysis_result]
560
  )
561
 
562
- # Visibility update functions - unchanged
563
- def update_visibility(viz_visible, status_visible):
564
- # ...existing code...
565
-
566
- # Run analysis with proper parameters
567
- run_analysis_btn.click(
568
- fn=run_analysis,
569
- inputs=[dataset_state, analysis_options, bow_top_slider, ngram_n, ngram_top, topic_count],
570
- outputs=[
571
- analysis_results_state,
572
- analysis_output,
573
- visualization_area_visible,
574
- analysis_title,
575
- prompt_title,
576
- models_compared,
577
- model1_title,
578
- model1_words,
579
- model2_title,
580
- model2_words,
581
- similarity_metrics_title,
582
- similarity_metrics,
583
- status_message_visible,
584
- status_message
585
- ]
586
- )
587
 
588
  return app
589
 
 
559
  outputs=[llm_analysis_status, llm_analysis_result]
560
  )
561
 
562
+ # Run analysis with proper parameters
563
+ run_analysis_btn.click(
564
+ fn=run_analysis,
565
+ inputs=[dataset_state, analysis_options, bow_top_slider, ngram_n, ngram_top, topic_count],
566
+ outputs=[
567
+ analysis_results_state,
568
+ analysis_output,
569
+ visualization_area_visible,
570
+ analysis_title,
571
+ prompt_title,
572
+ models_compared,
573
+ model1_title,
574
+ model1_words,
575
+ model2_title,
576
+ model2_words,
577
+ similarity_metrics_title,
578
+ similarity_metrics,
579
+ status_message_visible,
580
+ status_message
581
+ ]
582
+ )
 
 
 
 
583
 
584
  return app
585
 
ui/analysis_screen.py CHANGED
@@ -3,13 +3,9 @@ import json
3
  from visualization.bow_visualizer import process_and_visualize_analysis
4
 
5
  # Import analysis modules
6
- from processors.topic_modeling import compare_topics # Added import
7
  from processors.ngram_analysis import compare_ngrams
8
- # from processors.bias_detection import compare_bias
9
  from processors.bow_analysis import compare_bow
10
- # from processors.metrics import calculate_similarity
11
- # from processors.diff_highlighter import highlight_differences
12
- # Add this import at the top
13
  from processors.text_classifiers import classify_formality, classify_sentiment, classify_complexity, compare_classifications
14
 
15
  def create_analysis_screen():
@@ -31,8 +27,8 @@ def create_analysis_screen():
31
  "N-gram Analysis",
32
  "Topic Modeling",
33
  "Bias Detection",
34
- "Classifier", # New option for future development
35
- "LLM Analysis" # New option for future development
36
  ],
37
  value="Bag of Words", # Default selection
38
  label="Select Analysis Type"
@@ -86,15 +82,10 @@ def create_analysis_screen():
86
  label="Bias Detection Methods"
87
  )
88
 
89
- # Classifier parameters for future development
90
  with gr.Group(visible=False) as classifier_params:
91
  gr.Markdown("### Classifier Parameters")
92
- gr.Markdown("*Classifier options will be available in a future update*")
93
-
94
- # LLM Analysis parameters for future development
95
- with gr.Group(visible=False) as llm_params:
96
- gr.Markdown("### LLM Analysis Parameters")
97
- gr.Markdown("*LLM Analysis options will be available in a future update*")
98
 
99
  # Function to update parameter visibility based on selected analysis
100
  def update_params_visibility(selected):
@@ -103,7 +94,6 @@ def create_analysis_screen():
103
  ngram_params: gr.update(visible=selected == "N-gram Analysis"),
104
  bias_params: gr.update(visible=selected == "Bias Detection"),
105
  classifier_params: gr.update(visible=selected == "Classifier"),
106
- llm_params: gr.update(visible=selected == "LLM Analysis"),
107
  ngram_n: gr.update(visible=selected == "N-gram Analysis"),
108
  ngram_top: gr.update(visible=selected == "N-gram Analysis"),
109
  topic_count: gr.update(visible=selected == "Topic Modeling"),
@@ -119,7 +109,6 @@ def create_analysis_screen():
119
  ngram_params,
120
  bias_params,
121
  classifier_params,
122
- llm_params,
123
  ngram_n,
124
  ngram_top,
125
  topic_count,
@@ -136,6 +125,198 @@ def create_analysis_screen():
136
  # Return the components needed by app.py
137
  return analysis_options, analysis_params, run_analysis_btn, analysis_output, bow_top_slider, ngram_n, ngram_top, topic_count
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Process analysis request function
140
  def process_analysis_request(dataset, selected_analysis, parameters):
141
  """
@@ -172,41 +353,41 @@ def process_analysis_request(dataset, selected_analysis, parameters):
172
 
173
  # Process based on the selected analysis type
174
  if selected_analysis == "Bag of Words":
175
- # Perform Bag of Words analysis
176
- results["analyses"][prompt_text]["bag_of_words"] = {
177
- "models": [model1_name, model2_name],
178
- "important_words": {
179
- model1_name: extract_important_words(model1_response, top_n=parameters.get("bow_top", 20)),
180
- model2_name: extract_important_words(model2_response, top_n=parameters.get("bow_top", 20))
181
- },
182
- "comparisons": {
183
- f"{model1_name} vs {model2_name}": calculate_text_similarity(model1_response, model2_response)
184
- }
185
- }
186
 
187
  elif selected_analysis == "N-gram Analysis":
188
  # Perform N-gram analysis
189
  ngram_size = parameters.get("ngram_n", 2)
 
 
 
190
  top_n = parameters.get("ngram_top", 15)
 
 
191
 
192
- results["analyses"][prompt_text]["ngram_analysis"] = {
193
- "models": [model1_name, model2_name],
194
- "ngram_size": ngram_size,
195
- "important_ngrams": {
196
- model1_name: extract_ngrams(model1_response, n=ngram_size, top_n=top_n),
197
- model2_name: extract_ngrams(model2_response, n=ngram_size, top_n=top_n)
198
- },
199
- "comparisons": {
200
- f"{model1_name} vs {model2_name}": compare_ngrams(model1_response, model2_response, n=ngram_size)
201
- }
202
- }
203
 
204
  elif selected_analysis == "Topic Modeling":
205
  # Perform topic modeling analysis
206
  topic_count = parameters.get("topic_count", 3)
 
 
207
 
208
  try:
209
- topic_results = perform_topic_modeling(
210
  [model1_response, model2_response],
211
  model_names=[model1_name, model2_name],
212
  n_topics=topic_count
@@ -223,7 +404,7 @@ def process_analysis_request(dataset, selected_analysis, parameters):
223
  }
224
 
225
  elif selected_analysis == "Classifier":
226
- # Perform classifier analysis (placeholder implementation)
227
  results["analyses"][prompt_text]["classifier"] = {
228
  "models": [model1_name, model2_name],
229
  "classifications": {
@@ -247,4 +428,3 @@ def process_analysis_request(dataset, selected_analysis, parameters):
247
 
248
  # Return both the analysis results and a placeholder for visualization data
249
  return results, None
250
-
 
3
  from visualization.bow_visualizer import process_and_visualize_analysis
4
 
5
  # Import analysis modules
6
+ from processors.topic_modeling import compare_topics
7
  from processors.ngram_analysis import compare_ngrams
 
8
  from processors.bow_analysis import compare_bow
 
 
 
9
  from processors.text_classifiers import classify_formality, classify_sentiment, classify_complexity, compare_classifications
10
 
11
  def create_analysis_screen():
 
27
  "N-gram Analysis",
28
  "Topic Modeling",
29
  "Bias Detection",
30
+ "Classifier"
31
+ # Removed "LLM Analysis" as requested
32
  ],
33
  value="Bag of Words", # Default selection
34
  label="Select Analysis Type"
 
82
  label="Bias Detection Methods"
83
  )
84
 
85
+ # Classifier parameters
86
  with gr.Group(visible=False) as classifier_params:
87
  gr.Markdown("### Classifier Parameters")
88
+ gr.Markdown("Classifies responses based on formality, sentiment, and complexity")
 
 
 
 
 
89
 
90
  # Function to update parameter visibility based on selected analysis
91
  def update_params_visibility(selected):
 
94
  ngram_params: gr.update(visible=selected == "N-gram Analysis"),
95
  bias_params: gr.update(visible=selected == "Bias Detection"),
96
  classifier_params: gr.update(visible=selected == "Classifier"),
 
97
  ngram_n: gr.update(visible=selected == "N-gram Analysis"),
98
  ngram_top: gr.update(visible=selected == "N-gram Analysis"),
99
  topic_count: gr.update(visible=selected == "Topic Modeling"),
 
109
  ngram_params,
110
  bias_params,
111
  classifier_params,
 
112
  ngram_n,
113
  ngram_top,
114
  topic_count,
 
125
  # Return the components needed by app.py
126
  return analysis_options, analysis_params, run_analysis_btn, analysis_output, bow_top_slider, ngram_n, ngram_top, topic_count
127
 
128
+ # Add the implementation of these helper functions
129
+ def extract_important_words(text, top_n=20):
130
+ """
131
+ Extract the most important words from a text.
132
+
133
+ Args:
134
+ text (str): Input text
135
+ top_n (int): Number of top words to return
136
+
137
+ Returns:
138
+ list: List of important words with their counts
139
+ """
140
+ # Import necessary modules
141
+ from collections import Counter
142
+ import re
143
+ import nltk
144
+ from nltk.corpus import stopwords
145
+ from nltk.tokenize import word_tokenize
146
+
147
+ # Make sure nltk resources are available
148
+ try:
149
+ stop_words = set(stopwords.words('english'))
150
+ except:
151
+ nltk.download('stopwords')
152
+ stop_words = set(stopwords.words('english'))
153
+
154
+ try:
155
+ tokens = word_tokenize(text.lower())
156
+ except:
157
+ nltk.download('punkt')
158
+ tokens = word_tokenize(text.lower())
159
+
160
+ # Remove stopwords and non-alphabetic tokens
161
+ filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words and len(word) > 2]
162
+
163
+ # Count word frequencies
164
+ word_counts = Counter(filtered_tokens)
165
+
166
+ # Get the top N words
167
+ top_words = word_counts.most_common(top_n)
168
+
169
+ # Format the result
170
+ result = [{"word": word, "count": count} for word, count in top_words]
171
+
172
+ return result
173
+
174
+ def calculate_text_similarity(text1, text2):
175
+ """
176
+ Calculate similarity metrics between two texts.
177
+
178
+ Args:
179
+ text1 (str): First text
180
+ text2 (str): Second text
181
+
182
+ Returns:
183
+ dict: Similarity metrics
184
+ """
185
+ from processors.metrics import calculate_similarity
186
+
187
+ # Calculate similarity using the metrics module
188
+ metrics = calculate_similarity(text1, text2)
189
+
190
+ # Add common word count
191
+ from collections import Counter
192
+ import nltk
193
+ from nltk.corpus import stopwords
194
+
195
+ # Make sure nltk resources are available
196
+ try:
197
+ stop_words = set(stopwords.words('english'))
198
+ except:
199
+ nltk.download('stopwords')
200
+ stop_words = set(stopwords.words('english'))
201
+
202
+ # Simple tokenization and filtering
203
+ words1 = set([w.lower() for w in nltk.word_tokenize(text1)
204
+ if w.isalpha() and w.lower() not in stop_words])
205
+ words2 = set([w.lower() for w in nltk.word_tokenize(text2)
206
+ if w.isalpha() and w.lower() not in stop_words])
207
+
208
+ # Calculate common words
209
+ common_words = words1.intersection(words2)
210
+
211
+ # Add to metrics
212
+ metrics["common_word_count"] = len(common_words)
213
+
214
+ return metrics
215
+
216
+ def extract_ngrams(text, n=2, top_n=10):
217
+ """
218
+ Extract the most common n-grams from text.
219
+
220
+ Args:
221
+ text (str): Input text
222
+ n (int or str): Size of n-grams
223
+ top_n (int): Number of top n-grams to return
224
+
225
+ Returns:
226
+ list: List of important n-grams with their counts
227
+ """
228
+ import nltk
229
+ from nltk.util import ngrams
230
+ from collections import Counter
231
+
232
+ # Convert n to int if it's a string
233
+ if isinstance(n, str):
234
+ n = int(n)
235
+
236
+ # Make sure nltk resources are available
237
+ try:
238
+ tokens = nltk.word_tokenize(text.lower())
239
+ except:
240
+ nltk.download('punkt')
241
+ tokens = nltk.word_tokenize(text.lower())
242
+
243
+ # Generate n-grams
244
+ n_grams = list(ngrams(tokens, n))
245
+
246
+ # Convert n-grams to strings for easier handling
247
+ n_gram_strings = [' '.join(gram) for gram in n_grams]
248
+
249
+ # Count n-gram frequencies
250
+ n_gram_counts = Counter(n_gram_strings)
251
+
252
+ # Get the top N n-grams
253
+ top_n_grams = n_gram_counts.most_common(top_n)
254
+
255
+ # Format the result
256
+ result = [{"ngram": ngram, "count": count} for ngram, count in top_n_grams]
257
+
258
+ return result
259
+
260
+ def compare_ngrams(text1, text2, n=2):
261
+ """
262
+ Compare n-grams between two texts.
263
+
264
+ Args:
265
+ text1 (str): First text
266
+ text2 (str): Second text
267
+ n (int or str): Size of n-grams
268
+
269
+ Returns:
270
+ dict: Comparison metrics
271
+ """
272
+ import nltk
273
+ from nltk.util import ngrams
274
+ from collections import Counter
275
+
276
+ # Convert n to int if it's a string
277
+ if isinstance(n, str):
278
+ n = int(n)
279
+
280
+ # Make sure nltk resources are available
281
+ try:
282
+ tokens1 = nltk.word_tokenize(text1.lower())
283
+ tokens2 = nltk.word_tokenize(text2.lower())
284
+ except:
285
+ nltk.download('punkt')
286
+ tokens1 = nltk.word_tokenize(text1.lower())
287
+ tokens2 = nltk.word_tokenize(text2.lower())
288
+
289
+ # Generate n-grams
290
+ n_grams1 = set([' '.join(gram) for gram in ngrams(tokens1, n)])
291
+ n_grams2 = set([' '.join(gram) for gram in ngrams(tokens2, n)])
292
+
293
+ # Calculate common n-grams
294
+ common_n_grams = n_grams1.intersection(n_grams2)
295
+
296
+ # Return comparison metrics
297
+ return {
298
+ "common_ngram_count": len(common_n_grams)
299
+ }
300
+
301
+ def perform_topic_modeling(texts, model_names, n_topics=3):
302
+ """
303
+ Perform topic modeling on a list of texts.
304
+
305
+ Args:
306
+ texts (list): List of text documents
307
+ model_names (list): Names of the models
308
+ n_topics (int): Number of topics to extract
309
+
310
+ Returns:
311
+ dict: Topic modeling results
312
+ """
313
+ from processors.topic_modeling import compare_topics
314
+
315
+ # Use the topic modeling processor
316
+ result = compare_topics(texts, model_names, n_topics=n_topics)
317
+
318
+ return result
319
+
320
  # Process analysis request function
321
  def process_analysis_request(dataset, selected_analysis, parameters):
322
  """
 
353
 
354
  # Process based on the selected analysis type
355
  if selected_analysis == "Bag of Words":
356
+ # Perform Bag of Words analysis using the processor
357
+ bow_results = compare_bow(
358
+ [model1_response, model2_response],
359
+ [model1_name, model2_name],
360
+ top_n=parameters.get("bow_top", 25)
361
+ )
362
+ results["analyses"][prompt_text]["bag_of_words"] = bow_results
 
 
 
 
363
 
364
  elif selected_analysis == "N-gram Analysis":
365
  # Perform N-gram analysis
366
  ngram_size = parameters.get("ngram_n", 2)
367
+ if isinstance(ngram_size, str):
368
+ ngram_size = int(ngram_size)
369
+
370
  top_n = parameters.get("ngram_top", 15)
371
+ if isinstance(top_n, str):
372
+ top_n = int(top_n)
373
 
374
+ # Use the processor
375
+ ngram_results = compare_ngrams(
376
+ [model1_response, model2_response],
377
+ [model1_name, model2_name],
378
+ n=ngram_size,
379
+ top_n=top_n
380
+ )
381
+ results["analyses"][prompt_text]["ngram_analysis"] = ngram_results
 
 
 
382
 
383
  elif selected_analysis == "Topic Modeling":
384
  # Perform topic modeling analysis
385
  topic_count = parameters.get("topic_count", 3)
386
+ if isinstance(topic_count, str):
387
+ topic_count = int(topic_count)
388
 
389
  try:
390
+ topic_results = compare_topics(
391
  [model1_response, model2_response],
392
  model_names=[model1_name, model2_name],
393
  n_topics=topic_count
 
404
  }
405
 
406
  elif selected_analysis == "Classifier":
407
+ # Perform classifier analysis
408
  results["analyses"][prompt_text]["classifier"] = {
409
  "models": [model1_name, model2_name],
410
  "classifications": {
 
428
 
429
  # Return both the analysis results and a placeholder for visualization data
430
  return results, None