daniel-wojahn commited on
Commit
3651cd4
·
1 Parent(s): 117dd64

Fix progressive loading and UI button activation issues: 1) Fix KeyError: 'jaccard' in metrics computation 2) Fix 'Markdown' object has no attribute 'update' warning 3) Improve fuzzy matching method descriptions 4) Fix Structural Analysis button activation 5) Add consistent tooltip styling for all tabs

Browse files
Files changed (3) hide show
  1. app.py +41 -14
  2. pipeline/process.py +19 -6
  3. pipeline/progressive_ui.py +20 -2
app.py CHANGED
@@ -116,7 +116,7 @@ def main_interface():
116
  "ratio - Simple ratio matching"
117
  ],
118
  value="token_set - Order-independent matching",
119
- info="Select the fuzzy matching algorithm to use. Token set works best for Tibetan text with word order variations."
120
  )
121
 
122
  process_btn = gr.Button(
@@ -144,6 +144,7 @@ def main_interface():
144
  "Run Structural Analysis (time-consuming)",
145
  variant="secondary",
146
  interactive=False,
 
147
  )
148
 
149
  # LLM Interpretation components
@@ -226,9 +227,34 @@ Key points:
226
  - Context-aware embeddings capture nuanced meanings and relationships.
227
  - Designed for sentence/segment-level representations, not just words.
228
  - Works well alongside Jaccard and LCS for a holistic view.
 
 
 
 
 
 
 
229
 
230
- Stopword filtering: When enabled, common Tibetan particles and function words are filtered before embedding to focus on content-bearing terms.
 
 
 
 
231
  """,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  }
234
  heatmap_tabs = {}
@@ -275,22 +301,21 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
275
  # Add the appropriate plot
276
  if metric_key == "Word Counts":
277
  word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
278
- else:
279
- heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
280
 
281
- # Structural Analysis Tab (moved after metric tabs so it appears after Word Counts)
282
  with gr.Tab("Structural Analysis"):
 
 
 
 
 
 
283
  gr.Markdown("""
284
  ### Structural Analysis for Tibetan Legal Manuscripts
285
 
286
- This tab provides detailed chapter-level structural analysis for Tibetan legal manuscript comparison.
287
-
288
- **Features:**
289
- - **Differential Highlighting**: Highlights significant textual variations
290
- - **Per-Chapter Analysis**: Detailed comparison for each chapter pair
291
 
292
- **Usage:**
293
- Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
294
  """)
295
 
296
  # Structural analysis outputs
@@ -355,7 +380,8 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
355
  semantic_heatmap=heatmap_tabs["Semantic Similarity"],
356
  warning_box=warning_box,
357
  progress_container=progress_container,
358
- heatmap_titles=heatmap_titles
 
359
  )
360
 
361
  # Make progress container visible during analysis
@@ -515,9 +541,10 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
515
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
516
 
517
  # Enable structural analysis button and store states for deferred run
518
- structural_btn_update_res = gr.update(interactive=True)
519
  state_text_data_res = text_data
520
  state_df_results_res = df_results
 
521
 
522
  # Save results to CSV
523
  if progress is not None:
 
116
  "ratio - Simple ratio matching"
117
  ],
118
  value="token_set - Order-independent matching",
119
+ info="Select the fuzzy matching algorithm to use:\n\n• token_set: Best for texts with different word orders and partial overlaps. Compares unique words regardless of their order (recommended for Tibetan texts).\n\n• token_sort: Good for texts with different word orders but similar content. Sorts words alphabetically before comparing.\n\n• partial: Best for finding shorter strings within longer ones. Useful when one text is a fragment of another.\n\n• ratio: Simple Levenshtein distance ratio. Best for detecting small edits and typos in otherwise identical texts."
120
  )
121
 
122
  process_btn = gr.Button(
 
144
  "Run Structural Analysis (time-consuming)",
145
  variant="secondary",
146
  interactive=False,
147
+ elem_id="structural-btn"
148
  )
149
 
150
  # LLM Interpretation components
 
227
  - Context-aware embeddings capture nuanced meanings and relationships.
228
  - Designed for sentence/segment-level representations, not just words.
229
  - Works well alongside Jaccard and LCS for a holistic view.
230
+ - Stopword filtering: When enabled, common Tibetan particles and function words are filtered before embedding to focus on content-bearing terms.
231
+ """,
232
+ "Word Counts": """
233
+ ### Word Counts per Segment
234
+ This chart displays the number of words in each segment of your texts after tokenization.
235
+
236
+ The word count is calculated after applying the selected tokenization and stopword filtering options. This visualization helps you understand the relative sizes of different text segments and can reveal patterns in text structure across your documents.
237
 
238
+ **Key points**:
239
+ - Longer bars indicate segments with more words
240
+ - Segments are grouped by source document
241
+ - Useful for identifying structural patterns and content distribution
242
+ - Can help explain similarity metric variations (longer texts may show different patterns)
243
  """,
244
+ "Structural Analysis": """
245
+ ### Structural Analysis
246
+ This advanced analysis examines the structural relationships between text segments across your documents. It identifies patterns of similarity and difference that may indicate textual dependencies, common sources, or editorial modifications.
247
+
248
+ The structural analysis combines multiple similarity metrics to create a comprehensive view of how text segments relate to each other, highlighting potential stemmatic relationships and textual transmission patterns.
249
+
250
+ **Key points**:
251
+ - Identifies potential source-target relationships between texts
252
+ - Visualizes text reuse patterns across segments
253
+ - Helps reconstruct possible stemmatic relationships
254
+ - Provides insights into textual transmission and editorial history
255
+
256
+ **Note**: This analysis is computationally intensive and only available after the initial metrics calculation is complete.
257
+ """
258
 
259
  }
260
  heatmap_tabs = {}
 
301
  # Add the appropriate plot
302
  if metric_key == "Word Counts":
303
  word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
 
 
304
 
305
+ # Structural Analysis Tab
306
  with gr.Tab("Structural Analysis"):
307
+ with gr.Accordion("ℹ️ About this metric", open=False, elem_classes="metric-info-accordion structural-info"):
308
+ if "Structural Analysis" in metric_tooltips:
309
+ gr.Markdown(value=metric_tooltips["Structural Analysis"], elem_classes="metric-description")
310
+ else:
311
+ gr.Markdown(value="### Structural Analysis\nDescription not found.")
312
+
313
  gr.Markdown("""
314
  ### Structural Analysis for Tibetan Legal Manuscripts
315
 
316
+ This analysis identifies potential source-target relationships between text segments, helping to reconstruct stemmatic relationships.
 
 
 
 
317
 
318
+ Click the "Run Structural Analysis" button below after computing the basic metrics to perform this advanced analysis.
 
319
  """)
320
 
321
  # Structural analysis outputs
 
380
  semantic_heatmap=heatmap_tabs["Semantic Similarity"],
381
  warning_box=warning_box,
382
  progress_container=progress_container,
383
+ heatmap_titles=heatmap_titles,
384
+ structural_btn=structural_btn
385
  )
386
 
387
  # Make progress container visible during analysis
 
541
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
542
 
543
  # Enable structural analysis button and store states for deferred run
544
+ structural_btn_update_res = gr.update(interactive=True, value="Run Structural Analysis (time-consuming)")
545
  state_text_data_res = text_data
546
  state_df_results_res = df_results
547
+ logger.info("Enabling structural analysis button")
548
 
549
  # Save results to CSV
550
  if progress is not None:
pipeline/process.py CHANGED
@@ -302,7 +302,7 @@ def process_texts(
302
 
303
  try:
304
  # Compute metrics for this chapter pair
305
- pair_metrics = compute_all_metrics(
306
  texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
307
  token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
308
  model=model,
@@ -313,6 +313,19 @@ def process_texts(
313
  use_lite_stopwords=use_lite_stopwords,
314
  )
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # Format the results
317
  text_pair = f"{file1} vs {file2}"
318
  chapter_num = idx + 1
@@ -320,17 +333,17 @@ def process_texts(
320
  result_row = {
321
  "Text Pair": text_pair,
322
  "Chapter": chapter_num,
323
- "Jaccard Similarity (%)": pair_metrics["jaccard"] * 100, # Convert to percentage
324
- "Normalized LCS": pair_metrics["lcs"],
325
  }
326
 
327
  # Add fuzzy similarity if enabled
328
  if enable_fuzzy:
329
- result_row["Fuzzy Similarity"] = pair_metrics["fuzzy"]
330
 
331
  # Add semantic similarity if enabled and available
332
- if enable_semantic and "semantic" in pair_metrics:
333
- result_row["Semantic Similarity"] = pair_metrics["semantic"]
334
 
335
  results.append(result_row)
336
 
 
302
 
303
  try:
304
  # Compute metrics for this chapter pair
305
+ metrics_df = compute_all_metrics(
306
  texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
307
  token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
308
  model=model,
 
313
  use_lite_stopwords=use_lite_stopwords,
314
  )
315
 
316
+ # Extract metrics from the DataFrame (should have only one row)
317
+ if not metrics_df.empty:
318
+ pair_metrics = metrics_df.iloc[0].to_dict()
319
+ else:
320
+ # Handle empty DataFrame case
321
+ logger.error(f"No metrics computed for {seg1} vs {seg2}")
322
+ pair_metrics = {
323
+ "Jaccard Similarity (%)": 0.0,
324
+ "Normalized LCS": 0.0,
325
+ "Fuzzy Similarity": 0.0 if enable_fuzzy else np.nan,
326
+ "Semantic Similarity": 0.0 if enable_semantic else np.nan
327
+ }
328
+
329
  # Format the results
330
  text_pair = f"{file1} vs {file2}"
331
  chapter_num = idx + 1
 
333
  result_row = {
334
  "Text Pair": text_pair,
335
  "Chapter": chapter_num,
336
+ "Jaccard Similarity (%)": pair_metrics["Jaccard Similarity (%)"], # Already in percentage
337
+ "Normalized LCS": pair_metrics["Normalized LCS"],
338
  }
339
 
340
  # Add fuzzy similarity if enabled
341
  if enable_fuzzy:
342
+ result_row["Fuzzy Similarity"] = pair_metrics["Fuzzy Similarity"]
343
 
344
  # Add semantic similarity if enabled and available
345
+ if enable_semantic and "Semantic Similarity" in pair_metrics:
346
+ result_row["Semantic Similarity"] = pair_metrics["Semantic Similarity"]
347
 
348
  results.append(result_row)
349
 
pipeline/progressive_ui.py CHANGED
@@ -31,7 +31,8 @@ class ProgressiveUI:
31
  semantic_heatmap: gr.Plot,
32
  warning_box: gr.Markdown,
33
  progress_container: gr.Row,
34
- heatmap_titles: Dict[str, str]):
 
35
  """
36
  Initialize the ProgressiveUI.
37
 
@@ -55,6 +56,7 @@ class ProgressiveUI:
55
  self.warning_box = warning_box
56
  self.progress_container = progress_container
57
  self.heatmap_titles = heatmap_titles
 
58
 
59
  # Create progress indicators for each metric
60
  with self.progress_container:
@@ -160,6 +162,11 @@ class ProgressiveUI:
160
  updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
161
  if self.word_count_plot not in self.updated_components:
162
  updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
 
 
 
 
 
163
 
164
  return updates
165
 
@@ -202,6 +209,17 @@ def create_progressive_callback(progressive_ui: ProgressiveUI) -> Callable:
202
 
203
  # Apply updates to UI components
204
  for component, value in updates.items():
205
- component.update(value=value)
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  return callback
 
31
  semantic_heatmap: gr.Plot,
32
  warning_box: gr.Markdown,
33
  progress_container: gr.Row,
34
+ heatmap_titles: Dict[str, str],
35
+ structural_btn=None):
36
  """
37
  Initialize the ProgressiveUI.
38
 
 
56
  self.warning_box = warning_box
57
  self.progress_container = progress_container
58
  self.heatmap_titles = heatmap_titles
59
+ self.structural_btn = structural_btn
60
 
61
  # Create progress indicators for each metric
62
  with self.progress_container:
 
162
  updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
163
  if self.word_count_plot not in self.updated_components:
164
  updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
165
+ else:
166
+ # If computation is complete, enable structural button if available
167
+ if self.structural_btn is not None:
168
+ updates[self.structural_btn] = gr.update(interactive=True)
169
+ logger.info("Enabling structural analysis button via progressive UI")
170
 
171
  return updates
172
 
 
209
 
210
  # Apply updates to UI components
211
  for component, value in updates.items():
212
+ try:
213
+ # Handle different component types appropriately
214
+ if isinstance(component, gr.Markdown):
215
+ # For Markdown components, directly set the value
216
+ component.value = value
217
+ elif hasattr(component, 'update'):
218
+ # For components with update method
219
+ component.update(value=value)
220
+ else:
221
+ logger.warning(f"Cannot update component of type {type(component)}")
222
+ except Exception as e:
223
+ logger.warning(f"Error updating component: {e}")
224
 
225
  return callback