Spaces:
Sleeping
Sleeping
Commit
·
3651cd4
1
Parent(s):
117dd64
Fix progressive loading and UI button activation issues: 1) Fix KeyError: 'jaccard' in metrics computation 2) Fix 'Markdown' object has no attribute 'update' warning 3) Improve fuzzy matching method descriptions 4) Fix Structural Analysis button activation 5) Add consistent tooltip styling for all tabs
Browse files- app.py +41 -14
- pipeline/process.py +19 -6
- pipeline/progressive_ui.py +20 -2
app.py
CHANGED
@@ -116,7 +116,7 @@ def main_interface():
|
|
116 |
"ratio - Simple ratio matching"
|
117 |
],
|
118 |
value="token_set - Order-independent matching",
|
119 |
-
info="Select the fuzzy matching algorithm to use.
|
120 |
)
|
121 |
|
122 |
process_btn = gr.Button(
|
@@ -144,6 +144,7 @@ def main_interface():
|
|
144 |
"Run Structural Analysis (time-consuming)",
|
145 |
variant="secondary",
|
146 |
interactive=False,
|
|
|
147 |
)
|
148 |
|
149 |
# LLM Interpretation components
|
@@ -226,9 +227,34 @@ Key points:
|
|
226 |
- Context-aware embeddings capture nuanced meanings and relationships.
|
227 |
- Designed for sentence/segment-level representations, not just words.
|
228 |
- Works well alongside Jaccard and LCS for a holistic view.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
-
|
|
|
|
|
|
|
|
|
231 |
""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
}
|
234 |
heatmap_tabs = {}
|
@@ -275,22 +301,21 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
|
|
275 |
# Add the appropriate plot
|
276 |
if metric_key == "Word Counts":
|
277 |
word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
|
278 |
-
else:
|
279 |
-
heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
|
280 |
|
281 |
-
# Structural Analysis Tab
|
282 |
with gr.Tab("Structural Analysis"):
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
gr.Markdown("""
|
284 |
### Structural Analysis for Tibetan Legal Manuscripts
|
285 |
|
286 |
-
This
|
287 |
-
|
288 |
-
**Features:**
|
289 |
-
- **Differential Highlighting**: Highlights significant textual variations
|
290 |
-
- **Per-Chapter Analysis**: Detailed comparison for each chapter pair
|
291 |
|
292 |
-
|
293 |
-
Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
|
294 |
""")
|
295 |
|
296 |
# Structural analysis outputs
|
@@ -355,7 +380,8 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
|
|
355 |
semantic_heatmap=heatmap_tabs["Semantic Similarity"],
|
356 |
warning_box=warning_box,
|
357 |
progress_container=progress_container,
|
358 |
-
heatmap_titles=heatmap_titles
|
|
|
359 |
)
|
360 |
|
361 |
# Make progress container visible during analysis
|
@@ -515,9 +541,10 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
|
|
515 |
word_count_fig_res = generate_word_count_chart(word_counts_df_data)
|
516 |
|
517 |
# Enable structural analysis button and store states for deferred run
|
518 |
-
structural_btn_update_res = gr.update(interactive=True)
|
519 |
state_text_data_res = text_data
|
520 |
state_df_results_res = df_results
|
|
|
521 |
|
522 |
# Save results to CSV
|
523 |
if progress is not None:
|
|
|
116 |
"ratio - Simple ratio matching"
|
117 |
],
|
118 |
value="token_set - Order-independent matching",
|
119 |
+
info="Select the fuzzy matching algorithm to use:\n\n• token_set: Best for texts with different word orders and partial overlaps. Compares unique words regardless of their order (recommended for Tibetan texts).\n\n• token_sort: Good for texts with different word orders but similar content. Sorts words alphabetically before comparing.\n\n• partial: Best for finding shorter strings within longer ones. Useful when one text is a fragment of another.\n\n• ratio: Simple Levenshtein distance ratio. Best for detecting small edits and typos in otherwise identical texts."
|
120 |
)
|
121 |
|
122 |
process_btn = gr.Button(
|
|
|
144 |
"Run Structural Analysis (time-consuming)",
|
145 |
variant="secondary",
|
146 |
interactive=False,
|
147 |
+
elem_id="structural-btn"
|
148 |
)
|
149 |
|
150 |
# LLM Interpretation components
|
|
|
227 |
- Context-aware embeddings capture nuanced meanings and relationships.
|
228 |
- Designed for sentence/segment-level representations, not just words.
|
229 |
- Works well alongside Jaccard and LCS for a holistic view.
|
230 |
+
- Stopword filtering: When enabled, common Tibetan particles and function words are filtered before embedding to focus on content-bearing terms.
|
231 |
+
""",
|
232 |
+
"Word Counts": """
|
233 |
+
### Word Counts per Segment
|
234 |
+
This chart displays the number of words in each segment of your texts after tokenization.
|
235 |
+
|
236 |
+
The word count is calculated after applying the selected tokenization and stopword filtering options. This visualization helps you understand the relative sizes of different text segments and can reveal patterns in text structure across your documents.
|
237 |
|
238 |
+
**Key points**:
|
239 |
+
- Longer bars indicate segments with more words
|
240 |
+
- Segments are grouped by source document
|
241 |
+
- Useful for identifying structural patterns and content distribution
|
242 |
+
- Can help explain similarity metric variations (longer texts may show different patterns)
|
243 |
""",
|
244 |
+
"Structural Analysis": """
|
245 |
+
### Structural Analysis
|
246 |
+
This advanced analysis examines the structural relationships between text segments across your documents. It identifies patterns of similarity and difference that may indicate textual dependencies, common sources, or editorial modifications.
|
247 |
+
|
248 |
+
The structural analysis combines multiple similarity metrics to create a comprehensive view of how text segments relate to each other, highlighting potential stemmatic relationships and textual transmission patterns.
|
249 |
+
|
250 |
+
**Key points**:
|
251 |
+
- Identifies potential source-target relationships between texts
|
252 |
+
- Visualizes text reuse patterns across segments
|
253 |
+
- Helps reconstruct possible stemmatic relationships
|
254 |
+
- Provides insights into textual transmission and editorial history
|
255 |
+
|
256 |
+
**Note**: This analysis is computationally intensive and only available after the initial metrics calculation is complete.
|
257 |
+
"""
|
258 |
|
259 |
}
|
260 |
heatmap_tabs = {}
|
|
|
301 |
# Add the appropriate plot
|
302 |
if metric_key == "Word Counts":
|
303 |
word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
|
|
|
|
|
304 |
|
305 |
+
# Structural Analysis Tab
|
306 |
with gr.Tab("Structural Analysis"):
|
307 |
+
with gr.Accordion("ℹ️ About this metric", open=False, elem_classes="metric-info-accordion structural-info"):
|
308 |
+
if "Structural Analysis" in metric_tooltips:
|
309 |
+
gr.Markdown(value=metric_tooltips["Structural Analysis"], elem_classes="metric-description")
|
310 |
+
else:
|
311 |
+
gr.Markdown(value="### Structural Analysis\nDescription not found.")
|
312 |
+
|
313 |
gr.Markdown("""
|
314 |
### Structural Analysis for Tibetan Legal Manuscripts
|
315 |
|
316 |
+
This analysis identifies potential source-target relationships between text segments, helping to reconstruct stemmatic relationships.
|
|
|
|
|
|
|
|
|
317 |
|
318 |
+
Click the "Run Structural Analysis" button below after computing the basic metrics to perform this advanced analysis.
|
|
|
319 |
""")
|
320 |
|
321 |
# Structural analysis outputs
|
|
|
380 |
semantic_heatmap=heatmap_tabs["Semantic Similarity"],
|
381 |
warning_box=warning_box,
|
382 |
progress_container=progress_container,
|
383 |
+
heatmap_titles=heatmap_titles,
|
384 |
+
structural_btn=structural_btn
|
385 |
)
|
386 |
|
387 |
# Make progress container visible during analysis
|
|
|
541 |
word_count_fig_res = generate_word_count_chart(word_counts_df_data)
|
542 |
|
543 |
# Enable structural analysis button and store states for deferred run
|
544 |
+
structural_btn_update_res = gr.update(interactive=True, value="Run Structural Analysis (time-consuming)")
|
545 |
state_text_data_res = text_data
|
546 |
state_df_results_res = df_results
|
547 |
+
logger.info("Enabling structural analysis button")
|
548 |
|
549 |
# Save results to CSV
|
550 |
if progress is not None:
|
pipeline/process.py
CHANGED
@@ -302,7 +302,7 @@ def process_texts(
|
|
302 |
|
303 |
try:
|
304 |
# Compute metrics for this chapter pair
|
305 |
-
|
306 |
texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
|
307 |
token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
|
308 |
model=model,
|
@@ -313,6 +313,19 @@ def process_texts(
|
|
313 |
use_lite_stopwords=use_lite_stopwords,
|
314 |
)
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
# Format the results
|
317 |
text_pair = f"{file1} vs {file2}"
|
318 |
chapter_num = idx + 1
|
@@ -320,17 +333,17 @@ def process_texts(
|
|
320 |
result_row = {
|
321 |
"Text Pair": text_pair,
|
322 |
"Chapter": chapter_num,
|
323 |
-
"Jaccard Similarity (%)": pair_metrics["
|
324 |
-
"Normalized LCS": pair_metrics["
|
325 |
}
|
326 |
|
327 |
# Add fuzzy similarity if enabled
|
328 |
if enable_fuzzy:
|
329 |
-
result_row["Fuzzy Similarity"] = pair_metrics["
|
330 |
|
331 |
# Add semantic similarity if enabled and available
|
332 |
-
if enable_semantic and "
|
333 |
-
result_row["Semantic Similarity"] = pair_metrics["
|
334 |
|
335 |
results.append(result_row)
|
336 |
|
|
|
302 |
|
303 |
try:
|
304 |
# Compute metrics for this chapter pair
|
305 |
+
metrics_df = compute_all_metrics(
|
306 |
texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
|
307 |
token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
|
308 |
model=model,
|
|
|
313 |
use_lite_stopwords=use_lite_stopwords,
|
314 |
)
|
315 |
|
316 |
+
# Extract metrics from the DataFrame (should have only one row)
|
317 |
+
if not metrics_df.empty:
|
318 |
+
pair_metrics = metrics_df.iloc[0].to_dict()
|
319 |
+
else:
|
320 |
+
# Handle empty DataFrame case
|
321 |
+
logger.error(f"No metrics computed for {seg1} vs {seg2}")
|
322 |
+
pair_metrics = {
|
323 |
+
"Jaccard Similarity (%)": 0.0,
|
324 |
+
"Normalized LCS": 0.0,
|
325 |
+
"Fuzzy Similarity": 0.0 if enable_fuzzy else np.nan,
|
326 |
+
"Semantic Similarity": 0.0 if enable_semantic else np.nan
|
327 |
+
}
|
328 |
+
|
329 |
# Format the results
|
330 |
text_pair = f"{file1} vs {file2}"
|
331 |
chapter_num = idx + 1
|
|
|
333 |
result_row = {
|
334 |
"Text Pair": text_pair,
|
335 |
"Chapter": chapter_num,
|
336 |
+
"Jaccard Similarity (%)": pair_metrics["Jaccard Similarity (%)"], # Already in percentage
|
337 |
+
"Normalized LCS": pair_metrics["Normalized LCS"],
|
338 |
}
|
339 |
|
340 |
# Add fuzzy similarity if enabled
|
341 |
if enable_fuzzy:
|
342 |
+
result_row["Fuzzy Similarity"] = pair_metrics["Fuzzy Similarity"]
|
343 |
|
344 |
# Add semantic similarity if enabled and available
|
345 |
+
if enable_semantic and "Semantic Similarity" in pair_metrics:
|
346 |
+
result_row["Semantic Similarity"] = pair_metrics["Semantic Similarity"]
|
347 |
|
348 |
results.append(result_row)
|
349 |
|
pipeline/progressive_ui.py
CHANGED
@@ -31,7 +31,8 @@ class ProgressiveUI:
|
|
31 |
semantic_heatmap: gr.Plot,
|
32 |
warning_box: gr.Markdown,
|
33 |
progress_container: gr.Row,
|
34 |
-
heatmap_titles: Dict[str, str]
|
|
|
35 |
"""
|
36 |
Initialize the ProgressiveUI.
|
37 |
|
@@ -55,6 +56,7 @@ class ProgressiveUI:
|
|
55 |
self.warning_box = warning_box
|
56 |
self.progress_container = progress_container
|
57 |
self.heatmap_titles = heatmap_titles
|
|
|
58 |
|
59 |
# Create progress indicators for each metric
|
60 |
with self.progress_container:
|
@@ -160,6 +162,11 @@ class ProgressiveUI:
|
|
160 |
updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
|
161 |
if self.word_count_plot not in self.updated_components:
|
162 |
updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
return updates
|
165 |
|
@@ -202,6 +209,17 @@ def create_progressive_callback(progressive_ui: ProgressiveUI) -> Callable:
|
|
202 |
|
203 |
# Apply updates to UI components
|
204 |
for component, value in updates.items():
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
return callback
|
|
|
31 |
semantic_heatmap: gr.Plot,
|
32 |
warning_box: gr.Markdown,
|
33 |
progress_container: gr.Row,
|
34 |
+
heatmap_titles: Dict[str, str],
|
35 |
+
structural_btn=None):
|
36 |
"""
|
37 |
Initialize the ProgressiveUI.
|
38 |
|
|
|
56 |
self.warning_box = warning_box
|
57 |
self.progress_container = progress_container
|
58 |
self.heatmap_titles = heatmap_titles
|
59 |
+
self.structural_btn = structural_btn
|
60 |
|
61 |
# Create progress indicators for each metric
|
62 |
with self.progress_container:
|
|
|
162 |
updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
|
163 |
if self.word_count_plot not in self.updated_components:
|
164 |
updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
|
165 |
+
else:
|
166 |
+
# If computation is complete, enable structural button if available
|
167 |
+
if self.structural_btn is not None:
|
168 |
+
updates[self.structural_btn] = gr.update(interactive=True)
|
169 |
+
logger.info("Enabling structural analysis button via progressive UI")
|
170 |
|
171 |
return updates
|
172 |
|
|
|
209 |
|
210 |
# Apply updates to UI components
|
211 |
for component, value in updates.items():
|
212 |
+
try:
|
213 |
+
# Handle different component types appropriately
|
214 |
+
if isinstance(component, gr.Markdown):
|
215 |
+
# For Markdown components, directly set the value
|
216 |
+
component.value = value
|
217 |
+
elif hasattr(component, 'update'):
|
218 |
+
# For components with update method
|
219 |
+
component.update(value=value)
|
220 |
+
else:
|
221 |
+
logger.warning(f"Cannot update component of type {type(component)}")
|
222 |
+
except Exception as e:
|
223 |
+
logger.warning(f"Error updating component: {e}")
|
224 |
|
225 |
return callback
|