Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files

xet

Community

daniel-wojahn commited on Aug 21

Commit

3651cd4

1 Parent(s): 117dd64

Fix progressive loading and UI button activation issues: 1) Fix KeyError: 'jaccard' in metrics computation 2) Fix 'Markdown' object has no attribute 'update' warning 3) Improve fuzzy matching method descriptions 4) Fix Structural Analysis button activation 5) Add consistent tooltip styling for all tabs

Browse files

Files changed (3) hide show

app.py +41 -14
pipeline/process.py +19 -6
pipeline/progressive_ui.py +20 -2

app.py CHANGED Viewed

@@ -116,7 +116,7 @@ def main_interface():
                             "ratio - Simple ratio matching"
                         ],
                         value="token_set - Order-independent matching",
-                        info="Select the fuzzy matching algorithm to use. Token set works best for Tibetan text with word order variations."
                     )
                     process_btn = gr.Button(
@@ -144,6 +144,7 @@ def main_interface():
             "Run Structural Analysis (time-consuming)",
             variant="secondary",
             interactive=False,
         )
         # LLM Interpretation components
@@ -226,9 +227,34 @@ Key points:
 - Context-aware embeddings capture nuanced meanings and relationships.
 - Designed for sentence/segment-level representations, not just words.
 - Works well alongside Jaccard and LCS for a holistic view.
-Stopword filtering: When enabled, common Tibetan particles and function words are filtered before embedding to focus on content-bearing terms.
 """,
         }
         heatmap_tabs = {}
@@ -275,22 +301,21 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
                     # Add the appropriate plot
                     if metric_key == "Word Counts":
                         word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
-                    else:
-                        heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
-            # Structural Analysis Tab (moved after metric tabs so it appears after Word Counts)
             with gr.Tab("Structural Analysis"):
                 gr.Markdown("""
                 ### Structural Analysis for Tibetan Legal Manuscripts
-                This tab provides detailed chapter-level structural analysis for Tibetan legal manuscript comparison.
-                **Features:**
-                - **Differential Highlighting**: Highlights significant textual variations
-                - **Per-Chapter Analysis**: Detailed comparison for each chapter pair
-                **Usage:**
-                Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
                 """)
                 # Structural analysis outputs
@@ -355,7 +380,8 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
                 semantic_heatmap=heatmap_tabs["Semantic Similarity"],
                 warning_box=warning_box,
                 progress_container=progress_container,
-                heatmap_titles=heatmap_titles
             )
             # Make progress container visible during analysis
@@ -515,9 +541,10 @@ Stopword filtering: When enabled, common Tibetan particles and function words ar
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
                     # Enable structural analysis button and store states for deferred run
-                    structural_btn_update_res = gr.update(interactive=True)
                     state_text_data_res = text_data
                     state_df_results_res = df_results
                     # Save results to CSV
                     if progress is not None:

                             "ratio - Simple ratio matching"
                         ],
                         value="token_set - Order-independent matching",
+                        info="Select the fuzzy matching algorithm to use:\n\n• token_set: Best for texts with different word orders and partial overlaps. Compares unique words regardless of their order (recommended for Tibetan texts).\n\n• token_sort: Good for texts with different word orders but similar content. Sorts words alphabetically before comparing.\n\n• partial: Best for finding shorter strings within longer ones. Useful when one text is a fragment of another.\n\n• ratio: Simple Levenshtein distance ratio. Best for detecting small edits and typos in otherwise identical texts."
                     )
                     process_btn = gr.Button(
             "Run Structural Analysis (time-consuming)",
             variant="secondary",
             interactive=False,
+            elem_id="structural-btn"
         )
         # LLM Interpretation components
 - Context-aware embeddings capture nuanced meanings and relationships.
 - Designed for sentence/segment-level representations, not just words.
 - Works well alongside Jaccard and LCS for a holistic view.
+- Stopword filtering: When enabled, common Tibetan particles and function words are filtered before embedding to focus on content-bearing terms.
+""",
+            "Word Counts": """
+### Word Counts per Segment
+This chart displays the number of words in each segment of your texts after tokenization.
+The word count is calculated after applying the selected tokenization and stopword filtering options. This visualization helps you understand the relative sizes of different text segments and can reveal patterns in text structure across your documents.
+**Key points**:
+- Longer bars indicate segments with more words
+- Segments are grouped by source document
+- Useful for identifying structural patterns and content distribution
+- Can help explain similarity metric variations (longer texts may show different patterns)
 """,
+            "Structural Analysis": """
+### Structural Analysis
+This advanced analysis examines the structural relationships between text segments across your documents. It identifies patterns of similarity and difference that may indicate textual dependencies, common sources, or editorial modifications.
+The structural analysis combines multiple similarity metrics to create a comprehensive view of how text segments relate to each other, highlighting potential stemmatic relationships and textual transmission patterns.
+**Key points**:
+- Identifies potential source-target relationships between texts
+- Visualizes text reuse patterns across segments
+- Helps reconstruct possible stemmatic relationships
+- Provides insights into textual transmission and editorial history
+**Note**: This analysis is computationally intensive and only available after the initial metrics calculation is complete.
+"""
         }
         heatmap_tabs = {}
                     # Add the appropriate plot
                     if metric_key == "Word Counts":
                         word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
+            # Structural Analysis Tab
             with gr.Tab("Structural Analysis"):
+                with gr.Accordion("ℹ️ About this metric", open=False, elem_classes="metric-info-accordion structural-info"):
+                    if "Structural Analysis" in metric_tooltips:
+                        gr.Markdown(value=metric_tooltips["Structural Analysis"], elem_classes="metric-description")
+                    else:
+                        gr.Markdown(value="### Structural Analysis\nDescription not found.")
                 gr.Markdown("""
                 ### Structural Analysis for Tibetan Legal Manuscripts
+                This analysis identifies potential source-target relationships between text segments, helping to reconstruct stemmatic relationships.
+                Click the "Run Structural Analysis" button below after computing the basic metrics to perform this advanced analysis.
                 """)
                 # Structural analysis outputs
                 semantic_heatmap=heatmap_tabs["Semantic Similarity"],
                 warning_box=warning_box,
                 progress_container=progress_container,
+                heatmap_titles=heatmap_titles,
+                structural_btn=structural_btn
             )
             # Make progress container visible during analysis
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
                     # Enable structural analysis button and store states for deferred run
+                    structural_btn_update_res = gr.update(interactive=True, value="Run Structural Analysis (time-consuming)")
                     state_text_data_res = text_data
                     state_df_results_res = df_results
+                    logger.info("Enabling structural analysis button")
                     # Save results to CSV
                     if progress is not None:

pipeline/process.py CHANGED Viewed

@@ -302,7 +302,7 @@ def process_texts(
             try:
                 # Compute metrics for this chapter pair
-                pair_metrics = compute_all_metrics(
                     texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                     token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
                     model=model,
@@ -313,6 +313,19 @@ def process_texts(
                     use_lite_stopwords=use_lite_stopwords,
                 )
                 # Format the results
                 text_pair = f"{file1} vs {file2}"
                 chapter_num = idx + 1
@@ -320,17 +333,17 @@ def process_texts(
                 result_row = {
                     "Text Pair": text_pair,
                     "Chapter": chapter_num,
-                    "Jaccard Similarity (%)": pair_metrics["jaccard"] * 100,  # Convert to percentage
-                    "Normalized LCS": pair_metrics["lcs"],
                 }
                 # Add fuzzy similarity if enabled
                 if enable_fuzzy:
-                    result_row["Fuzzy Similarity"] = pair_metrics["fuzzy"]
                 # Add semantic similarity if enabled and available
-                if enable_semantic and "semantic" in pair_metrics:
-                    result_row["Semantic Similarity"] = pair_metrics["semantic"]
                 results.append(result_row)

             try:
                 # Compute metrics for this chapter pair
+                metrics_df = compute_all_metrics(
                     texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                     token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
                     model=model,
                     use_lite_stopwords=use_lite_stopwords,
                 )
+                # Extract metrics from the DataFrame (should have only one row)
+                if not metrics_df.empty:
+                    pair_metrics = metrics_df.iloc[0].to_dict()
+                else:
+                    # Handle empty DataFrame case
+                    logger.error(f"No metrics computed for {seg1} vs {seg2}")
+                    pair_metrics = {
+                        "Jaccard Similarity (%)": 0.0,
+                        "Normalized LCS": 0.0,
+                        "Fuzzy Similarity": 0.0 if enable_fuzzy else np.nan,
+                        "Semantic Similarity": 0.0 if enable_semantic else np.nan
+                    }
                 # Format the results
                 text_pair = f"{file1} vs {file2}"
                 chapter_num = idx + 1
                 result_row = {
                     "Text Pair": text_pair,
                     "Chapter": chapter_num,
+                    "Jaccard Similarity (%)": pair_metrics["Jaccard Similarity (%)"],  # Already in percentage
+                    "Normalized LCS": pair_metrics["Normalized LCS"],
                 }
                 # Add fuzzy similarity if enabled
                 if enable_fuzzy:
+                    result_row["Fuzzy Similarity"] = pair_metrics["Fuzzy Similarity"]
                 # Add semantic similarity if enabled and available
+                if enable_semantic and "Semantic Similarity" in pair_metrics:
+                    result_row["Semantic Similarity"] = pair_metrics["Semantic Similarity"]
                 results.append(result_row)

pipeline/progressive_ui.py CHANGED Viewed

@@ -31,7 +31,8 @@ class ProgressiveUI:
                  semantic_heatmap: gr.Plot,
                  warning_box: gr.Markdown,
                  progress_container: gr.Row,
-                 heatmap_titles: Dict[str, str]):
         """
         Initialize the ProgressiveUI.
@@ -55,6 +56,7 @@ class ProgressiveUI:
         self.warning_box = warning_box
         self.progress_container = progress_container
         self.heatmap_titles = heatmap_titles
         # Create progress indicators for each metric
         with self.progress_container:
@@ -160,6 +162,11 @@ class ProgressiveUI:
                 updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
             if self.word_count_plot not in self.updated_components:
                 updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
         return updates
@@ -202,6 +209,17 @@ def create_progressive_callback(progressive_ui: ProgressiveUI) -> Callable:
         # Apply updates to UI components
         for component, value in updates.items():
-            component.update(value=value)
     return callback

                  semantic_heatmap: gr.Plot,
                  warning_box: gr.Markdown,
                  progress_container: gr.Row,
+                 heatmap_titles: Dict[str, str],
+                 structural_btn=None):
         """
         Initialize the ProgressiveUI.
         self.warning_box = warning_box
         self.progress_container = progress_container
         self.heatmap_titles = heatmap_titles
+        self.structural_btn = structural_btn
         # Create progress indicators for each metric
         with self.progress_container:
                 updates[self.semantic_progress] = "⏳ **Semantic Similarity:** In progress..."
             if self.word_count_plot not in self.updated_components:
                 updates[self.word_count_progress] = "⏳ **Word Counts:** In progress..."
+        else:
+            # If computation is complete, enable structural button if available
+            if self.structural_btn is not None:
+                updates[self.structural_btn] = gr.update(interactive=True)
+                logger.info("Enabling structural analysis button via progressive UI")
         return updates
         # Apply updates to UI components
         for component, value in updates.items():
+            try:
+                # Handle different component types appropriately
+                if isinstance(component, gr.Markdown):
+                    # For Markdown components, directly set the value
+                    component.value = value
+                elif hasattr(component, 'update'):
+                    # For components with update method
+                    component.update(value=value)
+                else:
+                    logger.warning(f"Cannot update component of type {type(component)}")
+            except Exception as e:
+                logger.warning(f"Error updating component: {e}")
     return callback