Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

daniel-wojahn commited on Aug 1

Commit

bda2b5b

1 Parent(s): 28a74a6

maintenance and alignment prototype

Browse files

Files changed (15) hide show

.DS_Store +0 -0
.gitignore +3 -1
academic_article.md +8 -0
app.py +74 -6
pipeline/.DS_Store +0 -0
pipeline/differential_viz.py +283 -0
pipeline/fast_lcs.pyx +18 -7
pipeline/fasttext_embedding.py +23 -6
pipeline/hf_embedding.py +19 -9
pipeline/metrics.py +9 -5
pipeline/process.py +2 -2
pipeline/structural_analysis.py +241 -0
pipeline/tibetan_stopwords.py +8 -7
pipeline/visualize.py +4 -5
theme.py +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 venv
 __pycache__
-academic_article.md

 venv
 __pycache__
+academic_article.md
+#structural_analysis.py
+#differential_viz.py

academic_article.md CHANGED Viewed

@@ -47,6 +47,14 @@ These metrics focus on the vocabulary and key terms within the texts.
 **Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
 ### 2.4. Semantic Similarity
 To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.

 **Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
+**LCS vs. Levenshtein Distance:** While Levenshtein distance is another string similarity metric that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into another, LCS is more appropriate for Tibetan text analysis for several reasons:
+1. Tibetan manuscripts often share common passages or structural elements: LCS is particularly effective at identifying these shared passages, which may be separated by varying amounts of different text.
+2. LCS focuses on meaningful shared content rather than character-level differences: Unlike Levenshtein distance, which is sensitive to every character change, LCS identifies the longest sequence of words in the same order, focusing on substantive content overlap.
+3. LCS is less sensitive to minor variations that might occur in handwritten or OCR-processed texts: Tibetan manuscripts often contain variations due to scribal errors, regional differences, or OCR artifacts. LCS can still identify shared structural elements despite these variations, whereas Levenshtein distance might be disproportionately affected by them.
 ### 2.4. Semantic Similarity
 To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.

app.py CHANGED Viewed

@@ -10,10 +10,10 @@ from datetime import datetime
 from dotenv import load_dotenv
 # Load environment variables from .env file
-load_dotenv()
 from theme import tibetan_theme
 logger = logging.getLogger(__name__)
 def main_interface():
     with gr.Blocks(
@@ -67,12 +67,11 @@ def main_interface():
                     model_dropdown = gr.Dropdown(
                         choices=[
                             "sentence-transformers/LaBSE",
-                            "intfloat/e5-base-v2",
                             "Facebook FastText (Pre-trained)"
                         ],
                         label="Select Embedding Model",
                         value="sentence-transformers/LaBSE",
-                        info="Select the embedding model to use for semantic similarity analysis."
                     )
                     with gr.Accordion("Advanced Options", open=False):
@@ -208,6 +207,28 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
         gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
         with gr.Tabs(elem_id="heatmap-tab-group"):
             # Process all metrics including Word Counts in a unified way
             for metric_key, descriptive_title in heatmap_titles.items():
                 with gr.Tab(metric_key):
@@ -247,7 +268,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                     if metric_key == "Word Counts":
                         word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
                     else:
-                        heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False)
         # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
         # e.g., heatmap_tabs["Jaccard Similarity (%)"]
@@ -286,6 +307,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
             semantic_heatmap_res = None
             tfidf_heatmap_res = None
             warning_update_res = gr.update(value="", visible=False) # Default: no warning
             """
             Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
@@ -428,6 +452,44 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                             logger.warning(f"Progress update error (non-critical): {e}")
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
                     # Save results to CSV
                     if progress_tracker is not None:
                         try:
@@ -466,7 +528,10 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                 lcs_heatmap_res,
                 semantic_heatmap_res,
                 tfidf_heatmap_res,
-                warning_update_res
             )
         # Function to interpret results using LLM
@@ -515,6 +580,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                 heatmap_tabs["Semantic Similarity"],
                 heatmap_tabs["TF-IDF Cosine Sim"],
                 warning_box,
             ]
         )

 from dotenv import load_dotenv
 # Load environment variables from .env file
 from theme import tibetan_theme
+load_dotenv()
 logger = logging.getLogger(__name__)
 def main_interface():
     with gr.Blocks(
                     model_dropdown = gr.Dropdown(
                         choices=[
                             "sentence-transformers/LaBSE",
                             "Facebook FastText (Pre-trained)"
                         ],
                         label="Select Embedding Model",
                         value="sentence-transformers/LaBSE",
+                        info="Select the embedding model to use for semantic similarity analysis. LaBSE v1.0 or FastText v1.0."
                     )
                     with gr.Accordion("Advanced Options", open=False):
         gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
         with gr.Tabs(elem_id="heatmap-tab-group"):
+            # Structural Analysis Tab
+            with gr.Tab("Structural Analysis"):
+                with gr.Accordion("Understanding Structural Differences", open=False, elem_classes="structural-analysis-info"):
+                    gr.Markdown("""
+                    ### Structural Analysis for Legal Manuscripts
+                    This enhanced analysis provides detailed insights into structural differences between chapters, specifically designed for Tibetan legal manuscript comparison.
+                    **Features:**
+                    - **Change Detection**: Identifies insertions, deletions, and modifications
+                    - **Structural Alignment**: Shows how chapters map structurally
+                    - **Differential Highlighting**: Highlights significant textual variations
+                    - **Per-Chapter Analysis**: Detailed comparison for each chapter pair
+                    **Usage:**
+                    Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
+                    """)
+                # Structural analysis outputs
+                structural_heatmap = gr.Plot(label="Structural Changes Heatmap", show_label=False, elem_classes="structural-heatmap")
+                structural_report = gr.HTML(label="Differential Analysis Report")
+                structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
             # Process all metrics including Word Counts in a unified way
             for metric_key, descriptive_title in heatmap_titles.items():
                 with gr.Tab(metric_key):
                     if metric_key == "Word Counts":
                         word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
                     else:
+                        heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
         # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
         # e.g., heatmap_tabs["Jaccard Similarity (%)"]
             semantic_heatmap_res = None
             tfidf_heatmap_res = None
             warning_update_res = gr.update(value="", visible=False) # Default: no warning
+            structural_heatmap_res = None
+            structural_report_res = None
+            structural_export_res = None
             """
             Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
                             logger.warning(f"Progress update error (non-critical): {e}")
                     word_count_fig_res = generate_word_count_chart(word_counts_df_data)
+                    # Generate structural analysis
+                    if progress_tracker is not None:
+                        try:
+                            progress_tracker(0.92, desc="Generating structural analysis...")
+                        except Exception as e:
+                            logger.warning(f"Progress update error (non-critical): {e}")
+                    # Create structural analysis
+                    from pipeline.differential_viz import create_differential_heatmap, create_change_detection_report
+                    # Create structural heatmap
+                    try:
+                        structural_heatmap_res = create_differential_heatmap(
+                            text_data, "all_chapters", df_results
+                        )
+                    except Exception as e:
+                        logger.warning(f"Could not generate structural heatmap: {e}")
+                        structural_heatmap_res = None
+                    # Create structural report
+                    try:
+                        structural_report_res = create_change_detection_report(
+                            text_data, "all_chapters", "html"
+                        )
+                    except Exception as e:
+                        logger.warning(f"Could not generate structural report: {e}")
+                        structural_report_res = "<p>Could not generate structural analysis report.</p>"
+                    # Save structural analysis report
+                    try:
+                        report_path = "structural_analysis_report.html"
+                        with open(report_path, 'w', encoding='utf-8') as f:
+                            f.write(structural_report_res if isinstance(structural_report_res, str) else "")
+                        structural_export_res = report_path
+                    except Exception as e:
+                        logger.warning(f"Could not save structural report: {e}")
+                        structural_export_res = None
                     # Save results to CSV
                     if progress_tracker is not None:
                         try:
                 lcs_heatmap_res,
                 semantic_heatmap_res,
                 tfidf_heatmap_res,
+                warning_update_res,
+                structural_heatmap_res,
+                structural_report_res,
+                structural_export_res
             )
         # Function to interpret results using LLM
                 heatmap_tabs["Semantic Similarity"],
                 heatmap_tabs["TF-IDF Cosine Sim"],
                 warning_box,
+                structural_heatmap,
+                structural_report,
+                structural_export,
             ]
         )

pipeline/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pipeline/differential_viz.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+Differential visualization enhancements for Tibetan legal manuscript analysis.
+Provides enhanced heatmaps with structural change highlighting.
+"""
+import plotly.graph_objects as go
+from typing import Dict, List
+import pandas as pd
+from plotly.subplots import make_subplots
+from .structural_analysis import detect_structural_changes, generate_structural_alignment
+def create_differential_heatmap(texts_dict: Dict[str, str],
+                               chapter_key: str,
+                               metric_results: pd.DataFrame,
+                               highlight_threshold: float = 0.7) -> go.Figure:
+    """
+    Create enhanced heatmap with structural change highlighting.
+    Args:
+        texts_dict: Dictionary mapping text names to their content
+        chapter_key: Chapter identifier being analyzed
+        metric_results: DataFrame with similarity metrics
+        highlight_threshold: Threshold for highlighting significant changes
+    """
+    # Get unique text pairs
+    text_pairs = metric_results['Text Pair'].unique()
+    # Create enhanced heatmap data
+    enhanced_data = []
+    for pair in text_pairs:
+        texts = pair.split(' vs ')
+        if len(texts) == 2:
+            text1_name, text2_name = texts
+            # Get actual text content
+            text1_content = texts_dict.get(text1_name, '')
+            text2_content = texts_dict.get(text2_name, '')
+            # Perform structural analysis
+            changes = detect_structural_changes(text1_content, text2_content)
+            alignment = generate_structural_alignment(text1_content, text2_content)
+            # Create enhanced metrics
+            enhanced_row = {
+                'Text Pair': pair,
+                'Chapter': chapter_key,
+                'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
+                'modification_score': len(changes['modifications']),
+                'insertion_score': len(changes['insertions']),
+                'deletion_score': len(changes['deletions']),
+                'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
+                'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
+            }
+            enhanced_data.append(enhanced_row)
+    enhanced_df = pd.DataFrame(enhanced_data)
+    # Create subplots for different aspects
+    fig = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=('Structural Changes', 'Modifications', 'Insertions/Deletions', 'Alignment Quality'),
+        specs=[[{"secondary_y": True}, {"secondary_y": True}],
+               [{"secondary_y": True}, {"secondary_y": True}]]
+    )
+    # Structural changes heatmap
+    pivot_changes = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='structural_changes')
+    fig.add_trace(
+        go.Heatmap(
+            z=pivot_changes.values,
+            x=pivot_changes.columns,
+            y=pivot_changes.index,
+            colorscale='Reds',
+            name='Structural Changes',
+            showscale=True,
+            colorbar=dict(title="Changes", x=0.45)
+        ),
+        row=1, col=1
+    )
+    # Modifications heatmap
+    pivot_mods = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='modification_score')
+    fig.add_trace(
+        go.Heatmap(
+            z=pivot_mods.values,
+            x=pivot_mods.columns,
+            y=pivot_mods.index,
+            colorscale='Blues',
+            name='Modifications',
+            showscale=True,
+            colorbar=dict(title="Mods", x=1.0)
+        ),
+        row=1, col=2
+    )
+    # Insertions/Deletions combined heatmap
+    pivot_ins_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='insertion_score')
+    pivot_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='deletion_score')
+    combined = pivot_ins_del + pivot_del
+    fig.add_trace(
+        go.Heatmap(
+            z=combined.values,
+            x=combined.columns,
+            y=combined.index,
+            colorscale='Greens',
+            name='Insertions+Deletions',
+            showscale=True,
+            colorbar=dict(title="Ins+Del", x=0.45)
+        ),
+        row=2, col=1
+    )
+    # Alignment quality heatmap
+    pivot_align = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='alignment_quality')
+    fig.add_trace(
+        go.Heatmap(
+            z=pivot_align.values,
+            x=pivot_align.columns,
+            y=pivot_align.index,
+            colorscale='Viridis',
+            name='Alignment Quality',
+            showscale=True,
+            colorbar=dict(title="Quality", x=1.0)
+        ),
+        row=2, col=2
+    )
+    fig.update_layout(
+        title=f"Structural Analysis - Chapter {chapter_key}",
+        height=800,
+        showlegend=False
+    )
+    return fig
+def create_change_detection_report(texts_dict: Dict[str, str],
+                                 chapter_key: str,
+                                 output_format: str = 'html') -> str:
+    """
+    Create detailed change detection report for a chapter.
+    Args:
+        texts_dict: Dictionary mapping text names to content
+        chapter_key: Chapter identifier
+        output_format: Format for output ('html', 'json', 'markdown')
+    """
+    from .structural_analysis import generate_differential_report
+    text_names = list(texts_dict.keys())
+    reports = []
+    for i, text1_name in enumerate(text_names):
+        for text2_name in text_names[i+1:]:
+            text1_content = texts_dict[text1_name]
+            text2_content = texts_dict[text2_name]
+            report = generate_differential_report(
+                text1_content, text2_content, text1_name, text2_name
+            )
+            reports.append(report)
+    if output_format == 'html':
+        return create_html_report(reports, chapter_key)
+    elif output_format == 'json':
+        import json
+        return json.dumps(reports, indent=2, ensure_ascii=False)
+    else:
+        return create_markdown_report(reports, chapter_key)
+def create_html_report(reports: List[Dict], chapter_key: str) -> str:
+    """Create HTML report for structural analysis."""
+    html = f"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Structural Analysis Report - Chapter {chapter_key}</title>
+        <style>
+            body {{ font-family: Arial, sans-serif; margin: 20px; }}
+            .report {{ max-width: 1200px; margin: 0 auto; }}
+            .comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
+            .changes {{ display: flex; gap: 20px; }}
+            .change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
+            .insertion {{ background-color: #e8f5e8; }}
+            .deletion {{ background-color: #ffe8e8; }}
+            .modification {{ background-color: #fff3e0; }}
+            .highlight {{ background-color: yellow; padding: 2px 4px; }}
+        </style>
+    </head>
+    <body>
+        <div class="report">
+            <h1>Structural Analysis Report - Chapter {chapter_key}</h1>
+    """
+    for report in reports:
+        html += f"""
+            <div class="comparison">
+                <h2>{report['file1']} vs {report['file2']}</h2>
+                <div class="scores">
+                    <p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
+                    <p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
+                </div>
+                <div class="changes">
+                    <div class="change-type insertion">
+                        <h3>Insertions ({len(report['changes']['insertions'])})</h3>
+                        {format_changes_html(report['changes']['insertions'])}
+                    </div>
+                    <div class="change-type deletion">
+                        <h3>Deletions ({len(report['changes']['deletions'])})</h3>
+                        {format_changes_html(report['changes']['deletions'])}
+                    </div>
+                    <div class="change-type modification">
+                        <h3>Modifications ({len(report['changes']['modifications'])})</h3>
+                        {format_changes_html(report['changes']['modifications'], is_modification=True)}
+                    </div>
+                </div>
+            </div>
+        """
+    html += """
+        </div>
+    </body>
+    </html>
+    """
+    return html
+def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
+    """Format changes for HTML display."""
+    if not changes:
+        return "<p>No changes detected.</p>"
+    html = ""
+    for change in changes[:5]:  # Limit to first 5 for brevity
+        if is_modification:
+            html += f"""
+            <div class="change">
+                <span class="highlight">{change.get('original', '')}</span> →
+                <span class="highlight">{change.get('replacement', '')}</span>
+            </div>
+            """
+        else:
+            html += f"""
+            <div class="change">
+                <span class="highlight">{change.get('word', '')}</span>
+            </div>
+            """
+    if len(changes) > 5:
+        html += f"<p>... and {len(changes) - 5} more</p>"
+    return html
+def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
+    """Create markdown report for structural analysis."""
+    md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
+    for report in reports:
+        md += f"## {report['file1']} vs {report['file2']}\n\n"
+        md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
+        md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
+        md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
+        md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
+        md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
+        if report['changes']['modifications']:
+            md += "### Significant Modifications:\n"
+            for mod in report['changes']['modifications'][:3]:
+                md += f"- **{mod.get('original', '')}** → **{mod.get('replacement', '')}**\n"
+    return md

pipeline/fast_lcs.pyx CHANGED Viewed

@@ -3,9 +3,12 @@ import numpy as np
 cimport cython
 cimport numpy as np
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def compute_lcs_fast(list words1, list words2):
     """
     Computes the Longest Common Subsequence (LCS) of two lists of words.
@@ -28,18 +31,26 @@ def compute_lcs_fast(list words1, list words2):
         return compute_lcs_fast(words2, words1)
     # We only need two rows for the DP table
-    cdef np.ndarray[np.int32_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
-    cdef np.ndarray[np.int32_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
     cdef int i, j
     for i in range(1, m + 1):
         for j in range(1, n + 1):
             if words1[i - 1] == words2[j - 1]:
-                curr_row[j] = prev_row[j - 1] + 1
             else:
-                curr_row[j] = max(prev_row[j], curr_row[j - 1])
-        # Copy current row to previous row for the next iteration
-        prev_row = curr_row.copy()
-    return int(prev_row[n])

 cimport cython
 cimport numpy as np
+# Use memory views for better performance
+ctypedef np.int32_t DTYPE_t
 @cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.cdivision(True)
 def compute_lcs_fast(list words1, list words2):
     """
     Computes the Longest Common Subsequence (LCS) of two lists of words.
         return compute_lcs_fast(words2, words1)
     # We only need two rows for the DP table
+    cdef np.ndarray[DTYPE_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
+    cdef np.ndarray[DTYPE_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
+    # Use memory views for better access performance
+    cdef DTYPE_t[:] prev_view = prev_row
+    cdef DTYPE_t[:] curr_view = curr_row
     cdef int i, j
+    cdef DTYPE_t val1, val2
     for i in range(1, m + 1):
         for j in range(1, n + 1):
             if words1[i - 1] == words2[j - 1]:
+                curr_view[j] = prev_view[j - 1] + 1
             else:
+                val1 = prev_view[j]
+                val2 = curr_view[j - 1]
+                curr_view[j] = val1 if val1 > val2 else val2
+        # Swap views instead of copying for better performance
+        prev_view, curr_view = curr_view, prev_view
+    return <int>prev_view[n]

pipeline/fasttext_embedding.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import numpy as np
 import fasttext
 from collections import Counter
-from typing import List, Set, Optional
 from huggingface_hub import hf_hub_download
 # Set up logging
@@ -26,6 +26,11 @@ DEFAULT_MINN = 3
 DEFAULT_MAXN = 6
 DEFAULT_NEG = 5
 # Define paths for model storage
 DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
 DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin")  # Updated to custom model
@@ -399,9 +404,12 @@ def get_batch_embeddings(
     return np.array(embeddings)
-def get_model(model_id: str):
     """
-    Loads a FastText model based on the provided model ID.
     Args:
         model_id (str): The identifier for the model to load.
@@ -410,13 +418,22 @@ def get_model(model_id: str):
         Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
                                               or (None, None) if loading fails.
     """
-    logger.info(f"Attempting to load FastText model: {model_id}")
     if model_id == "facebook-fasttext-pretrained":
         try:
             model = _load_facebook_official_tibetan_model()
             if model:
-                logger.info(f"FastText model '{model_id}' loaded successfully.")
                 return model, "fasttext"
             else:
                 logger.error(f"Model loading for '{model_id}' returned None.")
@@ -428,7 +445,7 @@ def get_model(model_id: str):
     # elif model_id == "custom-model-name":
     #     ...
     else:
-        logger.error(f"Unsupported model_id for get_model: '{model_id}'.")
         return None, None
 def generate_embeddings(

 import numpy as np
 import fasttext
 from collections import Counter
+from typing import List, Optional, Tuple, Any, Set
 from huggingface_hub import hf_hub_download
 # Set up logging
 DEFAULT_MAXN = 6
 DEFAULT_NEG = 5
+# Model version information
+MODEL_VERSIONS = {
+    "facebook-fasttext-pretrained": "v1.0",
+}
 # Define paths for model storage
 DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
 DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin")  # Updated to custom model
     return np.array(embeddings)
+# Cache for loaded FastText models
+_fasttext_model_cache = {}
+def get_model(model_id: str) -> Tuple[Optional[Any], Optional[str]]:
     """
+    Loads a FastText model with version tracking.
     Args:
         model_id (str): The identifier for the model to load.
         Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
                                               or (None, None) if loading fails.
     """
+    # Include version information in cache key
+    model_version = MODEL_VERSIONS.get(model_id, "unknown")
+    cache_key = f"{model_id}@{model_version}"
+    if cache_key in _fasttext_model_cache:
+        logger.info(f"Returning cached FastText model: {model_id} (version: {model_version})")
+        return _fasttext_model_cache[cache_key], "fasttext"
+    logger.info(f"Attempting to load FastText model: {model_id} (version: {model_version})")
     if model_id == "facebook-fasttext-pretrained":
         try:
             model = _load_facebook_official_tibetan_model()
             if model:
+                _fasttext_model_cache[cache_key] = model
+                logger.info(f"FastText model '{model_id}' (version: {model_version}) loaded successfully.")
                 return model, "fasttext"
             else:
                 logger.error(f"Model loading for '{model_id}' returned None.")
     # elif model_id == "custom-model-name":
     #     ...
     else:
+        logger.warning(f"Unsupported FastText model ID: {model_id}")
         return None, None
 def generate_embeddings(

pipeline/hf_embedding.py CHANGED Viewed

@@ -1,16 +1,22 @@
 import logging
-from typing import List, Any, Optional, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
-# Cache for loaded models
 _model_cache = {}
 def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
     """
-    Loads a SentenceTransformer model from the Hugging Face Hub.
     Args:
         model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
@@ -19,15 +25,19 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
         Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
                                                               or (None, None) if loading fails.
     """
-    if model_id in _model_cache:
-        logger.info(f"Returning cached model: {model_id}")
-        return _model_cache[model_id], "sentence-transformer"
-    logger.info(f"Loading SentenceTransformer model: {model_id}")
     try:
         model = SentenceTransformer(model_id)
-        _model_cache[model_id] = model
-        logger.info(f"Model '{model_id}' loaded successfully.")
         return model, "sentence-transformer"
     except Exception as e:
         logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)

 import logging
+from typing import List, Optional, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
+# Cache for loaded models with version information
 _model_cache = {}
+# Model version mapping
+MODEL_VERSIONS = {
+    "sentence-transformers/LaBSE": "v1.0",
+    "intfloat/e5-base-v2": "v1.0",
+}
 def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
     """
+    Loads a SentenceTransformer model from the Hugging Face Hub with version tracking.
     Args:
         model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
         Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
                                                               or (None, None) if loading fails.
     """
+    # Include version information in cache key
+    model_version = MODEL_VERSIONS.get(model_id, "unknown")
+    cache_key = f"{model_id}@{model_version}"
+    if cache_key in _model_cache:
+        logger.info(f"Returning cached model: {model_id} (version: {model_version})")
+        return _model_cache[cache_key], "sentence-transformer"
+    logger.info(f"Loading SentenceTransformer model: {model_id} (version: {model_version})")
     try:
         model = SentenceTransformer(model_id)
+        _model_cache[cache_key] = model
+        logger.info(f"Model '{model_id}' (version: {model_version}) loaded successfully.")
         return model, "sentence-transformer"
     except Exception as e:
         logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)

pipeline/metrics.py CHANGED Viewed

@@ -2,11 +2,11 @@ import numpy as np
 import pandas as pd
 from typing import List, Dict, Union
 from itertools import combinations
 from sklearn.metrics.pairwise import cosine_similarity
 from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
 from .hf_embedding import generate_embeddings as generate_hf_embeddings
-from .tokenize import tokenize_texts
 import logging
 from sklearn.feature_extraction.text import TfidfVectorizer
 from .stopwords_bo import TIBETAN_STOPWORDS
@@ -139,8 +139,10 @@ def compute_semantic_similarity(
             return np.nan
         # Ensure embeddings are numpy arrays (should be, but defensive)
-        if not isinstance(emb1, np.ndarray): emb1 = np.array(emb1)
-        if not isinstance(emb2, np.ndarray): emb2 = np.array(emb2)
         # Handle cases where embeddings are all zeros
         if np.all(emb1 == 0) and np.all(emb2 == 0):
@@ -157,8 +159,10 @@ def compute_semantic_similarity(
             return 0.0
         # Ensure embeddings are 2D for cosine_similarity: [1, dim]
-        if emb1.ndim == 1: emb1 = emb1.reshape(1, -1)
-        if emb2.ndim == 1: emb2 = emb2.reshape(1, -1)
         similarity_score = cosine_similarity(emb1, emb2)[0][0]

 import pandas as pd
 from typing import List, Dict, Union
 from itertools import combinations
 from sklearn.metrics.pairwise import cosine_similarity
 from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
 from .hf_embedding import generate_embeddings as generate_hf_embeddings
 import logging
 from sklearn.feature_extraction.text import TfidfVectorizer
 from .stopwords_bo import TIBETAN_STOPWORDS
             return np.nan
         # Ensure embeddings are numpy arrays (should be, but defensive)
+        if not isinstance(emb1, np.ndarray):
+            emb1 = np.array(emb1)
+        if not isinstance(emb2, np.ndarray):
+            emb2 = np.array(emb2)
         # Handle cases where embeddings are all zeros
         if np.all(emb1 == 0) and np.all(emb2 == 0):
             return 0.0
         # Ensure embeddings are 2D for cosine_similarity: [1, dim]
+        if emb1.ndim == 1:
+            emb1 = emb1.reshape(1, -1)
+        if emb2.ndim == 1:
+            emb2 = emb2.reshape(1, -1)
         similarity_score = cosine_similarity(emb1, emb2)[0][0]

pipeline/process.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Dict, List, Tuple
 from .metrics import compute_all_metrics
 from .fasttext_embedding import get_model as get_fasttext_model
 from .hf_embedding import get_model as get_hf_model
-from .fasttext_embedding import load_fasttext_model
 from .tokenize import tokenize_texts
 import logging
 from itertools import combinations
@@ -90,6 +89,7 @@ def process_texts(
     """
     # Initialize model and model_type variables
     model, model_type = None, None # st_device removed
     model_warning = ""
     # Update progress if callback provided
@@ -118,7 +118,7 @@ def process_texts(
             else:
                 model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
                 logger.warning(model_warning)
-                warning += f" {model_warning}"
                 enable_semantic = False
                 if progress_callback is not None:
                     try:

 from .metrics import compute_all_metrics
 from .fasttext_embedding import get_model as get_fasttext_model
 from .hf_embedding import get_model as get_hf_model
 from .tokenize import tokenize_texts
 import logging
 from itertools import combinations
     """
     # Initialize model and model_type variables
     model, model_type = None, None # st_device removed
+    warning = ""
     model_warning = ""
     # Update progress if callback provided
             else:
                 model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
                 logger.warning(model_warning)
+                warning = warning + f" {model_warning}" if 'warning' in locals() else model_warning
                 enable_semantic = False
                 if progress_callback is not None:
                     try:

pipeline/structural_analysis.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Chapter-level structural analysis for Tibetan legal manuscripts.
+Provides differential highlighting, change detection, and structural alignment.
+"""
+import difflib
+import re
+def detect_structural_changes(text1: str, text2: str,
+                           min_change_length: int = 5,
+                           context_window: int = 10) -> dict:
+    """
+    Detect structural changes between two Tibetan text chapters.
+    Args:
+        text1: First text chapter
+        text2: Second text chapter
+        min_change_length: Minimum length of change to report
+        context_window: Number of characters to include as context
+    Returns:
+        Dictionary with detected changes: insertions, deletions, modifications
+    """
+    # Clean texts for comparison
+    def clean_text(text):
+        # Remove extra whitespace and normalize
+        text = re.sub(r'\s+', ' ', text.strip())
+        return text
+    clean1 = clean_text(text1)
+    clean2 = clean_text(text2)
+    # Use difflib to detect changes
+    differ = difflib.Differ()
+    diff = list(differ.compare(clean1.split(), clean2.split()))
+    changes = {
+        'insertions': [],
+        'deletions': [],
+        'modifications': [],
+        'unchanged': []
+    }
+    # Track current position in both texts
+    pos1 = 0
+    pos2 = 0
+    for i, line in enumerate(diff):
+        if line.startswith('  '):  # Unchanged
+            word = line[2:]
+            changes['unchanged'].append({
+                'word': word,
+                'position1': pos1,
+                'position2': pos2,
+                'length': len(word)
+            })
+            pos1 += len(word) + 1
+            pos2 += len(word) + 1
+        elif line.startswith('- '):  # Deletion
+            word = line[2:]
+            if len(word) >= min_change_length:
+                changes['deletions'].append({
+                    'word': word,
+                    'position': pos1,
+                    'length': len(word),
+                    'context': get_context(clean1, pos1, context_window)
+                })
+            pos1 += len(word) + 1
+        elif line.startswith('+ '):  # Insertion
+            word = line[2:]
+            if len(word) >= min_change_length:
+                changes['insertions'].append({
+                    'word': word,
+                    'position': pos2,
+                    'length': len(word),
+                    'context': get_context(clean2, pos2, context_window)
+                })
+            pos2 += len(word) + 1
+    # Detect modifications (adjacent deletions and insertions)
+    modifications = detect_modifications(changes['deletions'], changes['insertions'])
+    changes['modifications'] = modifications
+    return changes
+def get_context(text: str, position: int, window: int) -> str:
+    """Get context around a position in text."""
+    start = max(0, position - window)
+    end = min(len(text), position + window)
+    return text[start:end]
+def detect_modifications(deletions: list[dict], insertions: list[dict]) -> list[dict]:
+    """Detect modifications by pairing nearby deletions and insertions."""
+    modifications = []
+    for deletion in deletions[:]:  # Copy to avoid modification during iteration
+        for insertion in insertions[:]:
+            # If deletion and insertion are close (within 5 positions)
+            if abs(deletion['position'] - insertion['position']) <= 5:
+                modifications.append({
+                    'original': deletion['word'],
+                    'replacement': insertion['word'],
+                    'position': deletion['position'],
+                    'deletion_context': deletion['context'],
+                    'insertion_context': insertion['context']
+                })
+                # Remove from original lists to avoid duplicates
+                if deletion in deletions:
+                    deletions.remove(deletion)
+                if insertion in insertions:
+                    insertions.remove(insertion)
+                break
+    return modifications
+def generate_structural_alignment(text1: str, text2: str) -> dict[str, list]:
+    """
+    Generate structural alignment between two text chapters.
+    Returns:
+        Dictionary with alignment information including gaps and matches
+    """
+    # Split into sentences or clauses for alignment
+    def split_into_segments(text):
+        # Split on Tibetan punctuation
+        segments = re.split(r'[།༎༏༐༑༔]', text)
+        return [seg.strip() for seg in segments if seg.strip()]
+    segments1 = split_into_segments(text1)
+    segments2 = split_into_segments(text2)
+    # Create alignment using sequence matcher
+    matcher = difflib.SequenceMatcher(None, segments1, segments2)
+    alignment = {
+        'matches': [],
+        'gaps': [],
+        'mismatches': [],
+        'segments1': segments1,
+        'segments2': segments2
+    }
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == 'equal':
+            alignment['matches'].append({
+                'segments1': segments1[i1:i2],
+                'segments2': segments2[j1:j2],
+                'type': 'match'
+            })
+        elif tag == 'delete':
+            alignment['gaps'].append({
+                'segments': segments1[i1:i2],
+                'type': 'deletion',
+                'position': 'text1'
+            })
+        elif tag == 'insert':
+            alignment['gaps'].append({
+                'segments': segments2[j1:j2],
+                'type': 'insertion',
+                'position': 'text2'
+            })
+        elif tag == 'replace':
+            alignment['mismatches'].append({
+                'original': segments1[i1:i2],
+                'replacement': segments2[j1:j2],
+                'type': 'modification'
+            })
+    return alignment
+def calculate_structural_similarity_score(text1: str, text2: str) -> dict[str, float]:
+    """
+    Calculate various structural similarity scores between two texts.
+    Returns:
+        Dictionary with multiple similarity metrics
+    """
+    changes = detect_structural_changes(text1, text2)
+    alignment = generate_structural_alignment(text1, text2)
+    # Calculate scores
+    total_changes = len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications'])
+    # Structural similarity score (inverse of changes)
+    text_length = max(len(text1.split()), len(text2.split()))
+    structural_score = max(0, 1 - (total_changes / text_length)) if text_length > 0 else 0
+    # Alignment-based score
+    total_segments = len(alignment['segments1']) + len(alignment['segments2'])
+    matches = len(alignment['matches'])
+    alignment_score = matches / (total_segments / 2) if total_segments > 0 else 0
+    return {
+        'structural_similarity': structural_score,
+        'alignment_score': alignment_score,
+        'insertions': len(changes['insertions']),
+        'deletions': len(changes['deletions']),
+        'modifications': len(changes['modifications']),
+        'total_changes': total_changes
+    }
+def generate_differential_report(text1: str, text2: str,
+                               file1_name: str = "Text 1",
+                               file2_name: str = "Text 2") -> dict[str, any]:
+    """
+    Generate a comprehensive differential report for two text chapters.
+    Returns:
+        Complete report with changes, alignment, and recommendations
+    """
+    changes = detect_structural_changes(text1, text2)
+    alignment = generate_structural_alignment(text1, text2)
+    scores = calculate_structural_similarity_score(text1, text2)
+    report = {
+        'file1': file1_name,
+        'file2': file2_name,
+        'changes': changes,
+        'alignment': alignment,
+        'scores': scores,
+        'summary': {
+            'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10 or len(c['replacement']) > 10]),
+            'minor_variants': len([c for c in changes['modifications'] if len(c['original']) <= 5 and len(c['replacement']) <= 5]),
+            'structural_preservation': scores['alignment_score'] > 0.8,
+            'recommendation': 'Manuscripts are structurally similar' if scores['alignment_score'] > 0.7 else 'Significant structural differences detected'
+        }
+    }
+    return report

pipeline/tibetan_stopwords.py CHANGED Viewed

@@ -23,19 +23,20 @@ def get_stopwords(use_lite: bool = False) -> set:
             from .stopwords_bo import STOPWORDS
             stopwords_set = STOPWORDS
-        logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_module_name.lstrip('.')}.py")
     except ImportError:
         logger.error(
-            f"Failed to import STOPWORDS from {source_module_name.lstrip('.')}.py. "
-            f"Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
-            f"and is importable (e.g., no syntax errors)."
         )
     except AttributeError:
         logger.error(
-            f"Variable 'STOPWORDS' (all caps) not found in {source_module_name.lstrip('.')}.py. "
-            f"Please ensure the stopword set is defined with this name within the module."
         )
     except Exception as e:
-        logger.error(f"An unexpected error occurred while loading stopwords from {source_module_name.lstrip('.')}.py: {e}")
     return stopwords_set

             from .stopwords_bo import STOPWORDS
             stopwords_set = STOPWORDS
+        source_name = module_name.lstrip('.')
+        logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_name}.py")
     except ImportError:
         logger.error(
+            "Failed to import STOPWORDS from stopwords file. "
+            "Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
+            "and is importable (e.g., no syntax errors)."
         )
     except AttributeError:
         logger.error(
+            "Variable 'STOPWORDS' (all caps) not found in stopwords file. "
+            "Please ensure the stopword set is defined with this name within the module."
         )
     except Exception as e:
+        logger.error(f"An unexpected error occurred while loading stopwords: {e}")
     return stopwords_set

pipeline/visualize.py CHANGED Viewed

@@ -85,11 +85,10 @@ def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict =
             title=plot_title,
             xaxis_title="Text Pair",
             yaxis_title="Chapter",
-            autosize=False,
-            width=1350,
-            height=1200,
-            font=dict(size=16),
-            margin=dict(l=140, b=80, t=60),
         )
         fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
         fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")

             title=plot_title,
             xaxis_title="Text Pair",
             yaxis_title="Chapter",
+            autosize=True,
+            height=600,
+            font=dict(size=14),
+            margin=dict(l=100, b=100, t=50, r=50),
         )
         fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
         fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")

theme.py CHANGED Viewed

@@ -212,6 +212,18 @@ class TibetanAppTheme(gr.themes.Soft):
                 "width": "100% !important",
             },
             # LLM Analysis styling
             ".llm-analysis": {
                 "background-color": "#f8f9fa !important",

                 "width": "100% !important",
             },
+            # Heatmap plot styling - responsive sizing
+            ".tabs > .tab-content > div[data-testid='tabitem'] > .plotly": {
+                "width": "100% !important",
+                "height": "auto !important",
+            },
+            # Specific heatmap container styling
+            ".metric-heatmap": {
+                "max-width": "100% !important",
+                "overflow-x": "auto !important",
+            },
             # LLM Analysis styling
             ".llm-analysis": {
                 "background-color": "#f8f9fa !important",