daniel-wojahn commited on
Commit
bda2b5b
·
1 Parent(s): 28a74a6

maintenance and alignment prototype

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  venv
2
  __pycache__
3
- academic_article.md
 
 
 
1
  venv
2
  __pycache__
3
+ academic_article.md
4
+ #structural_analysis.py
5
+ #differential_viz.py
academic_article.md CHANGED
@@ -47,6 +47,14 @@ These metrics focus on the vocabulary and key terms within the texts.
47
 
48
  **Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
49
 
 
 
 
 
 
 
 
 
50
  ### 2.4. Semantic Similarity
51
 
52
  To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.
 
47
 
48
  **Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
49
 
50
+ **LCS vs. Levenshtein Distance:** While Levenshtein distance is another string similarity metric that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into another, LCS is more appropriate for Tibetan text analysis for several reasons:
51
+
52
+ 1. Tibetan manuscripts often share common passages or structural elements: LCS is particularly effective at identifying these shared passages, which may be separated by varying amounts of different text.
53
+
54
+ 2. LCS focuses on meaningful shared content rather than character-level differences: Unlike Levenshtein distance, which is sensitive to every character change, LCS identifies the longest sequence of words in the same order, focusing on substantive content overlap.
55
+
56
+ 3. LCS is less sensitive to minor variations that might occur in handwritten or OCR-processed texts: Tibetan manuscripts often contain variations due to scribal errors, regional differences, or OCR artifacts. LCS can still identify shared structural elements despite these variations, whereas Levenshtein distance might be disproportionately affected by them.
57
+
58
  ### 2.4. Semantic Similarity
59
 
60
  To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.
app.py CHANGED
@@ -10,10 +10,10 @@ from datetime import datetime
10
  from dotenv import load_dotenv
11
 
12
  # Load environment variables from .env file
13
- load_dotenv()
14
-
15
  from theme import tibetan_theme
16
 
 
 
17
  logger = logging.getLogger(__name__)
18
  def main_interface():
19
  with gr.Blocks(
@@ -67,12 +67,11 @@ def main_interface():
67
  model_dropdown = gr.Dropdown(
68
  choices=[
69
  "sentence-transformers/LaBSE",
70
- "intfloat/e5-base-v2",
71
  "Facebook FastText (Pre-trained)"
72
  ],
73
  label="Select Embedding Model",
74
  value="sentence-transformers/LaBSE",
75
- info="Select the embedding model to use for semantic similarity analysis."
76
  )
77
 
78
  with gr.Accordion("Advanced Options", open=False):
@@ -208,6 +207,28 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
208
  gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
209
 
210
  with gr.Tabs(elem_id="heatmap-tab-group"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  # Process all metrics including Word Counts in a unified way
212
  for metric_key, descriptive_title in heatmap_titles.items():
213
  with gr.Tab(metric_key):
@@ -247,7 +268,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
247
  if metric_key == "Word Counts":
248
  word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
249
  else:
250
- heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False)
251
 
252
  # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
253
  # e.g., heatmap_tabs["Jaccard Similarity (%)"]
@@ -286,6 +307,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
286
  semantic_heatmap_res = None
287
  tfidf_heatmap_res = None
288
  warning_update_res = gr.update(value="", visible=False) # Default: no warning
 
 
 
289
 
290
  """
291
  Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
@@ -428,6 +452,44 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
428
  logger.warning(f"Progress update error (non-critical): {e}")
429
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  # Save results to CSV
432
  if progress_tracker is not None:
433
  try:
@@ -466,7 +528,10 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
466
  lcs_heatmap_res,
467
  semantic_heatmap_res,
468
  tfidf_heatmap_res,
469
- warning_update_res
 
 
 
470
  )
471
 
472
  # Function to interpret results using LLM
@@ -515,6 +580,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
515
  heatmap_tabs["Semantic Similarity"],
516
  heatmap_tabs["TF-IDF Cosine Sim"],
517
  warning_box,
 
 
 
518
  ]
519
  )
520
 
 
10
  from dotenv import load_dotenv
11
 
12
  # Load environment variables from .env file
 
 
13
  from theme import tibetan_theme
14
 
15
+ load_dotenv()
16
+
17
  logger = logging.getLogger(__name__)
18
  def main_interface():
19
  with gr.Blocks(
 
67
  model_dropdown = gr.Dropdown(
68
  choices=[
69
  "sentence-transformers/LaBSE",
 
70
  "Facebook FastText (Pre-trained)"
71
  ],
72
  label="Select Embedding Model",
73
  value="sentence-transformers/LaBSE",
74
+ info="Select the embedding model to use for semantic similarity analysis. LaBSE v1.0 or FastText v1.0."
75
  )
76
 
77
  with gr.Accordion("Advanced Options", open=False):
 
207
  gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
208
 
209
  with gr.Tabs(elem_id="heatmap-tab-group"):
210
+ # Structural Analysis Tab
211
+ with gr.Tab("Structural Analysis"):
212
+ with gr.Accordion("Understanding Structural Differences", open=False, elem_classes="structural-analysis-info"):
213
+ gr.Markdown("""
214
+ ### Structural Analysis for Legal Manuscripts
215
+ This enhanced analysis provides detailed insights into structural differences between chapters, specifically designed for Tibetan legal manuscript comparison.
216
+
217
+ **Features:**
218
+ - **Change Detection**: Identifies insertions, deletions, and modifications
219
+ - **Structural Alignment**: Shows how chapters map structurally
220
+ - **Differential Highlighting**: Highlights significant textual variations
221
+ - **Per-Chapter Analysis**: Detailed comparison for each chapter pair
222
+
223
+ **Usage:**
224
+ Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
225
+ """)
226
+
227
+ # Structural analysis outputs
228
+ structural_heatmap = gr.Plot(label="Structural Changes Heatmap", show_label=False, elem_classes="structural-heatmap")
229
+ structural_report = gr.HTML(label="Differential Analysis Report")
230
+ structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
231
+
232
  # Process all metrics including Word Counts in a unified way
233
  for metric_key, descriptive_title in heatmap_titles.items():
234
  with gr.Tab(metric_key):
 
268
  if metric_key == "Word Counts":
269
  word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
270
  else:
271
+ heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
272
 
273
  # The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
274
  # e.g., heatmap_tabs["Jaccard Similarity (%)"]
 
307
  semantic_heatmap_res = None
308
  tfidf_heatmap_res = None
309
  warning_update_res = gr.update(value="", visible=False) # Default: no warning
310
+ structural_heatmap_res = None
311
+ structural_report_res = None
312
+ structural_export_res = None
313
 
314
  """
315
  Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
 
452
  logger.warning(f"Progress update error (non-critical): {e}")
453
  word_count_fig_res = generate_word_count_chart(word_counts_df_data)
454
 
455
+ # Generate structural analysis
456
+ if progress_tracker is not None:
457
+ try:
458
+ progress_tracker(0.92, desc="Generating structural analysis...")
459
+ except Exception as e:
460
+ logger.warning(f"Progress update error (non-critical): {e}")
461
+
462
+ # Create structural analysis
463
+ from pipeline.differential_viz import create_differential_heatmap, create_change_detection_report
464
+
465
+ # Create structural heatmap
466
+ try:
467
+ structural_heatmap_res = create_differential_heatmap(
468
+ text_data, "all_chapters", df_results
469
+ )
470
+ except Exception as e:
471
+ logger.warning(f"Could not generate structural heatmap: {e}")
472
+ structural_heatmap_res = None
473
+
474
+ # Create structural report
475
+ try:
476
+ structural_report_res = create_change_detection_report(
477
+ text_data, "all_chapters", "html"
478
+ )
479
+ except Exception as e:
480
+ logger.warning(f"Could not generate structural report: {e}")
481
+ structural_report_res = "<p>Could not generate structural analysis report.</p>"
482
+
483
+ # Save structural analysis report
484
+ try:
485
+ report_path = "structural_analysis_report.html"
486
+ with open(report_path, 'w', encoding='utf-8') as f:
487
+ f.write(structural_report_res if isinstance(structural_report_res, str) else "")
488
+ structural_export_res = report_path
489
+ except Exception as e:
490
+ logger.warning(f"Could not save structural report: {e}")
491
+ structural_export_res = None
492
+
493
  # Save results to CSV
494
  if progress_tracker is not None:
495
  try:
 
528
  lcs_heatmap_res,
529
  semantic_heatmap_res,
530
  tfidf_heatmap_res,
531
+ warning_update_res,
532
+ structural_heatmap_res,
533
+ structural_report_res,
534
+ structural_export_res
535
  )
536
 
537
  # Function to interpret results using LLM
 
580
  heatmap_tabs["Semantic Similarity"],
581
  heatmap_tabs["TF-IDF Cosine Sim"],
582
  warning_box,
583
+ structural_heatmap,
584
+ structural_report,
585
+ structural_export,
586
  ]
587
  )
588
 
pipeline/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pipeline/differential_viz.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Differential visualization enhancements for Tibetan legal manuscript analysis.
3
+ Provides enhanced heatmaps with structural change highlighting.
4
+ """
5
+
6
+ import plotly.graph_objects as go
7
+ from typing import Dict, List
8
+ import pandas as pd
9
+ from plotly.subplots import make_subplots
10
+ from .structural_analysis import detect_structural_changes, generate_structural_alignment
11
+
12
+
13
+ def create_differential_heatmap(texts_dict: Dict[str, str],
14
+ chapter_key: str,
15
+ metric_results: pd.DataFrame,
16
+ highlight_threshold: float = 0.7) -> go.Figure:
17
+ """
18
+ Create enhanced heatmap with structural change highlighting.
19
+
20
+ Args:
21
+ texts_dict: Dictionary mapping text names to their content
22
+ chapter_key: Chapter identifier being analyzed
23
+ metric_results: DataFrame with similarity metrics
24
+ highlight_threshold: Threshold for highlighting significant changes
25
+ """
26
+
27
+ # Get unique text pairs
28
+ text_pairs = metric_results['Text Pair'].unique()
29
+
30
+ # Create enhanced heatmap data
31
+ enhanced_data = []
32
+
33
+ for pair in text_pairs:
34
+ texts = pair.split(' vs ')
35
+ if len(texts) == 2:
36
+ text1_name, text2_name = texts
37
+
38
+ # Get actual text content
39
+ text1_content = texts_dict.get(text1_name, '')
40
+ text2_content = texts_dict.get(text2_name, '')
41
+
42
+ # Perform structural analysis
43
+ changes = detect_structural_changes(text1_content, text2_content)
44
+ alignment = generate_structural_alignment(text1_content, text2_content)
45
+
46
+ # Create enhanced metrics
47
+ enhanced_row = {
48
+ 'Text Pair': pair,
49
+ 'Chapter': chapter_key,
50
+ 'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
51
+ 'modification_score': len(changes['modifications']),
52
+ 'insertion_score': len(changes['insertions']),
53
+ 'deletion_score': len(changes['deletions']),
54
+ 'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
55
+ 'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
56
+ }
57
+
58
+ enhanced_data.append(enhanced_row)
59
+
60
+ enhanced_df = pd.DataFrame(enhanced_data)
61
+
62
+ # Create subplots for different aspects
63
+ fig = make_subplots(
64
+ rows=2, cols=2,
65
+ subplot_titles=('Structural Changes', 'Modifications', 'Insertions/Deletions', 'Alignment Quality'),
66
+ specs=[[{"secondary_y": True}, {"secondary_y": True}],
67
+ [{"secondary_y": True}, {"secondary_y": True}]]
68
+ )
69
+
70
+ # Structural changes heatmap
71
+ pivot_changes = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='structural_changes')
72
+ fig.add_trace(
73
+ go.Heatmap(
74
+ z=pivot_changes.values,
75
+ x=pivot_changes.columns,
76
+ y=pivot_changes.index,
77
+ colorscale='Reds',
78
+ name='Structural Changes',
79
+ showscale=True,
80
+ colorbar=dict(title="Changes", x=0.45)
81
+ ),
82
+ row=1, col=1
83
+ )
84
+
85
+ # Modifications heatmap
86
+ pivot_mods = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='modification_score')
87
+ fig.add_trace(
88
+ go.Heatmap(
89
+ z=pivot_mods.values,
90
+ x=pivot_mods.columns,
91
+ y=pivot_mods.index,
92
+ colorscale='Blues',
93
+ name='Modifications',
94
+ showscale=True,
95
+ colorbar=dict(title="Mods", x=1.0)
96
+ ),
97
+ row=1, col=2
98
+ )
99
+
100
+ # Insertions/Deletions combined heatmap
101
+ pivot_ins_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='insertion_score')
102
+ pivot_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='deletion_score')
103
+ combined = pivot_ins_del + pivot_del
104
+
105
+ fig.add_trace(
106
+ go.Heatmap(
107
+ z=combined.values,
108
+ x=combined.columns,
109
+ y=combined.index,
110
+ colorscale='Greens',
111
+ name='Insertions+Deletions',
112
+ showscale=True,
113
+ colorbar=dict(title="Ins+Del", x=0.45)
114
+ ),
115
+ row=2, col=1
116
+ )
117
+
118
+ # Alignment quality heatmap
119
+ pivot_align = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='alignment_quality')
120
+ fig.add_trace(
121
+ go.Heatmap(
122
+ z=pivot_align.values,
123
+ x=pivot_align.columns,
124
+ y=pivot_align.index,
125
+ colorscale='Viridis',
126
+ name='Alignment Quality',
127
+ showscale=True,
128
+ colorbar=dict(title="Quality", x=1.0)
129
+ ),
130
+ row=2, col=2
131
+ )
132
+
133
+ fig.update_layout(
134
+ title=f"Structural Analysis - Chapter {chapter_key}",
135
+ height=800,
136
+ showlegend=False
137
+ )
138
+
139
+ return fig
140
+
141
+
142
+ def create_change_detection_report(texts_dict: Dict[str, str],
143
+ chapter_key: str,
144
+ output_format: str = 'html') -> str:
145
+ """
146
+ Create detailed change detection report for a chapter.
147
+
148
+ Args:
149
+ texts_dict: Dictionary mapping text names to content
150
+ chapter_key: Chapter identifier
151
+ output_format: Format for output ('html', 'json', 'markdown')
152
+ """
153
+
154
+ from .structural_analysis import generate_differential_report
155
+
156
+ text_names = list(texts_dict.keys())
157
+ reports = []
158
+
159
+ for i, text1_name in enumerate(text_names):
160
+ for text2_name in text_names[i+1:]:
161
+ text1_content = texts_dict[text1_name]
162
+ text2_content = texts_dict[text2_name]
163
+
164
+ report = generate_differential_report(
165
+ text1_content, text2_content, text1_name, text2_name
166
+ )
167
+ reports.append(report)
168
+
169
+ if output_format == 'html':
170
+ return create_html_report(reports, chapter_key)
171
+ elif output_format == 'json':
172
+ import json
173
+ return json.dumps(reports, indent=2, ensure_ascii=False)
174
+ else:
175
+ return create_markdown_report(reports, chapter_key)
176
+
177
+
178
+ def create_html_report(reports: List[Dict], chapter_key: str) -> str:
179
+ """Create HTML report for structural analysis."""
180
+
181
+ html = f"""
182
+ <!DOCTYPE html>
183
+ <html>
184
+ <head>
185
+ <title>Structural Analysis Report - Chapter {chapter_key}</title>
186
+ <style>
187
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
188
+ .report {{ max-width: 1200px; margin: 0 auto; }}
189
+ .comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
190
+ .changes {{ display: flex; gap: 20px; }}
191
+ .change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
192
+ .insertion {{ background-color: #e8f5e8; }}
193
+ .deletion {{ background-color: #ffe8e8; }}
194
+ .modification {{ background-color: #fff3e0; }}
195
+ .highlight {{ background-color: yellow; padding: 2px 4px; }}
196
+ </style>
197
+ </head>
198
+ <body>
199
+ <div class="report">
200
+ <h1>Structural Analysis Report - Chapter {chapter_key}</h1>
201
+ """
202
+
203
+ for report in reports:
204
+ html += f"""
205
+ <div class="comparison">
206
+ <h2>{report['file1']} vs {report['file2']}</h2>
207
+ <div class="scores">
208
+ <p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
209
+ <p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
210
+ </div>
211
+
212
+ <div class="changes">
213
+ <div class="change-type insertion">
214
+ <h3>Insertions ({len(report['changes']['insertions'])})</h3>
215
+ {format_changes_html(report['changes']['insertions'])}
216
+ </div>
217
+ <div class="change-type deletion">
218
+ <h3>Deletions ({len(report['changes']['deletions'])})</h3>
219
+ {format_changes_html(report['changes']['deletions'])}
220
+ </div>
221
+ <div class="change-type modification">
222
+ <h3>Modifications ({len(report['changes']['modifications'])})</h3>
223
+ {format_changes_html(report['changes']['modifications'], is_modification=True)}
224
+ </div>
225
+ </div>
226
+ </div>
227
+ """
228
+
229
+ html += """
230
+ </div>
231
+ </body>
232
+ </html>
233
+ """
234
+
235
+ return html
236
+
237
+
238
+ def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
239
+ """Format changes for HTML display."""
240
+ if not changes:
241
+ return "<p>No changes detected.</p>"
242
+
243
+ html = ""
244
+ for change in changes[:5]: # Limit to first 5 for brevity
245
+ if is_modification:
246
+ html += f"""
247
+ <div class="change">
248
+ <span class="highlight">{change.get('original', '')}</span> →
249
+ <span class="highlight">{change.get('replacement', '')}</span>
250
+ </div>
251
+ """
252
+ else:
253
+ html += f"""
254
+ <div class="change">
255
+ <span class="highlight">{change.get('word', '')}</span>
256
+ </div>
257
+ """
258
+
259
+ if len(changes) > 5:
260
+ html += f"<p>... and {len(changes) - 5} more</p>"
261
+
262
+ return html
263
+
264
+
265
+ def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
266
+ """Create markdown report for structural analysis."""
267
+
268
+ md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
269
+
270
+ for report in reports:
271
+ md += f"## {report['file1']} vs {report['file2']}\n\n"
272
+ md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
273
+ md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
274
+ md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
275
+ md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
276
+ md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
277
+
278
+ if report['changes']['modifications']:
279
+ md += "### Significant Modifications:\n"
280
+ for mod in report['changes']['modifications'][:3]:
281
+ md += f"- **{mod.get('original', '')}** → **{mod.get('replacement', '')}**\n"
282
+
283
+ return md
pipeline/fast_lcs.pyx CHANGED
@@ -3,9 +3,12 @@ import numpy as np
3
  cimport cython
4
  cimport numpy as np
5
 
 
 
6
 
7
  @cython.boundscheck(False)
8
  @cython.wraparound(False)
 
9
  def compute_lcs_fast(list words1, list words2):
10
  """
11
  Computes the Longest Common Subsequence (LCS) of two lists of words.
@@ -28,18 +31,26 @@ def compute_lcs_fast(list words1, list words2):
28
  return compute_lcs_fast(words2, words1)
29
 
30
  # We only need two rows for the DP table
31
- cdef np.ndarray[np.int32_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
32
- cdef np.ndarray[np.int32_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
 
 
 
 
 
33
  cdef int i, j
 
34
 
35
  for i in range(1, m + 1):
36
  for j in range(1, n + 1):
37
  if words1[i - 1] == words2[j - 1]:
38
- curr_row[j] = prev_row[j - 1] + 1
39
  else:
40
- curr_row[j] = max(prev_row[j], curr_row[j - 1])
 
 
41
 
42
- # Copy current row to previous row for the next iteration
43
- prev_row = curr_row.copy()
44
 
45
- return int(prev_row[n])
 
3
  cimport cython
4
  cimport numpy as np
5
 
6
+ # Use memory views for better performance
7
+ ctypedef np.int32_t DTYPE_t
8
 
9
  @cython.boundscheck(False)
10
  @cython.wraparound(False)
11
+ @cython.cdivision(True)
12
  def compute_lcs_fast(list words1, list words2):
13
  """
14
  Computes the Longest Common Subsequence (LCS) of two lists of words.
 
31
  return compute_lcs_fast(words2, words1)
32
 
33
  # We only need two rows for the DP table
34
+ cdef np.ndarray[DTYPE_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
35
+ cdef np.ndarray[DTYPE_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
36
+
37
+ # Use memory views for better access performance
38
+ cdef DTYPE_t[:] prev_view = prev_row
39
+ cdef DTYPE_t[:] curr_view = curr_row
40
+
41
  cdef int i, j
42
+ cdef DTYPE_t val1, val2
43
 
44
  for i in range(1, m + 1):
45
  for j in range(1, n + 1):
46
  if words1[i - 1] == words2[j - 1]:
47
+ curr_view[j] = prev_view[j - 1] + 1
48
  else:
49
+ val1 = prev_view[j]
50
+ val2 = curr_view[j - 1]
51
+ curr_view[j] = val1 if val1 > val2 else val2
52
 
53
+ # Swap views instead of copying for better performance
54
+ prev_view, curr_view = curr_view, prev_view
55
 
56
+ return <int>prev_view[n]
pipeline/fasttext_embedding.py CHANGED
@@ -10,7 +10,7 @@ import logging
10
  import numpy as np
11
  import fasttext
12
  from collections import Counter
13
- from typing import List, Set, Optional
14
  from huggingface_hub import hf_hub_download
15
 
16
  # Set up logging
@@ -26,6 +26,11 @@ DEFAULT_MINN = 3
26
  DEFAULT_MAXN = 6
27
  DEFAULT_NEG = 5
28
 
 
 
 
 
 
29
  # Define paths for model storage
30
  DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
31
  DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin") # Updated to custom model
@@ -399,9 +404,12 @@ def get_batch_embeddings(
399
  return np.array(embeddings)
400
 
401
 
402
- def get_model(model_id: str):
 
 
 
403
  """
404
- Loads a FastText model based on the provided model ID.
405
 
406
  Args:
407
  model_id (str): The identifier for the model to load.
@@ -410,13 +418,22 @@ def get_model(model_id: str):
410
  Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
411
  or (None, None) if loading fails.
412
  """
413
- logger.info(f"Attempting to load FastText model: {model_id}")
 
 
 
 
 
 
 
 
414
 
415
  if model_id == "facebook-fasttext-pretrained":
416
  try:
417
  model = _load_facebook_official_tibetan_model()
418
  if model:
419
- logger.info(f"FastText model '{model_id}' loaded successfully.")
 
420
  return model, "fasttext"
421
  else:
422
  logger.error(f"Model loading for '{model_id}' returned None.")
@@ -428,7 +445,7 @@ def get_model(model_id: str):
428
  # elif model_id == "custom-model-name":
429
  # ...
430
  else:
431
- logger.error(f"Unsupported model_id for get_model: '{model_id}'.")
432
  return None, None
433
 
434
  def generate_embeddings(
 
10
  import numpy as np
11
  import fasttext
12
  from collections import Counter
13
+ from typing import List, Optional, Tuple, Any, Set
14
  from huggingface_hub import hf_hub_download
15
 
16
  # Set up logging
 
26
  DEFAULT_MAXN = 6
27
  DEFAULT_NEG = 5
28
 
29
+ # Model version information
30
+ MODEL_VERSIONS = {
31
+ "facebook-fasttext-pretrained": "v1.0",
32
+ }
33
+
34
  # Define paths for model storage
35
  DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
36
  DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin") # Updated to custom model
 
404
  return np.array(embeddings)
405
 
406
 
407
+ # Cache for loaded FastText models
408
+ _fasttext_model_cache = {}
409
+
410
+ def get_model(model_id: str) -> Tuple[Optional[Any], Optional[str]]:
411
  """
412
+ Loads a FastText model with version tracking.
413
 
414
  Args:
415
  model_id (str): The identifier for the model to load.
 
418
  Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
419
  or (None, None) if loading fails.
420
  """
421
+ # Include version information in cache key
422
+ model_version = MODEL_VERSIONS.get(model_id, "unknown")
423
+ cache_key = f"{model_id}@{model_version}"
424
+
425
+ if cache_key in _fasttext_model_cache:
426
+ logger.info(f"Returning cached FastText model: {model_id} (version: {model_version})")
427
+ return _fasttext_model_cache[cache_key], "fasttext"
428
+
429
+ logger.info(f"Attempting to load FastText model: {model_id} (version: {model_version})")
430
 
431
  if model_id == "facebook-fasttext-pretrained":
432
  try:
433
  model = _load_facebook_official_tibetan_model()
434
  if model:
435
+ _fasttext_model_cache[cache_key] = model
436
+ logger.info(f"FastText model '{model_id}' (version: {model_version}) loaded successfully.")
437
  return model, "fasttext"
438
  else:
439
  logger.error(f"Model loading for '{model_id}' returned None.")
 
445
  # elif model_id == "custom-model-name":
446
  # ...
447
  else:
448
+ logger.warning(f"Unsupported FastText model ID: {model_id}")
449
  return None, None
450
 
451
  def generate_embeddings(
pipeline/hf_embedding.py CHANGED
@@ -1,16 +1,22 @@
1
  import logging
2
- from typing import List, Any, Optional, Tuple
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
 
6
  logger = logging.getLogger(__name__)
7
 
8
- # Cache for loaded models
9
  _model_cache = {}
10
 
 
 
 
 
 
 
11
  def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
12
  """
13
- Loads a SentenceTransformer model from the Hugging Face Hub.
14
 
15
  Args:
16
  model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
@@ -19,15 +25,19 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
19
  Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
20
  or (None, None) if loading fails.
21
  """
22
- if model_id in _model_cache:
23
- logger.info(f"Returning cached model: {model_id}")
24
- return _model_cache[model_id], "sentence-transformer"
 
 
 
 
25
 
26
- logger.info(f"Loading SentenceTransformer model: {model_id}")
27
  try:
28
  model = SentenceTransformer(model_id)
29
- _model_cache[model_id] = model
30
- logger.info(f"Model '{model_id}' loaded successfully.")
31
  return model, "sentence-transformer"
32
  except Exception as e:
33
  logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
 
1
  import logging
2
+ from typing import List, Optional, Tuple
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
 
6
  logger = logging.getLogger(__name__)
7
 
8
+ # Cache for loaded models with version information
9
  _model_cache = {}
10
 
11
+ # Model version mapping
12
+ MODEL_VERSIONS = {
13
+ "sentence-transformers/LaBSE": "v1.0",
14
+ "intfloat/e5-base-v2": "v1.0",
15
+ }
16
+
17
  def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
18
  """
19
+ Loads a SentenceTransformer model from the Hugging Face Hub with version tracking.
20
 
21
  Args:
22
  model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
 
25
  Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
26
  or (None, None) if loading fails.
27
  """
28
+ # Include version information in cache key
29
+ model_version = MODEL_VERSIONS.get(model_id, "unknown")
30
+ cache_key = f"{model_id}@{model_version}"
31
+
32
+ if cache_key in _model_cache:
33
+ logger.info(f"Returning cached model: {model_id} (version: {model_version})")
34
+ return _model_cache[cache_key], "sentence-transformer"
35
 
36
+ logger.info(f"Loading SentenceTransformer model: {model_id} (version: {model_version})")
37
  try:
38
  model = SentenceTransformer(model_id)
39
+ _model_cache[cache_key] = model
40
+ logger.info(f"Model '{model_id}' (version: {model_version}) loaded successfully.")
41
  return model, "sentence-transformer"
42
  except Exception as e:
43
  logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
pipeline/metrics.py CHANGED
@@ -2,11 +2,11 @@ import numpy as np
2
  import pandas as pd
3
  from typing import List, Dict, Union
4
  from itertools import combinations
 
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
7
  from .hf_embedding import generate_embeddings as generate_hf_embeddings
8
 
9
- from .tokenize import tokenize_texts
10
  import logging
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from .stopwords_bo import TIBETAN_STOPWORDS
@@ -139,8 +139,10 @@ def compute_semantic_similarity(
139
  return np.nan
140
 
141
  # Ensure embeddings are numpy arrays (should be, but defensive)
142
- if not isinstance(emb1, np.ndarray): emb1 = np.array(emb1)
143
- if not isinstance(emb2, np.ndarray): emb2 = np.array(emb2)
 
 
144
 
145
  # Handle cases where embeddings are all zeros
146
  if np.all(emb1 == 0) and np.all(emb2 == 0):
@@ -157,8 +159,10 @@ def compute_semantic_similarity(
157
  return 0.0
158
 
159
  # Ensure embeddings are 2D for cosine_similarity: [1, dim]
160
- if emb1.ndim == 1: emb1 = emb1.reshape(1, -1)
161
- if emb2.ndim == 1: emb2 = emb2.reshape(1, -1)
 
 
162
 
163
  similarity_score = cosine_similarity(emb1, emb2)[0][0]
164
 
 
2
  import pandas as pd
3
  from typing import List, Dict, Union
4
  from itertools import combinations
5
+
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
8
  from .hf_embedding import generate_embeddings as generate_hf_embeddings
9
 
 
10
  import logging
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from .stopwords_bo import TIBETAN_STOPWORDS
 
139
  return np.nan
140
 
141
  # Ensure embeddings are numpy arrays (should be, but defensive)
142
+ if not isinstance(emb1, np.ndarray):
143
+ emb1 = np.array(emb1)
144
+ if not isinstance(emb2, np.ndarray):
145
+ emb2 = np.array(emb2)
146
 
147
  # Handle cases where embeddings are all zeros
148
  if np.all(emb1 == 0) and np.all(emb2 == 0):
 
159
  return 0.0
160
 
161
  # Ensure embeddings are 2D for cosine_similarity: [1, dim]
162
+ if emb1.ndim == 1:
163
+ emb1 = emb1.reshape(1, -1)
164
+ if emb2.ndim == 1:
165
+ emb2 = emb2.reshape(1, -1)
166
 
167
  similarity_score = cosine_similarity(emb1, emb2)[0][0]
168
 
pipeline/process.py CHANGED
@@ -3,7 +3,6 @@ from typing import Dict, List, Tuple
3
  from .metrics import compute_all_metrics
4
  from .fasttext_embedding import get_model as get_fasttext_model
5
  from .hf_embedding import get_model as get_hf_model
6
- from .fasttext_embedding import load_fasttext_model
7
  from .tokenize import tokenize_texts
8
  import logging
9
  from itertools import combinations
@@ -90,6 +89,7 @@ def process_texts(
90
  """
91
  # Initialize model and model_type variables
92
  model, model_type = None, None # st_device removed
 
93
  model_warning = ""
94
 
95
  # Update progress if callback provided
@@ -118,7 +118,7 @@ def process_texts(
118
  else:
119
  model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
120
  logger.warning(model_warning)
121
- warning += f" {model_warning}"
122
  enable_semantic = False
123
  if progress_callback is not None:
124
  try:
 
3
  from .metrics import compute_all_metrics
4
  from .fasttext_embedding import get_model as get_fasttext_model
5
  from .hf_embedding import get_model as get_hf_model
 
6
  from .tokenize import tokenize_texts
7
  import logging
8
  from itertools import combinations
 
89
  """
90
  # Initialize model and model_type variables
91
  model, model_type = None, None # st_device removed
92
+ warning = ""
93
  model_warning = ""
94
 
95
  # Update progress if callback provided
 
118
  else:
119
  model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
120
  logger.warning(model_warning)
121
+ warning = warning + f" {model_warning}" if 'warning' in locals() else model_warning
122
  enable_semantic = False
123
  if progress_callback is not None:
124
  try:
pipeline/structural_analysis.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chapter-level structural analysis for Tibetan legal manuscripts.
3
+ Provides differential highlighting, change detection, and structural alignment.
4
+ """
5
+
6
+ import difflib
7
+ import re
8
+
9
+
10
+ def detect_structural_changes(text1: str, text2: str,
11
+ min_change_length: int = 5,
12
+ context_window: int = 10) -> dict:
13
+ """
14
+ Detect structural changes between two Tibetan text chapters.
15
+
16
+ Args:
17
+ text1: First text chapter
18
+ text2: Second text chapter
19
+ min_change_length: Minimum length of change to report
20
+ context_window: Number of characters to include as context
21
+
22
+ Returns:
23
+ Dictionary with detected changes: insertions, deletions, modifications
24
+ """
25
+
26
+ # Clean texts for comparison
27
+ def clean_text(text):
28
+ # Remove extra whitespace and normalize
29
+ text = re.sub(r'\s+', ' ', text.strip())
30
+ return text
31
+
32
+ clean1 = clean_text(text1)
33
+ clean2 = clean_text(text2)
34
+
35
+ # Use difflib to detect changes
36
+ differ = difflib.Differ()
37
+ diff = list(differ.compare(clean1.split(), clean2.split()))
38
+
39
+ changes = {
40
+ 'insertions': [],
41
+ 'deletions': [],
42
+ 'modifications': [],
43
+ 'unchanged': []
44
+ }
45
+
46
+ # Track current position in both texts
47
+ pos1 = 0
48
+ pos2 = 0
49
+
50
+ for i, line in enumerate(diff):
51
+ if line.startswith(' '): # Unchanged
52
+ word = line[2:]
53
+ changes['unchanged'].append({
54
+ 'word': word,
55
+ 'position1': pos1,
56
+ 'position2': pos2,
57
+ 'length': len(word)
58
+ })
59
+ pos1 += len(word) + 1
60
+ pos2 += len(word) + 1
61
+
62
+ elif line.startswith('- '): # Deletion
63
+ word = line[2:]
64
+ if len(word) >= min_change_length:
65
+ changes['deletions'].append({
66
+ 'word': word,
67
+ 'position': pos1,
68
+ 'length': len(word),
69
+ 'context': get_context(clean1, pos1, context_window)
70
+ })
71
+ pos1 += len(word) + 1
72
+
73
+ elif line.startswith('+ '): # Insertion
74
+ word = line[2:]
75
+ if len(word) >= min_change_length:
76
+ changes['insertions'].append({
77
+ 'word': word,
78
+ 'position': pos2,
79
+ 'length': len(word),
80
+ 'context': get_context(clean2, pos2, context_window)
81
+ })
82
+ pos2 += len(word) + 1
83
+
84
+ # Detect modifications (adjacent deletions and insertions)
85
+ modifications = detect_modifications(changes['deletions'], changes['insertions'])
86
+ changes['modifications'] = modifications
87
+
88
+ return changes
89
+
90
+
91
+ def get_context(text: str, position: int, window: int) -> str:
92
+ """Get context around a position in text."""
93
+ start = max(0, position - window)
94
+ end = min(len(text), position + window)
95
+ return text[start:end]
96
+
97
+
98
+ def detect_modifications(deletions: list[dict], insertions: list[dict]) -> list[dict]:
99
+ """Detect modifications by pairing nearby deletions and insertions."""
100
+ modifications = []
101
+
102
+ for deletion in deletions[:]: # Copy to avoid modification during iteration
103
+ for insertion in insertions[:]:
104
+ # If deletion and insertion are close (within 5 positions)
105
+ if abs(deletion['position'] - insertion['position']) <= 5:
106
+ modifications.append({
107
+ 'original': deletion['word'],
108
+ 'replacement': insertion['word'],
109
+ 'position': deletion['position'],
110
+ 'deletion_context': deletion['context'],
111
+ 'insertion_context': insertion['context']
112
+ })
113
+ # Remove from original lists to avoid duplicates
114
+ if deletion in deletions:
115
+ deletions.remove(deletion)
116
+ if insertion in insertions:
117
+ insertions.remove(insertion)
118
+ break
119
+
120
+ return modifications
121
+
122
+
123
+ def generate_structural_alignment(text1: str, text2: str) -> dict[str, list]:
124
+ """
125
+ Generate structural alignment between two text chapters.
126
+
127
+ Returns:
128
+ Dictionary with alignment information including gaps and matches
129
+ """
130
+
131
+ # Split into sentences or clauses for alignment
132
+ def split_into_segments(text):
133
+ # Split on Tibetan punctuation
134
+ segments = re.split(r'[།༎༏༐༑༔]', text)
135
+ return [seg.strip() for seg in segments if seg.strip()]
136
+
137
+ segments1 = split_into_segments(text1)
138
+ segments2 = split_into_segments(text2)
139
+
140
+ # Create alignment using sequence matcher
141
+ matcher = difflib.SequenceMatcher(None, segments1, segments2)
142
+
143
+ alignment = {
144
+ 'matches': [],
145
+ 'gaps': [],
146
+ 'mismatches': [],
147
+ 'segments1': segments1,
148
+ 'segments2': segments2
149
+ }
150
+
151
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
152
+ if tag == 'equal':
153
+ alignment['matches'].append({
154
+ 'segments1': segments1[i1:i2],
155
+ 'segments2': segments2[j1:j2],
156
+ 'type': 'match'
157
+ })
158
+ elif tag == 'delete':
159
+ alignment['gaps'].append({
160
+ 'segments': segments1[i1:i2],
161
+ 'type': 'deletion',
162
+ 'position': 'text1'
163
+ })
164
+ elif tag == 'insert':
165
+ alignment['gaps'].append({
166
+ 'segments': segments2[j1:j2],
167
+ 'type': 'insertion',
168
+ 'position': 'text2'
169
+ })
170
+ elif tag == 'replace':
171
+ alignment['mismatches'].append({
172
+ 'original': segments1[i1:i2],
173
+ 'replacement': segments2[j1:j2],
174
+ 'type': 'modification'
175
+ })
176
+
177
+ return alignment
178
+
179
+
180
+ def calculate_structural_similarity_score(text1: str, text2: str) -> dict[str, float]:
181
+ """
182
+ Calculate various structural similarity scores between two texts.
183
+
184
+ Returns:
185
+ Dictionary with multiple similarity metrics
186
+ """
187
+
188
+ changes = detect_structural_changes(text1, text2)
189
+ alignment = generate_structural_alignment(text1, text2)
190
+
191
+ # Calculate scores
192
+ total_changes = len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications'])
193
+
194
+ # Structural similarity score (inverse of changes)
195
+ text_length = max(len(text1.split()), len(text2.split()))
196
+ structural_score = max(0, 1 - (total_changes / text_length)) if text_length > 0 else 0
197
+
198
+ # Alignment-based score
199
+ total_segments = len(alignment['segments1']) + len(alignment['segments2'])
200
+ matches = len(alignment['matches'])
201
+ alignment_score = matches / (total_segments / 2) if total_segments > 0 else 0
202
+
203
+ return {
204
+ 'structural_similarity': structural_score,
205
+ 'alignment_score': alignment_score,
206
+ 'insertions': len(changes['insertions']),
207
+ 'deletions': len(changes['deletions']),
208
+ 'modifications': len(changes['modifications']),
209
+ 'total_changes': total_changes
210
+ }
211
+
212
+
213
+ def generate_differential_report(text1: str, text2: str,
214
+ file1_name: str = "Text 1",
215
+ file2_name: str = "Text 2") -> dict[str, any]:
216
+ """
217
+ Generate a comprehensive differential report for two text chapters.
218
+
219
+ Returns:
220
+ Complete report with changes, alignment, and recommendations
221
+ """
222
+
223
+ changes = detect_structural_changes(text1, text2)
224
+ alignment = generate_structural_alignment(text1, text2)
225
+ scores = calculate_structural_similarity_score(text1, text2)
226
+
227
+ report = {
228
+ 'file1': file1_name,
229
+ 'file2': file2_name,
230
+ 'changes': changes,
231
+ 'alignment': alignment,
232
+ 'scores': scores,
233
+ 'summary': {
234
+ 'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10 or len(c['replacement']) > 10]),
235
+ 'minor_variants': len([c for c in changes['modifications'] if len(c['original']) <= 5 and len(c['replacement']) <= 5]),
236
+ 'structural_preservation': scores['alignment_score'] > 0.8,
237
+ 'recommendation': 'Manuscripts are structurally similar' if scores['alignment_score'] > 0.7 else 'Significant structural differences detected'
238
+ }
239
+ }
240
+
241
+ return report
pipeline/tibetan_stopwords.py CHANGED
@@ -23,19 +23,20 @@ def get_stopwords(use_lite: bool = False) -> set:
23
  from .stopwords_bo import STOPWORDS
24
  stopwords_set = STOPWORDS
25
 
26
- logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_module_name.lstrip('.')}.py")
 
27
  except ImportError:
28
  logger.error(
29
- f"Failed to import STOPWORDS from {source_module_name.lstrip('.')}.py. "
30
- f"Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
31
- f"and is importable (e.g., no syntax errors)."
32
  )
33
  except AttributeError:
34
  logger.error(
35
- f"Variable 'STOPWORDS' (all caps) not found in {source_module_name.lstrip('.')}.py. "
36
- f"Please ensure the stopword set is defined with this name within the module."
37
  )
38
  except Exception as e:
39
- logger.error(f"An unexpected error occurred while loading stopwords from {source_module_name.lstrip('.')}.py: {e}")
40
 
41
  return stopwords_set
 
23
  from .stopwords_bo import STOPWORDS
24
  stopwords_set = STOPWORDS
25
 
26
+ source_name = module_name.lstrip('.')
27
+ logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_name}.py")
28
  except ImportError:
29
  logger.error(
30
+ "Failed to import STOPWORDS from stopwords file. "
31
+ "Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
32
+ "and is importable (e.g., no syntax errors)."
33
  )
34
  except AttributeError:
35
  logger.error(
36
+ "Variable 'STOPWORDS' (all caps) not found in stopwords file. "
37
+ "Please ensure the stopword set is defined with this name within the module."
38
  )
39
  except Exception as e:
40
+ logger.error(f"An unexpected error occurred while loading stopwords: {e}")
41
 
42
  return stopwords_set
pipeline/visualize.py CHANGED
@@ -85,11 +85,10 @@ def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict =
85
  title=plot_title,
86
  xaxis_title="Text Pair",
87
  yaxis_title="Chapter",
88
- autosize=False,
89
- width=1350,
90
- height=1200,
91
- font=dict(size=16),
92
- margin=dict(l=140, b=80, t=60),
93
  )
94
  fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
95
  fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
 
85
  title=plot_title,
86
  xaxis_title="Text Pair",
87
  yaxis_title="Chapter",
88
+ autosize=True,
89
+ height=600,
90
+ font=dict(size=14),
91
+ margin=dict(l=100, b=100, t=50, r=50),
 
92
  )
93
  fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
94
  fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
theme.py CHANGED
@@ -212,6 +212,18 @@ class TibetanAppTheme(gr.themes.Soft):
212
  "width": "100% !important",
213
  },
214
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # LLM Analysis styling
216
  ".llm-analysis": {
217
  "background-color": "#f8f9fa !important",
 
212
  "width": "100% !important",
213
  },
214
 
215
+ # Heatmap plot styling - responsive sizing
216
+ ".tabs > .tab-content > div[data-testid='tabitem'] > .plotly": {
217
+ "width": "100% !important",
218
+ "height": "auto !important",
219
+ },
220
+
221
+ # Specific heatmap container styling
222
+ ".metric-heatmap": {
223
+ "max-width": "100% !important",
224
+ "overflow-x": "auto !important",
225
+ },
226
+
227
  # LLM Analysis styling
228
  ".llm-analysis": {
229
  "background-color": "#f8f9fa !important",