Spaces:
Running
Running
Commit
·
bda2b5b
1
Parent(s):
28a74a6
maintenance and alignment prototype
Browse files- .DS_Store +0 -0
- .gitignore +3 -1
- academic_article.md +8 -0
- app.py +74 -6
- pipeline/.DS_Store +0 -0
- pipeline/differential_viz.py +283 -0
- pipeline/fast_lcs.pyx +18 -7
- pipeline/fasttext_embedding.py +23 -6
- pipeline/hf_embedding.py +19 -9
- pipeline/metrics.py +9 -5
- pipeline/process.py +2 -2
- pipeline/structural_analysis.py +241 -0
- pipeline/tibetan_stopwords.py +8 -7
- pipeline/visualize.py +4 -5
- theme.py +12 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
venv
|
2 |
__pycache__
|
3 |
-
academic_article.md
|
|
|
|
|
|
1 |
venv
|
2 |
__pycache__
|
3 |
+
academic_article.md
|
4 |
+
#structural_analysis.py
|
5 |
+
#differential_viz.py
|
academic_article.md
CHANGED
@@ -47,6 +47,14 @@ These metrics focus on the vocabulary and key terms within the texts.
|
|
47 |
|
48 |
**Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
### 2.4. Semantic Similarity
|
51 |
|
52 |
To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.
|
|
|
47 |
|
48 |
**Normalized Longest Common Subsequence (LCS):** This metric moves beyond vocabulary to assess structural parallels. The LCS algorithm finds the longest sequence of words that appears in both texts in the same relative order, though not necessarily contiguously. For example, the LCS of "the brown fox jumps" and "the lazy brown dog jumps" is "the brown jumps". TTM normalizes the length of this subsequence to produce a score that reflects shared phrasing and narrative structure. A high LCS score can indicate direct textual borrowing or a shared structural template. To ensure performance, the LCS calculation is optimized with a custom Cython implementation.
|
49 |
|
50 |
+
**LCS vs. Levenshtein Distance:** While Levenshtein distance is another string similarity metric that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into another, LCS is more appropriate for Tibetan text analysis for several reasons:
|
51 |
+
|
52 |
+
1. Tibetan manuscripts often share common passages or structural elements: LCS is particularly effective at identifying these shared passages, which may be separated by varying amounts of different text.
|
53 |
+
|
54 |
+
2. LCS focuses on meaningful shared content rather than character-level differences: Unlike Levenshtein distance, which is sensitive to every character change, LCS identifies the longest sequence of words in the same order, focusing on substantive content overlap.
|
55 |
+
|
56 |
+
3. LCS is less sensitive to minor variations that might occur in handwritten or OCR-processed texts: Tibetan manuscripts often contain variations due to scribal errors, regional differences, or OCR artifacts. LCS can still identify shared structural elements despite these variations, whereas Levenshtein distance might be disproportionately affected by them.
|
57 |
+
|
58 |
### 2.4. Semantic Similarity
|
59 |
|
60 |
To capture similarities in meaning that may not be apparent from lexical overlap, TTM employs semantic similarity using word and sentence embeddings.
|
app.py
CHANGED
@@ -10,10 +10,10 @@ from datetime import datetime
|
|
10 |
from dotenv import load_dotenv
|
11 |
|
12 |
# Load environment variables from .env file
|
13 |
-
load_dotenv()
|
14 |
-
|
15 |
from theme import tibetan_theme
|
16 |
|
|
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
def main_interface():
|
19 |
with gr.Blocks(
|
@@ -67,12 +67,11 @@ def main_interface():
|
|
67 |
model_dropdown = gr.Dropdown(
|
68 |
choices=[
|
69 |
"sentence-transformers/LaBSE",
|
70 |
-
"intfloat/e5-base-v2",
|
71 |
"Facebook FastText (Pre-trained)"
|
72 |
],
|
73 |
label="Select Embedding Model",
|
74 |
value="sentence-transformers/LaBSE",
|
75 |
-
info="Select the embedding model to use for semantic similarity analysis."
|
76 |
)
|
77 |
|
78 |
with gr.Accordion("Advanced Options", open=False):
|
@@ -208,6 +207,28 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
208 |
gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
|
209 |
|
210 |
with gr.Tabs(elem_id="heatmap-tab-group"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
# Process all metrics including Word Counts in a unified way
|
212 |
for metric_key, descriptive_title in heatmap_titles.items():
|
213 |
with gr.Tab(metric_key):
|
@@ -247,7 +268,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
247 |
if metric_key == "Word Counts":
|
248 |
word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
|
249 |
else:
|
250 |
-
heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False)
|
251 |
|
252 |
# The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
|
253 |
# e.g., heatmap_tabs["Jaccard Similarity (%)"]
|
@@ -286,6 +307,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
286 |
semantic_heatmap_res = None
|
287 |
tfidf_heatmap_res = None
|
288 |
warning_update_res = gr.update(value="", visible=False) # Default: no warning
|
|
|
|
|
|
|
289 |
|
290 |
"""
|
291 |
Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
|
@@ -428,6 +452,44 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
428 |
logger.warning(f"Progress update error (non-critical): {e}")
|
429 |
word_count_fig_res = generate_word_count_chart(word_counts_df_data)
|
430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
# Save results to CSV
|
432 |
if progress_tracker is not None:
|
433 |
try:
|
@@ -466,7 +528,10 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
466 |
lcs_heatmap_res,
|
467 |
semantic_heatmap_res,
|
468 |
tfidf_heatmap_res,
|
469 |
-
warning_update_res
|
|
|
|
|
|
|
470 |
)
|
471 |
|
472 |
# Function to interpret results using LLM
|
@@ -515,6 +580,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
515 |
heatmap_tabs["Semantic Similarity"],
|
516 |
heatmap_tabs["TF-IDF Cosine Sim"],
|
517 |
warning_box,
|
|
|
|
|
|
|
518 |
]
|
519 |
)
|
520 |
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
|
12 |
# Load environment variables from .env file
|
|
|
|
|
13 |
from theme import tibetan_theme
|
14 |
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
logger = logging.getLogger(__name__)
|
18 |
def main_interface():
|
19 |
with gr.Blocks(
|
|
|
67 |
model_dropdown = gr.Dropdown(
|
68 |
choices=[
|
69 |
"sentence-transformers/LaBSE",
|
|
|
70 |
"Facebook FastText (Pre-trained)"
|
71 |
],
|
72 |
label="Select Embedding Model",
|
73 |
value="sentence-transformers/LaBSE",
|
74 |
+
info="Select the embedding model to use for semantic similarity analysis. LaBSE v1.0 or FastText v1.0."
|
75 |
)
|
76 |
|
77 |
with gr.Accordion("Advanced Options", open=False):
|
|
|
207 |
gr.Markdown("## Detailed Metric Analysis", elem_classes="gr-markdown")
|
208 |
|
209 |
with gr.Tabs(elem_id="heatmap-tab-group"):
|
210 |
+
# Structural Analysis Tab
|
211 |
+
with gr.Tab("Structural Analysis"):
|
212 |
+
with gr.Accordion("Understanding Structural Differences", open=False, elem_classes="structural-analysis-info"):
|
213 |
+
gr.Markdown("""
|
214 |
+
### Structural Analysis for Legal Manuscripts
|
215 |
+
This enhanced analysis provides detailed insights into structural differences between chapters, specifically designed for Tibetan legal manuscript comparison.
|
216 |
+
|
217 |
+
**Features:**
|
218 |
+
- **Change Detection**: Identifies insertions, deletions, and modifications
|
219 |
+
- **Structural Alignment**: Shows how chapters map structurally
|
220 |
+
- **Differential Highlighting**: Highlights significant textual variations
|
221 |
+
- **Per-Chapter Analysis**: Detailed comparison for each chapter pair
|
222 |
+
|
223 |
+
**Usage:**
|
224 |
+
Results appear automatically when texts are processed. Use the export buttons to save detailed reports for philological analysis.
|
225 |
+
""")
|
226 |
+
|
227 |
+
# Structural analysis outputs
|
228 |
+
structural_heatmap = gr.Plot(label="Structural Changes Heatmap", show_label=False, elem_classes="structural-heatmap")
|
229 |
+
structural_report = gr.HTML(label="Differential Analysis Report")
|
230 |
+
structural_export = gr.File(label="Export Structural Analysis Report", file_types=[".html", ".md", ".json"])
|
231 |
+
|
232 |
# Process all metrics including Word Counts in a unified way
|
233 |
for metric_key, descriptive_title in heatmap_titles.items():
|
234 |
with gr.Tab(metric_key):
|
|
|
268 |
if metric_key == "Word Counts":
|
269 |
word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
|
270 |
else:
|
271 |
+
heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
|
272 |
|
273 |
# The outputs in process_btn.click should use the short metric names as keys for heatmap_tabs
|
274 |
# e.g., heatmap_tabs["Jaccard Similarity (%)"]
|
|
|
307 |
semantic_heatmap_res = None
|
308 |
tfidf_heatmap_res = None
|
309 |
warning_update_res = gr.update(value="", visible=False) # Default: no warning
|
310 |
+
structural_heatmap_res = None
|
311 |
+
structural_report_res = None
|
312 |
+
structural_export_res = None
|
313 |
|
314 |
"""
|
315 |
Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
|
|
|
452 |
logger.warning(f"Progress update error (non-critical): {e}")
|
453 |
word_count_fig_res = generate_word_count_chart(word_counts_df_data)
|
454 |
|
455 |
+
# Generate structural analysis
|
456 |
+
if progress_tracker is not None:
|
457 |
+
try:
|
458 |
+
progress_tracker(0.92, desc="Generating structural analysis...")
|
459 |
+
except Exception as e:
|
460 |
+
logger.warning(f"Progress update error (non-critical): {e}")
|
461 |
+
|
462 |
+
# Create structural analysis
|
463 |
+
from pipeline.differential_viz import create_differential_heatmap, create_change_detection_report
|
464 |
+
|
465 |
+
# Create structural heatmap
|
466 |
+
try:
|
467 |
+
structural_heatmap_res = create_differential_heatmap(
|
468 |
+
text_data, "all_chapters", df_results
|
469 |
+
)
|
470 |
+
except Exception as e:
|
471 |
+
logger.warning(f"Could not generate structural heatmap: {e}")
|
472 |
+
structural_heatmap_res = None
|
473 |
+
|
474 |
+
# Create structural report
|
475 |
+
try:
|
476 |
+
structural_report_res = create_change_detection_report(
|
477 |
+
text_data, "all_chapters", "html"
|
478 |
+
)
|
479 |
+
except Exception as e:
|
480 |
+
logger.warning(f"Could not generate structural report: {e}")
|
481 |
+
structural_report_res = "<p>Could not generate structural analysis report.</p>"
|
482 |
+
|
483 |
+
# Save structural analysis report
|
484 |
+
try:
|
485 |
+
report_path = "structural_analysis_report.html"
|
486 |
+
with open(report_path, 'w', encoding='utf-8') as f:
|
487 |
+
f.write(structural_report_res if isinstance(structural_report_res, str) else "")
|
488 |
+
structural_export_res = report_path
|
489 |
+
except Exception as e:
|
490 |
+
logger.warning(f"Could not save structural report: {e}")
|
491 |
+
structural_export_res = None
|
492 |
+
|
493 |
# Save results to CSV
|
494 |
if progress_tracker is not None:
|
495 |
try:
|
|
|
528 |
lcs_heatmap_res,
|
529 |
semantic_heatmap_res,
|
530 |
tfidf_heatmap_res,
|
531 |
+
warning_update_res,
|
532 |
+
structural_heatmap_res,
|
533 |
+
structural_report_res,
|
534 |
+
structural_export_res
|
535 |
)
|
536 |
|
537 |
# Function to interpret results using LLM
|
|
|
580 |
heatmap_tabs["Semantic Similarity"],
|
581 |
heatmap_tabs["TF-IDF Cosine Sim"],
|
582 |
warning_box,
|
583 |
+
structural_heatmap,
|
584 |
+
structural_report,
|
585 |
+
structural_export,
|
586 |
]
|
587 |
)
|
588 |
|
pipeline/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pipeline/differential_viz.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Differential visualization enhancements for Tibetan legal manuscript analysis.
|
3 |
+
Provides enhanced heatmaps with structural change highlighting.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
from typing import Dict, List
|
8 |
+
import pandas as pd
|
9 |
+
from plotly.subplots import make_subplots
|
10 |
+
from .structural_analysis import detect_structural_changes, generate_structural_alignment
|
11 |
+
|
12 |
+
|
13 |
+
def create_differential_heatmap(texts_dict: Dict[str, str],
|
14 |
+
chapter_key: str,
|
15 |
+
metric_results: pd.DataFrame,
|
16 |
+
highlight_threshold: float = 0.7) -> go.Figure:
|
17 |
+
"""
|
18 |
+
Create enhanced heatmap with structural change highlighting.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
texts_dict: Dictionary mapping text names to their content
|
22 |
+
chapter_key: Chapter identifier being analyzed
|
23 |
+
metric_results: DataFrame with similarity metrics
|
24 |
+
highlight_threshold: Threshold for highlighting significant changes
|
25 |
+
"""
|
26 |
+
|
27 |
+
# Get unique text pairs
|
28 |
+
text_pairs = metric_results['Text Pair'].unique()
|
29 |
+
|
30 |
+
# Create enhanced heatmap data
|
31 |
+
enhanced_data = []
|
32 |
+
|
33 |
+
for pair in text_pairs:
|
34 |
+
texts = pair.split(' vs ')
|
35 |
+
if len(texts) == 2:
|
36 |
+
text1_name, text2_name = texts
|
37 |
+
|
38 |
+
# Get actual text content
|
39 |
+
text1_content = texts_dict.get(text1_name, '')
|
40 |
+
text2_content = texts_dict.get(text2_name, '')
|
41 |
+
|
42 |
+
# Perform structural analysis
|
43 |
+
changes = detect_structural_changes(text1_content, text2_content)
|
44 |
+
alignment = generate_structural_alignment(text1_content, text2_content)
|
45 |
+
|
46 |
+
# Create enhanced metrics
|
47 |
+
enhanced_row = {
|
48 |
+
'Text Pair': pair,
|
49 |
+
'Chapter': chapter_key,
|
50 |
+
'structural_changes': len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications']),
|
51 |
+
'modification_score': len(changes['modifications']),
|
52 |
+
'insertion_score': len(changes['insertions']),
|
53 |
+
'deletion_score': len(changes['deletions']),
|
54 |
+
'alignment_quality': len(alignment['matches']) / max(len(alignment['segments1']) + len(alignment['segments2']), 1),
|
55 |
+
'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10])
|
56 |
+
}
|
57 |
+
|
58 |
+
enhanced_data.append(enhanced_row)
|
59 |
+
|
60 |
+
enhanced_df = pd.DataFrame(enhanced_data)
|
61 |
+
|
62 |
+
# Create subplots for different aspects
|
63 |
+
fig = make_subplots(
|
64 |
+
rows=2, cols=2,
|
65 |
+
subplot_titles=('Structural Changes', 'Modifications', 'Insertions/Deletions', 'Alignment Quality'),
|
66 |
+
specs=[[{"secondary_y": True}, {"secondary_y": True}],
|
67 |
+
[{"secondary_y": True}, {"secondary_y": True}]]
|
68 |
+
)
|
69 |
+
|
70 |
+
# Structural changes heatmap
|
71 |
+
pivot_changes = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='structural_changes')
|
72 |
+
fig.add_trace(
|
73 |
+
go.Heatmap(
|
74 |
+
z=pivot_changes.values,
|
75 |
+
x=pivot_changes.columns,
|
76 |
+
y=pivot_changes.index,
|
77 |
+
colorscale='Reds',
|
78 |
+
name='Structural Changes',
|
79 |
+
showscale=True,
|
80 |
+
colorbar=dict(title="Changes", x=0.45)
|
81 |
+
),
|
82 |
+
row=1, col=1
|
83 |
+
)
|
84 |
+
|
85 |
+
# Modifications heatmap
|
86 |
+
pivot_mods = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='modification_score')
|
87 |
+
fig.add_trace(
|
88 |
+
go.Heatmap(
|
89 |
+
z=pivot_mods.values,
|
90 |
+
x=pivot_mods.columns,
|
91 |
+
y=pivot_mods.index,
|
92 |
+
colorscale='Blues',
|
93 |
+
name='Modifications',
|
94 |
+
showscale=True,
|
95 |
+
colorbar=dict(title="Mods", x=1.0)
|
96 |
+
),
|
97 |
+
row=1, col=2
|
98 |
+
)
|
99 |
+
|
100 |
+
# Insertions/Deletions combined heatmap
|
101 |
+
pivot_ins_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='insertion_score')
|
102 |
+
pivot_del = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='deletion_score')
|
103 |
+
combined = pivot_ins_del + pivot_del
|
104 |
+
|
105 |
+
fig.add_trace(
|
106 |
+
go.Heatmap(
|
107 |
+
z=combined.values,
|
108 |
+
x=combined.columns,
|
109 |
+
y=combined.index,
|
110 |
+
colorscale='Greens',
|
111 |
+
name='Insertions+Deletions',
|
112 |
+
showscale=True,
|
113 |
+
colorbar=dict(title="Ins+Del", x=0.45)
|
114 |
+
),
|
115 |
+
row=2, col=1
|
116 |
+
)
|
117 |
+
|
118 |
+
# Alignment quality heatmap
|
119 |
+
pivot_align = enhanced_df.pivot(index='Chapter', columns='Text Pair', values='alignment_quality')
|
120 |
+
fig.add_trace(
|
121 |
+
go.Heatmap(
|
122 |
+
z=pivot_align.values,
|
123 |
+
x=pivot_align.columns,
|
124 |
+
y=pivot_align.index,
|
125 |
+
colorscale='Viridis',
|
126 |
+
name='Alignment Quality',
|
127 |
+
showscale=True,
|
128 |
+
colorbar=dict(title="Quality", x=1.0)
|
129 |
+
),
|
130 |
+
row=2, col=2
|
131 |
+
)
|
132 |
+
|
133 |
+
fig.update_layout(
|
134 |
+
title=f"Structural Analysis - Chapter {chapter_key}",
|
135 |
+
height=800,
|
136 |
+
showlegend=False
|
137 |
+
)
|
138 |
+
|
139 |
+
return fig
|
140 |
+
|
141 |
+
|
142 |
+
def create_change_detection_report(texts_dict: Dict[str, str],
|
143 |
+
chapter_key: str,
|
144 |
+
output_format: str = 'html') -> str:
|
145 |
+
"""
|
146 |
+
Create detailed change detection report for a chapter.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
texts_dict: Dictionary mapping text names to content
|
150 |
+
chapter_key: Chapter identifier
|
151 |
+
output_format: Format for output ('html', 'json', 'markdown')
|
152 |
+
"""
|
153 |
+
|
154 |
+
from .structural_analysis import generate_differential_report
|
155 |
+
|
156 |
+
text_names = list(texts_dict.keys())
|
157 |
+
reports = []
|
158 |
+
|
159 |
+
for i, text1_name in enumerate(text_names):
|
160 |
+
for text2_name in text_names[i+1:]:
|
161 |
+
text1_content = texts_dict[text1_name]
|
162 |
+
text2_content = texts_dict[text2_name]
|
163 |
+
|
164 |
+
report = generate_differential_report(
|
165 |
+
text1_content, text2_content, text1_name, text2_name
|
166 |
+
)
|
167 |
+
reports.append(report)
|
168 |
+
|
169 |
+
if output_format == 'html':
|
170 |
+
return create_html_report(reports, chapter_key)
|
171 |
+
elif output_format == 'json':
|
172 |
+
import json
|
173 |
+
return json.dumps(reports, indent=2, ensure_ascii=False)
|
174 |
+
else:
|
175 |
+
return create_markdown_report(reports, chapter_key)
|
176 |
+
|
177 |
+
|
178 |
+
def create_html_report(reports: List[Dict], chapter_key: str) -> str:
|
179 |
+
"""Create HTML report for structural analysis."""
|
180 |
+
|
181 |
+
html = f"""
|
182 |
+
<!DOCTYPE html>
|
183 |
+
<html>
|
184 |
+
<head>
|
185 |
+
<title>Structural Analysis Report - Chapter {chapter_key}</title>
|
186 |
+
<style>
|
187 |
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
188 |
+
.report {{ max-width: 1200px; margin: 0 auto; }}
|
189 |
+
.comparison {{ border: 1px solid #ddd; margin: 20px 0; padding: 15px; }}
|
190 |
+
.changes {{ display: flex; gap: 20px; }}
|
191 |
+
.change-type {{ flex: 1; padding: 10px; border: 1px solid #eee; }}
|
192 |
+
.insertion {{ background-color: #e8f5e8; }}
|
193 |
+
.deletion {{ background-color: #ffe8e8; }}
|
194 |
+
.modification {{ background-color: #fff3e0; }}
|
195 |
+
.highlight {{ background-color: yellow; padding: 2px 4px; }}
|
196 |
+
</style>
|
197 |
+
</head>
|
198 |
+
<body>
|
199 |
+
<div class="report">
|
200 |
+
<h1>Structural Analysis Report - Chapter {chapter_key}</h1>
|
201 |
+
"""
|
202 |
+
|
203 |
+
for report in reports:
|
204 |
+
html += f"""
|
205 |
+
<div class="comparison">
|
206 |
+
<h2>{report['file1']} vs {report['file2']}</h2>
|
207 |
+
<div class="scores">
|
208 |
+
<p><strong>Structural Similarity:</strong> {report['scores']['structural_similarity']:.2f}</p>
|
209 |
+
<p><strong>Alignment Score:</strong> {report['scores']['alignment_score']:.2f}</p>
|
210 |
+
</div>
|
211 |
+
|
212 |
+
<div class="changes">
|
213 |
+
<div class="change-type insertion">
|
214 |
+
<h3>Insertions ({len(report['changes']['insertions'])})</h3>
|
215 |
+
{format_changes_html(report['changes']['insertions'])}
|
216 |
+
</div>
|
217 |
+
<div class="change-type deletion">
|
218 |
+
<h3>Deletions ({len(report['changes']['deletions'])})</h3>
|
219 |
+
{format_changes_html(report['changes']['deletions'])}
|
220 |
+
</div>
|
221 |
+
<div class="change-type modification">
|
222 |
+
<h3>Modifications ({len(report['changes']['modifications'])})</h3>
|
223 |
+
{format_changes_html(report['changes']['modifications'], is_modification=True)}
|
224 |
+
</div>
|
225 |
+
</div>
|
226 |
+
</div>
|
227 |
+
"""
|
228 |
+
|
229 |
+
html += """
|
230 |
+
</div>
|
231 |
+
</body>
|
232 |
+
</html>
|
233 |
+
"""
|
234 |
+
|
235 |
+
return html
|
236 |
+
|
237 |
+
|
238 |
+
def format_changes_html(changes: List[Dict], is_modification: bool = False) -> str:
|
239 |
+
"""Format changes for HTML display."""
|
240 |
+
if not changes:
|
241 |
+
return "<p>No changes detected.</p>"
|
242 |
+
|
243 |
+
html = ""
|
244 |
+
for change in changes[:5]: # Limit to first 5 for brevity
|
245 |
+
if is_modification:
|
246 |
+
html += f"""
|
247 |
+
<div class="change">
|
248 |
+
<span class="highlight">{change.get('original', '')}</span> →
|
249 |
+
<span class="highlight">{change.get('replacement', '')}</span>
|
250 |
+
</div>
|
251 |
+
"""
|
252 |
+
else:
|
253 |
+
html += f"""
|
254 |
+
<div class="change">
|
255 |
+
<span class="highlight">{change.get('word', '')}</span>
|
256 |
+
</div>
|
257 |
+
"""
|
258 |
+
|
259 |
+
if len(changes) > 5:
|
260 |
+
html += f"<p>... and {len(changes) - 5} more</p>"
|
261 |
+
|
262 |
+
return html
|
263 |
+
|
264 |
+
|
265 |
+
def create_markdown_report(reports: List[Dict], chapter_key: str) -> str:
|
266 |
+
"""Create markdown report for structural analysis."""
|
267 |
+
|
268 |
+
md = f"# Structural Analysis Report - Chapter {chapter_key}\n\n"
|
269 |
+
|
270 |
+
for report in reports:
|
271 |
+
md += f"## {report['file1']} vs {report['file2']}\n\n"
|
272 |
+
md += f"- **Structural Similarity**: {report['scores']['structural_similarity']:.2f}\n"
|
273 |
+
md += f"- **Alignment Score**: {report['scores']['alignment_score']:.2f}\n"
|
274 |
+
md += f"- **Insertions**: {len(report['changes']['insertions'])}\n"
|
275 |
+
md += f"- **Deletions**: {len(report['changes']['deletions'])}\n"
|
276 |
+
md += f"- **Modifications**: {len(report['changes']['modifications'])}\n\n"
|
277 |
+
|
278 |
+
if report['changes']['modifications']:
|
279 |
+
md += "### Significant Modifications:\n"
|
280 |
+
for mod in report['changes']['modifications'][:3]:
|
281 |
+
md += f"- **{mod.get('original', '')}** → **{mod.get('replacement', '')}**\n"
|
282 |
+
|
283 |
+
return md
|
pipeline/fast_lcs.pyx
CHANGED
@@ -3,9 +3,12 @@ import numpy as np
|
|
3 |
cimport cython
|
4 |
cimport numpy as np
|
5 |
|
|
|
|
|
6 |
|
7 |
@cython.boundscheck(False)
|
8 |
@cython.wraparound(False)
|
|
|
9 |
def compute_lcs_fast(list words1, list words2):
|
10 |
"""
|
11 |
Computes the Longest Common Subsequence (LCS) of two lists of words.
|
@@ -28,18 +31,26 @@ def compute_lcs_fast(list words1, list words2):
|
|
28 |
return compute_lcs_fast(words2, words1)
|
29 |
|
30 |
# We only need two rows for the DP table
|
31 |
-
cdef np.ndarray[
|
32 |
-
cdef np.ndarray[
|
|
|
|
|
|
|
|
|
|
|
33 |
cdef int i, j
|
|
|
34 |
|
35 |
for i in range(1, m + 1):
|
36 |
for j in range(1, n + 1):
|
37 |
if words1[i - 1] == words2[j - 1]:
|
38 |
-
|
39 |
else:
|
40 |
-
|
|
|
|
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
|
45 |
-
return int
|
|
|
3 |
cimport cython
|
4 |
cimport numpy as np
|
5 |
|
6 |
+
# Use memory views for better performance
|
7 |
+
ctypedef np.int32_t DTYPE_t
|
8 |
|
9 |
@cython.boundscheck(False)
|
10 |
@cython.wraparound(False)
|
11 |
+
@cython.cdivision(True)
|
12 |
def compute_lcs_fast(list words1, list words2):
|
13 |
"""
|
14 |
Computes the Longest Common Subsequence (LCS) of two lists of words.
|
|
|
31 |
return compute_lcs_fast(words2, words1)
|
32 |
|
33 |
# We only need two rows for the DP table
|
34 |
+
cdef np.ndarray[DTYPE_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
|
35 |
+
cdef np.ndarray[DTYPE_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
|
36 |
+
|
37 |
+
# Use memory views for better access performance
|
38 |
+
cdef DTYPE_t[:] prev_view = prev_row
|
39 |
+
cdef DTYPE_t[:] curr_view = curr_row
|
40 |
+
|
41 |
cdef int i, j
|
42 |
+
cdef DTYPE_t val1, val2
|
43 |
|
44 |
for i in range(1, m + 1):
|
45 |
for j in range(1, n + 1):
|
46 |
if words1[i - 1] == words2[j - 1]:
|
47 |
+
curr_view[j] = prev_view[j - 1] + 1
|
48 |
else:
|
49 |
+
val1 = prev_view[j]
|
50 |
+
val2 = curr_view[j - 1]
|
51 |
+
curr_view[j] = val1 if val1 > val2 else val2
|
52 |
|
53 |
+
# Swap views instead of copying for better performance
|
54 |
+
prev_view, curr_view = curr_view, prev_view
|
55 |
|
56 |
+
return <int>prev_view[n]
|
pipeline/fasttext_embedding.py
CHANGED
@@ -10,7 +10,7 @@ import logging
|
|
10 |
import numpy as np
|
11 |
import fasttext
|
12 |
from collections import Counter
|
13 |
-
from typing import List,
|
14 |
from huggingface_hub import hf_hub_download
|
15 |
|
16 |
# Set up logging
|
@@ -26,6 +26,11 @@ DEFAULT_MINN = 3
|
|
26 |
DEFAULT_MAXN = 6
|
27 |
DEFAULT_NEG = 5
|
28 |
|
|
|
|
|
|
|
|
|
|
|
29 |
# Define paths for model storage
|
30 |
DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
31 |
DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin") # Updated to custom model
|
@@ -399,9 +404,12 @@ def get_batch_embeddings(
|
|
399 |
return np.array(embeddings)
|
400 |
|
401 |
|
402 |
-
|
|
|
|
|
|
|
403 |
"""
|
404 |
-
Loads a FastText model
|
405 |
|
406 |
Args:
|
407 |
model_id (str): The identifier for the model to load.
|
@@ -410,13 +418,22 @@ def get_model(model_id: str):
|
|
410 |
Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
|
411 |
or (None, None) if loading fails.
|
412 |
"""
|
413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
if model_id == "facebook-fasttext-pretrained":
|
416 |
try:
|
417 |
model = _load_facebook_official_tibetan_model()
|
418 |
if model:
|
419 |
-
|
|
|
420 |
return model, "fasttext"
|
421 |
else:
|
422 |
logger.error(f"Model loading for '{model_id}' returned None.")
|
@@ -428,7 +445,7 @@ def get_model(model_id: str):
|
|
428 |
# elif model_id == "custom-model-name":
|
429 |
# ...
|
430 |
else:
|
431 |
-
logger.
|
432 |
return None, None
|
433 |
|
434 |
def generate_embeddings(
|
|
|
10 |
import numpy as np
|
11 |
import fasttext
|
12 |
from collections import Counter
|
13 |
+
from typing import List, Optional, Tuple, Any, Set
|
14 |
from huggingface_hub import hf_hub_download
|
15 |
|
16 |
# Set up logging
|
|
|
26 |
DEFAULT_MAXN = 6
|
27 |
DEFAULT_NEG = 5
|
28 |
|
29 |
+
# Model version information
|
30 |
+
MODEL_VERSIONS = {
|
31 |
+
"facebook-fasttext-pretrained": "v1.0",
|
32 |
+
}
|
33 |
+
|
34 |
# Define paths for model storage
|
35 |
DEFAULT_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
|
36 |
DEFAULT_MODEL_PATH = str(Path(__file__).resolve().parent.parent / "fasttext-modelling" / "tibetan_cbow_model.bin") # Updated to custom model
|
|
|
404 |
return np.array(embeddings)
|
405 |
|
406 |
|
407 |
+
# Cache for loaded FastText models
|
408 |
+
_fasttext_model_cache = {}
|
409 |
+
|
410 |
+
def get_model(model_id: str) -> Tuple[Optional[Any], Optional[str]]:
|
411 |
"""
|
412 |
+
Loads a FastText model with version tracking.
|
413 |
|
414 |
Args:
|
415 |
model_id (str): The identifier for the model to load.
|
|
|
418 |
Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
|
419 |
or (None, None) if loading fails.
|
420 |
"""
|
421 |
+
# Include version information in cache key
|
422 |
+
model_version = MODEL_VERSIONS.get(model_id, "unknown")
|
423 |
+
cache_key = f"{model_id}@{model_version}"
|
424 |
+
|
425 |
+
if cache_key in _fasttext_model_cache:
|
426 |
+
logger.info(f"Returning cached FastText model: {model_id} (version: {model_version})")
|
427 |
+
return _fasttext_model_cache[cache_key], "fasttext"
|
428 |
+
|
429 |
+
logger.info(f"Attempting to load FastText model: {model_id} (version: {model_version})")
|
430 |
|
431 |
if model_id == "facebook-fasttext-pretrained":
|
432 |
try:
|
433 |
model = _load_facebook_official_tibetan_model()
|
434 |
if model:
|
435 |
+
_fasttext_model_cache[cache_key] = model
|
436 |
+
logger.info(f"FastText model '{model_id}' (version: {model_version}) loaded successfully.")
|
437 |
return model, "fasttext"
|
438 |
else:
|
439 |
logger.error(f"Model loading for '{model_id}' returned None.")
|
|
|
445 |
# elif model_id == "custom-model-name":
|
446 |
# ...
|
447 |
else:
|
448 |
+
logger.warning(f"Unsupported FastText model ID: {model_id}")
|
449 |
return None, None
|
450 |
|
451 |
def generate_embeddings(
|
pipeline/hf_embedding.py
CHANGED
@@ -1,16 +1,22 @@
|
|
1 |
import logging
|
2 |
-
from typing import List,
|
3 |
import numpy as np
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
|
6 |
logger = logging.getLogger(__name__)
|
7 |
|
8 |
-
# Cache for loaded models
|
9 |
_model_cache = {}
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
|
12 |
"""
|
13 |
-
Loads a SentenceTransformer model from the Hugging Face Hub.
|
14 |
|
15 |
Args:
|
16 |
model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
|
@@ -19,15 +25,19 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
|
|
19 |
Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
|
20 |
or (None, None) if loading fails.
|
21 |
"""
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
logger.info(f"Loading SentenceTransformer model: {model_id}")
|
27 |
try:
|
28 |
model = SentenceTransformer(model_id)
|
29 |
-
_model_cache[
|
30 |
-
logger.info(f"Model '{model_id}' loaded successfully.")
|
31 |
return model, "sentence-transformer"
|
32 |
except Exception as e:
|
33 |
logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
|
|
|
1 |
import logging
|
2 |
+
from typing import List, Optional, Tuple
|
3 |
import numpy as np
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
|
6 |
logger = logging.getLogger(__name__)
|
7 |
|
8 |
+
# Cache for loaded models with version information
|
9 |
_model_cache = {}
|
10 |
|
11 |
+
# Model version mapping
|
12 |
+
MODEL_VERSIONS = {
|
13 |
+
"sentence-transformers/LaBSE": "v1.0",
|
14 |
+
"intfloat/e5-base-v2": "v1.0",
|
15 |
+
}
|
16 |
+
|
17 |
def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
|
18 |
"""
|
19 |
+
Loads a SentenceTransformer model from the Hugging Face Hub with version tracking.
|
20 |
|
21 |
Args:
|
22 |
model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
|
|
|
25 |
Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
|
26 |
or (None, None) if loading fails.
|
27 |
"""
|
28 |
+
# Include version information in cache key
|
29 |
+
model_version = MODEL_VERSIONS.get(model_id, "unknown")
|
30 |
+
cache_key = f"{model_id}@{model_version}"
|
31 |
+
|
32 |
+
if cache_key in _model_cache:
|
33 |
+
logger.info(f"Returning cached model: {model_id} (version: {model_version})")
|
34 |
+
return _model_cache[cache_key], "sentence-transformer"
|
35 |
|
36 |
+
logger.info(f"Loading SentenceTransformer model: {model_id} (version: {model_version})")
|
37 |
try:
|
38 |
model = SentenceTransformer(model_id)
|
39 |
+
_model_cache[cache_key] = model
|
40 |
+
logger.info(f"Model '{model_id}' (version: {model_version}) loaded successfully.")
|
41 |
return model, "sentence-transformer"
|
42 |
except Exception as e:
|
43 |
logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
|
pipeline/metrics.py
CHANGED
@@ -2,11 +2,11 @@ import numpy as np
|
|
2 |
import pandas as pd
|
3 |
from typing import List, Dict, Union
|
4 |
from itertools import combinations
|
|
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
|
7 |
from .hf_embedding import generate_embeddings as generate_hf_embeddings
|
8 |
|
9 |
-
from .tokenize import tokenize_texts
|
10 |
import logging
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
from .stopwords_bo import TIBETAN_STOPWORDS
|
@@ -139,8 +139,10 @@ def compute_semantic_similarity(
|
|
139 |
return np.nan
|
140 |
|
141 |
# Ensure embeddings are numpy arrays (should be, but defensive)
|
142 |
-
if not isinstance(emb1, np.ndarray):
|
143 |
-
|
|
|
|
|
144 |
|
145 |
# Handle cases where embeddings are all zeros
|
146 |
if np.all(emb1 == 0) and np.all(emb2 == 0):
|
@@ -157,8 +159,10 @@ def compute_semantic_similarity(
|
|
157 |
return 0.0
|
158 |
|
159 |
# Ensure embeddings are 2D for cosine_similarity: [1, dim]
|
160 |
-
if emb1.ndim == 1:
|
161 |
-
|
|
|
|
|
162 |
|
163 |
similarity_score = cosine_similarity(emb1, emb2)[0][0]
|
164 |
|
|
|
2 |
import pandas as pd
|
3 |
from typing import List, Dict, Union
|
4 |
from itertools import combinations
|
5 |
+
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
|
8 |
from .hf_embedding import generate_embeddings as generate_hf_embeddings
|
9 |
|
|
|
10 |
import logging
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
from .stopwords_bo import TIBETAN_STOPWORDS
|
|
|
139 |
return np.nan
|
140 |
|
141 |
# Ensure embeddings are numpy arrays (should be, but defensive)
|
142 |
+
if not isinstance(emb1, np.ndarray):
|
143 |
+
emb1 = np.array(emb1)
|
144 |
+
if not isinstance(emb2, np.ndarray):
|
145 |
+
emb2 = np.array(emb2)
|
146 |
|
147 |
# Handle cases where embeddings are all zeros
|
148 |
if np.all(emb1 == 0) and np.all(emb2 == 0):
|
|
|
159 |
return 0.0
|
160 |
|
161 |
# Ensure embeddings are 2D for cosine_similarity: [1, dim]
|
162 |
+
if emb1.ndim == 1:
|
163 |
+
emb1 = emb1.reshape(1, -1)
|
164 |
+
if emb2.ndim == 1:
|
165 |
+
emb2 = emb2.reshape(1, -1)
|
166 |
|
167 |
similarity_score = cosine_similarity(emb1, emb2)[0][0]
|
168 |
|
pipeline/process.py
CHANGED
@@ -3,7 +3,6 @@ from typing import Dict, List, Tuple
|
|
3 |
from .metrics import compute_all_metrics
|
4 |
from .fasttext_embedding import get_model as get_fasttext_model
|
5 |
from .hf_embedding import get_model as get_hf_model
|
6 |
-
from .fasttext_embedding import load_fasttext_model
|
7 |
from .tokenize import tokenize_texts
|
8 |
import logging
|
9 |
from itertools import combinations
|
@@ -90,6 +89,7 @@ def process_texts(
|
|
90 |
"""
|
91 |
# Initialize model and model_type variables
|
92 |
model, model_type = None, None # st_device removed
|
|
|
93 |
model_warning = ""
|
94 |
|
95 |
# Update progress if callback provided
|
@@ -118,7 +118,7 @@ def process_texts(
|
|
118 |
else:
|
119 |
model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
|
120 |
logger.warning(model_warning)
|
121 |
-
warning
|
122 |
enable_semantic = False
|
123 |
if progress_callback is not None:
|
124 |
try:
|
|
|
3 |
from .metrics import compute_all_metrics
|
4 |
from .fasttext_embedding import get_model as get_fasttext_model
|
5 |
from .hf_embedding import get_model as get_hf_model
|
|
|
6 |
from .tokenize import tokenize_texts
|
7 |
import logging
|
8 |
from itertools import combinations
|
|
|
89 |
"""
|
90 |
# Initialize model and model_type variables
|
91 |
model, model_type = None, None # st_device removed
|
92 |
+
warning = ""
|
93 |
model_warning = ""
|
94 |
|
95 |
# Update progress if callback provided
|
|
|
118 |
else:
|
119 |
model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
|
120 |
logger.warning(model_warning)
|
121 |
+
warning = warning + f" {model_warning}" if 'warning' in locals() else model_warning
|
122 |
enable_semantic = False
|
123 |
if progress_callback is not None:
|
124 |
try:
|
pipeline/structural_analysis.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Chapter-level structural analysis for Tibetan legal manuscripts.
|
3 |
+
Provides differential highlighting, change detection, and structural alignment.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import difflib
|
7 |
+
import re
|
8 |
+
|
9 |
+
|
10 |
+
def detect_structural_changes(text1: str, text2: str,
|
11 |
+
min_change_length: int = 5,
|
12 |
+
context_window: int = 10) -> dict:
|
13 |
+
"""
|
14 |
+
Detect structural changes between two Tibetan text chapters.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
text1: First text chapter
|
18 |
+
text2: Second text chapter
|
19 |
+
min_change_length: Minimum length of change to report
|
20 |
+
context_window: Number of characters to include as context
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
Dictionary with detected changes: insertions, deletions, modifications
|
24 |
+
"""
|
25 |
+
|
26 |
+
# Clean texts for comparison
|
27 |
+
def clean_text(text):
|
28 |
+
# Remove extra whitespace and normalize
|
29 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
30 |
+
return text
|
31 |
+
|
32 |
+
clean1 = clean_text(text1)
|
33 |
+
clean2 = clean_text(text2)
|
34 |
+
|
35 |
+
# Use difflib to detect changes
|
36 |
+
differ = difflib.Differ()
|
37 |
+
diff = list(differ.compare(clean1.split(), clean2.split()))
|
38 |
+
|
39 |
+
changes = {
|
40 |
+
'insertions': [],
|
41 |
+
'deletions': [],
|
42 |
+
'modifications': [],
|
43 |
+
'unchanged': []
|
44 |
+
}
|
45 |
+
|
46 |
+
# Track current position in both texts
|
47 |
+
pos1 = 0
|
48 |
+
pos2 = 0
|
49 |
+
|
50 |
+
for i, line in enumerate(diff):
|
51 |
+
if line.startswith(' '): # Unchanged
|
52 |
+
word = line[2:]
|
53 |
+
changes['unchanged'].append({
|
54 |
+
'word': word,
|
55 |
+
'position1': pos1,
|
56 |
+
'position2': pos2,
|
57 |
+
'length': len(word)
|
58 |
+
})
|
59 |
+
pos1 += len(word) + 1
|
60 |
+
pos2 += len(word) + 1
|
61 |
+
|
62 |
+
elif line.startswith('- '): # Deletion
|
63 |
+
word = line[2:]
|
64 |
+
if len(word) >= min_change_length:
|
65 |
+
changes['deletions'].append({
|
66 |
+
'word': word,
|
67 |
+
'position': pos1,
|
68 |
+
'length': len(word),
|
69 |
+
'context': get_context(clean1, pos1, context_window)
|
70 |
+
})
|
71 |
+
pos1 += len(word) + 1
|
72 |
+
|
73 |
+
elif line.startswith('+ '): # Insertion
|
74 |
+
word = line[2:]
|
75 |
+
if len(word) >= min_change_length:
|
76 |
+
changes['insertions'].append({
|
77 |
+
'word': word,
|
78 |
+
'position': pos2,
|
79 |
+
'length': len(word),
|
80 |
+
'context': get_context(clean2, pos2, context_window)
|
81 |
+
})
|
82 |
+
pos2 += len(word) + 1
|
83 |
+
|
84 |
+
# Detect modifications (adjacent deletions and insertions)
|
85 |
+
modifications = detect_modifications(changes['deletions'], changes['insertions'])
|
86 |
+
changes['modifications'] = modifications
|
87 |
+
|
88 |
+
return changes
|
89 |
+
|
90 |
+
|
91 |
+
def get_context(text: str, position: int, window: int) -> str:
|
92 |
+
"""Get context around a position in text."""
|
93 |
+
start = max(0, position - window)
|
94 |
+
end = min(len(text), position + window)
|
95 |
+
return text[start:end]
|
96 |
+
|
97 |
+
|
98 |
+
def detect_modifications(deletions: list[dict], insertions: list[dict]) -> list[dict]:
|
99 |
+
"""Detect modifications by pairing nearby deletions and insertions."""
|
100 |
+
modifications = []
|
101 |
+
|
102 |
+
for deletion in deletions[:]: # Copy to avoid modification during iteration
|
103 |
+
for insertion in insertions[:]:
|
104 |
+
# If deletion and insertion are close (within 5 positions)
|
105 |
+
if abs(deletion['position'] - insertion['position']) <= 5:
|
106 |
+
modifications.append({
|
107 |
+
'original': deletion['word'],
|
108 |
+
'replacement': insertion['word'],
|
109 |
+
'position': deletion['position'],
|
110 |
+
'deletion_context': deletion['context'],
|
111 |
+
'insertion_context': insertion['context']
|
112 |
+
})
|
113 |
+
# Remove from original lists to avoid duplicates
|
114 |
+
if deletion in deletions:
|
115 |
+
deletions.remove(deletion)
|
116 |
+
if insertion in insertions:
|
117 |
+
insertions.remove(insertion)
|
118 |
+
break
|
119 |
+
|
120 |
+
return modifications
|
121 |
+
|
122 |
+
|
123 |
+
def generate_structural_alignment(text1: str, text2: str) -> dict[str, list]:
|
124 |
+
"""
|
125 |
+
Generate structural alignment between two text chapters.
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
Dictionary with alignment information including gaps and matches
|
129 |
+
"""
|
130 |
+
|
131 |
+
# Split into sentences or clauses for alignment
|
132 |
+
def split_into_segments(text):
|
133 |
+
# Split on Tibetan punctuation
|
134 |
+
segments = re.split(r'[།༎༏༐༑༔]', text)
|
135 |
+
return [seg.strip() for seg in segments if seg.strip()]
|
136 |
+
|
137 |
+
segments1 = split_into_segments(text1)
|
138 |
+
segments2 = split_into_segments(text2)
|
139 |
+
|
140 |
+
# Create alignment using sequence matcher
|
141 |
+
matcher = difflib.SequenceMatcher(None, segments1, segments2)
|
142 |
+
|
143 |
+
alignment = {
|
144 |
+
'matches': [],
|
145 |
+
'gaps': [],
|
146 |
+
'mismatches': [],
|
147 |
+
'segments1': segments1,
|
148 |
+
'segments2': segments2
|
149 |
+
}
|
150 |
+
|
151 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
152 |
+
if tag == 'equal':
|
153 |
+
alignment['matches'].append({
|
154 |
+
'segments1': segments1[i1:i2],
|
155 |
+
'segments2': segments2[j1:j2],
|
156 |
+
'type': 'match'
|
157 |
+
})
|
158 |
+
elif tag == 'delete':
|
159 |
+
alignment['gaps'].append({
|
160 |
+
'segments': segments1[i1:i2],
|
161 |
+
'type': 'deletion',
|
162 |
+
'position': 'text1'
|
163 |
+
})
|
164 |
+
elif tag == 'insert':
|
165 |
+
alignment['gaps'].append({
|
166 |
+
'segments': segments2[j1:j2],
|
167 |
+
'type': 'insertion',
|
168 |
+
'position': 'text2'
|
169 |
+
})
|
170 |
+
elif tag == 'replace':
|
171 |
+
alignment['mismatches'].append({
|
172 |
+
'original': segments1[i1:i2],
|
173 |
+
'replacement': segments2[j1:j2],
|
174 |
+
'type': 'modification'
|
175 |
+
})
|
176 |
+
|
177 |
+
return alignment
|
178 |
+
|
179 |
+
|
180 |
+
def calculate_structural_similarity_score(text1: str, text2: str) -> dict[str, float]:
|
181 |
+
"""
|
182 |
+
Calculate various structural similarity scores between two texts.
|
183 |
+
|
184 |
+
Returns:
|
185 |
+
Dictionary with multiple similarity metrics
|
186 |
+
"""
|
187 |
+
|
188 |
+
changes = detect_structural_changes(text1, text2)
|
189 |
+
alignment = generate_structural_alignment(text1, text2)
|
190 |
+
|
191 |
+
# Calculate scores
|
192 |
+
total_changes = len(changes['insertions']) + len(changes['deletions']) + len(changes['modifications'])
|
193 |
+
|
194 |
+
# Structural similarity score (inverse of changes)
|
195 |
+
text_length = max(len(text1.split()), len(text2.split()))
|
196 |
+
structural_score = max(0, 1 - (total_changes / text_length)) if text_length > 0 else 0
|
197 |
+
|
198 |
+
# Alignment-based score
|
199 |
+
total_segments = len(alignment['segments1']) + len(alignment['segments2'])
|
200 |
+
matches = len(alignment['matches'])
|
201 |
+
alignment_score = matches / (total_segments / 2) if total_segments > 0 else 0
|
202 |
+
|
203 |
+
return {
|
204 |
+
'structural_similarity': structural_score,
|
205 |
+
'alignment_score': alignment_score,
|
206 |
+
'insertions': len(changes['insertions']),
|
207 |
+
'deletions': len(changes['deletions']),
|
208 |
+
'modifications': len(changes['modifications']),
|
209 |
+
'total_changes': total_changes
|
210 |
+
}
|
211 |
+
|
212 |
+
|
213 |
+
def generate_differential_report(text1: str, text2: str,
|
214 |
+
file1_name: str = "Text 1",
|
215 |
+
file2_name: str = "Text 2") -> dict[str, any]:
|
216 |
+
"""
|
217 |
+
Generate a comprehensive differential report for two text chapters.
|
218 |
+
|
219 |
+
Returns:
|
220 |
+
Complete report with changes, alignment, and recommendations
|
221 |
+
"""
|
222 |
+
|
223 |
+
changes = detect_structural_changes(text1, text2)
|
224 |
+
alignment = generate_structural_alignment(text1, text2)
|
225 |
+
scores = calculate_structural_similarity_score(text1, text2)
|
226 |
+
|
227 |
+
report = {
|
228 |
+
'file1': file1_name,
|
229 |
+
'file2': file2_name,
|
230 |
+
'changes': changes,
|
231 |
+
'alignment': alignment,
|
232 |
+
'scores': scores,
|
233 |
+
'summary': {
|
234 |
+
'significant_differences': len([c for c in changes['modifications'] if len(c['original']) > 10 or len(c['replacement']) > 10]),
|
235 |
+
'minor_variants': len([c for c in changes['modifications'] if len(c['original']) <= 5 and len(c['replacement']) <= 5]),
|
236 |
+
'structural_preservation': scores['alignment_score'] > 0.8,
|
237 |
+
'recommendation': 'Manuscripts are structurally similar' if scores['alignment_score'] > 0.7 else 'Significant structural differences detected'
|
238 |
+
}
|
239 |
+
}
|
240 |
+
|
241 |
+
return report
|
pipeline/tibetan_stopwords.py
CHANGED
@@ -23,19 +23,20 @@ def get_stopwords(use_lite: bool = False) -> set:
|
|
23 |
from .stopwords_bo import STOPWORDS
|
24 |
stopwords_set = STOPWORDS
|
25 |
|
26 |
-
|
|
|
27 |
except ImportError:
|
28 |
logger.error(
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
)
|
33 |
except AttributeError:
|
34 |
logger.error(
|
35 |
-
|
36 |
-
|
37 |
)
|
38 |
except Exception as e:
|
39 |
-
logger.error(f"An unexpected error occurred while loading stopwords
|
40 |
|
41 |
return stopwords_set
|
|
|
23 |
from .stopwords_bo import STOPWORDS
|
24 |
stopwords_set = STOPWORDS
|
25 |
|
26 |
+
source_name = module_name.lstrip('.')
|
27 |
+
logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_name}.py")
|
28 |
except ImportError:
|
29 |
logger.error(
|
30 |
+
"Failed to import STOPWORDS from stopwords file. "
|
31 |
+
"Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
|
32 |
+
"and is importable (e.g., no syntax errors)."
|
33 |
)
|
34 |
except AttributeError:
|
35 |
logger.error(
|
36 |
+
"Variable 'STOPWORDS' (all caps) not found in stopwords file. "
|
37 |
+
"Please ensure the stopword set is defined with this name within the module."
|
38 |
)
|
39 |
except Exception as e:
|
40 |
+
logger.error(f"An unexpected error occurred while loading stopwords: {e}")
|
41 |
|
42 |
return stopwords_set
|
pipeline/visualize.py
CHANGED
@@ -85,11 +85,10 @@ def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict =
|
|
85 |
title=plot_title,
|
86 |
xaxis_title="Text Pair",
|
87 |
yaxis_title="Chapter",
|
88 |
-
autosize=
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
margin=dict(l=140, b=80, t=60),
|
93 |
)
|
94 |
fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
|
95 |
fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
|
|
|
85 |
title=plot_title,
|
86 |
xaxis_title="Text Pair",
|
87 |
yaxis_title="Chapter",
|
88 |
+
autosize=True,
|
89 |
+
height=600,
|
90 |
+
font=dict(size=14),
|
91 |
+
margin=dict(l=100, b=100, t=50, r=50),
|
|
|
92 |
)
|
93 |
fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
|
94 |
fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
|
theme.py
CHANGED
@@ -212,6 +212,18 @@ class TibetanAppTheme(gr.themes.Soft):
|
|
212 |
"width": "100% !important",
|
213 |
},
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
# LLM Analysis styling
|
216 |
".llm-analysis": {
|
217 |
"background-color": "#f8f9fa !important",
|
|
|
212 |
"width": "100% !important",
|
213 |
},
|
214 |
|
215 |
+
# Heatmap plot styling - responsive sizing
|
216 |
+
".tabs > .tab-content > div[data-testid='tabitem'] > .plotly": {
|
217 |
+
"width": "100% !important",
|
218 |
+
"height": "auto !important",
|
219 |
+
},
|
220 |
+
|
221 |
+
# Specific heatmap container styling
|
222 |
+
".metric-heatmap": {
|
223 |
+
"max-width": "100% !important",
|
224 |
+
"overflow-x": "auto !important",
|
225 |
+
},
|
226 |
+
|
227 |
# LLM Analysis styling
|
228 |
".llm-analysis": {
|
229 |
"background-color": "#f8f9fa !important",
|