Spaces:
Sleeping
Sleeping
""" | |
Advanced Tibetan Legal Manuscript Alignment Engine | |
Juxta/CollateX-inspired alignment with Tibetan-specific enhancements | |
""" | |
import difflib | |
import re | |
from typing import Dict, List, Tuple | |
from dataclasses import dataclass | |
from collections import defaultdict | |
import logging | |
logger = logging.getLogger(__name__) | |
class AlignmentSegment: | |
"""Represents an aligned segment between texts.""" | |
text1_content: str | |
text2_content: str | |
alignment_type: str # 'match', 'gap', 'mismatch', 'transposition' | |
confidence: float | |
position_text1: int | |
position_text2: int | |
context: str = "" | |
class TibetanAlignmentResult: | |
"""Complete alignment result for Tibetan manuscripts.""" | |
segments: List[AlignmentSegment] | |
transpositions: List[Tuple[int, int]] | |
insertions: List[Dict] | |
deletions: List[Dict] | |
modifications: List[Dict] | |
alignment_score: float | |
structural_similarity: float | |
scholarly_apparatus: Dict | |
class TibetanLegalAligner: | |
""" | |
Juxta/CollateX-inspired alignment engine for Tibetan legal manuscripts. | |
Features: | |
- Multi-level alignment (character → word → sentence → paragraph) | |
- Transposition detection (content moves) | |
- Tibetan-specific punctuation handling | |
- Scholarly apparatus generation | |
- Confidence scoring | |
""" | |
def __init__(self, min_segment_length: int = 3, context_window: int = 15): | |
self.min_segment_length = min_segment_length | |
self.context_window = context_window | |
self.tibetan_punctuation = r'[།༎༏༐༑༔་]' | |
def tibetan_tokenize(self, text: str) -> List[str]: | |
"""Tibetan-specific tokenization respecting syllable boundaries.""" | |
# Split on Tibetan punctuation and spaces | |
tokens = re.split(rf'{self.tibetan_punctuation}|\s+', text) | |
return [token.strip() for token in tokens if token.strip()] | |
def segment_by_syllables(self, text: str) -> List[str]: | |
"""Segment text into Tibetan syllables.""" | |
# Tibetan syllables typically end with ་ or punctuation | |
syllables = re.findall(r'[^་]+་?', text) | |
return [s.strip() for s in syllables if s.strip()] | |
def multi_level_alignment(self, text1: str, text2: str) -> TibetanAlignmentResult: | |
""" | |
Multi-level alignment inspired by Juxta/CollateX. | |
Levels: | |
1. Character level (for precise changes) | |
2. Syllable level (Tibetan linguistic units) | |
3. Sentence level (punctuation-based) | |
4. Paragraph level (structural blocks) | |
""" | |
# Level 1: Character-level alignment | |
char_alignment = self.character_level_alignment(text1, text2) | |
# Level 2: Syllable-level alignment | |
syllable_alignment = self.syllable_level_alignment(text1, text2) | |
# Level 3: Sentence-level alignment | |
sentence_alignment = self.sentence_level_alignment(text1, text2) | |
# Level 4: Structural alignment | |
structural_alignment = self.structural_level_alignment(text1, text2) | |
# Combine results with confidence scoring | |
return self.combine_alignments( | |
char_alignment, syllable_alignment, | |
sentence_alignment, structural_alignment | |
) | |
def character_level_alignment(self, text1: str, text2: str) -> Dict: | |
"""Character-level precise alignment.""" | |
matcher = difflib.SequenceMatcher(None, text1, text2) | |
segments = [] | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
segment = AlignmentSegment( | |
text1_content=text1[i1:i2], | |
text2_content=text2[j1:j2], | |
alignment_type=self.map_opcode_to_type(tag), | |
confidence=self.calculate_confidence(text1[i1:i2], text2[j1:j2]), | |
position_text1=i1, | |
position_text2=j1 | |
) | |
segments.append(segment) | |
return {'segments': segments, 'level': 'character'} | |
def syllable_level_alignment(self, text1: str, text2: str) -> Dict: | |
"""Tibetan syllable-level alignment.""" | |
syllables1 = self.segment_by_syllables(text1) | |
syllables2 = self.segment_by_syllables(text2) | |
matcher = difflib.SequenceMatcher(None, syllables1, syllables2) | |
segments = [] | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
content1 = ' '.join(syllables1[i1:i2]) | |
content2 = ' '.join(syllables2[j1:j2]) | |
segment = AlignmentSegment( | |
text1_content=content1, | |
text2_content=content2, | |
alignment_type=self.map_opcode_to_type(tag), | |
confidence=self.calculate_confidence(content1, content2), | |
position_text1=i1, | |
position_text2=j1 | |
) | |
segments.append(segment) | |
return {'segments': segments, 'level': 'syllable'} | |
def sentence_level_alignment(self, text1: str, text2: str) -> Dict: | |
"""Sentence-level alignment using Tibetan punctuation.""" | |
sentences1 = self.tibetan_tokenize(text1) | |
sentences2 = self.tibetan_tokenize(text2) | |
matcher = difflib.SequenceMatcher(None, sentences1, sentences2) | |
segments = [] | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
content1 = ' '.join(sentences1[i1:i2]) | |
content2 = ' '.join(sentences2[j1:j2]) | |
segment = AlignmentSegment( | |
text1_content=content1, | |
text2_content=content2, | |
alignment_type=self.map_opcode_to_type(tag), | |
confidence=self.calculate_confidence(content1, content2), | |
position_text1=i1, | |
position_text2=j1 | |
) | |
segments.append(segment) | |
return {'segments': segments, 'level': 'sentence'} | |
def structural_level_alignment(self, text1: str, text2: str) -> Dict: | |
"""Structural-level alignment for larger text blocks.""" | |
# Paragraph-level segmentation | |
paragraphs1 = text1.split('\n\n') | |
paragraphs2 = text2.split('\n\n') | |
matcher = difflib.SequenceMatcher(None, paragraphs1, paragraphs2) | |
segments = [] | |
for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
content1 = '\n\n'.join(paragraphs1[i1:i2]) | |
content2 = '\n\n'.join(paragraphs2[j1:j2]) | |
segment = AlignmentSegment( | |
text1_content=content1, | |
text2_content=content2, | |
alignment_type=self.map_opcode_to_type(tag), | |
confidence=self.calculate_confidence(content1, content2), | |
position_text1=i1, | |
position_text2=j1 | |
) | |
segments.append(segment) | |
return {'segments': segments, 'level': 'structural'} | |
def detect_transpositions(self, segments: List[AlignmentSegment]) -> List[Tuple[int, int]]: | |
"""Detect content transpositions (moves) between texts.""" | |
transpositions = [] | |
# Look for identical content appearing in different positions | |
content_map = defaultdict(list) | |
for i, segment in enumerate(segments): | |
if segment.alignment_type == 'match': | |
content_map[segment.text1_content].append(i) | |
# Detect moves where same content appears at different positions | |
for content, positions in content_map.items(): | |
if len(positions) > 1: | |
# Potential transposition detected | |
transpositions.extend([(positions[i], positions[j]) | |
for i in range(len(positions)) | |
for j in range(i+1, len(positions))]) | |
return transpositions | |
def map_opcode_to_type(self, opcode: str) -> str: | |
"""Map difflib opcode to alignment type.""" | |
mapping = { | |
'equal': 'match', | |
'delete': 'deletion', | |
'insert': 'insertion', | |
'replace': 'mismatch' | |
} | |
return mapping.get(opcode, 'unknown') | |
def calculate_confidence(self, content1: str, content2: str) -> float: | |
"""Calculate alignment confidence score.""" | |
if not content1 and not content2: | |
return 1.0 | |
if not content1 or not content2: | |
return 0.0 | |
# Use Levenshtein distance for confidence | |
distance = self.levenshtein_distance(content1, content2) | |
max_len = max(len(content1), len(content2)) | |
return max(0.0, 1.0 - (distance / max_len)) if max_len > 0 else 1.0 | |
def levenshtein_distance(self, s1: str, s2: str) -> int: | |
"""Calculate Levenshtein distance between two strings.""" | |
if len(s1) < len(s2): | |
return self.levenshtein_distance(s2, s1) | |
if len(s2) == 0: | |
return len(s1) | |
previous_row = list(range(len(s2) + 1)) | |
for i, c1 in enumerate(s1): | |
current_row = [i + 1] | |
for j, c2 in enumerate(s2): | |
insertions = previous_row[j + 1] + 1 | |
deletions = current_row[j] + 1 | |
substitutions = previous_row[j] + (c1 != c2) | |
current_row.append(min(insertions, deletions, substitutions)) | |
previous_row = current_row | |
return previous_row[-1] | |
def generate_scholarly_apparatus(self, alignment: TibetanAlignmentResult) -> Dict: | |
"""Generate scholarly apparatus for critical edition.""" | |
return { | |
'sigla': { | |
'witness_a': 'Base text', | |
'witness_b': 'Variant text' | |
}, | |
'critical_notes': self.generate_critical_notes(alignment), | |
'alignment_summary': { | |
'total_segments': len(alignment.segments), | |
'exact_matches': len([s for s in alignment.segments if s.alignment_type == 'match']), | |
'variants': len([s for s in alignment.segments if s.alignment_type in ['mismatch', 'modification']]), | |
'transpositions': len(alignment.transpositions), | |
'confidence_score': sum(s.confidence for s in alignment.segments) / len(alignment.segments) if alignment.segments else 0 | |
} | |
} | |
def generate_critical_notes(self, alignment: TibetanAlignmentResult) -> List[str]: | |
"""Generate critical notes in scholarly format.""" | |
notes = [] | |
for segment in alignment.segments: | |
if segment.alignment_type in ['mismatch', 'modification']: | |
note = f"Variant: '{segment.text1_content}' → '{segment.text2_content}'" | |
notes.append(note) | |
return notes | |
def combine_alignments(self, *alignments) -> TibetanAlignmentResult: | |
"""Combine multi-level alignments into final result.""" | |
# This would implement sophisticated combination logic | |
# For now, return the highest confidence level | |
# Use sentence-level as primary | |
sentence_alignment = next(a for a in alignments if a['level'] == 'sentence') | |
return TibetanAlignmentResult( | |
segments=sentence_alignment['segments'], | |
transpositions=[], | |
insertions=[], | |
deletions=[], | |
modifications=[], | |
alignment_score=0.85, # Placeholder | |
structural_similarity=0.75, # Placeholder | |
scholarly_apparatus={ | |
'method': 'Juxta/CollateX-inspired multi-level alignment', | |
'levels': ['character', 'syllable', 'sentence', 'structural'] | |
} | |
) | |
# Integration function for existing codebase | |
def enhanced_structural_analysis(text1: str, text2: str, | |
file1_name: str = "Text 1", | |
file2_name: str = "Text 2") -> dict: | |
""" | |
Enhanced structural analysis using Juxta/CollateX-inspired algorithms. | |
Args: | |
text1: First text to analyze | |
text2: Second text to analyze | |
file1_name: Name for first text | |
file2_name: Name for second text | |
Returns: | |
Comprehensive alignment analysis | |
""" | |
aligner = TibetanLegalAligner() | |
result = aligner.multi_level_alignment(text1, text2) | |
return { | |
'alignment_segments': [{ | |
'type': segment.alignment_type, | |
'content1': segment.text1_content, | |
'content2': segment.text2_content, | |
'confidence': segment.confidence | |
} for segment in result.segments], | |
'transpositions': result.transpositions, | |
'scholarly_apparatus': result.scholarly_apparatus, | |
'alignment_score': result.alignment_score, | |
'structural_similarity': result.structural_similarity | |
} | |