""" Advanced Tibetan Legal Manuscript Alignment Engine Juxta/CollateX-inspired alignment with Tibetan-specific enhancements """ import difflib import re from typing import Dict, List, Tuple from dataclasses import dataclass from collections import defaultdict import logging logger = logging.getLogger(__name__) @dataclass class AlignmentSegment: """Represents an aligned segment between texts.""" text1_content: str text2_content: str alignment_type: str # 'match', 'gap', 'mismatch', 'transposition' confidence: float position_text1: int position_text2: int context: str = "" @dataclass class TibetanAlignmentResult: """Complete alignment result for Tibetan manuscripts.""" segments: List[AlignmentSegment] transpositions: List[Tuple[int, int]] insertions: List[Dict] deletions: List[Dict] modifications: List[Dict] alignment_score: float structural_similarity: float scholarly_apparatus: Dict class TibetanLegalAligner: """ Juxta/CollateX-inspired alignment engine for Tibetan legal manuscripts. Features: - Multi-level alignment (character → word → sentence → paragraph) - Transposition detection (content moves) - Tibetan-specific punctuation handling - Scholarly apparatus generation - Confidence scoring """ def __init__(self, min_segment_length: int = 3, context_window: int = 15): self.min_segment_length = min_segment_length self.context_window = context_window self.tibetan_punctuation = r'[།༎༏༐༑༔་]' def tibetan_tokenize(self, text: str) -> List[str]: """Tibetan-specific tokenization respecting syllable boundaries.""" # Split on Tibetan punctuation and spaces tokens = re.split(rf'{self.tibetan_punctuation}|\s+', text) return [token.strip() for token in tokens if token.strip()] def segment_by_syllables(self, text: str) -> List[str]: """Segment text into Tibetan syllables.""" # Tibetan syllables typically end with ་ or punctuation syllables = re.findall(r'[^་]+་?', text) return [s.strip() for s in syllables if s.strip()] def multi_level_alignment(self, text1: str, text2: str) -> TibetanAlignmentResult: """ Multi-level alignment inspired by Juxta/CollateX. Levels: 1. Character level (for precise changes) 2. Syllable level (Tibetan linguistic units) 3. Sentence level (punctuation-based) 4. Paragraph level (structural blocks) """ # Level 1: Character-level alignment char_alignment = self.character_level_alignment(text1, text2) # Level 2: Syllable-level alignment syllable_alignment = self.syllable_level_alignment(text1, text2) # Level 3: Sentence-level alignment sentence_alignment = self.sentence_level_alignment(text1, text2) # Level 4: Structural alignment structural_alignment = self.structural_level_alignment(text1, text2) # Combine results with confidence scoring return self.combine_alignments( char_alignment, syllable_alignment, sentence_alignment, structural_alignment ) def character_level_alignment(self, text1: str, text2: str) -> Dict: """Character-level precise alignment.""" matcher = difflib.SequenceMatcher(None, text1, text2) segments = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): segment = AlignmentSegment( text1_content=text1[i1:i2], text2_content=text2[j1:j2], alignment_type=self.map_opcode_to_type(tag), confidence=self.calculate_confidence(text1[i1:i2], text2[j1:j2]), position_text1=i1, position_text2=j1 ) segments.append(segment) return {'segments': segments, 'level': 'character'} def syllable_level_alignment(self, text1: str, text2: str) -> Dict: """Tibetan syllable-level alignment.""" syllables1 = self.segment_by_syllables(text1) syllables2 = self.segment_by_syllables(text2) matcher = difflib.SequenceMatcher(None, syllables1, syllables2) segments = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): content1 = ' '.join(syllables1[i1:i2]) content2 = ' '.join(syllables2[j1:j2]) segment = AlignmentSegment( text1_content=content1, text2_content=content2, alignment_type=self.map_opcode_to_type(tag), confidence=self.calculate_confidence(content1, content2), position_text1=i1, position_text2=j1 ) segments.append(segment) return {'segments': segments, 'level': 'syllable'} def sentence_level_alignment(self, text1: str, text2: str) -> Dict: """Sentence-level alignment using Tibetan punctuation.""" sentences1 = self.tibetan_tokenize(text1) sentences2 = self.tibetan_tokenize(text2) matcher = difflib.SequenceMatcher(None, sentences1, sentences2) segments = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): content1 = ' '.join(sentences1[i1:i2]) content2 = ' '.join(sentences2[j1:j2]) segment = AlignmentSegment( text1_content=content1, text2_content=content2, alignment_type=self.map_opcode_to_type(tag), confidence=self.calculate_confidence(content1, content2), position_text1=i1, position_text2=j1 ) segments.append(segment) return {'segments': segments, 'level': 'sentence'} def structural_level_alignment(self, text1: str, text2: str) -> Dict: """Structural-level alignment for larger text blocks.""" # Paragraph-level segmentation paragraphs1 = text1.split('\n\n') paragraphs2 = text2.split('\n\n') matcher = difflib.SequenceMatcher(None, paragraphs1, paragraphs2) segments = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): content1 = '\n\n'.join(paragraphs1[i1:i2]) content2 = '\n\n'.join(paragraphs2[j1:j2]) segment = AlignmentSegment( text1_content=content1, text2_content=content2, alignment_type=self.map_opcode_to_type(tag), confidence=self.calculate_confidence(content1, content2), position_text1=i1, position_text2=j1 ) segments.append(segment) return {'segments': segments, 'level': 'structural'} def detect_transpositions(self, segments: List[AlignmentSegment]) -> List[Tuple[int, int]]: """Detect content transpositions (moves) between texts.""" transpositions = [] # Look for identical content appearing in different positions content_map = defaultdict(list) for i, segment in enumerate(segments): if segment.alignment_type == 'match': content_map[segment.text1_content].append(i) # Detect moves where same content appears at different positions for content, positions in content_map.items(): if len(positions) > 1: # Potential transposition detected transpositions.extend([(positions[i], positions[j]) for i in range(len(positions)) for j in range(i+1, len(positions))]) return transpositions def map_opcode_to_type(self, opcode: str) -> str: """Map difflib opcode to alignment type.""" mapping = { 'equal': 'match', 'delete': 'deletion', 'insert': 'insertion', 'replace': 'mismatch' } return mapping.get(opcode, 'unknown') def calculate_confidence(self, content1: str, content2: str) -> float: """Calculate alignment confidence score.""" if not content1 and not content2: return 1.0 if not content1 or not content2: return 0.0 # Use Levenshtein distance for confidence distance = self.levenshtein_distance(content1, content2) max_len = max(len(content1), len(content2)) return max(0.0, 1.0 - (distance / max_len)) if max_len > 0 else 1.0 def levenshtein_distance(self, s1: str, s2: str) -> int: """Calculate Levenshtein distance between two strings.""" if len(s1) < len(s2): return self.levenshtein_distance(s2, s1) if len(s2) == 0: return len(s1) previous_row = list(range(len(s2) + 1)) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def generate_scholarly_apparatus(self, alignment: TibetanAlignmentResult) -> Dict: """Generate scholarly apparatus for critical edition.""" return { 'sigla': { 'witness_a': 'Base text', 'witness_b': 'Variant text' }, 'critical_notes': self.generate_critical_notes(alignment), 'alignment_summary': { 'total_segments': len(alignment.segments), 'exact_matches': len([s for s in alignment.segments if s.alignment_type == 'match']), 'variants': len([s for s in alignment.segments if s.alignment_type in ['mismatch', 'modification']]), 'transpositions': len(alignment.transpositions), 'confidence_score': sum(s.confidence for s in alignment.segments) / len(alignment.segments) if alignment.segments else 0 } } def generate_critical_notes(self, alignment: TibetanAlignmentResult) -> List[str]: """Generate critical notes in scholarly format.""" notes = [] for segment in alignment.segments: if segment.alignment_type in ['mismatch', 'modification']: note = f"Variant: '{segment.text1_content}' → '{segment.text2_content}'" notes.append(note) return notes def combine_alignments(self, *alignments) -> TibetanAlignmentResult: """Combine multi-level alignments into final result.""" # This would implement sophisticated combination logic # For now, return the highest confidence level # Use sentence-level as primary sentence_alignment = next(a for a in alignments if a['level'] == 'sentence') return TibetanAlignmentResult( segments=sentence_alignment['segments'], transpositions=[], insertions=[], deletions=[], modifications=[], alignment_score=0.85, # Placeholder structural_similarity=0.75, # Placeholder scholarly_apparatus={ 'method': 'Juxta/CollateX-inspired multi-level alignment', 'levels': ['character', 'syllable', 'sentence', 'structural'] } ) # Integration function for existing codebase def enhanced_structural_analysis(text1: str, text2: str, file1_name: str = "Text 1", file2_name: str = "Text 2") -> dict: """ Enhanced structural analysis using Juxta/CollateX-inspired algorithms. Args: text1: First text to analyze text2: Second text to analyze file1_name: Name for first text file2_name: Name for second text Returns: Comprehensive alignment analysis """ aligner = TibetanLegalAligner() result = aligner.multi_level_alignment(text1, text2) return { 'alignment_segments': [{ 'type': segment.alignment_type, 'content1': segment.text1_content, 'content2': segment.text2_content, 'confidence': segment.confidence } for segment in result.segments], 'transpositions': result.transpositions, 'scholarly_apparatus': result.scholarly_apparatus, 'alignment_score': result.alignment_score, 'structural_similarity': result.structural_similarity }