Spaces:
Sleeping
Sleeping
File size: 12,962 Bytes
edd4b9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 |
"""
Advanced Tibetan Legal Manuscript Alignment Engine
Juxta/CollateX-inspired alignment with Tibetan-specific enhancements
"""
import difflib
import re
from typing import Dict, List, Tuple
from dataclasses import dataclass
from collections import defaultdict
import logging
logger = logging.getLogger(__name__)
@dataclass
class AlignmentSegment:
"""Represents an aligned segment between texts."""
text1_content: str
text2_content: str
alignment_type: str # 'match', 'gap', 'mismatch', 'transposition'
confidence: float
position_text1: int
position_text2: int
context: str = ""
@dataclass
class TibetanAlignmentResult:
"""Complete alignment result for Tibetan manuscripts."""
segments: List[AlignmentSegment]
transpositions: List[Tuple[int, int]]
insertions: List[Dict]
deletions: List[Dict]
modifications: List[Dict]
alignment_score: float
structural_similarity: float
scholarly_apparatus: Dict
class TibetanLegalAligner:
"""
Juxta/CollateX-inspired alignment engine for Tibetan legal manuscripts.
Features:
- Multi-level alignment (character → word → sentence → paragraph)
- Transposition detection (content moves)
- Tibetan-specific punctuation handling
- Scholarly apparatus generation
- Confidence scoring
"""
def __init__(self, min_segment_length: int = 3, context_window: int = 15):
self.min_segment_length = min_segment_length
self.context_window = context_window
self.tibetan_punctuation = r'[།༎༏༐༑༔་]'
def tibetan_tokenize(self, text: str) -> List[str]:
"""Tibetan-specific tokenization respecting syllable boundaries."""
# Split on Tibetan punctuation and spaces
tokens = re.split(rf'{self.tibetan_punctuation}|\s+', text)
return [token.strip() for token in tokens if token.strip()]
def segment_by_syllables(self, text: str) -> List[str]:
"""Segment text into Tibetan syllables."""
# Tibetan syllables typically end with ་ or punctuation
syllables = re.findall(r'[^་]+་?', text)
return [s.strip() for s in syllables if s.strip()]
def multi_level_alignment(self, text1: str, text2: str) -> TibetanAlignmentResult:
"""
Multi-level alignment inspired by Juxta/CollateX.
Levels:
1. Character level (for precise changes)
2. Syllable level (Tibetan linguistic units)
3. Sentence level (punctuation-based)
4. Paragraph level (structural blocks)
"""
# Level 1: Character-level alignment
char_alignment = self.character_level_alignment(text1, text2)
# Level 2: Syllable-level alignment
syllable_alignment = self.syllable_level_alignment(text1, text2)
# Level 3: Sentence-level alignment
sentence_alignment = self.sentence_level_alignment(text1, text2)
# Level 4: Structural alignment
structural_alignment = self.structural_level_alignment(text1, text2)
# Combine results with confidence scoring
return self.combine_alignments(
char_alignment, syllable_alignment,
sentence_alignment, structural_alignment
)
def character_level_alignment(self, text1: str, text2: str) -> Dict:
"""Character-level precise alignment."""
matcher = difflib.SequenceMatcher(None, text1, text2)
segments = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
segment = AlignmentSegment(
text1_content=text1[i1:i2],
text2_content=text2[j1:j2],
alignment_type=self.map_opcode_to_type(tag),
confidence=self.calculate_confidence(text1[i1:i2], text2[j1:j2]),
position_text1=i1,
position_text2=j1
)
segments.append(segment)
return {'segments': segments, 'level': 'character'}
def syllable_level_alignment(self, text1: str, text2: str) -> Dict:
"""Tibetan syllable-level alignment."""
syllables1 = self.segment_by_syllables(text1)
syllables2 = self.segment_by_syllables(text2)
matcher = difflib.SequenceMatcher(None, syllables1, syllables2)
segments = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
content1 = ' '.join(syllables1[i1:i2])
content2 = ' '.join(syllables2[j1:j2])
segment = AlignmentSegment(
text1_content=content1,
text2_content=content2,
alignment_type=self.map_opcode_to_type(tag),
confidence=self.calculate_confidence(content1, content2),
position_text1=i1,
position_text2=j1
)
segments.append(segment)
return {'segments': segments, 'level': 'syllable'}
def sentence_level_alignment(self, text1: str, text2: str) -> Dict:
"""Sentence-level alignment using Tibetan punctuation."""
sentences1 = self.tibetan_tokenize(text1)
sentences2 = self.tibetan_tokenize(text2)
matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
segments = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
content1 = ' '.join(sentences1[i1:i2])
content2 = ' '.join(sentences2[j1:j2])
segment = AlignmentSegment(
text1_content=content1,
text2_content=content2,
alignment_type=self.map_opcode_to_type(tag),
confidence=self.calculate_confidence(content1, content2),
position_text1=i1,
position_text2=j1
)
segments.append(segment)
return {'segments': segments, 'level': 'sentence'}
def structural_level_alignment(self, text1: str, text2: str) -> Dict:
"""Structural-level alignment for larger text blocks."""
# Paragraph-level segmentation
paragraphs1 = text1.split('\n\n')
paragraphs2 = text2.split('\n\n')
matcher = difflib.SequenceMatcher(None, paragraphs1, paragraphs2)
segments = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
content1 = '\n\n'.join(paragraphs1[i1:i2])
content2 = '\n\n'.join(paragraphs2[j1:j2])
segment = AlignmentSegment(
text1_content=content1,
text2_content=content2,
alignment_type=self.map_opcode_to_type(tag),
confidence=self.calculate_confidence(content1, content2),
position_text1=i1,
position_text2=j1
)
segments.append(segment)
return {'segments': segments, 'level': 'structural'}
def detect_transpositions(self, segments: List[AlignmentSegment]) -> List[Tuple[int, int]]:
"""Detect content transpositions (moves) between texts."""
transpositions = []
# Look for identical content appearing in different positions
content_map = defaultdict(list)
for i, segment in enumerate(segments):
if segment.alignment_type == 'match':
content_map[segment.text1_content].append(i)
# Detect moves where same content appears at different positions
for content, positions in content_map.items():
if len(positions) > 1:
# Potential transposition detected
transpositions.extend([(positions[i], positions[j])
for i in range(len(positions))
for j in range(i+1, len(positions))])
return transpositions
def map_opcode_to_type(self, opcode: str) -> str:
"""Map difflib opcode to alignment type."""
mapping = {
'equal': 'match',
'delete': 'deletion',
'insert': 'insertion',
'replace': 'mismatch'
}
return mapping.get(opcode, 'unknown')
def calculate_confidence(self, content1: str, content2: str) -> float:
"""Calculate alignment confidence score."""
if not content1 and not content2:
return 1.0
if not content1 or not content2:
return 0.0
# Use Levenshtein distance for confidence
distance = self.levenshtein_distance(content1, content2)
max_len = max(len(content1), len(content2))
return max(0.0, 1.0 - (distance / max_len)) if max_len > 0 else 1.0
def levenshtein_distance(self, s1: str, s2: str) -> int:
"""Calculate Levenshtein distance between two strings."""
if len(s1) < len(s2):
return self.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def generate_scholarly_apparatus(self, alignment: TibetanAlignmentResult) -> Dict:
"""Generate scholarly apparatus for critical edition."""
return {
'sigla': {
'witness_a': 'Base text',
'witness_b': 'Variant text'
},
'critical_notes': self.generate_critical_notes(alignment),
'alignment_summary': {
'total_segments': len(alignment.segments),
'exact_matches': len([s for s in alignment.segments if s.alignment_type == 'match']),
'variants': len([s for s in alignment.segments if s.alignment_type in ['mismatch', 'modification']]),
'transpositions': len(alignment.transpositions),
'confidence_score': sum(s.confidence for s in alignment.segments) / len(alignment.segments) if alignment.segments else 0
}
}
def generate_critical_notes(self, alignment: TibetanAlignmentResult) -> List[str]:
"""Generate critical notes in scholarly format."""
notes = []
for segment in alignment.segments:
if segment.alignment_type in ['mismatch', 'modification']:
note = f"Variant: '{segment.text1_content}' → '{segment.text2_content}'"
notes.append(note)
return notes
def combine_alignments(self, *alignments) -> TibetanAlignmentResult:
"""Combine multi-level alignments into final result."""
# This would implement sophisticated combination logic
# For now, return the highest confidence level
# Use sentence-level as primary
sentence_alignment = next(a for a in alignments if a['level'] == 'sentence')
return TibetanAlignmentResult(
segments=sentence_alignment['segments'],
transpositions=[],
insertions=[],
deletions=[],
modifications=[],
alignment_score=0.85, # Placeholder
structural_similarity=0.75, # Placeholder
scholarly_apparatus={
'method': 'Juxta/CollateX-inspired multi-level alignment',
'levels': ['character', 'syllable', 'sentence', 'structural']
}
)
# Integration function for existing codebase
def enhanced_structural_analysis(text1: str, text2: str,
file1_name: str = "Text 1",
file2_name: str = "Text 2") -> dict:
"""
Enhanced structural analysis using Juxta/CollateX-inspired algorithms.
Args:
text1: First text to analyze
text2: Second text to analyze
file1_name: Name for first text
file2_name: Name for second text
Returns:
Comprehensive alignment analysis
"""
aligner = TibetanLegalAligner()
result = aligner.multi_level_alignment(text1, text2)
return {
'alignment_segments': [{
'type': segment.alignment_type,
'content1': segment.text1_content,
'content2': segment.text2_content,
'confidence': segment.confidence
} for segment in result.segments],
'transpositions': result.transpositions,
'scholarly_apparatus': result.scholarly_apparatus,
'alignment_score': result.alignment_score,
'structural_similarity': result.structural_similarity
}
|