"""TextContent value object for representing text data with language and encoding validation.""" from dataclasses import dataclass from typing import Optional import re @dataclass(frozen=True) class TextContent: """Value object representing text content with language and encoding information.""" text: str language: str encoding: str = 'utf-8' def __post_init__(self): """Validate text content after initialization.""" self._validate() def _validate(self): """Validate text content properties.""" if not isinstance(self.text, str): raise TypeError("Text must be a string") if not self.text.strip(): raise ValueError("Text content cannot be empty or whitespace only") if len(self.text) > 50000: # Reasonable limit for TTS processing raise ValueError("Text content too long (maximum 50,000 characters)") if not isinstance(self.language, str): raise TypeError("Language must be a string") if not self.language.strip(): raise ValueError("Language cannot be empty") # Validate language code format (ISO 639-1 or ISO 639-3) if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language): raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.") if not isinstance(self.encoding, str): raise TypeError("Encoding must be a string") if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']: raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1") # Validate that text can be encoded with specified encoding try: self.text.encode(self.encoding) except UnicodeEncodeError: raise ValueError(f"Text cannot be encoded with {self.encoding} encoding") @property def word_count(self) -> int: """Get the approximate word count of the text.""" return len(self.text.split()) @property def character_count(self) -> int: """Get the character count of the text.""" return len(self.text) @property def is_empty(self) -> bool: """Check if the text content is effectively empty.""" return not self.text.strip() def truncate(self, max_length: int) -> 'TextContent': """Create a new TextContent with truncated text.""" if max_length <= 0: raise ValueError("Max length must be positive") if len(self.text) <= max_length: return self truncated_text = self.text[:max_length].rstrip() return TextContent( text=truncated_text, language=self.language, encoding=self.encoding )