Spaces:
Build error
Build error
"""TextContent value object for representing text data with language and encoding validation.""" | |
from dataclasses import dataclass | |
from typing import Optional | |
import re | |
class TextContent: | |
"""Value object representing text content with language and encoding information.""" | |
text: str | |
language: str | |
encoding: str = 'utf-8' | |
def __post_init__(self): | |
"""Validate text content after initialization.""" | |
self._validate() | |
def _validate(self): | |
"""Validate text content properties.""" | |
if not isinstance(self.text, str): | |
raise TypeError("Text must be a string") | |
if not self.text.strip(): | |
raise ValueError("Text content cannot be empty or whitespace only") | |
if len(self.text) > 50000: # Reasonable limit for TTS processing | |
raise ValueError("Text content too long (maximum 50,000 characters)") | |
if not isinstance(self.language, str): | |
raise TypeError("Language must be a string") | |
if not self.language.strip(): | |
raise ValueError("Language cannot be empty") | |
# Validate language code format (ISO 639-1 or ISO 639-3) | |
if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language): | |
raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.") | |
if not isinstance(self.encoding, str): | |
raise TypeError("Encoding must be a string") | |
if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']: | |
raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1") | |
# Validate that text can be encoded with specified encoding | |
try: | |
self.text.encode(self.encoding) | |
except UnicodeEncodeError: | |
raise ValueError(f"Text cannot be encoded with {self.encoding} encoding") | |
def word_count(self) -> int: | |
"""Get the approximate word count of the text.""" | |
return len(self.text.split()) | |
def character_count(self) -> int: | |
"""Get the character count of the text.""" | |
return len(self.text) | |
def is_empty(self) -> bool: | |
"""Check if the text content is effectively empty.""" | |
return not self.text.strip() | |
def truncate(self, max_length: int) -> 'TextContent': | |
"""Create a new TextContent with truncated text.""" | |
if max_length <= 0: | |
raise ValueError("Max length must be positive") | |
if len(self.text) <= max_length: | |
return self | |
truncated_text = self.text[:max_length].rstrip() | |
return TextContent( | |
text=truncated_text, | |
language=self.language, | |
encoding=self.encoding | |
) |