Spaces:
Build error
Build error
File size: 2,808 Bytes
5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 0f99c8d 5009cb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
"""TextContent value object for representing text data with language and encoding validation."""
from dataclasses import dataclass
from typing import Optional
import re
@dataclass(frozen=True)
class TextContent:
"""Value object representing text content with language and encoding information."""
text: str
language: str
encoding: str = 'utf-8'
def __post_init__(self):
"""Validate text content after initialization."""
self._validate()
def _validate(self):
"""Validate text content properties."""
if not isinstance(self.text, str):
raise TypeError("Text must be a string")
if not self.text.strip():
raise ValueError("Text content cannot be empty or whitespace only")
if len(self.text) > 50000: # Reasonable limit for TTS processing
raise ValueError("Text content too long (maximum 50,000 characters)")
if not isinstance(self.language, str):
raise TypeError("Language must be a string")
if not self.language.strip():
raise ValueError("Language cannot be empty")
# Validate language code format (ISO 639-1 or ISO 639-3)
if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")
if not isinstance(self.encoding, str):
raise TypeError("Encoding must be a string")
if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']:
raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1")
# Validate that text can be encoded with specified encoding
try:
self.text.encode(self.encoding)
except UnicodeEncodeError:
raise ValueError(f"Text cannot be encoded with {self.encoding} encoding")
@property
def word_count(self) -> int:
"""Get the approximate word count of the text."""
return len(self.text.split())
@property
def character_count(self) -> int:
"""Get the character count of the text."""
return len(self.text)
@property
def is_empty(self) -> bool:
"""Check if the text content is effectively empty."""
return not self.text.strip()
def truncate(self, max_length: int) -> 'TextContent':
"""Create a new TextContent with truncated text."""
if max_length <= 0:
raise ValueError("Max length must be positive")
if len(self.text) <= max_length:
return self
truncated_text = self.text[:max_length].rstrip()
return TextContent(
text=truncated_text,
language=self.language,
encoding=self.encoding
) |