File size: 2,808 Bytes
5009cb8
 
 
 
 
 
 
 
 
 
0f99c8d
5009cb8
 
 
0f99c8d
5009cb8
 
 
0f99c8d
5009cb8
 
 
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
 
 
 
0f99c8d
5009cb8
 
 
 
0f99c8d
5009cb8
 
 
 
0f99c8d
5009cb8
 
 
 
0f99c8d
5009cb8
 
 
 
0f99c8d
5009cb8
 
0f99c8d
5009cb8
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""TextContent value object for representing text data with language and encoding validation."""

from dataclasses import dataclass
from typing import Optional
import re


@dataclass(frozen=True)
class TextContent:
    """Value object representing text content with language and encoding information."""

    text: str
    language: str
    encoding: str = 'utf-8'

    def __post_init__(self):
        """Validate text content after initialization."""
        self._validate()

    def _validate(self):
        """Validate text content properties."""
        if not isinstance(self.text, str):
            raise TypeError("Text must be a string")

        if not self.text.strip():
            raise ValueError("Text content cannot be empty or whitespace only")

        if len(self.text) > 50000:  # Reasonable limit for TTS processing
            raise ValueError("Text content too long (maximum 50,000 characters)")

        if not isinstance(self.language, str):
            raise TypeError("Language must be a string")

        if not self.language.strip():
            raise ValueError("Language cannot be empty")

        # Validate language code format (ISO 639-1 or ISO 639-3)
        if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language):
            raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.")

        if not isinstance(self.encoding, str):
            raise TypeError("Encoding must be a string")

        if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']:
            raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1")

        # Validate that text can be encoded with specified encoding
        try:
            self.text.encode(self.encoding)
        except UnicodeEncodeError:
            raise ValueError(f"Text cannot be encoded with {self.encoding} encoding")

    @property
    def word_count(self) -> int:
        """Get the approximate word count of the text."""
        return len(self.text.split())

    @property
    def character_count(self) -> int:
        """Get the character count of the text."""
        return len(self.text)

    @property
    def is_empty(self) -> bool:
        """Check if the text content is effectively empty."""
        return not self.text.strip()

    def truncate(self, max_length: int) -> 'TextContent':
        """Create a new TextContent with truncated text."""
        if max_length <= 0:
            raise ValueError("Max length must be positive")

        if len(self.text) <= max_length:
            return self

        truncated_text = self.text[:max_length].rstrip()
        return TextContent(
            text=truncated_text,
            language=self.language,
            encoding=self.encoding
        )