File size: 8,783 Bytes
5009cb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""Unit tests for TextContent value object."""

import pytest
from src.domain.models.text_content import TextContent


class TestTextContent:
    """Test cases for TextContent value object."""
    
    def test_valid_text_content_creation(self):
        """Test creating valid TextContent instance."""
        text = TextContent(
            text="Hello, world!",
            language="en",
            encoding="utf-8"
        )
        
        assert text.text == "Hello, world!"
        assert text.language == "en"
        assert text.encoding == "utf-8"
        assert text.word_count == 2
        assert text.character_count == 13
        assert text.is_empty is False
    
    def test_text_content_with_default_encoding(self):
        """Test creating TextContent with default encoding."""
        text = TextContent(
            text="Hello, world!",
            language="en"
        )
        
        assert text.encoding == "utf-8"
    
    def test_non_string_text_raises_error(self):
        """Test that non-string text raises TypeError."""
        with pytest.raises(TypeError, match="Text must be a string"):
            TextContent(
                text=123,  # type: ignore
                language="en"
            )
    
    def test_empty_text_raises_error(self):
        """Test that empty text raises ValueError."""
        with pytest.raises(ValueError, match="Text content cannot be empty or whitespace only"):
            TextContent(
                text="",
                language="en"
            )
    
    def test_whitespace_only_text_raises_error(self):
        """Test that whitespace-only text raises ValueError."""
        with pytest.raises(ValueError, match="Text content cannot be empty or whitespace only"):
            TextContent(
                text="   \n\t  ",
                language="en"
            )
    
    def test_text_too_long_raises_error(self):
        """Test that text over 50,000 characters raises ValueError."""
        long_text = "a" * 50001
        with pytest.raises(ValueError, match="Text content too long"):
            TextContent(
                text=long_text,
                language="en"
            )
    
    def test_text_at_max_length(self):
        """Test text at maximum allowed length."""
        max_text = "a" * 50000
        text = TextContent(
            text=max_text,
            language="en"
        )
        assert len(text.text) == 50000
    
    def test_non_string_language_raises_error(self):
        """Test that non-string language raises TypeError."""
        with pytest.raises(TypeError, match="Language must be a string"):
            TextContent(
                text="Hello",
                language=123  # type: ignore
            )
    
    def test_empty_language_raises_error(self):
        """Test that empty language raises ValueError."""
        with pytest.raises(ValueError, match="Language cannot be empty"):
            TextContent(
                text="Hello",
                language=""
            )
    
    def test_whitespace_language_raises_error(self):
        """Test that whitespace-only language raises ValueError."""
        with pytest.raises(ValueError, match="Language cannot be empty"):
            TextContent(
                text="Hello",
                language="   "
            )
    
    def test_invalid_language_code_format_raises_error(self):
        """Test that invalid language code format raises ValueError."""
        invalid_codes = ["e", "ENG", "en-us", "en-USA", "123", "en_US"]
        
        for code in invalid_codes:
            with pytest.raises(ValueError, match="Invalid language code format"):
                TextContent(
                    text="Hello",
                    language=code
                )
    
    def test_valid_language_codes(self):
        """Test valid language code formats."""
        valid_codes = ["en", "fr", "de", "es", "zh", "ja", "en-US", "fr-FR", "zh-CN"]
        
        for code in valid_codes:
            text = TextContent(
                text="Hello",
                language=code
            )
            assert text.language == code
    
    def test_non_string_encoding_raises_error(self):
        """Test that non-string encoding raises TypeError."""
        with pytest.raises(TypeError, match="Encoding must be a string"):
            TextContent(
                text="Hello",
                language="en",
                encoding=123  # type: ignore
            )
    
    def test_unsupported_encoding_raises_error(self):
        """Test that unsupported encoding raises ValueError."""
        with pytest.raises(ValueError, match="Unsupported encoding: xyz"):
            TextContent(
                text="Hello",
                language="en",
                encoding="xyz"
            )
    
    def test_supported_encodings(self):
        """Test all supported encodings."""
        supported_encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1']
        
        for encoding in supported_encodings:
            text = TextContent(
                text="Hello",
                language="en",
                encoding=encoding
            )
            assert text.encoding == encoding
    
    def test_text_encoding_compatibility(self):
        """Test that text is compatible with specified encoding."""
        # ASCII text with UTF-8 encoding should work
        text = TextContent(
            text="Hello",
            language="en",
            encoding="ascii"
        )
        assert text.encoding == "ascii"
        
        # Unicode text with ASCII encoding should fail
        with pytest.raises(ValueError, match="Text cannot be encoded with ascii encoding"):
            TextContent(
                text="Héllo",  # Contains non-ASCII character
                language="en",
                encoding="ascii"
            )
    
    def test_word_count_property(self):
        """Test word_count property calculation."""
        test_cases = [
            ("Hello world", 2),
            ("Hello", 1),
            ("Hello world test", 3),
            ("Hello,  world!  Test.", 3),  # Multiple spaces and punctuation
            ("", 1),  # Empty string split returns ['']
        ]
        
        for text_str, expected_count in test_cases:
            if text_str:  # Skip empty string test as it would fail validation
                text = TextContent(text=text_str, language="en")
                assert text.word_count == expected_count
    
    def test_character_count_property(self):
        """Test character_count property."""
        text_str = "Hello, world!"
        text = TextContent(text=text_str, language="en")
        assert text.character_count == len(text_str)
    
    def test_is_empty_property(self):
        """Test is_empty property."""
        # Non-empty text
        text = TextContent(text="Hello", language="en")
        assert text.is_empty is False
        
        # Text with only meaningful content
        text2 = TextContent(text="  Hello  ", language="en")
        assert text2.is_empty is False
    
    def test_truncate_method(self):
        """Test truncate method."""
        text = TextContent(text="Hello, world! This is a test.", language="en")
        
        # Truncate to shorter length
        truncated = text.truncate(10)
        assert len(truncated.text) <= 10
        assert truncated.language == text.language
        assert truncated.encoding == text.encoding
        assert isinstance(truncated, TextContent)
        
        # Truncate to longer length (should return same)
        not_truncated = text.truncate(100)
        assert not_truncated.text == text.text
    
    def test_truncate_with_invalid_length(self):
        """Test truncate with invalid max_length."""
        text = TextContent(text="Hello", language="en")
        
        with pytest.raises(ValueError, match="Max length must be positive"):
            text.truncate(0)
        
        with pytest.raises(ValueError, match="Max length must be positive"):
            text.truncate(-1)
    
    def test_text_content_is_immutable(self):
        """Test that TextContent is immutable (frozen dataclass)."""
        text = TextContent(text="Hello", language="en")
        
        with pytest.raises(AttributeError):
            text.text = "Goodbye"  # type: ignore
    
    def test_truncate_preserves_word_boundaries(self):
        """Test that truncate method preserves word boundaries by rstripping."""
        text = TextContent(text="Hello world test", language="en")
        
        # Truncate in middle of word
        truncated = text.truncate(12)  # "Hello world " -> "Hello world" after rstrip
        assert not truncated.text.endswith(" ")
        assert truncated.text == "Hello world"