Spaces:
Running
Running
File size: 10,044 Bytes
d66ab65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
"""
Unit tests for StatsService
"""
import pytest
from unittest.mock import Mock
from app.services.stats_service import StatsService
class TestStatsService:
"""Test cases for StatsService."""
def setup_method(self):
"""Set up test fixtures."""
self.service = StatsService()
def test_get_varied_color_basic(self):
"""Test basic color generation."""
color = self.service.get_varied_color(0, 10)
assert isinstance(color, dict)
assert 'background' in color
assert 'text' in color
assert color['background'].startswith('#')
assert color['text'].startswith('#')
assert len(color['background']) == 7 # #RRGGBB format
assert len(color['text']) == 7
def test_get_varied_color_different_indices(self):
"""Test that different indices produce different colors."""
color1 = self.service.get_varied_color(0, 10)
color2 = self.service.get_varied_color(1, 10)
color3 = self.service.get_varied_color(5, 10)
# Colors should be different
assert color1['background'] != color2['background']
assert color2['background'] != color3['background']
assert color1['background'] != color3['background']
def test_get_varied_color_edge_cases(self):
"""Test color generation with edge cases."""
# Single token
color = self.service.get_varied_color(0, 1)
assert isinstance(color, dict)
# Large number of tokens
color = self.service.get_varied_color(999, 1000)
assert isinstance(color, dict)
# Zero index
color = self.service.get_varied_color(0, 5)
assert isinstance(color, dict)
def test_fix_token_basic(self):
"""Test basic token fixing."""
assert self.service.fix_token("hello") == "hello"
assert self.service.fix_token("world") == "world"
def test_fix_token_special_characters(self):
"""Test token fixing with special characters."""
# Test space replacement
assert self.service.fix_token(" ") == "␣"
assert self.service.fix_token("\t") == "→"
assert self.service.fix_token("\n") == "↵"
# Test Ġ prefix (common in tokenizers)
assert self.service.fix_token("Ġhello") == " hello"
assert self.service.fix_token("Ġworld") == " world"
# Test combination
assert self.service.fix_token("Ġ") == " "
def test_fix_token_edge_cases(self):
"""Test token fixing edge cases."""
# Empty string
assert self.service.fix_token("") == ""
# None (shouldn't happen but test defensive programming)
result = self.service.fix_token(None)
assert result is None or result == ""
# Multiple special characters
assert self.service.fix_token("\n\t ") == "↵→␣"
# Multiple Ġ prefixes (edge case)
assert self.service.fix_token("ĠĠhello") == " hello"
def test_get_token_stats_basic(self, sample_tokens, sample_text):
"""Test basic token statistics calculation."""
stats = self.service.get_token_stats(sample_tokens, sample_text)
assert isinstance(stats, dict)
assert 'basic_stats' in stats
assert 'length_stats' in stats
basic = stats['basic_stats']
length = stats['length_stats']
# Check basic stats structure
assert 'total_tokens' in basic
assert 'unique_tokens' in basic
assert 'unique_percentage' in basic
assert 'special_tokens' in basic
assert 'space_tokens' in basic
assert 'newline_tokens' in basic
assert 'compression_ratio' in basic
# Check length stats structure
assert 'avg_length' in length
assert 'median_length' in length
assert 'std_dev' in length
def test_get_token_stats_calculations(self):
"""Test specific statistics calculations."""
tokens = ['Hello', ' world', '!', ' test']
text = "Hello world! test"
stats = self.service.get_token_stats(tokens, text)
basic = stats['basic_stats']
# Test total tokens
assert basic['total_tokens'] == 4
# Test unique tokens (all are unique in this case)
assert basic['unique_tokens'] == 4
assert basic['unique_percentage'] == "100.0"
# Test compression ratio
expected_ratio = len(text) / len(tokens)
assert float(basic['compression_ratio']) == pytest.approx(expected_ratio, rel=1e-2)
def test_get_token_stats_special_tokens(self):
"""Test special token counting."""
tokens = ['<s>', 'Hello', ' world', '</s>', '<pad>']
text = "Hello world"
stats = self.service.get_token_stats(tokens, text)
basic = stats['basic_stats']
# Should detect special tokens (those with < >)
assert basic['special_tokens'] >= 2 # <s>, </s>, <pad>
def test_get_token_stats_whitespace_tokens(self):
"""Test whitespace token counting."""
tokens = ['Hello', ' ', 'world', '\n', 'test', '\t']
text = "Hello world\ntest\t"
stats = self.service.get_token_stats(tokens, text)
basic = stats['basic_stats']
# Should count space and tab tokens
assert basic['space_tokens'] >= 1
assert basic['newline_tokens'] >= 1
def test_get_token_stats_length_calculations(self):
"""Test token length statistics."""
tokens = ['a', 'bb', 'ccc', 'dddd'] # Lengths: 1, 2, 3, 4
text = "a bb ccc dddd"
stats = self.service.get_token_stats(tokens, text)
length = stats['length_stats']
# Average length should be 2.5
assert float(length['avg_length']) == pytest.approx(2.5, rel=1e-2)
# Median should be 2.5 (between 2 and 3)
assert float(length['median_length']) == pytest.approx(2.5, rel=1e-2)
def test_get_token_stats_empty_input(self):
"""Test statistics with empty input."""
stats = self.service.get_token_stats([], "")
basic = stats['basic_stats']
length = stats['length_stats']
assert basic['total_tokens'] == 0
assert basic['unique_tokens'] == 0
assert basic['unique_percentage'] == "0.0"
assert basic['compression_ratio'] == "0.0"
# Length stats should handle empty case gracefully
assert length['avg_length'] == "0.0"
assert length['median_length'] == "0.0"
assert length['std_dev'] == "0.0"
def test_format_tokens_for_display_basic(self, mock_tokenizer):
"""Test basic token formatting for display."""
tokens = ['Hello', ' world', '!']
# Mock the tokenizer.convert_ids_to_tokens method
mock_tokenizer.convert_ids_to_tokens.return_value = tokens
formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
assert isinstance(formatted, list)
assert len(formatted) == len(tokens)
for i, token_data in enumerate(formatted):
assert isinstance(token_data, dict)
assert 'display' in token_data
assert 'original' in token_data
assert 'token_id' in token_data
assert 'colors' in token_data
assert 'newline' in token_data
assert token_data['original'] == tokens[i]
assert isinstance(token_data['colors'], dict)
assert 'background' in token_data['colors']
assert 'text' in token_data['colors']
def test_format_tokens_newline_detection(self, mock_tokenizer):
"""Test newline detection in token formatting."""
tokens = ['Hello', '\n', 'world']
mock_tokenizer.convert_ids_to_tokens.return_value = tokens
formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
# Second token should be marked as newline
assert formatted[1]['newline'] is True
assert formatted[0]['newline'] is False
assert formatted[2]['newline'] is False
def test_format_tokens_color_consistency(self, mock_tokenizer):
"""Test that same tokens get same colors."""
tokens = ['hello', 'world', 'hello'] # 'hello' appears twice
mock_tokenizer.convert_ids_to_tokens.return_value = tokens
formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
# Same tokens should have same colors
assert formatted[0]['colors']['background'] == formatted[2]['colors']['background']
assert formatted[0]['colors']['text'] == formatted[2]['colors']['text']
# Different tokens should have different colors
assert formatted[0]['colors']['background'] != formatted[1]['colors']['background']
def test_format_tokens_special_character_handling(self, mock_tokenizer):
"""Test special character handling in token formatting."""
tokens = [' ', '\t', '\n', 'Ġhello']
mock_tokenizer.convert_ids_to_tokens.return_value = tokens
formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
# Check that special characters are properly converted
assert formatted[0]['display'] == '␣' # Space
assert formatted[1]['display'] == '→' # Tab
assert formatted[2]['display'] == '↵' # Newline
assert formatted[3]['display'] == ' hello' # Ġ prefix |