""" Unit tests for StatsService """ import pytest from unittest.mock import Mock from app.services.stats_service import StatsService class TestStatsService: """Test cases for StatsService.""" def setup_method(self): """Set up test fixtures.""" self.service = StatsService() def test_get_varied_color_basic(self): """Test basic color generation.""" color = self.service.get_varied_color(0, 10) assert isinstance(color, dict) assert 'background' in color assert 'text' in color assert color['background'].startswith('#') assert color['text'].startswith('#') assert len(color['background']) == 7 # #RRGGBB format assert len(color['text']) == 7 def test_get_varied_color_different_indices(self): """Test that different indices produce different colors.""" color1 = self.service.get_varied_color(0, 10) color2 = self.service.get_varied_color(1, 10) color3 = self.service.get_varied_color(5, 10) # Colors should be different assert color1['background'] != color2['background'] assert color2['background'] != color3['background'] assert color1['background'] != color3['background'] def test_get_varied_color_edge_cases(self): """Test color generation with edge cases.""" # Single token color = self.service.get_varied_color(0, 1) assert isinstance(color, dict) # Large number of tokens color = self.service.get_varied_color(999, 1000) assert isinstance(color, dict) # Zero index color = self.service.get_varied_color(0, 5) assert isinstance(color, dict) def test_fix_token_basic(self): """Test basic token fixing.""" assert self.service.fix_token("hello") == "hello" assert self.service.fix_token("world") == "world" def test_fix_token_special_characters(self): """Test token fixing with special characters.""" # Test space replacement assert self.service.fix_token(" ") == "␣" assert self.service.fix_token("\t") == "→" assert self.service.fix_token("\n") == "↵" # Test Ġ prefix (common in tokenizers) assert self.service.fix_token("Ġhello") == " hello" assert self.service.fix_token("Ġworld") == " world" # Test combination assert self.service.fix_token("Ġ") == " " def test_fix_token_edge_cases(self): """Test token fixing edge cases.""" # Empty string assert self.service.fix_token("") == "" # None (shouldn't happen but test defensive programming) result = self.service.fix_token(None) assert result is None or result == "" # Multiple special characters assert self.service.fix_token("\n\t ") == "↵→␣" # Multiple Ġ prefixes (edge case) assert self.service.fix_token("ĠĠhello") == " hello" def test_get_token_stats_basic(self, sample_tokens, sample_text): """Test basic token statistics calculation.""" stats = self.service.get_token_stats(sample_tokens, sample_text) assert isinstance(stats, dict) assert 'basic_stats' in stats assert 'length_stats' in stats basic = stats['basic_stats'] length = stats['length_stats'] # Check basic stats structure assert 'total_tokens' in basic assert 'unique_tokens' in basic assert 'unique_percentage' in basic assert 'special_tokens' in basic assert 'space_tokens' in basic assert 'newline_tokens' in basic assert 'compression_ratio' in basic # Check length stats structure assert 'avg_length' in length assert 'median_length' in length assert 'std_dev' in length def test_get_token_stats_calculations(self): """Test specific statistics calculations.""" tokens = ['Hello', ' world', '!', ' test'] text = "Hello world! test" stats = self.service.get_token_stats(tokens, text) basic = stats['basic_stats'] # Test total tokens assert basic['total_tokens'] == 4 # Test unique tokens (all are unique in this case) assert basic['unique_tokens'] == 4 assert basic['unique_percentage'] == "100.0" # Test compression ratio expected_ratio = len(text) / len(tokens) assert float(basic['compression_ratio']) == pytest.approx(expected_ratio, rel=1e-2) def test_get_token_stats_special_tokens(self): """Test special token counting.""" tokens = ['', 'Hello', ' world', '', ''] text = "Hello world" stats = self.service.get_token_stats(tokens, text) basic = stats['basic_stats'] # Should detect special tokens (those with < >) assert basic['special_tokens'] >= 2 # , , def test_get_token_stats_whitespace_tokens(self): """Test whitespace token counting.""" tokens = ['Hello', ' ', 'world', '\n', 'test', '\t'] text = "Hello world\ntest\t" stats = self.service.get_token_stats(tokens, text) basic = stats['basic_stats'] # Should count space and tab tokens assert basic['space_tokens'] >= 1 assert basic['newline_tokens'] >= 1 def test_get_token_stats_length_calculations(self): """Test token length statistics.""" tokens = ['a', 'bb', 'ccc', 'dddd'] # Lengths: 1, 2, 3, 4 text = "a bb ccc dddd" stats = self.service.get_token_stats(tokens, text) length = stats['length_stats'] # Average length should be 2.5 assert float(length['avg_length']) == pytest.approx(2.5, rel=1e-2) # Median should be 2.5 (between 2 and 3) assert float(length['median_length']) == pytest.approx(2.5, rel=1e-2) def test_get_token_stats_empty_input(self): """Test statistics with empty input.""" stats = self.service.get_token_stats([], "") basic = stats['basic_stats'] length = stats['length_stats'] assert basic['total_tokens'] == 0 assert basic['unique_tokens'] == 0 assert basic['unique_percentage'] == "0.0" assert basic['compression_ratio'] == "0.0" # Length stats should handle empty case gracefully assert length['avg_length'] == "0.0" assert length['median_length'] == "0.0" assert length['std_dev'] == "0.0" def test_format_tokens_for_display_basic(self, mock_tokenizer): """Test basic token formatting for display.""" tokens = ['Hello', ' world', '!'] # Mock the tokenizer.convert_ids_to_tokens method mock_tokenizer.convert_ids_to_tokens.return_value = tokens formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer) assert isinstance(formatted, list) assert len(formatted) == len(tokens) for i, token_data in enumerate(formatted): assert isinstance(token_data, dict) assert 'display' in token_data assert 'original' in token_data assert 'token_id' in token_data assert 'colors' in token_data assert 'newline' in token_data assert token_data['original'] == tokens[i] assert isinstance(token_data['colors'], dict) assert 'background' in token_data['colors'] assert 'text' in token_data['colors'] def test_format_tokens_newline_detection(self, mock_tokenizer): """Test newline detection in token formatting.""" tokens = ['Hello', '\n', 'world'] mock_tokenizer.convert_ids_to_tokens.return_value = tokens formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer) # Second token should be marked as newline assert formatted[1]['newline'] is True assert formatted[0]['newline'] is False assert formatted[2]['newline'] is False def test_format_tokens_color_consistency(self, mock_tokenizer): """Test that same tokens get same colors.""" tokens = ['hello', 'world', 'hello'] # 'hello' appears twice mock_tokenizer.convert_ids_to_tokens.return_value = tokens formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer) # Same tokens should have same colors assert formatted[0]['colors']['background'] == formatted[2]['colors']['background'] assert formatted[0]['colors']['text'] == formatted[2]['colors']['text'] # Different tokens should have different colors assert formatted[0]['colors']['background'] != formatted[1]['colors']['background'] def test_format_tokens_special_character_handling(self, mock_tokenizer): """Test special character handling in token formatting.""" tokens = [' ', '\t', '\n', 'Ġhello'] mock_tokenizer.convert_ids_to_tokens.return_value = tokens formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer) # Check that special characters are properly converted assert formatted[0]['display'] == '␣' # Space assert formatted[1]['display'] == '→' # Tab assert formatted[2]['display'] == '↵' # Newline assert formatted[3]['display'] == ' hello' # Ġ prefix