Spaces:

bartar
/

tokenizers

Running

File size: 11,006 Bytes

d66ab65

"""

Unit tests for Validators utility

"""
import pytest
from app.utils.validators import Validators, ValidationError


class TestValidators:
    """Test cases for Validators utility."""
    
    def setup_method(self):
        """Set up test fixtures."""
        self.validators = Validators()
    
    def test_validate_filename_valid(self):
        """Test filename validation with valid filenames."""
        # Valid filenames should not raise
        self.validators.validate_filename('test.txt')
        self.validators.validate_filename('document.md')
        self.validators.validate_filename('script_file.py')
        self.validators.validate_filename('My Document.txt')
        self.validators.validate_filename('file-name.json')
        self.validators.validate_filename('data123.csv')
    
    def test_validate_filename_invalid(self):
        """Test filename validation with invalid filenames."""
        # Empty or None filename
        with pytest.raises(ValidationError):
            self.validators.validate_filename('')
        
        with pytest.raises(ValidationError):
            self.validators.validate_filename(None)
        
        # Dangerous characters
        with pytest.raises(ValidationError):
            self.validators.validate_filename('../../../etc/passwd')
        
        with pytest.raises(ValidationError):
            self.validators.validate_filename('file\\with\\backslashes.txt')
        
        # Null bytes
        with pytest.raises(ValidationError):
            self.validators.validate_filename('file\x00.txt')
        
        # Control characters
        with pytest.raises(ValidationError):
            self.validators.validate_filename('file\x01\x02.txt')
        
        # Reserved names on Windows
        with pytest.raises(ValidationError):
            self.validators.validate_filename('CON.txt')
        
        with pytest.raises(ValidationError):
            self.validators.validate_filename('PRN.txt')
        
        with pytest.raises(ValidationError):
            self.validators.validate_filename('AUX.txt')
    
    def test_validate_file_extension_valid(self):
        """Test file extension validation with valid extensions."""
        allowed_extensions = {'.txt', '.md', '.py', '.js', '.json'}
        
        # Valid extensions should not raise
        self.validators.validate_file_extension('test.txt', allowed_extensions)
        self.validators.validate_file_extension('document.md', allowed_extensions)
        self.validators.validate_file_extension('script.py', allowed_extensions)
        self.validators.validate_file_extension('data.json', allowed_extensions)
        
        # Case insensitive
        self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
        self.validators.validate_file_extension('Document.MD', allowed_extensions)
    
    def test_validate_file_extension_invalid(self):
        """Test file extension validation with invalid extensions."""
        allowed_extensions = {'.txt', '.md', '.py'}
        
        # Invalid extensions should raise
        with pytest.raises(ValidationError):
            self.validators.validate_file_extension('virus.exe', allowed_extensions)
        
        with pytest.raises(ValidationError):
            self.validators.validate_file_extension('archive.zip', allowed_extensions)
        
        with pytest.raises(ValidationError):
            self.validators.validate_file_extension('image.jpg', allowed_extensions)
        
        # No extension
        with pytest.raises(ValidationError):
            self.validators.validate_file_extension('filename', allowed_extensions)
        
        # Empty filename
        with pytest.raises(ValidationError):
            self.validators.validate_file_extension('', allowed_extensions)
    
    def test_validate_model_path_valid(self):
        """Test model path validation with valid paths."""
        # Valid HuggingFace model paths
        valid_paths = [
            'microsoft/DialoGPT-medium',
            'google/bert-base-uncased',
            'meta-llama/Llama-2-7b-hf',
            'mistralai/Mistral-7B-Instruct-v0.1',
            'Qwen/Qwen2.5-72B-Instruct',
            'THUDM/chatglm-6b',
            'deepseek-ai/deepseek-coder-6.7b-base',
            'unsloth/llama-2-7b-bnb-4bit',
            'google-bert/bert-base-uncased',
            'bartar/SPLM-2'  # User's specific case
        ]
        
        for path in valid_paths:
            self.validators.validate_model_path(path)  # Should not raise
    
    def test_validate_model_path_invalid_format(self):
        """Test model path validation with invalid formats."""
        # Invalid formats should raise
        invalid_paths = [
            '',  # Empty
            'invalid-path',  # No slash
            'user/',  # Empty model name
            '/model-name',  # Empty user
            'user//model',  # Double slash
            'user/model/extra',  # Too many parts
            'user name/model',  # Space in user
            'user/model name',  # Space in model (actually this might be valid)
            'user@domain/model',  # Invalid characters
            '../malicious/path',  # Path traversal
            'user\\model',  # Backslash
        ]
        
        for path in invalid_paths:
            with pytest.raises(ValidationError):
                self.validators.validate_model_path(path)
    
    def test_validate_model_path_untrusted_prefix(self):
        """Test model path validation with untrusted prefixes."""
        # Paths with untrusted prefixes should raise
        untrusted_paths = [
            'random-user/some-model',
            'untrusted/malicious-model',
            'hacker/backdoor-model',
            'suspicious/model'
        ]
        
        for path in untrusted_paths:
            with pytest.raises(ValidationError):
                self.validators.validate_model_path(path)
    
    def test_validate_model_path_edge_cases(self):
        """Test model path validation edge cases."""
        # None input
        with pytest.raises(ValidationError):
            self.validators.validate_model_path(None)
        
        # Very long path
        long_path = 'microsoft/' + 'a' * 1000
        with pytest.raises(ValidationError):
            self.validators.validate_model_path(long_path)
        
        # Special characters in allowed prefix
        self.validators.validate_model_path('microsoft/model-with-dashes')
        self.validators.validate_model_path('microsoft/model_with_underscores')
        self.validators.validate_model_path('microsoft/model.with.dots')
    
    def test_validate_text_input_valid(self):
        """Test text input validation with valid inputs."""
        # Valid text inputs should not raise
        self.validators.validate_text_input('Hello world!')
        self.validators.validate_text_input('A' * 1000)  # Long but reasonable text
        self.validators.validate_text_input('Text with\nnewlines\nand\ttabs')
        self.validators.validate_text_input('Unicode: 你好世界 🌍')
        self.validators.validate_text_input('')  # Empty text might be valid
    
    def test_validate_text_input_invalid(self):
        """Test text input validation with invalid inputs."""
        # None input
        with pytest.raises(ValidationError):
            self.validators.validate_text_input(None)
        
        # Extremely long text (if there's a limit)
        very_long_text = 'A' * (10 * 1024 * 1024)  # 10MB of text
        with pytest.raises(ValidationError):
            self.validators.validate_text_input(very_long_text)
    
    def test_validate_text_input_malicious_content(self):
        """Test text input validation with potentially malicious content."""
        # Null bytes
        with pytest.raises(ValidationError):
            self.validators.validate_text_input('text\x00with\x00nulls')
        
        # Control characters (some might be allowed like \n, \t)
        try:
            self.validators.validate_text_input('text\x01with\x02controls')
        except ValidationError:
            pass  # This might be expected
    
    def test_validation_error_messages(self):
        """Test that ValidationError contains meaningful messages."""
        # Test filename validation error message
        try:
            self.validators.validate_filename('../../../etc/passwd')
            assert False, "Should have raised ValidationError"
        except ValidationError as e:
            assert 'filename' in str(e).lower() or 'path' in str(e).lower()
        
        # Test file extension error message
        try:
            self.validators.validate_file_extension('virus.exe', {'.txt'})
            assert False, "Should have raised ValidationError"
        except ValidationError as e:
            assert 'extension' in str(e).lower() or 'allowed' in str(e).lower()
        
        # Test model path error message
        try:
            self.validators.validate_model_path('invalid-path')
            assert False, "Should have raised ValidationError"
        except ValidationError as e:
            assert 'model' in str(e).lower() or 'path' in str(e).lower()
    
    def test_allowed_model_prefixes_coverage(self):
        """Test that all common model prefixes are covered."""
        # This test ensures we have good coverage of trusted model prefixes
        common_prefixes = [
            'microsoft/',
            'google/',
            'meta-llama/',
            'mistralai/',
            'openai-community/',
            'Qwen/',
            'THUDM/',
            'deepseek-ai/',
            'unsloth/',
            'google-bert/'
        ]
        
        for prefix in common_prefixes:
            # Should be able to validate models with these prefixes
            test_path = prefix + 'test-model'
            try:
                self.validators.validate_model_path(test_path)
            except ValidationError:
                pytest.fail(f"Trusted prefix {prefix} should be allowed")
    
    def test_case_sensitivity(self):
        """Test case sensitivity in various validations."""
        # File extensions should be case insensitive
        allowed_extensions = {'.txt', '.md'}
        self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
        self.validators.validate_file_extension('Document.MD', allowed_extensions)
        
        # Model path prefixes should be case sensitive (HuggingFace convention)
        self.validators.validate_model_path('Microsoft/model')  # Capital M
        
        # But random capitalization in untrusted prefixes should still fail
        with pytest.raises(ValidationError):
            self.validators.validate_model_path('RANDOM/model')