tokenizers / tests /test_validators.py
bartar's picture
Upload 26 files
d66ab65 verified
"""
Unit tests for Validators utility
"""
import pytest
from app.utils.validators import Validators, ValidationError
class TestValidators:
"""Test cases for Validators utility."""
def setup_method(self):
"""Set up test fixtures."""
self.validators = Validators()
def test_validate_filename_valid(self):
"""Test filename validation with valid filenames."""
# Valid filenames should not raise
self.validators.validate_filename('test.txt')
self.validators.validate_filename('document.md')
self.validators.validate_filename('script_file.py')
self.validators.validate_filename('My Document.txt')
self.validators.validate_filename('file-name.json')
self.validators.validate_filename('data123.csv')
def test_validate_filename_invalid(self):
"""Test filename validation with invalid filenames."""
# Empty or None filename
with pytest.raises(ValidationError):
self.validators.validate_filename('')
with pytest.raises(ValidationError):
self.validators.validate_filename(None)
# Dangerous characters
with pytest.raises(ValidationError):
self.validators.validate_filename('../../../etc/passwd')
with pytest.raises(ValidationError):
self.validators.validate_filename('file\\with\\backslashes.txt')
# Null bytes
with pytest.raises(ValidationError):
self.validators.validate_filename('file\x00.txt')
# Control characters
with pytest.raises(ValidationError):
self.validators.validate_filename('file\x01\x02.txt')
# Reserved names on Windows
with pytest.raises(ValidationError):
self.validators.validate_filename('CON.txt')
with pytest.raises(ValidationError):
self.validators.validate_filename('PRN.txt')
with pytest.raises(ValidationError):
self.validators.validate_filename('AUX.txt')
def test_validate_file_extension_valid(self):
"""Test file extension validation with valid extensions."""
allowed_extensions = {'.txt', '.md', '.py', '.js', '.json'}
# Valid extensions should not raise
self.validators.validate_file_extension('test.txt', allowed_extensions)
self.validators.validate_file_extension('document.md', allowed_extensions)
self.validators.validate_file_extension('script.py', allowed_extensions)
self.validators.validate_file_extension('data.json', allowed_extensions)
# Case insensitive
self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
self.validators.validate_file_extension('Document.MD', allowed_extensions)
def test_validate_file_extension_invalid(self):
"""Test file extension validation with invalid extensions."""
allowed_extensions = {'.txt', '.md', '.py'}
# Invalid extensions should raise
with pytest.raises(ValidationError):
self.validators.validate_file_extension('virus.exe', allowed_extensions)
with pytest.raises(ValidationError):
self.validators.validate_file_extension('archive.zip', allowed_extensions)
with pytest.raises(ValidationError):
self.validators.validate_file_extension('image.jpg', allowed_extensions)
# No extension
with pytest.raises(ValidationError):
self.validators.validate_file_extension('filename', allowed_extensions)
# Empty filename
with pytest.raises(ValidationError):
self.validators.validate_file_extension('', allowed_extensions)
def test_validate_model_path_valid(self):
"""Test model path validation with valid paths."""
# Valid HuggingFace model paths
valid_paths = [
'microsoft/DialoGPT-medium',
'google/bert-base-uncased',
'meta-llama/Llama-2-7b-hf',
'mistralai/Mistral-7B-Instruct-v0.1',
'Qwen/Qwen2.5-72B-Instruct',
'THUDM/chatglm-6b',
'deepseek-ai/deepseek-coder-6.7b-base',
'unsloth/llama-2-7b-bnb-4bit',
'google-bert/bert-base-uncased',
'bartar/SPLM-2' # User's specific case
]
for path in valid_paths:
self.validators.validate_model_path(path) # Should not raise
def test_validate_model_path_invalid_format(self):
"""Test model path validation with invalid formats."""
# Invalid formats should raise
invalid_paths = [
'', # Empty
'invalid-path', # No slash
'user/', # Empty model name
'/model-name', # Empty user
'user//model', # Double slash
'user/model/extra', # Too many parts
'user name/model', # Space in user
'user/model name', # Space in model (actually this might be valid)
'user@domain/model', # Invalid characters
'../malicious/path', # Path traversal
'user\\model', # Backslash
]
for path in invalid_paths:
with pytest.raises(ValidationError):
self.validators.validate_model_path(path)
def test_validate_model_path_untrusted_prefix(self):
"""Test model path validation with untrusted prefixes."""
# Paths with untrusted prefixes should raise
untrusted_paths = [
'random-user/some-model',
'untrusted/malicious-model',
'hacker/backdoor-model',
'suspicious/model'
]
for path in untrusted_paths:
with pytest.raises(ValidationError):
self.validators.validate_model_path(path)
def test_validate_model_path_edge_cases(self):
"""Test model path validation edge cases."""
# None input
with pytest.raises(ValidationError):
self.validators.validate_model_path(None)
# Very long path
long_path = 'microsoft/' + 'a' * 1000
with pytest.raises(ValidationError):
self.validators.validate_model_path(long_path)
# Special characters in allowed prefix
self.validators.validate_model_path('microsoft/model-with-dashes')
self.validators.validate_model_path('microsoft/model_with_underscores')
self.validators.validate_model_path('microsoft/model.with.dots')
def test_validate_text_input_valid(self):
"""Test text input validation with valid inputs."""
# Valid text inputs should not raise
self.validators.validate_text_input('Hello world!')
self.validators.validate_text_input('A' * 1000) # Long but reasonable text
self.validators.validate_text_input('Text with\nnewlines\nand\ttabs')
self.validators.validate_text_input('Unicode: 你好世界 🌍')
self.validators.validate_text_input('') # Empty text might be valid
def test_validate_text_input_invalid(self):
"""Test text input validation with invalid inputs."""
# None input
with pytest.raises(ValidationError):
self.validators.validate_text_input(None)
# Extremely long text (if there's a limit)
very_long_text = 'A' * (10 * 1024 * 1024) # 10MB of text
with pytest.raises(ValidationError):
self.validators.validate_text_input(very_long_text)
def test_validate_text_input_malicious_content(self):
"""Test text input validation with potentially malicious content."""
# Null bytes
with pytest.raises(ValidationError):
self.validators.validate_text_input('text\x00with\x00nulls')
# Control characters (some might be allowed like \n, \t)
try:
self.validators.validate_text_input('text\x01with\x02controls')
except ValidationError:
pass # This might be expected
def test_validation_error_messages(self):
"""Test that ValidationError contains meaningful messages."""
# Test filename validation error message
try:
self.validators.validate_filename('../../../etc/passwd')
assert False, "Should have raised ValidationError"
except ValidationError as e:
assert 'filename' in str(e).lower() or 'path' in str(e).lower()
# Test file extension error message
try:
self.validators.validate_file_extension('virus.exe', {'.txt'})
assert False, "Should have raised ValidationError"
except ValidationError as e:
assert 'extension' in str(e).lower() or 'allowed' in str(e).lower()
# Test model path error message
try:
self.validators.validate_model_path('invalid-path')
assert False, "Should have raised ValidationError"
except ValidationError as e:
assert 'model' in str(e).lower() or 'path' in str(e).lower()
def test_allowed_model_prefixes_coverage(self):
"""Test that all common model prefixes are covered."""
# This test ensures we have good coverage of trusted model prefixes
common_prefixes = [
'microsoft/',
'google/',
'meta-llama/',
'mistralai/',
'openai-community/',
'Qwen/',
'THUDM/',
'deepseek-ai/',
'unsloth/',
'google-bert/'
]
for prefix in common_prefixes:
# Should be able to validate models with these prefixes
test_path = prefix + 'test-model'
try:
self.validators.validate_model_path(test_path)
except ValidationError:
pytest.fail(f"Trusted prefix {prefix} should be allowed")
def test_case_sensitivity(self):
"""Test case sensitivity in various validations."""
# File extensions should be case insensitive
allowed_extensions = {'.txt', '.md'}
self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
self.validators.validate_file_extension('Document.MD', allowed_extensions)
# Model path prefixes should be case sensitive (HuggingFace convention)
self.validators.validate_model_path('Microsoft/model') # Capital M
# But random capitalization in untrusted prefixes should still fail
with pytest.raises(ValidationError):
self.validators.validate_model_path('RANDOM/model')