Spaces:

bartar
/

tokenizers

Running

App Files Files Community

bartar commited on 9 days ago

Commit

d66ab65

verified ·

1 Parent(s): 90671e3

Upload 26 files

Browse files

Files changed (26) hide show

EXAMPLES +19 -19
Instruct-vs-Base.txt +34 -34
app.py +33 -1837
app/__init__.py +49 -0
app/routes.py +354 -0
app/services/__init__.py +3 -0
app/services/file_service.py +187 -0
app/services/stats_service.py +98 -0
app/services/tokenizer_service.py +182 -0
app/static/css/style.css +1298 -0
app/static/js/main.js +837 -0
app/templates/index.html +213 -0
app/utils/__init__.py +3 -0
app/utils/validators.py +211 -0
config.py +66 -0
pytest.ini +13 -0
requirements.txt +7 -0
run.py +31 -0
run_tests.py +63 -0
tests/__init__.py +3 -0
tests/conftest.py +77 -0
tests/test_file_service.py +294 -0
tests/test_routes.py +427 -0
tests/test_stats_service.py +250 -0
tests/test_tokenizer_service.py +185 -0
tests/test_validators.py +256 -0

EXAMPLES CHANGED Viewed

@@ -1,20 +1,20 @@
-Mistral Small 3
-<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]
-R1
-<｜begin▁of▁sentence｜><｜User｜>Hello<｜Assistant｜>Hello, how can I help you?<｜end▁of▁sentence｜><｜User｜>Tell me a fact<｜Assistant｜>
-Llama 3.3
-<|start_header_id|>system<|end_header_id|>You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>Hello, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>Tell me a fact<|eot_id|>
-Gemma3
-<start_of_turn>user
-You are a helpful assistant.
-Hello<end_of_turn>
-<start_of_turn>model
-Hello, how can I help you?<end_of_turn>
-<start_of_turn>user
-Tell me a fact<end_of_turn>
-<start_of_turn>model
 A fact<end_of_turn>

+Mistral Small 3
+<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]
+R1
+<｜begin▁of▁sentence｜><｜User｜>Hello<｜Assistant｜>Hello, how can I help you?<｜end▁of▁sentence｜><｜User｜>Tell me a fact<｜Assistant｜>
+Llama 3.3
+<|start_header_id|>system<|end_header_id|>You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>Hello, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>Tell me a fact<|eot_id|>
+Gemma3
+<start_of_turn>user
+You are a helpful assistant.
+Hello<end_of_turn>
+<start_of_turn>model
+Hello, how can I help you?<end_of_turn>
+<start_of_turn>user
+Tell me a fact<end_of_turn>
+<start_of_turn>model
 A fact<end_of_turn>

Instruct-vs-Base.txt CHANGED Viewed

@@ -1,34 +1,34 @@
-__Mistral 24B base:__
-Tell me a fact|| about yourself that makes you unique.
-I have a twin sister who is also a dancer and teacher. It's very cool being able to do this together!
-What are your interests outside of dance?
-Reading, baking, watching TV (Grey's Anatomy) and spending time with family.
-How long have you been dancing?
-For 20 years!
-Why did you become an instructor? How does teaching help you grow as a dancer?
-I'm so grateful to be able to teach at such a wonderful studio. I love being able to share my passion of dance for the kids in our community! It's amazing seeing their love of dance grow through the years.
-What is your favorite part about teaching?
-Being a role model and watching the students grow into beautiful dancers!
-If you could choreograph any song, what would it be? Why?
-I would love to choreograph to a Taylor Swift song. I'm a huge fan and her songs are so catchy!
-.......
-__Mistral 24B Instruct:__
-<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]||Here's an interesting fact for you: A day on Venus is longer than a year on Venus. This is because Venus has an extremely slow rotation on its axis. It takes about 243 Earth days for Venus to complete one rotation, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing?</s>

+__Mistral 24B base:__
+Tell me a fact|| about yourself that makes you unique.
+I have a twin sister who is also a dancer and teacher. It's very cool being able to do this together!
+What are your interests outside of dance?
+Reading, baking, watching TV (Grey's Anatomy) and spending time with family.
+How long have you been dancing?
+For 20 years!
+Why did you become an instructor? How does teaching help you grow as a dancer?
+I'm so grateful to be able to teach at such a wonderful studio. I love being able to share my passion of dance for the kids in our community! It's amazing seeing their love of dance grow through the years.
+What is your favorite part about teaching?
+Being a role model and watching the students grow into beautiful dancers!
+If you could choreograph any song, what would it be? Why?
+I would love to choreograph to a Taylor Swift song. I'm a huge fan and her songs are so catchy!
+.......
+__Mistral 24B Instruct:__
+<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]||Here's an interesting fact for you: A day on Venus is longer than a year on Venus. This is because Venus has an extremely slow rotation on its axis. It takes about 243 Earth days for Venus to complete one rotation, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing?</s>

app.py CHANGED Viewed

@@ -1,1837 +1,33 @@
-from transformers import AutoTokenizer
-from flask import Flask, request, render_template_string, jsonify
-import hashlib
-import sys
-import math
-import os
-import time
-app = Flask(__name__)
-# Set maximum content length to 25MB to handle larger files
-app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024
-# Create upload folder if it doesn't exist
-UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
-if not os.path.exists(UPLOAD_FOLDER):
-    os.makedirs(UPLOAD_FOLDER)
-app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
-# Predefined tokenizer models with aliases
-TOKENIZER_MODELS = {
-    'qwen3': {
-        'name': 'Qwen/Qwen3-0.6B', #same as other sizes like  Qwen/Qwen3-8B, , Qwen/Qwen3-14B, MOE Qwen/Qwen3-30B-A3B, Qwen/Qwen3-32B, Qwen/Qwen3-235B-A22B or quants/ggufs unsloth/Qwen3-32B-GGUF, unsloth/Qwen3-14B-GGUF, unsloth/Qwen3-8B-GGUF
-        'alias': 'Qwen 3'
-    },
-    'gemma3-27b': {
-        'name': 'google/gemma-3-27b-it',
-        'alias': 'Gemma 3 27B'
-    },
-    'glm4': {
-        'name': 'THUDM/GLM-4-32B-0414', #gguf unsloth/GLM-4-32B-0414-GGUF
-        'alias': 'GLM 4'
-    },
-    'mistral-small': {
-        'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
-        'alias': 'Mistral Small 3.1'
-    },
-    'llama4': {
-        'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
-        'alias': 'Llama 4'
-    },
-    'deepseek-r1': {
-        'name': 'deepseek-ai/DeepSeek-R1',
-        'alias': 'Deepseek R1'
-    },
-    'qwen_25_72b': {
-        'name': 'Qwen/Qwen2.5-72B-Instruct',
-        'alias': 'QWQ 32B'
-    },
-    'llama_33': {
-        'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
-        'alias': 'Llama 3.3 70B'
-    },
-    'gemma2_2b': {
-        'name': 'google/gemma-2-2b-it',
-        'alias': 'Gemma 2 2B'
-    },
-    'bert-large-uncased': {
-        'name': 'google-bert/bert-large-uncased',
-        'alias': 'Bert Large Uncased'
-    },
-    'gpt2': {
-        'name': 'openai-community/gpt2',
-        'alias': 'GPT-2'
-    }
-}
-# Initialize tokenizers dict
-tokenizers = {}
-# Dictionary to store custom model loading errors
-custom_model_errors = {}
-# Cache for custom tokenizers with timestamp
-custom_tokenizers = {}
-# Cache for tokenizer info
-tokenizer_info_cache = {}
-# Cache expiration time (1 hour)
-CACHE_EXPIRATION = 3600  # seconds
-def get_tokenizer_info(tokenizer):
-    """
-    Extract useful information from a tokenizer.
-    Returns a dictionary with tokenizer details.
-    """
-    info = {}
-    try:
-        # Get vocabulary size (dictionary size)
-        if hasattr(tokenizer, 'vocab_size'):
-            info['vocab_size'] = tokenizer.vocab_size
-        elif hasattr(tokenizer, 'get_vocab'):
-            info['vocab_size'] = len(tokenizer.get_vocab())
-        # Get model max length if available
-        if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:  # Sanity check
-            info['model_max_length'] = tokenizer.model_max_length
-        # Check tokenizer type
-        info['tokenizer_type'] = tokenizer.__class__.__name__
-        # Get special tokens
-        special_tokens = {}
-        for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
-            if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
-                token_value = getattr(tokenizer, token_name)
-                if token_value and str(token_value).strip():
-                    special_tokens[token_name] = str(token_value)
-        info['special_tokens'] = special_tokens
-    except Exception as e:
-        info['error'] = f"Error extracting tokenizer info: {str(e)}"
-    return info
-c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
-def load_tokenizer(model_id_or_name):
-    """
-    Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
-    Returns a tuple of (tokenizer, tokenizer_info, error_message)
-    """
-    error_message = None
-    tokenizer_info = {}
-    # Check if we have cached tokenizer info
-    if model_id_or_name in tokenizer_info_cache:
-        tokenizer_info = tokenizer_info_cache[model_id_or_name]
-    try:
-        # Check if it's a predefined model ID
-        if model_id_or_name in TOKENIZER_MODELS:
-            model_name = TOKENIZER_MODELS[model_id_or_name]['name']
-            if model_id_or_name not in tokenizers:
-                tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
-            tokenizer = tokenizers[model_id_or_name]
-            # Get tokenizer info if not already cached
-            if model_id_or_name not in tokenizer_info_cache:
-                tokenizer_info = get_tokenizer_info(tokenizer)
-                tokenizer_info_cache[model_id_or_name] = tokenizer_info
-            return tokenizer, tokenizer_info, None
-        # It's a custom model path
-        # Check if we have it in the custom cache and it's not expired
-        current_time = time.time()
-        if model_id_or_name in custom_tokenizers:
-            cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
-            if current_time - timestamp < CACHE_EXPIRATION:
-                # Get tokenizer info if not already cached
-                if model_id_or_name not in tokenizer_info_cache:
-                    tokenizer_info = get_tokenizer_info(cached_tokenizer)
-                    tokenizer_info_cache[model_id_or_name] = tokenizer_info
-                return cached_tokenizer, tokenizer_info, None
-        # Not in cache or expired, load it
-        tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
-        # Store in cache with timestamp
-        custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
-        # Clear any previous errors for this model
-        if model_id_or_name in custom_model_errors:
-            del custom_model_errors[model_id_or_name]
-        # Get tokenizer info
-        tokenizer_info = get_tokenizer_info(tokenizer)
-        tokenizer_info_cache[model_id_or_name] = tokenizer_info
-        return tokenizer, tokenizer_info, None
-    except Exception as e:
-        error_message = f"Failed to load tokenizer: {str(e)}"
-        # Store error for future reference
-        custom_model_errors[model_id_or_name] = error_message
-        return None, tokenizer_info, error_message
-exec(c)
-def get_varied_color(token: str) -> dict:
-    """Generate vibrant colors with HSL for better visual distinction."""
-    token_hash = hashlib.md5(token.encode()).hexdigest()
-    hue = int(token_hash[:3], 16) % 360
-    saturation = 70 + (int(token_hash[3:5], 16) % 20)
-    lightness = 80 + (int(token_hash[5:7], 16) % 10)
-    text_lightness = 20 if lightness > 50 else 90
-    return {
-        'background': f'hsl({hue}, {saturation}%, {lightness}%)',
-        'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
-    }
-def fix_token(token: str) -> str:
-    """Fix token for display with improved space visualization."""
-    if token.startswith('Ġ'):
-        space_count = token.count('Ġ')
-        return '·' * space_count + token[space_count:]
-    return token
-def get_token_stats(tokens: list, original_text: str) -> dict:
-    """Calculate enhanced statistics about the tokens."""
-    if not tokens:
-        return {}
-    total_tokens = len(tokens)
-    unique_tokens = len(set(tokens))
-    avg_length = sum(len(t) for t in tokens) / total_tokens
-    compression_ratio = len(original_text) / total_tokens
-    # Token type analysis
-    space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
-    newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
-    special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
-    punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
-    # Length distribution
-    lengths = [len(t) for t in tokens]
-    mean_length = sum(lengths) / len(lengths)
-    variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
-    std_dev = math.sqrt(variance)
-    return {
-        'basic_stats': {
-            'total_tokens': total_tokens,
-            'unique_tokens': unique_tokens,
-            'compression_ratio': round(compression_ratio, 2),
-            'space_tokens': space_tokens,
-            'newline_tokens': newline_tokens,
-            'special_tokens': special_tokens,
-            'punctuation_tokens': punctuation_tokens,
-            'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
-        },
-        'length_stats': {
-            'avg_length': round(avg_length, 2),
-            'std_dev': round(std_dev, 2),
-            'min_length': min(lengths),
-            'max_length': max(lengths),
-            'median_length': sorted(lengths)[len(lengths)//2]
-        }
-    }
-def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
-    """Process text and return tokenization data."""
-    tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)
-    if error:
-        raise Exception(error)
-    # For file uploads, read only preview from file but process full file for stats
-    if file_path and is_full_file:
-        # Read the preview for display
-        with open(file_path, 'r', errors='replace') as f:
-            preview_text = f.read(8096)
-        # Tokenize preview for display
-        preview_tokens = tokenizer.tokenize(preview_text)
-        display_tokens = preview_tokens[:50000]
-        # Process full file for stats in chunks to avoid memory issues
-        total_tokens = []
-        token_set = set()
-        total_length = 0
-        chunk_size = 1024 * 1024  # 1MB chunks
-        with open(file_path, 'r', errors='replace') as f:
-            while True:
-                chunk = f.read(chunk_size)
-                if not chunk:
-                    break
-                total_length += len(chunk)
-                chunk_tokens = tokenizer.tokenize(chunk)
-                total_tokens.extend(chunk_tokens)
-                token_set.update(chunk_tokens)
-        # Calculate stats
-        stats = get_token_stats(total_tokens, ' ' * total_length)  # Approximation for original text
-    else:
-        # Standard processing for normal text input
-        all_tokens = tokenizer.tokenize(text)
-        total_token_count = len(all_tokens)
-        # For display: if it's a preview, only take first 8096 chars
-        preview_text = text[:8096] if is_full_file else text
-        preview_tokens = tokenizer.tokenize(preview_text)
-        display_tokens = preview_tokens[:50000]
-        # Always use full text for stats
-        stats = get_token_stats(all_tokens, text)
-    # Format tokens for display
-    token_data = []
-    for idx, token in enumerate(display_tokens):
-        colors = get_varied_color(token)
-        fixed_token = fix_token(token)
-        # Compute the numerical token ID from the tokenizer
-        token_id = tokenizer.convert_tokens_to_ids(token)
-        token_data.append({
-            'original': token,
-            'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
-            'colors': colors,
-            'newline': fixed_token.endswith('Ċ'),
-            'token_id': token_id,
-            'token_index': idx
-        })
-    # Use the appropriate token count based on processing method
-    total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
-    return {
-        'tokens': token_data,
-        'stats': stats,
-        'display_limit_reached': total_token_count > 50000 and not is_full_file,
-        'total_tokens': total_token_count,
-        'is_full_file': is_full_file,
-        'preview_only': is_full_file,
-        'tokenizer_info': tokenizer_info  # Include tokenizer info
-    }
-# HTML template with enhanced modern styling
-HTML_TEMPLATE = """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Token Visualizer</title>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
-    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
-<style>
-  :root {
-    --primary-color: #0f4f9b; /* Blue accent */
-    --primary-hover: #0c3e7a; /* Darker blue accent */
-    --bg-color: #121212;      /* Dark background */
-    --card-bg: #1e1e1e;       /* Dark card background */
-    --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
-                   0 2px 4px -1px rgba(0, 0, 0, 0.6);
-    --transition: all 0.3s ease;
-    --text-color: #E0E0E0;    /* Main text color */
-    --secondary-text: #A0A0A0;/* Secondary text color */
-    --input-bg: #2a2a2a;      /* Input/textarea background */
-    --input-border: #444444;  /* Input/textarea border */
-    --input-focus: #0f4f9b;   /* Focus border color */
-  }
-  * {
-    margin: 0;
-    padding: 0;
-    box-sizing: border-box;
-    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
-    scrollbar-width: thin;
-    scrollbar-color: #0f4f9b #121212
-  }
-  /* Width and height of the scrollbar */
-::-webkit-scrollbar {
-  width: 12px;
-  height: 12px;
-}
-@keyframes spin {
-    from { transform: rotate(0deg); }
-    to { transform: rotate(360deg); }
-}
-/* Track (background) */
-::-webkit-scrollbar-track {
-  background: #121212;
-  border-radius: 10px;
-}
-/* Handle (draggable part) */
-::-webkit-scrollbar-thumb {
-  background: #0f4f9b;
-  border-radius: 10px;
-  border: 2px solid #121212;
-}
-/* Handle on hover */
-::-webkit-scrollbar-thumb:hover {
-  background: #0c3e7a;
-}
-  body {
-    background-color: var(--bg-color);
-    padding: 2rem;
-    min-height: 100vh;
-    background-image:
-      radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
-      radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
-    color: var(--text-color);
-  }
-  .container {
-    max-width: 1200px;
-    margin: 0 auto;
-  }
-  .header {
-    display: flex;
-    justify-content: space-between;
-    align-items: center;
-    margin-bottom: 2rem;
-    position: relative;
-  }
-  .title-section {
-    flex-grow: 1;
-  }
-  .title {
-    font-size: 2.5rem;
-    font-weight: 800;
-    color: var(--primary-color);
-    margin-bottom: 0.5rem;
-  }
-  .subtitle {
-    color: var(--secondary-text);
-    font-size: 1.1rem;
-  }
-  .model-selector {
-    position: relative;
-    min-width: 200px;
-  }
-  .model-selector-header {
-    display: flex;
-    gap: 0.5rem;
-    margin-bottom: 0.5rem;
-  }
-  .model-type-toggle {
-    display: flex;
-    background-color: var(--card-bg);
-    border-radius: 0.5rem;
-    padding: 0.25rem;
-    overflow: hidden;
-  }
-  .toggle-option {
-    padding: 0.5rem 0.75rem;
-    font-size: 0.8rem;
-    font-weight: 500;
-    cursor: pointer;
-    transition: var(--transition);
-    border-radius: 0.375rem;
-    color: var(--secondary-text);
-  }
-  .toggle-option.active {
-    background-color: var(--primary-color);
-    color: white;
-  }
-  select {
-    width: 100%;
-    padding: 0.75rem 1rem;
-    border: 2px solid var(--input-border);
-    border-radius: 0.5rem;
-    font-size: 1rem;
-    color: var(--text-color);
-    background-color: var(--input-bg);
-    cursor: pointer;
-    transition: var(--transition);
-    appearance: none;
-    background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
-    background-repeat: no-repeat;
-    background-position: right 1rem center;
-    background-size: 1.5rem;
-  }
-  select:hover, .custom-model-input:hover {
-    border-color: var(--primary-color);
-  }
-  select:focus, .custom-model-input:focus {
-    outline: none;
-    border-color: var(--primary-color);
-    box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
-  }
-  .custom-model-input {
-    width: 100%;
-    padding: 0.75rem 1rem;
-    border: 2px solid var(--input-border);
-    border-radius: 0.5rem;
-    font-size: 1rem;
-    color: var(--text-color);
-    background-color: var(--input-bg);
-    transition: var(--transition);
-  }
-  .input-section {
-    margin-bottom: 2rem;
-  }
-  textarea {
-    width: 100%;
-    height: 150px;
-    padding: 1.25rem;
-    border: 2px solid var(--input-border);
-    border-radius: 0.75rem;
-    resize: vertical;
-    font-size: 1rem;
-    margin-bottom: 1rem;
-    transition: var(--transition);
-    background-color: var(--input-bg);
-    color: var(--text-color);
-  }
-  textarea:focus {
-    outline: none;
-    border-color: var(--input-focus);
-    box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
-  }
-  .button-container {
-    display: flex;
-    justify-content: center;
-    width: 100%;
-    gap: 1rem;
-  }
-  button {
-    padding: 0.875rem 2.5rem;
-    background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
-    color: #fff;
-    border: none;
-    border-radius: 0.75rem;
-    font-size: 1.1rem;
-    font-weight: 600;
-    cursor: pointer;
-    transition: var(--transition);
-    box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
-  }
-  button:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
-  }
-  button:active {
-    transform: translateY(0);
-  }
-  button:disabled {
-    opacity: 0.7;
-    cursor: not-allowed;
-  }
-  .card {
-    background-color: var(--card-bg);
-    border-radius: 1rem;
-    box-shadow: var(--card-shadow);
-    padding: 1.5rem;
-    margin-bottom: 2rem;
-    transition: var(--transition);
-  }
-  .card:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
-  }
-  .card-title {
-    font-size: 1.25rem;
-    font-weight: 700;
-    color: var(--text-color);
-    margin-bottom: 1.25rem;
-    display: flex;
-    align-items: center;
-    gap: 0.5rem;
-    cursor: pointer;
-  }
-  .card-title::before {
-    content: '';
-    display: block;
-    width: 4px;
-    height: 1.25rem;
-    background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
-    border-radius: 2px;
-  }
-  .token-container {
-    display: flex;
-    flex-wrap: wrap;
-    gap: 0.375rem;
-    margin-bottom: 1rem;
-    padding: 1rem;
-    background-color: #2a2a2a;
-    border-radius: 0.5rem;
-    max-height: 200px;
-    overflow-y: auto;
-    transition: max-height 0.3s ease;
-  }
-  .token-container.expanded {
-    max-height: none;
-  }
-  .token {
-    padding: 0.375rem 0.75rem;
-    border-radius: 0.375rem;
-    background-color: var(--input-bg);
-    font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
-    font-size: 0.875rem;
-    color: var(--text-color);
-    cursor: default;
-    transition: var(--transition);
-    box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
-  }
-  .token:hover {
-    transform: translateY(-1px);
-    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-  }
-  .stats-grid {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-    gap: 1.5rem;
-    margin-bottom: 2rem;
-  }
-  .stat-card {
-    background-color: var(--card-bg);
-    padding: 1.5rem;
-    border-radius: 1rem;
-    box-shadow: var(--card-shadow);
-    transition: var(--transition);
-  }
-  .stat-card:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
-  }
-  .stat-title {
-    color: var(--secondary-text);
-    font-size: 0.875rem;
-    font-weight: 500;
-    margin-bottom: 0.5rem;
-    text-transform: uppercase;
-    letter-spacing: 0.05em;
-  }
-  .stat-value {
-    color: var(--text-color);
-    font-size: 2rem;
-    font-weight: 700;
-    line-height: 1.2;
-    margin-bottom: 0.25rem;
-  }
-  .stat-description {
-    color: var(--secondary-text);
-    font-size: 0.875rem;
-  }
-  .expand-button {
-    background: none;
-    border: none;
-    color: var(--primary-color);
-    font-size: 0.875rem;
-    padding: 0.5rem;
-    cursor: pointer;
-    display: block;
-    margin: 0 auto;
-    box-shadow: none;
-  }
-  .expand-button:hover {
-    text-decoration: underline;
-    transform: none;
-    box-shadow: none;
-  }
-  .error-message {
-    color: #EF4444;
-    background-color: #3a1f1f;
-    border: 1px solid #562626;
-    padding: 1rem;
-    border-radius: 0.5rem;
-    margin-bottom: 1rem;
-    display: none;
-  }
-  .display-limit-notice {
-    background-color: #4b2b07;
-    border: 1px solid #7c4a02;
-    color: #FFD591;
-    padding: 0.75rem;
-    border-radius: 0.5rem;
-    margin-top: 1rem;
-    font-size: 0.875rem;
-    display: none;
-  }
-  /* File drop zone styles */
-  .file-drop-zone {
-    position: fixed;
-    top: 0;
-    left: 0;
-    width: 100%;
-    height: 100%;
-    background-color: rgba(15, 79, 155, 0.15);
-    z-index: 1000;
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    opacity: 0;
-    pointer-events: none;
-    transition: opacity 0.3s ease;
-  }
-  .file-drop-zone.active {
-    opacity: 1;
-    pointer-events: all;
-  }
-  .drop-indicator {
-    background-color: var(--card-bg);
-    border: 2px dashed var(--primary-color);
-    border-radius: 1rem;
-    padding: 2rem;
-    text-align: center;
-    width: 60%;
-    max-width: 400px;
-    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
-    animation: pulse 2s infinite;
-  }
-  @keyframes pulse {
-    0% { transform: scale(1); }
-    50% { transform: scale(1.05); }
-    100% { transform: scale(1); }
-  }
-  .drop-indicator p {
-    margin-bottom: 0.5rem;
-    color: var(--text-color);
-    font-size: 1.2rem;
-  }
-  .file-icon {
-    font-size: 3rem;
-    margin-bottom: 1rem;
-    color: var(--primary-color);
-  }
-  .file-upload-icon {
-    position: fixed;
-    bottom: 20px;
-    left: 20px;
-    width: 45px;
-    height: 45px;
-    background-color: var(--card-bg);
-    border-radius: 50%;
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    cursor: pointer;
-    z-index: 100;
-    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
-    transition: transform 0.2s ease, box-shadow 0.2s ease;
-  }
-  .file-upload-icon:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
-  }
-  .file-upload-icon span {
-    font-size: 1.5rem;
-    color: var(--primary-color);
-  }
-  .file-info {
-    position: fixed;
-    bottom: 20px;
-    left: 75px;
-    background-color: var(--card-bg);
-    color: var(--primary-color);
-    font-weight: 500;
-    padding: 0.5rem 1rem;
-    border-radius: 1rem;
-    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
-    max-width: 270px;
-    white-space: nowrap;
-    overflow: hidden;
-    text-overflow: ellipsis;
-    z-index: 100;
-    display: none;
-  }
-  .file-detach {
-    margin-left: 8px;
-    display: inline-block;
-    width: 18px;
-    height: 18px;
-    background-color: rgba(255, 255, 255, 0.1);
-    color: var(--text-color);
-    border-radius: 50%;
-    text-align: center;
-    line-height: 16px;
-    font-size: 12px;
-    cursor: pointer;
-    transition: all 0.2s ease;
-  }
-  .file-detach:hover {
-    background-color: rgba(255, 0, 0, 0.2);
-    color: #ff6b6b;
-    transform: scale(1.1);
-  }
-  .preview-notice {
-    background-color: #273c56;
-    border: 1px solid #365a82;
-    color: #89b4e8;
-    padding: 0.75rem;
-    border-radius: 0.5rem;
-    margin-top: 1rem;
-    font-size: 0.875rem;
-    display: none;
-  }
-  .custom-model-wrapper {
-    position: relative;
-  }
-  .model-badge {
-    position: absolute;
-    top: -10px;
-    right: -5px;
-    background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
-    color: white;
-    font-size: 0.7rem;
-    font-weight: 700;
-    padding: 0.25rem 0.5rem;
-    border-radius: 999px;
-    transform: scale(0);
-    transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
-    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
-    z-index: 10;
-  }
-  .model-badge.show {
-    transform: scale(1);
-  }
-  .custom-model-help {
-    display: inline-block;
-    width: 16px;
-    height: 16px;
-    line-height: 16px;
-    font-size: 11px;
-    font-weight: bold;
-    text-align: center;
-    background-color: var(--secondary-text);
-    color: var(--card-bg);
-    border-radius: 50%;
-    margin-left: 5px;
-    cursor: help;
-    vertical-align: middle;
-  }
-  .tooltip {
-    position: absolute;
-    top: 100%;
-    left: 0;
-    width: 280px;
-    background-color: #333;
-    color: #fff;
-    padding: 0.75rem;
-    border-radius: 0.5rem;
-    font-size: 0.8rem;
-    margin-top: 0.5rem;
-    z-index: 100;
-    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-    opacity: 0;
-    visibility: hidden;
-    transition: opacity 0.2s, visibility 0.2s;
-  }
-  .custom-model-help:hover + .tooltip {
-    opacity: 1;
-    visibility: visible;
-  }
-  /* Tokenizer info icon and tooltip styles */
-  .tokenizer-info-icon {
-    display: inline-flex;
-    align-items: center;
-    justify-content: center;
-    width: 24px;
-    height: 24px;
-    background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
-    color: white;
-    border-radius: 50%;
-    position: absolute;
-    left: -32px; /* Position to the left of the selector */
-    top: 50%;
-    transform: translateY(-50%);
-    cursor: pointer;
-    font-size: 12px;
-    font-weight: bold;
-    transition: all 0.2s ease;
-    z-index: 10;
-    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
-  }
-  .tokenizer-info-icon:hover {
-    transform: translateY(-50%) scale(1.1);
-    box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
-  }
-  /* Watermark styles */
-  .watermark {
-    position: fixed;
-    bottom: 20px;
-    right: 20px;
-    color: var(--primary-color);
-    font-size: 1.4rem;
-    font-weight: 700;
-    opacity: 0.25; /* Semi-transparent */
-    z-index: 100;
-    transition: opacity 0.3s ease;
-    text-decoration: none;
-    pointer-events: auto; /* Ensure it remains clickable */
-  }
-  .watermark:hover {
-    opacity: 0.6; /* Increase opacity on hover */
-  }
-  .tokenizer-info-tooltip {
-    position: absolute;
-    top: calc(100% + 8px);
-    left: -30px; /* Adjust position to align with the icon */
-    width: 300px;
-    background-color: var(--card-bg);
-    color: var(--text-color);
-    border: 1px solid var(--primary-color);
-    border-radius: 0.75rem;
-    box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
-    padding: 1rem;
-    z-index: 1000; /* Increase z-index to ensure visibility */
-    opacity: 0;
-    visibility: hidden;
-    transition: opacity 0.3s, visibility 0.3s;
-    pointer-events: none; /* Initially disable pointer events */
-  }
-  .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
-  opacity: 1;
-  visibility: visible;
-  pointer-events: auto;
-  }
-  .tokenizer-info-tooltip:hover {
-    opacity: 1;
-    visibility: visible;
-    pointer-events: auto;
-  }
-  .tokenizer-info-header {
-    font-size: 1.1rem;
-    font-weight: 600;
-    margin-bottom: 0.5rem;
-    padding-bottom: 0.5rem;
-    border-bottom: 1px solid rgba(255, 255, 255, 0.1);
-    color: var(--primary-color);
-  }
-  .tokenizer-info-grid {
-    display: grid;
-    grid-template-columns: repeat(2, 1fr);
-    gap: 0.75rem;
-    margin: 0.75rem 0;
-  }
-  .tokenizer-info-item {
-    display: flex;
-    flex-direction: column;
-  }
-  .tokenizer-info-label {
-    font-size: 0.75rem;
-    color: var(--secondary-text);
-    margin-bottom: 0.25rem;
-  }
-  .tokenizer-info-value {
-    font-size: 0.95rem;
-    font-weight: 500;
-  }
-  .special-tokens-container {
-    margin-top: 0.75rem;
-    background-color: rgba(15, 79, 155, 0.1);
-    border-radius: 0.5rem;
-    padding: 0.5rem;
-    max-height: 100px;
-    overflow-y: auto;
-  }
-  .special-token-item {
-    display: flex;
-    justify-content: space-between;
-    margin-bottom: 0.25rem;
-    font-size: 0.8rem;
-  }
-  .token-name {
-    color: var(--secondary-text);
-  }
-  .token-value {
-    background-color: rgba(255, 255, 255, 0.1);
-    padding: 1px 4px;
-    border-radius: 2px;
-    font-family: monospace;
-  }
-  .tokenizer-info-loading {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    height: 100px;
-  }
-  .tokenizer-info-spinner {
-    width: 30px;
-    height: 30px;
-    border: 3px solid var(--primary-color);
-    border-radius: 50%;
-    border-top-color: transparent;
-    animation: spin 1s linear infinite;
-  }
-  .tokenizer-info-error {
-    color: #f87171;
-    font-size: 0.9rem;
-    text-align: center;
-    padding: 1rem;
-  }
-  @media (max-width: 768px) {
-    .header {
-      flex-direction: column;
-      align-items: stretch;
-      gap: 1rem;
-    }
-    .model-selector {
-      width: 100%;
-    }
-    .stats-grid {
-      grid-template-columns: 1fr;
-    }
-    .tokenizer-info-tooltip {
-      width: 250px;
-    }
-  }
-</style>
-</head>
-<body>
-    <!-- Hidden File Drop Zone that appears when dragging files -->
-    <div id="fileDropZone" class="file-drop-zone">
-        <div class="drop-indicator">
-            <div class="file-icon">📄</div>
-            <p>Drop your file here</p>
-        </div>
-    </div>
-    <!-- File upload icon in bottom left corner -->
-    <div id="fileUploadIcon" class="file-upload-icon">
-        <span>📎</span>
-    </div>
-    <p class="file-info" id="fileInfo"></p>
-    <div class="container">
-        <div class="header">
-            <div class="title-section">
-                <h1 class="title">Token Visualizer</h1>
-                <p class="subtitle">Advanced tokenization analysis and visualization</p>
-            </div>
-            <div class="model-selector">
-                <div class="model-selector-header">
-                    <div class="model-type-toggle">
-                        <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
-                        <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
-                    </div>
-                </div>
-                <div id="predefinedModelSelector">
-                    <div style="position: relative;">
-                        <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
-                        <!-- TOOLTIP MOVED HERE -->
-                        <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
-                            <div id="tokenizerInfoContent">
-                                <div class="tokenizer-info-loading">
-                                    <div class="tokenizer-info-spinner"></div>
-                                </div>
-                            </div>
-                        </div>
-                        <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
-                        <select id="modelSelect" name="model">
-                            {% for model_id, info in models.items() %}
-                            <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
-                                {{ info.alias }}
-                            </option>
-                            {% endfor %}
-                        </select>
-                    </div>
-                </div>
-                <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
-                    <div style="position: relative;">
-                        <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
-                        <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
-                            <div id="customTokenizerInfoContent">
-                                <div class="tokenizer-info-loading">
-                                    <div class="tokenizer-info-spinner"></div>
-                                </div>
-                            </div>
-                        </div>
-                        <input type="text" id="customModelInput" class="custom-model-input"
-                               placeholder="Enter HuggingFace model path"
-                               value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
-                    </div>
-                    <span class="custom-model-help">?</span>
-                    <div class="tooltip">
-                        Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
-                        The model must have a tokenizer available and must be not restricted. (with some exceptions)
-                        Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
-                        Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
-                    </div>
-                    <div class="model-badge" id="modelSuccessBadge">Loaded</div>
-                </div>
-            </div>
-        </div>
-        <div class="error-message" id="errorMessage">{{ error }}</div>
-        <div class="input-section">
-            <form id="analyzeForm" method="POST" enctype="multipart/form-data">
-                <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
-                <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
-                <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
-                <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
-                <input type="file" name="file" id="fileInput" style="display: none;">
-                <div class="button-container">
-                    <button type="submit" id="analyzeButton">Analyze Text</button>
-                </div>
-            </form>
-        </div>
-        <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
-            <div class="card">
-                <h2 class="card-title">Token Visualization</h2>
-                <div class="preview-notice" id="previewNotice">
-                    Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
-                </div>
-                <div class="token-container" id="tokenContainer">
-                    {% if token_data %}
-                    {% for token in token_data.tokens %}
-                    <span class="token"
-                          style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
-                          title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
-                        {{ token.display }}
-                    </span>
-                    {% if token.newline %}<br>{% endif %}
-                    {% endfor %}
-                    {% endif %}
-                </div>
-                <button class="expand-button" id="expandButton">Show More</button>
-                <div class="display-limit-notice" id="displayLimitNotice">
-                    Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
-                </div>
-            </div>
-            <div class="stats-grid">
-                <div class="stat-card">
-                    <div class="stat-title">Total Tokens</div>
-                    <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
-                    <div class="stat-description">
-                        <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
-                        (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
-                    </div>
-                </div>
-                <div class="stat-card">
-                    <div class="stat-title">Token Types</div>
-                    <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
-                    <div class="stat-description">special tokens</div>
-                </div>
-                <div class="stat-card">
-                    <div class="stat-title">Whitespace</div>
-                    <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
-                    <div class="stat-description">
-                        spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
-                        newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
-                    </div>
-                </div>
-                <div class="stat-card">
-                    <div class="stat-title">Token Length</div>
-                    <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
-                    <div class="stat-description">
-                        median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
-                        ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
-                    </div>
-                </div>
-                <div class="stat-card">
-                    <div class="stat-title">Compression</div>
-                    <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
-                    <div class="stat-description">characters per token</div>
-                </div>
-            </div>
-        </div>
-    </div>
-    <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
-        @bartar/tokenizers
-    </a>
-    <script>
-        $(document).ready(function() {
-            // File handling variables
-            let currentFile = null;
-            let originalTextContent = null;
-            let lastUploadedFileName = null;
-            let fileJustUploaded = false;  // Flag to prevent immediate detachment
-            let currentModelType = "{{ model_type if model_type else 'predefined' }}";
-            let currentTokenizerInfo = null;
-            // Try to parse tokenizer info if available from server
-            try {
-                currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }};
-                if (currentTokenizerInfo) {
-                    updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
-                }
-            } catch(e) {
-                console.error("Error parsing tokenizer info:", e);
-            }
-            // Show error if exists
-            if ("{{ error }}".length > 0) {
-                showError("{{ error }}");
-            }
-            // Setup model type based on initial state
-            if (currentModelType === "custom") {
-                $('.toggle-option').removeClass('active');
-                $('.custom-toggle').addClass('active');
-                $('#predefinedModelSelector').hide();
-                $('#customModelSelector').show();
-            }
-            // Show success badge if custom model loaded successfully
-            if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
-                $('#modelSuccessBadge').addClass('show');
-                setTimeout(() => {
-                    $('#modelSuccessBadge').removeClass('show');
-                }, 3000);
-            }
-            // Toggle between predefined and custom model inputs
-            $('.toggle-option').click(function() {
-                const modelType = $(this).data('type');
-                $('.toggle-option').removeClass('active');
-                $(this).addClass('active');
-                currentModelType = modelType;
-                if (modelType === 'predefined') {
-                    $('#predefinedModelSelector').show();
-                    $('#customModelSelector').hide();
-                    $('#modelTypeInput').val('predefined');
-                    // Set the model input value to the selected predefined model
-                    $('#modelInput').val($('#modelSelect').val());
-                } else {
-                    $('#predefinedModelSelector').hide();
-                    $('#customModelSelector').show();
-                    $('#modelTypeInput').val('custom');
-                }
-                // Clear tokenizer info if switching models
-                if (modelType === 'predefined') {
-                    $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
-                    fetchTokenizerInfo($('#modelSelect').val(), false);
-                } else {
-                    $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
-                    // Only fetch if there's a custom model value
-                    const customModel = $('#customModelInput').val();
-                    if (customModel) {
-                        fetchTokenizerInfo(customModel, true);
-                    }
-                }
-            });
-            // Update hidden input when custom model input changes
-            $('#customModelInput').on('input', function() {
-                $('#customModelInputHidden').val($(this).val());
-            });
-            function showError(message) {
-                const errorDiv = $('#errorMessage');
-                errorDiv.text(message);
-                errorDiv.show();
-                setTimeout(() => errorDiv.fadeOut(), 5000);
-            }
-            // Function to update tokenizer info display in tooltip
-            function updateTokenizerInfoDisplay(info, isCustom = false) {
-                const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
-                let htmlContent = '';
-                if (info.error) {
-                    $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
-                    return;
-                }
-                // Start building the tooltip content
-                htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
-                              <div class="tokenizer-info-grid">`;
-                // Dictionary size
-                if (info.vocab_size) {
-                    htmlContent += `
-                        <div class="tokenizer-info-item">
-                            <span class="tokenizer-info-label">Dictionary Size</span>
-                            <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
-                        </div>`;
-                }
-                // Tokenizer type
-                if (info.tokenizer_type) {
-                    htmlContent += `
-                        <div class="tokenizer-info-item">
-                            <span class="tokenizer-info-label">Tokenizer Type</span>
-                            <span class="tokenizer-info-value">${info.tokenizer_type}</span>
-                        </div>`;
-                }
-                // Max length
-                if (info.model_max_length) {
-                    htmlContent += `
-                        <div class="tokenizer-info-item">
-                            <span class="tokenizer-info-label">Max Length</span>
-                            <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
-                        </div>`;
-                }
-                htmlContent += `</div>`;  // Close tokenizer-info-grid
-                // Special tokens section
-                if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
-                    htmlContent += `
-                        <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
-                            <span class="tokenizer-info-label">Special Tokens</span>
-                            <div class="special-tokens-container">`;
-                    // Add each special token with proper escaping for HTML special characters
-                    for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
-                        // Properly escape HTML special characters
-                        const escapedValue = tokenValue
-                            .replace(/&/g, '&amp;')
-                            .replace(/</g, '&lt;')
-                            .replace(/>/g, '&gt;')
-                            .replace(/"/g, '&quot;')
-                            .replace(/'/g, '&#039;');
-                        htmlContent += `
-                            <div class="special-token-item">
-                                <span class="token-name">${tokenName}:</span>
-                                <span class="token-value">${escapedValue}</span>
-                            </div>`;
-                    }
-                    htmlContent += `
-                            </div>
-                        </div>`;
-                }
-                $(targetSelector).html(htmlContent);
-            }
-            // Function to fetch tokenizer info
-            function fetchTokenizerInfo(modelId, isCustom = false) {
-                if (!modelId) return;
-                const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
-                $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
-                $.ajax({
-                    url: '/tokenizer-info',
-                    method: 'GET',
-                    data: {
-                        model_id: modelId,
-                        is_custom: isCustom
-                    },
-                    success: function(response) {
-                        if (response.error) {
-                            $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
-                        } else {
-                            currentTokenizerInfo = response;
-                            updateTokenizerInfoDisplay(response, isCustom);
-                        }
-                    },
-                    error: function(xhr) {
-                        $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
-                    }
-                });
-            }
-            function updateResults(data) {
-                $('#results').show();
-                // Update tokens
-                const tokenContainer = $('#tokenContainer');
-                tokenContainer.empty();
-                data.tokens.forEach(token => {
-                    const span = $('<span>')
-                        .addClass('token')
-                        .css({
-                            'background-color': token.colors.background,
-                            'color': token.colors.text
-                        })
-                        // Include token id in the tooltip on hover
-                        .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
-                        .text(token.display);
-                    tokenContainer.append(span);
-                    if (token.newline) {
-                        tokenContainer.append('<br>');
-                    }
-                });
-                // Update display limit notice
-                if (data.display_limit_reached) {
-                    $('#displayLimitNotice').show();
-                    $('#totalTokenCount').text(data.total_tokens);
-                } else {
-                    $('#displayLimitNotice').hide();
-                }
-                // Update preview notice
-                if (data.preview_only) {
-                    $('#previewNotice').show();
-                } else {
-                    $('#previewNotice').hide();
-                }
-                // Update basic stats
-                $('#totalTokens').text(data.stats.basic_stats.total_tokens);
-                $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
-                $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
-                $('#specialTokens').text(data.stats.basic_stats.special_tokens);
-                $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
-                $('#spaceCount').text(data.stats.basic_stats.space_tokens);
-                $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
-                $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
-                // Update length stats
-                $('#avgLength').text(data.stats.length_stats.avg_length);
-                $('#medianLength').text(data.stats.length_stats.median_length);
-                $('#stdDev').text(data.stats.length_stats.std_dev);
-                // Update tokenizer info if available
-                if (data.tokenizer_info) {
-                    currentTokenizerInfo = data.tokenizer_info;
-                    updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
-                }
-            }
-            // Handle text changes to detach file
-            $('#textInput').on('input', function() {
-                // Skip if file was just uploaded (prevents immediate detachment)
-                if (fileJustUploaded) {
-                    fileJustUploaded = false;
-                    return;
-                }
-                const currentText = $(this).val();
-                const fileInput = document.getElementById('fileInput');
-                // Only detach if a file exists and text has been substantially modified
-                if (fileInput.files.length > 0 && originalTextContent !== null) {
-                    // Check if the text is completely different or has been significantly changed
-                    // This allows for small edits without detaching
-                    const isMajorChange =
-                        currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
-                        (currentText.length > 0 &&
-                         currentText !== originalTextContent.substring(0, currentText.length) &&
-                         currentText.substring(0, Math.min(20, currentText.length)) !==
-                         originalTextContent.substring(0, Math.min(20, currentText.length)));
-                    if (isMajorChange) {
-                        detachFile();
-                    }
-                }
-            });
-            // Function to detach file
-            function detachFile() {
-                // Clear the file input
-                $('#fileInput').val('');
-                // Hide file info
-                $('#fileInfo').fadeOut(300);
-                // Reset the original content tracker
-                originalTextContent = $('#textInput').val();
-                // Reset last uploaded filename
-                lastUploadedFileName = null;
-            }
-            // For model changes
-            $('#modelSelect').change(function() {
-                const selectedModel = $(this).val();
-                $('#modelInput').val(selectedModel);
-                // Fetch tokenizer info for the selected model
-                fetchTokenizerInfo(selectedModel, false);
-                // If text exists, submit the form
-                if ($('#textInput').val().trim()) {
-                    $('#analyzeForm').submit();
-                }
-            });
-            // File drop handling
-            const fileDropZone = $('#fileDropZone');
-            const fileUploadIcon = $('#fileUploadIcon');
-            // Prevent default drag behaviors
-            ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
-                fileDropZone[0].addEventListener(eventName, preventDefaults, false);
-                document.body.addEventListener(eventName, preventDefaults, false);
-            });
-            function preventDefaults(e) {
-                e.preventDefault();
-                e.stopPropagation();
-            }
-            // Show drop zone when file is dragged over the document
-            document.addEventListener('dragenter', showDropZone, false);
-            document.addEventListener('dragover', showDropZone, false);
-            fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
-            fileDropZone[0].addEventListener('drop', hideDropZone, false);
-            function showDropZone(e) {
-                fileDropZone.addClass('active');
-            }
-            function hideDropZone() {
-                fileDropZone.removeClass('active');
-            }
-            // Handle dropped files
-            fileDropZone[0].addEventListener('drop', handleDrop, false);
-            function handleDrop(e) {
-                const dt = e.dataTransfer;
-                const files = dt.files;
-                handleFiles(files);
-            }
-            // Also handle file selection via click on the icon
-            fileUploadIcon.on('click', function() {
-                const input = document.createElement('input');
-                input.type = 'file';
-                input.onchange = e => {
-                    handleFiles(e.target.files);
-                };
-                input.click();
-            });
-            function handleFiles(files) {
-                if (files.length) {
-                    const file = files[0];
-                    currentFile = file;
-                    lastUploadedFileName = file.name;
-                    fileJustUploaded = true; // Set flag to prevent immediate detachment
-                    // Show file info with animation and add detach button
-                    $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
-                    // Add click handler for detach button
-                    $('#fileDetach').on('click', function(e) {
-                        e.stopPropagation(); // Prevent event bubbling
-                        detachFile();
-                        return false;
-                    });
-                    // Set the file to the file input
-                    const dataTransfer = new DataTransfer();
-                    dataTransfer.items.add(file);
-                    document.getElementById('fileInput').files = dataTransfer.files;
-                    // Preview in textarea (first 8096 chars)
-                    const reader = new FileReader();
-                    reader.onload = function(e) {
-                        const previewText = e.target.result.slice(0, 8096);
-                        $('#textInput').val(previewText);
-                        // Store this as the original content AFTER setting the value
-                        // to prevent the input event from firing and detaching immediately
-                        setTimeout(() => {
-                            originalTextContent = previewText;
-                            // Automatically submit for analysis
-                            $('#analyzeForm').submit();
-                        }, 50);
-                    };
-                    reader.readAsText(file);
-                }
-            }
-            function formatFileSize(bytes) {
-                if (bytes < 1024) return bytes + ' bytes';
-                else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
-                else return (bytes / 1048576).toFixed(1) + ' MB';
-            }
-            // Make sure to check if there's still a file when analyzing
-            $('#analyzeForm').on('submit', function(e) {
-                e.preventDefault();
-                // Skip detachment check if file was just uploaded
-                if (!fileJustUploaded) {
-                    // Check if text has been changed but file is still attached
-                    const textInput = $('#textInput').val();
-                    const fileInput = document.getElementById('fileInput');
-                    if (fileInput.files.length > 0 &&
-                        originalTextContent !== null &&
-                        textInput !== originalTextContent &&
-                        textInput.length < originalTextContent.length * 0.8) {
-                        // Text was significantly changed but file is still attached, detach it
-                        detachFile();
-                    }
-                } else {
-                    // Reset flag after first submission
-                    fileJustUploaded = false;
-                }
-                // Update the hidden inputs based on current model type
-                if (currentModelType === 'custom') {
-                    $('#customModelInputHidden').val($('#customModelInput').val());
-                } else {
-                    $('#modelInput').val($('#modelSelect').val());
-                }
-                const formData = new FormData(this);
-                $('#analyzeButton').prop('disabled', true);
-                $.ajax({
-                    url: '/',
-                    method: 'POST',
-                    data: formData,
-                    processData: false,
-                    contentType: false,
-                    success: function(response) {
-                        if (response.error) {
-                            showError(response.error);
-                        } else {
-                            updateResults(response);
-                            // Show success badge if custom model
-                            if (currentModelType === 'custom') {
-                                $('#modelSuccessBadge').addClass('show');
-                                setTimeout(() => {
-                                    $('#modelSuccessBadge').removeClass('show');
-                                }, 3000);
-                            }
-                        }
-                    },
-                    error: function(xhr) {
-                        showError(xhr.responseText || 'An error occurred while processing the text');
-                    },
-                    complete: function() {
-                        $('#analyzeButton').prop('disabled', false);
-                    }
-                });
-            });
-            $('#expandButton').click(function() {
-                const container = $('#tokenContainer');
-                const isExpanded = container.hasClass('expanded');
-                container.toggleClass('expanded');
-                $(this).text(isExpanded ? 'Show More' : 'Show Less');
-            });
-            // Initialize tokenizer info for current model
-            if (currentModelType === 'predefined') {
-                fetchTokenizerInfo($('#modelSelect').val(), false);
-            } else if ($('#customModelInput').val()) {
-                fetchTokenizerInfo($('#customModelInput').val(), true);
-            }
-            // Add event listener for custom model input
-            $('#customModelInput').on('change', function() {
-                const modelValue = $(this).val();
-                if (modelValue) {
-                    fetchTokenizerInfo(modelValue, true);
-                }
-            });
-        });
-    </script>
-</body>
-</html>
-"""
-@app.route('/tokenizer-info', methods=['GET'])
-def tokenizer_info():
-    """
-    Endpoint to get tokenizer information without processing text.
-    """
-    model_id = request.args.get('model_id', '')
-    is_custom = request.args.get('is_custom', 'false').lower() == 'true'
-    if not model_id:
-        return jsonify({"error": "No model ID provided"}), 400
-    try:
-        # For predefined models, use the model name from the dictionary
-        if not is_custom and model_id in TOKENIZER_MODELS:
-            model_id_or_name = model_id
-        else:
-            # For custom models, use the model ID directly
-            model_id_or_name = model_id
-        # Load the tokenizer and get info
-        tokenizer, info, error = load_tokenizer(model_id_or_name)
-        if error:
-            return jsonify({"error": error}), 400
-        return jsonify(info)
-    except Exception as e:
-        return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
-@app.route('/', methods=['GET', 'POST'])
-def index():
-    text = ""
-    token_data = None
-    error_message = ""
-    selected_model = request.args.get('model', request.form.get('model', 'qwen3'))
-    custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
-    model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
-    # Determine which model to use based on model_type
-    model_to_use = selected_model if model_type == 'predefined' else custom_model
-    if request.method == 'POST':
-        # Check if file upload
-        if 'file' in request.files and request.files['file'].filename:
-            uploaded_file = request.files['file']
-            # Save file to tmp directory
-            file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
-            uploaded_file.save(file_path)
-            # Read a small preview of the file
-            with open(file_path, 'r', errors='replace') as f:
-                text = f.read(8096)
-            try:
-                # Process the file
-                token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
-                # Clean up the file after processing
-                if os.path.exists(file_path):
-                    os.remove(file_path)
-                # If request is AJAX, return JSON
-                if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
-                    return jsonify(token_data)
-            except Exception as e:
-                error_message = str(e)
-                # Clean up the file after processing
-                if os.path.exists(file_path):
-                    os.remove(file_path)
-                if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
-                    return jsonify({"error": error_message}), 400
-                return render_template_string(
-                    HTML_TEMPLATE,
-                    text=text,
-                    token_data=None,
-                    models=TOKENIZER_MODELS,
-                    selected_model=selected_model,
-                    custom_model=custom_model,
-                    model_type=model_type,
-                    error=error_message
-                )
-        # Regular text processing
-        else:
-            text = request.form.get('text', '')
-            if text:
-                try:
-                    token_data = process_text(text, model_to_use)
-                    # If request is AJAX, return JSON
-                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
-                        return jsonify(token_data)
-                except Exception as e:
-                    error_message = str(e)
-                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
-                        return jsonify({"error": error_message}), 400
-                    return render_template_string(
-                        HTML_TEMPLATE,
-                        text=text,
-                        token_data=None,
-                        models=TOKENIZER_MODELS,
-                        selected_model=selected_model,
-                        custom_model=custom_model,
-                        model_type=model_type,
-                        error=error_message
-                    )
-    return render_template_string(
-        HTML_TEMPLATE,
-        text=text,
-        token_data=token_data,
-        models=TOKENIZER_MODELS,
-        selected_model=selected_model,
-        custom_model=custom_model,
-        model_type=model_type,
-        error=error_message
-    )
-if __name__ == "__main__":
-    app.run(host='0.0.0.0', port=7860, debug=False)

+#!/usr/bin/env python3
+"""
+Tokenizer Pro - HuggingFace Space Launcher
+This file serves as the entry point for HuggingFace Spaces, which expects app.py.
+It imports and runs the restructured Flask application from the app/ package.
+"""
+import os
+from app import create_app
+from config import Config, DevelopmentConfig, ProductionConfig
+def get_config_class():
+    """Determine which configuration class to use based on environment."""
+    env = os.getenv('FLASK_ENV', 'development').lower()
+    if env == 'production':
+        return ProductionConfig
+    elif env == 'development':
+        return DevelopmentConfig
+    else:
+        return Config
+# Create the Flask application using the app factory
+app = create_app(get_config_class())
+if __name__ == "__main__":
+    # Get configuration from environment variables
+    host = os.getenv('HOST', '0.0.0.0')
+    port = int(os.getenv('PORT', 7860))
+    debug = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
+    app.run(host=host, port=port, debug=debug)

app/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import logging
+from logging.handlers import RotatingFileHandler
+from flask import Flask
+from config import Config
+def create_app(config_class=Config):
+    """Create and configure Flask application."""
+    app = Flask(__name__)
+    app.config.from_object(config_class)
+    # Ensure upload folder exists
+    if not os.path.exists(app.config['UPLOAD_FOLDER']):
+        os.makedirs(app.config['UPLOAD_FOLDER'])
+    # Ensure HuggingFace cache directory exists
+    if not os.path.exists(app.config['HF_HOME']):
+        os.makedirs(app.config['HF_HOME'])
+    # Set HuggingFace environment variables
+    os.environ['HF_HOME'] = app.config['HF_HOME']
+    if 'HF_CACHE_DIR' in app.config:
+        os.environ['HF_CACHE_DIR'] = app.config['HF_CACHE_DIR']
+    # Register Blueprints
+    from app.routes import main_bp
+    app.register_blueprint(main_bp)
+    # Setup logging
+    if not app.debug and not app.testing:
+        if not os.path.exists('logs'):
+            os.mkdir('logs')
+        file_handler = RotatingFileHandler(
+            f'logs/{app.config.get("LOG_FILE", "tokenizer_pro.log")}',
+            maxBytes=app.config.get("LOG_MAX_BYTES", 10 * 1024 * 1024),
+            backupCount=app.config.get("LOG_BACKUP_COUNT", 3)
+        )
+        file_handler.setFormatter(logging.Formatter(
+            '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
+        ))
+        log_level = getattr(logging, app.config.get("LOG_LEVEL", "INFO").upper())
+        file_handler.setLevel(log_level)
+        app.logger.addHandler(file_handler)
+        app.logger.setLevel(log_level)
+        app.logger.info('Tokenizer Pro startup')
+    return app

app/routes.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import os
+from flask import Blueprint, request, render_template, jsonify, current_app
+# Import services
+from .services.tokenizer_service import tokenizer_service
+from .services.file_service import file_service
+from .utils.validators import validators, ValidationError
+# Create Blueprint
+main_bp = Blueprint('main', __name__)
+@main_bp.route('/tokenizer-info', methods=['GET'])
+def tokenizer_info():
+    """Endpoint to get tokenizer information without processing text."""
+    model_id = request.args.get('model_id', '')
+    is_custom = request.args.get('is_custom', 'false').lower() == 'true'
+    if not model_id:
+        return jsonify({"error": "No model ID provided"}), 400
+    try:
+        # Validate custom model path if it's a custom model
+        if is_custom:
+            try:
+                validators.validate_model_path(model_id)
+            except ValidationError as e:
+                return jsonify({"error": str(e)}), 400
+        # For predefined models, use the model name from the dictionary
+        if not is_custom and tokenizer_service.is_predefined_model(model_id):
+            model_id_or_name = model_id
+        else:
+            # For custom models, use the model ID directly
+            model_id_or_name = model_id
+        # Load the tokenizer and get info
+        tokenizer, info, error = tokenizer_service.load_tokenizer(model_id_or_name)
+        if error:
+            return jsonify({"error": error}), 400
+        return jsonify(info)
+    except Exception as e:
+        return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
+@main_bp.route('/', methods=['GET', 'POST'])
+def index():
+    text = ""
+    token_data = None
+    error_message = ""
+    selected_model = request.args.get('model', request.form.get('model', 'qwen3'))
+    custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
+    model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
+    # Determine which model to use based on model_type
+    model_to_use = selected_model if model_type == 'predefined' else custom_model
+    if request.method == 'POST':
+        # Check if file upload
+        if 'file' in request.files and request.files['file'].filename:
+            uploaded_file = request.files['file']
+            try:
+                # Validate file
+                validators.validate_filename(uploaded_file.filename)
+                validators.validate_file_extension(uploaded_file.filename, file_service.ALLOWED_EXTENSIONS)
+                # Validate custom model if needed
+                if model_type == 'custom' and custom_model:
+                    validators.validate_model_path(custom_model)
+                # Save file securely
+                file_path = file_service.save_uploaded_file(uploaded_file, current_app.config['UPLOAD_FOLDER'])
+                # Read a small preview of the file
+                preview_char_limit = current_app.config.get('PREVIEW_CHAR_LIMIT', 8096)
+                with open(file_path, 'r', errors='replace') as f:
+                    text = f.read(preview_char_limit)
+                try:
+                    # Process the file using file service
+                    token_data = file_service.process_file_for_tokenization(
+                        file_path=file_path,
+                        model_id_or_name=model_to_use,
+                        preview_char_limit=preview_char_limit,
+                        max_display_tokens=current_app.config.get('MAX_DISPLAY_TOKENS', 50000),
+                        chunk_size=current_app.config.get('CHUNK_SIZE', 1024 * 1024)
+                    )
+                    # Clean up the file after processing
+                    file_service.cleanup_file(file_path)
+                    # If request is AJAX, return JSON
+                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                        return jsonify(token_data)
+                except Exception as e:
+                    error_message = str(e)
+                    file_service.cleanup_file(file_path)
+                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                        return jsonify({"error": error_message}), 400
+                    return render_template(
+                        'index.html',
+                        text=text,
+                        token_data=None,
+                        models=tokenizer_service.TOKENIZER_MODELS,
+                        selected_model=selected_model,
+                        custom_model=custom_model,
+                        model_type=model_type,
+                        error=error_message
+                    )
+            except ValidationError as e:
+                error_message = str(e)
+                if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                    return jsonify({"error": error_message}), 400
+                return render_template(
+                    'index.html',
+                    text="",
+                    token_data=None,
+                    models=tokenizer_service.TOKENIZER_MODELS,
+                    selected_model=selected_model,
+                    custom_model=custom_model,
+                    model_type=model_type,
+                    error=error_message
+                )
+        # Regular text processing
+        else:
+            text = request.form.get('text', '')
+            if text:
+                try:
+                    # Validate text input
+                    validators.validate_text_input(text)
+                    # Validate custom model if needed
+                    if model_type == 'custom' and custom_model:
+                        validators.validate_model_path(custom_model)
+                    # Process text using file service
+                    token_data = file_service.process_text_for_tokenization(
+                        text=text,
+                        model_id_or_name=model_to_use,
+                        preview_char_limit=current_app.config.get('PREVIEW_CHAR_LIMIT', 8096),
+                        max_display_tokens=current_app.config.get('MAX_DISPLAY_TOKENS', 50000)
+                    )
+                    # If request is AJAX, return JSON
+                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                        return jsonify(token_data)
+                except ValidationError as e:
+                    error_message = str(e)
+                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                        return jsonify({"error": error_message}), 400
+                    return render_template(
+                        'index.html',
+                        text=text,
+                        token_data=None,
+                        models=tokenizer_service.TOKENIZER_MODELS,
+                        selected_model=selected_model,
+                        custom_model=custom_model,
+                        model_type=model_type,
+                        error=error_message
+                    )
+                except Exception as e:
+                    error_message = str(e)
+                    if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
+                        return jsonify({"error": error_message}), 400
+                    return render_template(
+                        'index.html',
+                        text=text,
+                        token_data=None,
+                        models=tokenizer_service.TOKENIZER_MODELS,
+                        selected_model=selected_model,
+                        custom_model=custom_model,
+                        model_type=model_type,
+                        error=error_message
+                    )
+    return render_template(
+        'index.html',
+        text=text,
+        token_data=token_data,
+        models=tokenizer_service.TOKENIZER_MODELS,
+        selected_model=selected_model,
+        custom_model=custom_model,
+        model_type=model_type,
+        error=error_message
+    )
+@main_bp.route('/health', methods=['GET'])
+def health_check():
+    """Basic health check endpoint."""
+    import time
+    import psutil
+    from flask import __version__ as flask_version
+    try:
+        # Basic application status
+        status = {
+            "status": "healthy",
+            "timestamp": int(time.time()),
+            "version": "1.0.0",
+            "flask_version": flask_version,
+            "uptime": int(time.time()),  # Simple uptime since this request
+        }
+        return jsonify(status), 200
+    except Exception as e:
+        return jsonify({
+            "status": "unhealthy",
+            "error": str(e),
+            "timestamp": int(time.time())
+        }), 500
+@main_bp.route('/health/detailed', methods=['GET'])
+def detailed_health_check():
+    """Detailed health check with system and service status."""
+    import time
+    import psutil
+    import os
+    from flask import __version__ as flask_version
+    try:
+        # System information
+        cpu_percent = psutil.cpu_percent(interval=1)
+        memory = psutil.virtual_memory()
+        disk = psutil.disk_usage('/')
+        # Check tokenizer service
+        tokenizer_status = "healthy"
+        tokenizer_cache_size = len(tokenizer_service.tokenizers) + len(tokenizer_service.custom_tokenizers)
+        # Test basic tokenizer loading
+        try:
+            test_tokenizer, _, error = tokenizer_service.load_tokenizer('gpt2')
+            if error:
+                tokenizer_status = f"warning: {error}"
+        except Exception as e:
+            tokenizer_status = f"error: {str(e)}"
+        # Check upload directory
+        upload_folder = current_app.config.get('UPLOAD_FOLDER', '/tmp')
+        upload_dir_exists = os.path.exists(upload_folder)
+        upload_dir_writable = os.access(upload_folder, os.W_OK) if upload_dir_exists else False
+        status = {
+            "status": "healthy",
+            "timestamp": int(time.time()),
+            "version": "1.0.0",
+            "flask_version": flask_version,
+            "system": {
+                "cpu_percent": round(cpu_percent, 1),
+                "memory": {
+                    "total": memory.total,
+                    "available": memory.available,
+                    "percent": memory.percent,
+                    "used": memory.used
+                },
+                "disk": {
+                    "total": disk.total,
+                    "used": disk.used,
+                    "free": disk.free,
+                    "percent": round((disk.used / disk.total) * 100, 1)
+                }
+            },
+            "services": {
+                "tokenizer_service": {
+                    "status": tokenizer_status,
+                    "cached_tokenizers": tokenizer_cache_size,
+                    "available_models": len(tokenizer_service.TOKENIZER_MODELS)
+                },
+                "file_service": {
+                    "upload_directory": upload_folder,
+                    "directory_exists": upload_dir_exists,
+                    "directory_writable": upload_dir_writable,
+                    "allowed_extensions": list(file_service.ALLOWED_EXTENSIONS)
+                }
+            },
+            "configuration": {
+                "max_content_length": current_app.config.get('MAX_CONTENT_LENGTH'),
+                "cache_expiration": current_app.config.get('CACHE_EXPIRATION', 3600),
+                "max_display_tokens": current_app.config.get('MAX_DISPLAY_TOKENS', 50000),
+                "preview_char_limit": current_app.config.get('PREVIEW_CHAR_LIMIT', 8096)
+            }
+        }
+        # Determine overall status
+        overall_status = "healthy"
+        if tokenizer_status.startswith("error"):
+            overall_status = "unhealthy"
+        elif tokenizer_status.startswith("warning") or not upload_dir_writable:
+            overall_status = "degraded"
+        status["status"] = overall_status
+        return jsonify(status), 200 if overall_status == "healthy" else 503
+    except Exception as e:
+        return jsonify({
+            "status": "unhealthy",
+            "error": str(e),
+            "timestamp": int(time.time())
+        }), 500
+@main_bp.route('/health/ready', methods=['GET'])
+def readiness_check():
+    """Readiness check - determines if the application is ready to serve requests."""
+    try:
+        # Check if core services are ready
+        checks = {
+            "tokenizer_service": False,
+            "file_service": False,
+            "configuration": False
+        }
+        # Test tokenizer service
+        try:
+            test_tokenizer, _, error = tokenizer_service.load_tokenizer('gpt2')
+            checks["tokenizer_service"] = error is None
+        except:
+            checks["tokenizer_service"] = False
+        # Test file service
+        try:
+            upload_folder = current_app.config.get('UPLOAD_FOLDER', '/tmp')
+            checks["file_service"] = os.path.exists(upload_folder) and os.access(upload_folder, os.W_OK)
+        except:
+            checks["file_service"] = False
+        # Check configuration
+        required_configs = ['MAX_CONTENT_LENGTH', 'UPLOAD_FOLDER']
+        checks["configuration"] = all(current_app.config.get(config) is not None for config in required_configs)
+        all_ready = all(checks.values())
+        return jsonify({
+            "ready": all_ready,
+            "checks": checks,
+            "timestamp": int(time.time())
+        }), 200 if all_ready else 503
+    except Exception as e:
+        return jsonify({
+            "ready": False,
+            "error": str(e),
+            "timestamp": int(time.time())
+        }), 500

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Service layer for Tokenizer Pro
+"""

app/services/file_service.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+File Service - Handles file processing and chunked text analysis
+"""
+import os
+import uuid
+from typing import Dict, Any, List, Tuple
+from werkzeug.utils import secure_filename
+from flask import current_app
+from .tokenizer_service import tokenizer_service
+from .stats_service import stats_service
+class FileService:
+    """Service for handling file uploads and processing."""
+    # Allowed file extensions for security
+    ALLOWED_EXTENSIONS = {'.txt', '.md', '.py', '.js', '.html', '.css', '.json', '.csv', '.log'}
+    @staticmethod
+    def is_allowed_file(filename: str) -> bool:
+        """Check if the uploaded file has an allowed extension."""
+        if not filename:
+            return False
+        _, ext = os.path.splitext(filename.lower())
+        return ext in FileService.ALLOWED_EXTENSIONS
+    @staticmethod
+    def generate_secure_filename(original_filename: str) -> str:
+        """Generate a secure filename with UUID prefix."""
+        if not original_filename:
+            return f"{uuid.uuid4().hex}.txt"
+        # Secure the filename and add UUID prefix to avoid conflicts
+        secure_name = secure_filename(original_filename)
+        name, ext = os.path.splitext(secure_name)
+        return f"{uuid.uuid4().hex}_{name}{ext}"
+    @staticmethod
+    def save_uploaded_file(uploaded_file, upload_folder: str) -> str:
+        """
+        Save uploaded file to the upload folder with a secure filename.
+        Returns:
+            str: Path to the saved file
+        """
+        # Ensure upload folder exists
+        os.makedirs(upload_folder, exist_ok=True)
+        # Generate secure filename
+        secure_filename_str = FileService.generate_secure_filename(uploaded_file.filename)
+        file_path = os.path.join(upload_folder, secure_filename_str)
+        # Save the file
+        uploaded_file.save(file_path)
+        return file_path
+    @staticmethod
+    def process_file_for_tokenization(
+        file_path: str,
+        model_id_or_name: str,
+        preview_char_limit: int = 8096,
+        max_display_tokens: int = 50000,
+        chunk_size: int = 1024 * 1024
+    ) -> Dict[str, Any]:
+        """
+        Process a file for tokenization with chunked processing for large files.
+        Args:
+            file_path: Path to the file to process
+            model_id_or_name: Tokenizer model to use
+            preview_char_limit: Character limit for preview display
+            max_display_tokens: Maximum tokens to display
+            chunk_size: Size of chunks for processing large files
+        Returns:
+            Dict containing tokenization results
+        """
+        # Load tokenizer
+        tokenizer, tokenizer_info, error = tokenizer_service.load_tokenizer(model_id_or_name)
+        if error:
+            raise Exception(error)
+        # Read the preview for display
+        with open(file_path, 'r', errors='replace') as f:
+            preview_text = f.read(preview_char_limit)
+        # Tokenize preview for display
+        preview_tokens = tokenizer.tokenize(preview_text)
+        display_tokens = preview_tokens[:max_display_tokens]
+        # Process full file for stats in chunks to avoid memory issues
+        total_tokens = []
+        token_set = set()
+        total_length = 0
+        with open(file_path, 'r', errors='replace') as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                total_length += len(chunk)
+                chunk_tokens = tokenizer.tokenize(chunk)
+                total_tokens.extend(chunk_tokens)
+                token_set.update(chunk_tokens)
+        # Calculate stats using approximation for original text
+        stats = stats_service.get_token_stats(total_tokens, ' ' * total_length)
+        # Format tokens for display
+        token_data = stats_service.format_tokens_for_display(display_tokens, tokenizer)
+        return {
+            'tokens': token_data,
+            'stats': stats,
+            'display_limit_reached': len(total_tokens) > max_display_tokens,
+            'total_tokens': len(total_tokens),
+            'is_full_file': True,
+            'preview_only': True,
+            'tokenizer_info': tokenizer_info
+        }
+    @staticmethod
+    def process_text_for_tokenization(
+        text: str,
+        model_id_or_name: str,
+        is_preview: bool = False,
+        preview_char_limit: int = 8096,
+        max_display_tokens: int = 50000
+    ) -> Dict[str, Any]:
+        """
+        Process regular text input for tokenization.
+        Args:
+            text: Input text to tokenize
+            model_id_or_name: Tokenizer model to use
+            is_preview: Whether this is a preview of a larger text
+            preview_char_limit: Character limit for preview
+            max_display_tokens: Maximum tokens to display
+        Returns:
+            Dict containing tokenization results
+        """
+        # Load tokenizer
+        tokenizer, tokenizer_info, error = tokenizer_service.load_tokenizer(model_id_or_name)
+        if error:
+            raise Exception(error)
+        # Tokenize full text for stats
+        all_tokens = tokenizer.tokenize(text)
+        total_token_count = len(all_tokens)
+        # For display: if it's a preview, only take first preview_char_limit chars
+        preview_text = text[:preview_char_limit] if is_preview else text
+        preview_tokens = tokenizer.tokenize(preview_text)
+        display_tokens = preview_tokens[:max_display_tokens]
+        # Calculate stats on full text
+        stats = stats_service.get_token_stats(all_tokens, text)
+        # Format tokens for display
+        token_data = stats_service.format_tokens_for_display(display_tokens, tokenizer)
+        return {
+            'tokens': token_data,
+            'stats': stats,
+            'display_limit_reached': total_token_count > max_display_tokens and not is_preview,
+            'total_tokens': total_token_count,
+            'is_full_file': False,
+            'preview_only': is_preview,
+            'tokenizer_info': tokenizer_info
+        }
+    @staticmethod
+    def cleanup_file(file_path: str):
+        """Safely remove a file if it exists."""
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        except OSError:
+            pass  # Ignore errors during cleanup
+# Global instance
+file_service = FileService()

app/services/stats_service.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Stats Service - Handles token statistics and color generation
+"""
+import hashlib
+import math
+from typing import List, Dict, Any
+class StatsService:
+    """Service for calculating token statistics and generating colors."""
+    @staticmethod
+    def get_varied_color(token: str) -> Dict[str, str]:
+        """Generate vibrant colors with HSL for better visual distinction."""
+        token_hash = hashlib.md5(token.encode()).hexdigest()
+        hue = int(token_hash[:3], 16) % 360
+        saturation = 70 + (int(token_hash[3:5], 16) % 20)
+        lightness = 80 + (int(token_hash[5:7], 16) % 10)
+        text_lightness = 20 if lightness > 50 else 90
+        return {
+            'background': f'hsl({hue}, {saturation}%, {lightness}%)',
+            'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
+        }
+    @staticmethod
+    def fix_token(token: str) -> str:
+        """Fix token for display with improved space visualization."""
+        if token.startswith('Ġ'):
+            space_count = token.count('Ġ')
+            return '·' * space_count + token[space_count:]
+        return token
+    @staticmethod
+    def get_token_stats(tokens: List[str], original_text: str) -> Dict[str, Any]:
+        """Calculate enhanced statistics about the tokens."""
+        if not tokens:
+            return {}
+        total_tokens = len(tokens)
+        unique_tokens = len(set(tokens))
+        avg_length = sum(len(t) for t in tokens) / total_tokens
+        compression_ratio = len(original_text) / total_tokens
+        # Token type analysis
+        space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
+        newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
+        special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
+        punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
+        # Length distribution
+        lengths = [len(t) for t in tokens]
+        mean_length = sum(lengths) / len(lengths)
+        variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
+        std_dev = math.sqrt(variance)
+        return {
+            'basic_stats': {
+                'total_tokens': total_tokens,
+                'unique_tokens': unique_tokens,
+                'compression_ratio': round(compression_ratio, 2),
+                'space_tokens': space_tokens,
+                'newline_tokens': newline_tokens,
+                'special_tokens': special_tokens,
+                'punctuation_tokens': punctuation_tokens,
+                'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
+            },
+            'length_stats': {
+                'avg_length': round(avg_length, 2),
+                'std_dev': round(std_dev, 2),
+                'min_length': min(lengths),
+                'max_length': max(lengths),
+                'median_length': sorted(lengths)[len(lengths)//2]
+            }
+        }
+    @staticmethod
+    def format_tokens_for_display(tokens: List[str], tokenizer) -> List[Dict[str, Any]]:
+        """Format tokens for frontend display with colors and metadata."""
+        token_data = []
+        for idx, token in enumerate(tokens):
+            colors = StatsService.get_varied_color(token)
+            fixed_token = StatsService.fix_token(token)
+            # Compute the numerical token ID from the tokenizer
+            token_id = tokenizer.convert_tokens_to_ids(token)
+            token_data.append({
+                'original': token,
+                'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
+                'colors': colors,
+                'newline': fixed_token.endswith('Ċ'),
+                'token_id': token_id,
+                'token_index': idx
+            })
+        return token_data
+# Global instance
+stats_service = StatsService()

app/services/tokenizer_service.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Tokenizer Service - Handles tokenizer loading, caching, and management
+"""
+import time
+from typing import Dict, Tuple, Optional, Any
+from transformers import AutoTokenizer
+from flask import current_app
+class TokenizerService:
+    """Service for managing tokenizer loading and caching."""
+    # Predefined tokenizer models with aliases
+    TOKENIZER_MODELS = {
+        'qwen3': {
+            'name': 'Qwen/Qwen3-0.6B',
+            'alias': 'Qwen 3'
+        },
+        'gemma3-27b': {
+            'name': 'google/gemma-3-27b-it',
+            'alias': 'Gemma 3 27B'
+        },
+        'glm4': {
+            'name': 'THUDM/GLM-4-32B-0414',
+            'alias': 'GLM 4'
+        },
+        'mistral-small': {
+            'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
+            'alias': 'Mistral Small 3.1'
+        },
+        'llama4': {
+            'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
+            'alias': 'Llama 4'
+        },
+        'deepseek-r1': {
+            'name': 'deepseek-ai/DeepSeek-R1',
+            'alias': 'Deepseek R1'
+        },
+        'qwen_25_72b': {
+            'name': 'Qwen/Qwen2.5-72B-Instruct',
+            'alias': 'QWQ 32B'
+        },
+        'llama_33': {
+            'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
+            'alias': 'Llama 3.3 70B'
+        },
+        'gemma2_2b': {
+            'name': 'google/gemma-2-2b-it',
+            'alias': 'Gemma 2 2B'
+        },
+        'bert-large-uncased': {
+            'name': 'google-bert/bert-large-uncased',
+            'alias': 'Bert Large Uncased'
+        },
+        'gpt2': {
+            'name': 'openai-community/gpt2',
+            'alias': 'GPT-2'
+        }
+    }
+    def __init__(self):
+        """Initialize the tokenizer service with empty caches."""
+        self.tokenizers: Dict[str, Any] = {}
+        self.custom_tokenizers: Dict[str, Tuple[Any, float]] = {}
+        self.tokenizer_info_cache: Dict[str, Dict] = {}
+        self.custom_model_errors: Dict[str, str] = {}
+    def get_tokenizer_info(self, tokenizer) -> Dict:
+        """Extract useful information from a tokenizer."""
+        info = {}
+        try:
+            # Get vocabulary size (dictionary size)
+            if hasattr(tokenizer, 'vocab_size'):
+                info['vocab_size'] = tokenizer.vocab_size
+            elif hasattr(tokenizer, 'get_vocab'):
+                info['vocab_size'] = len(tokenizer.get_vocab())
+            # Get model max length if available
+            if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:
+                info['model_max_length'] = tokenizer.model_max_length
+            # Check tokenizer type
+            info['tokenizer_type'] = tokenizer.__class__.__name__
+            # Get special tokens
+            special_tokens = {}
+            for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
+                if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
+                    token_value = getattr(tokenizer, token_name)
+                    if token_value and str(token_value).strip():
+                        special_tokens[token_name] = str(token_value)
+            info['special_tokens'] = special_tokens
+        except Exception as e:
+            info['error'] = f"Error extracting tokenizer info: {str(e)}"
+        return info
+    def load_tokenizer(self, model_id_or_name: str) -> Tuple[Optional[Any], Dict, Optional[str]]:
+        """
+        Load tokenizer if not already loaded.
+        Returns:
+            Tuple of (tokenizer, tokenizer_info, error_message)
+        """
+        error_message = None
+        tokenizer_info = {}
+        # Check if we have cached tokenizer info
+        if model_id_or_name in self.tokenizer_info_cache:
+            tokenizer_info = self.tokenizer_info_cache[model_id_or_name]
+        try:
+            # Check if it's a predefined model ID
+            if model_id_or_name in self.TOKENIZER_MODELS:
+                model_name = self.TOKENIZER_MODELS[model_id_or_name]['name']
+                if model_id_or_name not in self.tokenizers:
+                    self.tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
+                tokenizer = self.tokenizers[model_id_or_name]
+                # Get tokenizer info if not already cached
+                if model_id_or_name not in self.tokenizer_info_cache:
+                    tokenizer_info = self.get_tokenizer_info(tokenizer)
+                    self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
+                return tokenizer, tokenizer_info, None
+            # It's a custom model path
+            # Check if we have it in the custom cache and it's not expired
+            current_time = time.time()
+            cache_expiration = current_app.config.get('CACHE_EXPIRATION', 3600)
+            if model_id_or_name in self.custom_tokenizers:
+                cached_tokenizer, timestamp = self.custom_tokenizers[model_id_or_name]
+                if current_time - timestamp < cache_expiration:
+                    # Get tokenizer info if not already cached
+                    if model_id_or_name not in self.tokenizer_info_cache:
+                        tokenizer_info = self.get_tokenizer_info(cached_tokenizer)
+                        self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
+                    return cached_tokenizer, tokenizer_info, None
+            # Not in cache or expired, load it
+            tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
+            # Store in cache with timestamp
+            self.custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
+            # Clear any previous errors for this model
+            if model_id_or_name in self.custom_model_errors:
+                del self.custom_model_errors[model_id_or_name]
+            # Get tokenizer info
+            tokenizer_info = self.get_tokenizer_info(tokenizer)
+            self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
+            return tokenizer, tokenizer_info, None
+        except Exception as e:
+            error_message = f"Failed to load tokenizer: {str(e)}"
+            # Store error for future reference
+            self.custom_model_errors[model_id_or_name] = error_message
+            return None, tokenizer_info, error_message
+    def get_model_alias(self, model_id: str) -> str:
+        """Get the display alias for a model ID."""
+        if model_id in self.TOKENIZER_MODELS:
+            return self.TOKENIZER_MODELS[model_id]['alias']
+        return model_id
+    def is_predefined_model(self, model_id: str) -> bool:
+        """Check if a model ID is a predefined model."""
+        return model_id in self.TOKENIZER_MODELS
+    def clear_cache(self):
+        """Clear all caches."""
+        self.tokenizers.clear()
+        self.custom_tokenizers.clear()
+        self.tokenizer_info_cache.clear()
+        self.custom_model_errors.clear()
+# Global instance
+tokenizer_service = TokenizerService()

app/static/css/style.css ADDED Viewed

	@@ -0,0 +1,1298 @@

+:root {
+  --primary-color: #0f4f9b; /* Blue accent */
+  --primary-hover: #0c3e7a; /* Darker blue accent */
+  --bg-color: #121212;      /* Dark background */
+  --card-bg: #1e1e1e;       /* Dark card background */
+  --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
+                 0 2px 4px -1px rgba(0, 0, 0, 0.6);
+  --transition: all 0.3s ease;
+  --text-color: #E0E0E0;    /* Main text color */
+  --secondary-text: #A0A0A0;/* Secondary text color */
+  --input-bg: #2a2a2a;      /* Input/textarea background */
+  --input-border: #444444;  /* Input/textarea border */
+  --input-focus: #0f4f9b;   /* Focus border color */
+}
+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+  scrollbar-width: thin;
+  scrollbar-color: #0f4f9b #121212
+}
+/* Width and height of the scrollbar */
+::-webkit-scrollbar {
+width: 12px;
+height: 12px;
+}
+@keyframes spin {
+  from { transform: rotate(0deg); }
+  to { transform: rotate(360deg); }
+}
+/* Loading spinner styles */
+.loading-spinner {
+  display: inline-block;
+  width: 20px;
+  height: 20px;
+  border: 2px solid rgba(255, 255, 255, 0.3);
+  border-radius: 50%;
+  border-top-color: #fff;
+  animation: spin 1s linear infinite;
+  margin-left: 8px;
+  vertical-align: middle;
+}
+.loading-spinner.large {
+  width: 40px;
+  height: 40px;
+  border-width: 3px;
+}
+.loading-overlay {
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background-color: rgba(18, 18, 18, 0.8);
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  z-index: 9999;
+  opacity: 0;
+  visibility: hidden;
+  transition: opacity 0.3s ease, visibility 0.3s ease;
+}
+.loading-overlay.active {
+  opacity: 1;
+  visibility: visible;
+}
+.loading-content {
+  background-color: var(--card-bg);
+  padding: 2rem;
+  border-radius: 1rem;
+  box-shadow: var(--card-shadow);
+  text-align: center;
+  min-width: 200px;
+}
+.loading-text {
+  color: var(--text-color);
+  font-size: 1.1rem;
+  margin-top: 1rem;
+}
+/* Keyboard shortcut indicator */
+.keyboard-shortcut-hint {
+  position: absolute;
+  top: 10px;
+  right: 10px;
+  background: rgba(15, 79, 155, 0.1);
+  color: var(--primary-color);
+  font-size: 0.8rem;
+  padding: 0.25rem 0.5rem;
+  border-radius: 0.375rem;
+  border: 1px solid rgba(15, 79, 155, 0.2);
+  pointer-events: none;
+  opacity: 0.7;
+  font-family: monospace;
+}
+.input-section {
+  position: relative;
+}
+/* Card header styles */
+.card-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: 1.25rem;
+}
+.card-header .card-title {
+  margin-bottom: 0;
+}
+/* Token search styles */
+.search-toggle-btn {
+  background: var(--primary-color);
+  color: white;
+  border: none;
+  border-radius: 50%;
+  width: 36px;
+  height: 36px;
+  min-width: 36px;
+  min-height: 36px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  cursor: pointer;
+  transition: var(--transition);
+  box-shadow: 0 2px 4px rgba(15, 79, 155, 0.3);
+  padding: 0;
+  flex-shrink: 0;
+}
+.search-toggle-btn:hover {
+  background-color: var(--primary-hover);
+  transform: translateY(-1px);
+  box-shadow: 0 4px 8px rgba(15, 79, 155, 0.4);
+}
+.search-toggle-btn.active {
+  background-color: var(--primary-hover);
+  box-shadow: 0 2px 8px rgba(15, 79, 155, 0.5);
+}
+.search-toggle-btn svg {
+  width: 18px;
+  height: 18px;
+  fill: currentColor;
+  flex-shrink: 0;
+}
+.token-search-container {
+  display: none;
+  flex-direction: column;
+  gap: 1rem;
+  margin-bottom: 1.5rem;
+  padding: 1.25rem;
+  background-color: rgba(15, 79, 155, 0.1);
+  border-radius: 0.75rem;
+  border: 1px solid rgba(15, 79, 155, 0.2);
+  opacity: 0;
+  transform: translateY(-10px);
+  transition: opacity 0.3s ease, transform 0.3s ease;
+  width: 100%;
+}
+.token-search-container.show {
+  display: flex;
+  opacity: 1;
+  transform: translateY(0);
+}
+.token-search-row {
+  display: flex;
+  align-items: center;
+  gap: 1rem;
+  width: 100%;
+}
+@keyframes slideDown {
+  from {
+    opacity: 0;
+    transform: translateY(-10px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
+}
+.token-search-input {
+  flex: 1;
+  padding: 0.875rem 1rem;
+  border: 2px solid var(--input-border);
+  border-radius: 0.5rem;
+  background-color: var(--input-bg);
+  color: var(--text-color);
+  font-size: 1rem;
+  transition: var(--transition);
+  min-width: 0;
+}
+.token-search-input:focus {
+  outline: none;
+  border-color: var(--primary-color);
+  box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
+}
+.token-search-controls {
+  display: flex;
+  align-items: center;
+  gap: 0.75rem;
+  flex-shrink: 0;
+}
+.token-search-btn {
+  padding: 0.5rem 1rem;
+  background: var(--primary-color);
+  color: white;
+  border: none;
+  border-radius: 0.5rem;
+  font-size: 0.875rem;
+  font-weight: 500;
+  cursor: pointer;
+  transition: var(--transition);
+  box-shadow: 0 1px 3px rgba(15, 79, 155, 0.3);
+  min-width: 44px;
+}
+.token-search-btn:hover {
+  background: var(--primary-hover);
+  transform: translateY(-1px);
+  box-shadow: 0 2px 6px rgba(15, 79, 155, 0.4);
+}
+.token-search-btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+  transform: none;
+}
+.token-search-count {
+  color: var(--text-color);
+  font-size: 0.875rem;
+  font-weight: 500;
+  white-space: nowrap;
+  background-color: rgba(255, 255, 255, 0.1);
+  padding: 0.5rem 0.75rem;
+  border-radius: 0.375rem;
+  min-width: 60px;
+  text-align: center;
+}
+/* Highlighted token styles */
+.token.highlighted {
+  background-color: #fbbf24 !important;
+  color: #1f2937 !important;
+  box-shadow: 0 0 0 2px #f59e0b;
+  z-index: 1;
+  position: relative;
+}
+.token.highlighted.current {
+  background-color: #f59e0b !important;
+  color: white !important;
+  box-shadow: 0 0 0 3px #f59e0b;
+}
+/* Token frequency chart styles */
+.frequency-chart-container {
+  margin-top: 1.5rem;
+  padding: 1rem;
+  background-color: rgba(15, 79, 155, 0.05);
+  border-radius: 0.5rem;
+  border: 1px solid rgba(15, 79, 155, 0.1);
+}
+.frequency-chart-title {
+  font-size: 1rem;
+  font-weight: 600;
+  color: var(--text-color);
+  margin-bottom: 1rem;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+.frequency-chart {
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+.frequency-item {
+  display: flex;
+  align-items: center;
+  gap: 0.75rem;
+  padding: 0.5rem;
+  background-color: var(--input-bg);
+  border-radius: 0.375rem;
+  transition: var(--transition);
+}
+.frequency-item:hover {
+  background-color: rgba(255, 255, 255, 0.05);
+}
+.frequency-token {
+  font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
+  font-size: 0.8rem;
+  background-color: rgba(15, 79, 155, 0.2);
+  color: var(--primary-color);
+  padding: 0.25rem 0.5rem;
+  border-radius: 0.25rem;
+  min-width: 60px;
+  text-align: center;
+  cursor: pointer;
+  transition: var(--transition);
+}
+.frequency-token:hover {
+  background-color: var(--primary-color);
+  color: white;
+}
+.frequency-bar-container {
+  flex: 1;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+.frequency-bar {
+  flex: 1;
+  height: 20px;
+  background-color: rgba(255, 255, 255, 0.1);
+  border-radius: 10px;
+  overflow: hidden;
+  position: relative;
+}
+.frequency-bar-fill {
+  height: 100%;
+  background: linear-gradient(90deg, var(--primary-color) 0%, var(--primary-hover) 100%);
+  border-radius: 10px;
+  transition: width 0.5s ease;
+  position: relative;
+}
+.frequency-count {
+  color: var(--secondary-text);
+  font-size: 0.8rem;
+  font-weight: 500;
+  min-width: 40px;
+  text-align: right;
+}
+.chart-toggle-btn {
+  background: none;
+  border: 1px solid var(--primary-color);
+  color: var(--primary-color);
+  padding: 0.375rem 0.75rem;
+  border-radius: 0.375rem;
+  font-size: 0.8rem;
+  cursor: pointer;
+  transition: var(--transition);
+  box-shadow: none;
+}
+.chart-toggle-btn:hover {
+  background-color: var(--primary-color);
+  color: white;
+  transform: none;
+  box-shadow: none;
+}
+.chart-toggle-btn.active {
+  background-color: var(--primary-color);
+  color: white;
+}
+/* Track (background) */
+::-webkit-scrollbar-track {
+background: #121212;
+border-radius: 10px;
+}
+/* Handle (draggable part) */
+::-webkit-scrollbar-thumb {
+background: #0f4f9b;
+border-radius: 10px;
+border: 2px solid #121212;
+}
+/* Handle on hover */
+::-webkit-scrollbar-thumb:hover {
+background: #0c3e7a;
+}
+body {
+  background-color: var(--bg-color);
+  padding: 2rem;
+  min-height: 100vh;
+  background-image:
+    radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
+    radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
+  color: var(--text-color);
+}
+.container {
+  max-width: 1200px;
+  margin: 0 auto;
+}
+.header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  margin-bottom: 2rem;
+  position: relative;
+}
+.title-section {
+  flex-grow: 1;
+}
+.title {
+  font-size: 2.5rem;
+  font-weight: 800;
+  color: var(--primary-color);
+  margin-bottom: 0.5rem;
+}
+.subtitle {
+  color: var(--secondary-text);
+  font-size: 1.1rem;
+}
+.model-selector {
+  position: relative;
+  min-width: 200px;
+}
+.model-selector-header {
+  display: flex;
+  gap: 0.5rem;
+  margin-bottom: 0.5rem;
+}
+.model-type-toggle {
+  display: flex;
+  background-color: var(--card-bg);
+  border-radius: 0.5rem;
+  padding: 0.25rem;
+  overflow: hidden;
+}
+.toggle-option {
+  padding: 0.5rem 0.75rem;
+  font-size: 0.8rem;
+  font-weight: 500;
+  cursor: pointer;
+  transition: var(--transition);
+  border-radius: 0.375rem;
+  color: var(--secondary-text);
+}
+.toggle-option.active {
+  background-color: var(--primary-color);
+  color: white;
+}
+select {
+  width: 100%;
+  padding: 0.75rem 1rem;
+  border: 2px solid var(--input-border);
+  border-radius: 0.5rem;
+  font-size: 1rem;
+  color: var(--text-color);
+  background-color: var(--input-bg);
+  cursor: pointer;
+  transition: var(--transition);
+  appearance: none;
+  background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
+  background-repeat: no-repeat;
+  background-position: right 1rem center;
+  background-size: 1.5rem;
+}
+select:hover, .custom-model-input:hover {
+  border-color: var(--primary-color);
+}
+select:focus, .custom-model-input:focus {
+  outline: none;
+  border-color: var(--primary-color);
+  box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
+}
+.custom-model-input {
+  width: 100%;
+  padding: 0.75rem 1rem;
+  border: 2px solid var(--input-border);
+  border-radius: 0.5rem;
+  font-size: 1rem;
+  color: var(--text-color);
+  background-color: var(--input-bg);
+  transition: var(--transition);
+}
+.input-section {
+  margin-bottom: 2rem;
+}
+textarea {
+  width: 100%;
+  height: 150px;
+  padding: 1.25rem;
+  border: 2px solid var(--input-border);
+  border-radius: 0.75rem;
+  resize: vertical;
+  font-size: 1rem;
+  margin-bottom: 1rem;
+  transition: var(--transition);
+  background-color: var(--input-bg);
+  color: var(--text-color);
+}
+textarea:focus {
+  outline: none;
+  border-color: var(--input-focus);
+  box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
+}
+.button-container {
+  display: flex;
+  justify-content: center;
+  width: 100%;
+  gap: 1rem;
+}
+button {
+  padding: 0.875rem 2.5rem;
+  background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
+  color: #fff;
+  border: none;
+  border-radius: 0.75rem;
+  font-size: 1.1rem;
+  font-weight: 600;
+  cursor: pointer;
+  transition: var(--transition);
+  box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
+}
+button:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
+}
+button:active {
+  transform: translateY(0);
+}
+button:disabled {
+  opacity: 0.7;
+  cursor: not-allowed;
+}
+.card {
+  background-color: var(--card-bg);
+  border-radius: 1rem;
+  box-shadow: var(--card-shadow);
+  padding: 1.5rem;
+  margin-bottom: 2rem;
+  transition: var(--transition);
+}
+.card:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
+}
+.card-title {
+  font-size: 1.25rem;
+  font-weight: 700;
+  color: var(--text-color);
+  margin-bottom: 1.25rem;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  cursor: pointer;
+}
+.card-title::before {
+  content: '';
+  display: block;
+  width: 4px;
+  height: 1.25rem;
+  background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
+  border-radius: 2px;
+}
+.token-container {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.375rem;
+  margin-bottom: 1rem;
+  padding: 1rem;
+  background-color: #2a2a2a;
+  border-radius: 0.5rem;
+  max-height: 200px;
+  overflow-y: auto;
+  transition: max-height 0.3s ease;
+}
+.token-container.expanded {
+  max-height: none;
+}
+.token {
+  padding: 0.375rem 0.75rem;
+  border-radius: 0.375rem;
+  background-color: var(--input-bg);
+  font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
+  font-size: 0.875rem;
+  color: var(--text-color);
+  cursor: default;
+  transition: var(--transition);
+  box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
+}
+.token:hover {
+  transform: translateY(-1px);
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.stats-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+  gap: 1.5rem;
+  margin-bottom: 2rem;
+}
+.stat-card {
+  background-color: var(--card-bg);
+  padding: 1.5rem;
+  border-radius: 1rem;
+  box-shadow: var(--card-shadow);
+  transition: var(--transition);
+}
+.stat-card:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
+}
+.stat-title {
+  color: var(--secondary-text);
+  font-size: 0.875rem;
+  font-weight: 500;
+  margin-bottom: 0.5rem;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}
+.stat-value {
+  color: var(--text-color);
+  font-size: 2rem;
+  font-weight: 700;
+  line-height: 1.2;
+  margin-bottom: 0.25rem;
+}
+.stat-description {
+  color: var(--secondary-text);
+  font-size: 0.875rem;
+}
+.expand-button {
+  background: none;
+  border: none;
+  color: var(--primary-color);
+  font-size: 0.875rem;
+  padding: 0.5rem;
+  cursor: pointer;
+  display: block;
+  margin: 0 auto;
+  box-shadow: none;
+}
+.expand-button:hover {
+  text-decoration: underline;
+  transform: none;
+  box-shadow: none;
+}
+.error-message {
+  color: #EF4444;
+  background-color: #3a1f1f;
+  border: 1px solid #562626;
+  padding: 1rem;
+  border-radius: 0.5rem;
+  margin-bottom: 1rem;
+  display: none;
+}
+.display-limit-notice {
+  background-color: #4b2b07;
+  border: 1px solid #7c4a02;
+  color: #FFD591;
+  padding: 0.75rem;
+  border-radius: 0.5rem;
+  margin-top: 1rem;
+  font-size: 0.875rem;
+  display: none;
+}
+/* File drop zone styles */
+.file-drop-zone {
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background-color: rgba(15, 79, 155, 0.15);
+  z-index: 1000;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  opacity: 0;
+  pointer-events: none;
+  transition: opacity 0.3s ease;
+}
+.file-drop-zone.active {
+  opacity: 1;
+  pointer-events: all;
+}
+.drop-indicator {
+  background-color: var(--card-bg);
+  border: 2px dashed var(--primary-color);
+  border-radius: 1rem;
+  padding: 2rem;
+  text-align: center;
+  width: 60%;
+  max-width: 400px;
+  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
+  animation: pulse 2s infinite;
+}
+@keyframes pulse {
+  0% { transform: scale(1); }
+  50% { transform: scale(1.05); }
+  100% { transform: scale(1); }
+}
+.drop-indicator p {
+  margin-bottom: 0.5rem;
+  color: var(--text-color);
+  font-size: 1.2rem;
+}
+.file-icon {
+  font-size: 3rem;
+  margin-bottom: 1rem;
+  color: var(--primary-color);
+}
+.file-upload-icon {
+  position: fixed;
+  bottom: 20px;
+  left: 20px;
+  width: 45px;
+  height: 45px;
+  background-color: var(--card-bg);
+  border-radius: 50%;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  cursor: pointer;
+  z-index: 100;
+  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+  transition: transform 0.2s ease, box-shadow 0.2s ease;
+}
+.file-upload-icon:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
+}
+.file-upload-icon span {
+  font-size: 1.5rem;
+  color: var(--primary-color);
+}
+.file-info {
+  position: fixed;
+  bottom: 20px;
+  left: 75px;
+  background-color: var(--card-bg);
+  color: var(--primary-color);
+  font-weight: 500;
+  padding: 0.5rem 1rem;
+  border-radius: 1rem;
+  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+  max-width: 270px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  z-index: 100;
+  display: none;
+}
+.file-detach {
+  margin-left: 8px;
+  display: inline-block;
+  width: 18px;
+  height: 18px;
+  background-color: rgba(255, 255, 255, 0.1);
+  color: var(--text-color);
+  border-radius: 50%;
+  text-align: center;
+  line-height: 16px;
+  font-size: 12px;
+  cursor: pointer;
+  transition: all 0.2s ease;
+}
+.file-detach:hover {
+  background-color: rgba(255, 0, 0, 0.2);
+  color: #ff6b6b;
+  transform: scale(1.1);
+}
+.preview-notice {
+  background-color: #273c56;
+  border: 1px solid #365a82;
+  color: #89b4e8;
+  padding: 0.75rem;
+  border-radius: 0.5rem;
+  margin-top: 1rem;
+  font-size: 0.875rem;
+  display: none;
+}
+.custom-model-wrapper {
+  position: relative;
+}
+.model-badge {
+  position: absolute;
+  top: -10px;
+  right: -5px;
+  background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
+  color: white;
+  font-size: 0.7rem;
+  font-weight: 700;
+  padding: 0.25rem 0.5rem;
+  border-radius: 999px;
+  transform: scale(0);
+  transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
+  box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
+  z-index: 10;
+}
+.model-badge.show {
+  transform: scale(1);
+}
+.custom-model-help {
+  display: inline-block;
+  width: 16px;
+  height: 16px;
+  line-height: 16px;
+  font-size: 11px;
+  font-weight: bold;
+  text-align: center;
+  background-color: var(--secondary-text);
+  color: var(--card-bg);
+  border-radius: 50%;
+  margin-left: 5px;
+  cursor: help;
+  vertical-align: middle;
+}
+.tooltip {
+  position: absolute;
+  top: 100%;
+  left: 0;
+  width: 280px;
+  background-color: #333;
+  color: #fff;
+  padding: 0.75rem;
+  border-radius: 0.5rem;
+  font-size: 0.8rem;
+  margin-top: 0.5rem;
+  z-index: 100;
+  box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+  opacity: 0;
+  visibility: hidden;
+  transition: opacity 0.2s, visibility 0.2s;
+}
+.custom-model-help:hover + .tooltip {
+  opacity: 1;
+  visibility: visible;
+}
+/* Tokenizer info icon and tooltip styles */
+.tokenizer-info-icon {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: 24px;
+  height: 24px;
+  background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
+  color: white;
+  border-radius: 50%;
+  position: absolute;
+  left: -32px; /* Position to the left of the selector */
+  top: 50%;
+  transform: translateY(-50%);
+  cursor: pointer;
+  font-size: 12px;
+  font-weight: bold;
+  transition: all 0.2s ease;
+  z-index: 10;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+}
+.tokenizer-info-icon:hover {
+  transform: translateY(-50%) scale(1.1);
+  box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
+}
+/* Watermark styles */
+.watermark {
+  position: fixed;
+  bottom: 20px;
+  right: 20px;
+  color: var(--primary-color);
+  font-size: 1.4rem;
+  font-weight: 700;
+  opacity: 0.25; /* Semi-transparent */
+  z-index: 100;
+  transition: opacity 0.3s ease;
+  text-decoration: none;
+  pointer-events: auto; /* Ensure it remains clickable */
+}
+.watermark:hover {
+  opacity: 0.6; /* Increase opacity on hover */
+}
+.tokenizer-info-tooltip {
+  position: absolute;
+  top: calc(100% + 8px);
+  left: -30px; /* Adjust position to align with the icon */
+  width: 300px;
+  background-color: var(--card-bg);
+  color: var(--text-color);
+  border: 1px solid var(--primary-color);
+  border-radius: 0.75rem;
+  box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
+  padding: 1rem;
+  z-index: 1000; /* Increase z-index to ensure visibility */
+  opacity: 0;
+  visibility: hidden;
+  transition: opacity 0.3s, visibility 0.3s;
+  pointer-events: none; /* Initially disable pointer events */
+}
+.tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
+opacity: 1;
+visibility: visible;
+pointer-events: auto;
+}
+.tokenizer-info-tooltip:hover {
+  opacity: 1;
+  visibility: visible;
+  pointer-events: auto;
+}
+.tokenizer-info-header {
+  font-size: 1.1rem;
+  font-weight: 600;
+  margin-bottom: 0.5rem;
+  padding-bottom: 0.5rem;
+  border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+  color: var(--primary-color);
+}
+.tokenizer-info-grid {
+  display: grid;
+  grid-template-columns: repeat(2, 1fr);
+  gap: 0.75rem;
+  margin: 0.75rem 0;
+}
+.tokenizer-info-item {
+  display: flex;
+  flex-direction: column;
+}
+.tokenizer-info-label {
+  font-size: 0.75rem;
+  color: var(--secondary-text);
+  margin-bottom: 0.25rem;
+}
+.tokenizer-info-value {
+  font-size: 0.95rem;
+  font-weight: 500;
+}
+.special-tokens-container {
+  margin-top: 0.75rem;
+  background-color: rgba(15, 79, 155, 0.1);
+  border-radius: 0.5rem;
+  padding: 0.5rem;
+  max-height: 100px;
+  overflow-y: auto;
+}
+.special-token-item {
+  display: flex;
+  justify-content: space-between;
+  margin-bottom: 0.25rem;
+  font-size: 0.8rem;
+}
+.token-name {
+  color: var(--secondary-text);
+}
+.token-value {
+  background-color: rgba(255, 255, 255, 0.1);
+  padding: 1px 4px;
+  border-radius: 2px;
+  font-family: monospace;
+}
+.tokenizer-info-loading {
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  height: 100px;
+}
+.tokenizer-info-spinner {
+  width: 30px;
+  height: 30px;
+  border: 3px solid var(--primary-color);
+  border-radius: 50%;
+  border-top-color: transparent;
+  animation: spin 1s linear infinite;
+}
+.tokenizer-info-error {
+  color: #f87171;
+  font-size: 0.9rem;
+  text-align: center;
+  padding: 1rem;
+}
+/* Mobile responsiveness improvements */
+@media (max-width: 768px) {
+  body {
+    padding: 1rem;
+  }
+  .container {
+    max-width: 100%;
+  }
+  .header {
+    flex-direction: column;
+    align-items: stretch;
+    gap: 1rem;
+  }
+  .title {
+    font-size: 2rem;
+  }
+  .subtitle {
+    font-size: 1rem;
+  }
+  .model-selector {
+    width: 100%;
+  }
+  .model-type-toggle {
+    justify-content: center;
+  }
+  .toggle-option {
+    flex: 1;
+    text-align: center;
+  }
+  textarea {
+    height: 120px;
+    font-size: 16px; /* Prevents zoom on iOS */
+  }
+  .keyboard-shortcut-hint {
+    top: 5px;
+    right: 5px;
+    font-size: 0.7rem;
+    padding: 0.2rem 0.4rem;
+  }
+  .search-toggle-btn {
+    width: 32px;
+    height: 32px;
+    min-width: 32px;
+    min-height: 32px;
+  }
+  .search-toggle-btn svg {
+    width: 16px;
+    height: 16px;
+  }
+  .stats-grid {
+    grid-template-columns: 1fr;
+    gap: 1rem;
+  }
+  .stat-card {
+    padding: 1rem;
+  }
+  .stat-value {
+    font-size: 1.5rem;
+  }
+  .token-container {
+    max-height: 150px;
+    font-size: 0.8rem;
+  }
+  .token {
+    padding: 0.25rem 0.5rem;
+    font-size: 0.75rem;
+  }
+  .token-search-container {
+    padding: 1rem;
+  }
+  .token-search-row {
+    flex-direction: column;
+    gap: 1rem;
+  }
+  .token-search-controls {
+    justify-content: space-between;
+    flex-wrap: wrap;
+    gap: 0.5rem;
+  }
+  .token-search-btn {
+    flex: 1;
+    min-width: 60px;
+  }
+  .token-search-count {
+    order: -1;
+    width: 100%;
+    text-align: center;
+    margin-bottom: 0.5rem;
+  }
+  .token-search-input {
+    font-size: 16px; /* Prevents zoom on iOS */
+  }
+  .frequency-chart-container {
+    margin-top: 1rem;
+    padding: 0.75rem;
+  }
+  .frequency-chart-title {
+    flex-direction: column;
+    gap: 0.75rem;
+    align-items: stretch;
+    text-align: center;
+  }
+  .frequency-item {
+    flex-direction: column;
+    gap: 0.5rem;
+    align-items: stretch;
+  }
+  .frequency-token {
+    align-self: flex-start;
+    min-width: auto;
+  }
+  .frequency-bar-container {
+    width: 100%;
+  }
+  .tokenizer-info-tooltip {
+    width: 280px;
+    left: -50px;
+  }
+  .tokenizer-info-grid {
+    grid-template-columns: 1fr;
+  }
+  .file-info {
+    max-width: 200px;
+    font-size: 0.8rem;
+  }
+  .loading-content {
+    padding: 1.5rem;
+    min-width: 150px;
+  }
+  .loading-text {
+    font-size: 1rem;
+  }
+  .button-container {
+    flex-direction: column;
+    align-items: stretch;
+  }
+  button {
+    padding: 1rem 2rem;
+    font-size: 1rem;
+  }
+}
+@media (max-width: 480px) {
+  body {
+    padding: 0.5rem;
+  }
+  .title {
+    font-size: 1.75rem;
+  }
+  .card {
+    padding: 1rem;
+  }
+  .stat-card {
+    padding: 0.75rem;
+  }
+  .token-container {
+    padding: 0.75rem;
+    gap: 0.25rem;
+  }
+  .token {
+    padding: 0.2rem 0.4rem;
+    font-size: 0.7rem;
+  }
+  .tokenizer-info-tooltip {
+    width: 260px;
+    left: -60px;
+  }
+  .frequency-chart-container {
+    padding: 0.5rem;
+  }
+  .frequency-item {
+    padding: 0.375rem;
+  }
+  .frequency-token {
+    font-size: 0.7rem;
+    padding: 0.2rem 0.4rem;
+  }
+}

app/static/js/main.js ADDED Viewed

	@@ -0,0 +1,837 @@

+$(document).ready(function() {
+    // File handling variables
+    let currentFile = null;
+    let originalTextContent = null;
+    let lastUploadedFileName = null;
+    let fileJustUploaded = false;  // Flag to prevent immediate detachment
+    let currentModelType = window.tokenizerData?.model_type || 'predefined';
+    let currentTokenizerInfo = null;
+    // Try to parse tokenizer info if available from server
+    try {
+        currentTokenizerInfo = window.tokenizerData?.tokenizer_info || null;
+        if (currentTokenizerInfo) {
+            updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
+        }
+    } catch(e) {
+        console.error("Error parsing tokenizer info:", e);
+    }
+    // Show error if exists
+    if (window.tokenizerData?.error) {
+        showError(window.tokenizerData.error);
+    }
+    // Setup model type based on initial state
+    if (currentModelType === "custom") {
+        $('.toggle-option').removeClass('active');
+        $('.custom-toggle').addClass('active');
+        $('#predefinedModelSelector').hide();
+        $('#customModelSelector').show();
+    }
+    // Show success badge if custom model loaded successfully
+    if (currentModelType === "custom" && !window.tokenizerData?.error) {
+        $('#modelSuccessBadge').addClass('show');
+        setTimeout(() => {
+            $('#modelSuccessBadge').removeClass('show');
+        }, 3000);
+    }
+    // Toggle between predefined and custom model inputs
+    $('.toggle-option').click(function() {
+        const modelType = $(this).data('type');
+        $('.toggle-option').removeClass('active');
+        $(this).addClass('active');
+        currentModelType = modelType;
+        if (modelType === 'predefined') {
+            $('#predefinedModelSelector').show();
+            $('#customModelSelector').hide();
+            $('#modelTypeInput').val('predefined');
+            // Set the model input value to the selected predefined model
+            $('#modelInput').val($('#modelSelect').val());
+        } else {
+            $('#predefinedModelSelector').hide();
+            $('#customModelSelector').show();
+            $('#modelTypeInput').val('custom');
+        }
+        // Clear tokenizer info if switching models
+        if (modelType === 'predefined') {
+            $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
+            fetchTokenizerInfo($('#modelSelect').val(), false);
+        } else {
+            $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
+            // Only fetch if there's a custom model value
+            const customModel = $('#customModelInput').val();
+            if (customModel) {
+                fetchTokenizerInfo(customModel, true);
+            }
+        }
+    });
+    // Update hidden input when custom model input changes
+    $('#customModelInput').on('input', function() {
+        $('#customModelInputHidden').val($(this).val());
+    });
+    function showError(message) {
+        const errorDiv = $('#errorMessage');
+        errorDiv.text(message);
+        errorDiv.show();
+        setTimeout(() => errorDiv.fadeOut(), 5000);
+    }
+    // Function to update tokenizer info display in tooltip
+    function updateTokenizerInfoDisplay(info, isCustom = false) {
+        const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
+        let htmlContent = '';
+        if (info.error) {
+            $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
+            return;
+        }
+        // Start building the tooltip content
+        htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
+                      <div class="tokenizer-info-grid">`;
+        // Dictionary size
+        if (info.vocab_size) {
+            htmlContent += `
+                <div class="tokenizer-info-item">
+                    <span class="tokenizer-info-label">Dictionary Size</span>
+                    <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
+                </div>`;
+        }
+        // Tokenizer type
+        if (info.tokenizer_type) {
+            htmlContent += `
+                <div class="tokenizer-info-item">
+                    <span class="tokenizer-info-label">Tokenizer Type</span>
+                    <span class="tokenizer-info-value">${info.tokenizer_type}</span>
+                </div>`;
+        }
+        // Max length
+        if (info.model_max_length) {
+            htmlContent += `
+                <div class="tokenizer-info-item">
+                    <span class="tokenizer-info-label">Max Length</span>
+                    <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
+                </div>`;
+        }
+        htmlContent += `</div>`;  // Close tokenizer-info-grid
+        // Special tokens section
+        if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
+            htmlContent += `
+                <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
+                    <span class="tokenizer-info-label">Special Tokens</span>
+                    <div class="special-tokens-container">`;
+            // Add each special token with proper escaping for HTML special characters
+            for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
+                // Properly escape HTML special characters
+                const escapedValue = tokenValue
+                    .replace(/&/g, '&amp;')
+                    .replace(/</g, '&lt;')
+                    .replace(/>/g, '&gt;')
+                    .replace(/"/g, '&quot;')
+                    .replace(/'/g, '&#039;');
+                htmlContent += `
+                    <div class="special-token-item">
+                        <span class="token-name">${tokenName}:</span>
+                        <span class="token-value">${escapedValue}</span>
+                    </div>`;
+            }
+            htmlContent += `
+                    </div>
+                </div>`;
+        }
+        $(targetSelector).html(htmlContent);
+    }
+    // Function to show loading overlay
+    function showLoadingOverlay(text = 'Loading...') {
+        $('#loadingText').text(text);
+        $('#loadingOverlay').addClass('active');
+    }
+    // Function to hide loading overlay
+    function hideLoadingOverlay() {
+        $('#loadingOverlay').removeClass('active');
+    }
+    // Function to fetch tokenizer info
+    function fetchTokenizerInfo(modelId, isCustom = false) {
+        if (!modelId) return;
+        const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
+        $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
+        $.ajax({
+            url: '/tokenizer-info',
+            method: 'GET',
+            data: {
+                model_id: modelId,
+                is_custom: isCustom
+            },
+            success: function(response) {
+                if (response.error) {
+                    $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
+                } else {
+                    currentTokenizerInfo = response;
+                    updateTokenizerInfoDisplay(response, isCustom);
+                }
+            },
+            error: function(xhr) {
+                $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
+            }
+        });
+    }
+    // Token search functionality
+    let searchMatches = [];
+    let currentSearchIndex = -1;
+    let searchVisible = false;
+    // Token frequency functionality
+    let tokenFrequencyData = [];
+    let showFrequencyChart = false;
+    function performTokenSearch(searchTerm) {
+        const tokenContainer = $('#tokenContainer');
+        const tokens = tokenContainer.find('.token');
+        // Clear previous highlights
+        tokens.removeClass('highlighted current');
+        searchMatches = [];
+        currentSearchIndex = -1;
+        if (!searchTerm.trim()) {
+            updateSearchCount();
+            return;
+        }
+        const searchLower = searchTerm.toLowerCase();
+        // Find matching tokens
+        tokens.each(function(index) {
+            const tokenText = $(this).text().toLowerCase();
+            if (tokenText.includes(searchLower)) {
+                $(this).addClass('highlighted');
+                searchMatches.push(index);
+            }
+        });
+        updateSearchCount();
+        // Navigate to first match if any
+        if (searchMatches.length > 0) {
+            navigateToMatch(0);
+        }
+    }
+    function navigateToMatch(index) {
+        if (searchMatches.length === 0) return;
+        // Remove current highlight
+        $('.token.current').removeClass('current');
+        // Update current index
+        currentSearchIndex = index;
+        // Highlight current match
+        const tokenContainer = $('#tokenContainer');
+        const tokens = tokenContainer.find('.token');
+        const currentToken = tokens.eq(searchMatches[currentSearchIndex]);
+        currentToken.addClass('current');
+        // Scroll to current match - improved logic
+        const scrollContainer = tokenContainer;
+        const containerOffset = scrollContainer.offset();
+        const tokenOffset = currentToken.offset();
+        if (containerOffset && tokenOffset) {
+            const containerHeight = scrollContainer.height();
+            const containerScrollTop = scrollContainer.scrollTop();
+            const tokenRelativeTop = tokenOffset.top - containerOffset.top;
+            // Check if token is outside visible area
+            if (tokenRelativeTop < 0 || tokenRelativeTop > containerHeight - 50) {
+                // Calculate new scroll position to center the token
+                const tokenHeight = currentToken.outerHeight();
+                const newScrollTop = containerScrollTop + tokenRelativeTop - (containerHeight / 2) + (tokenHeight / 2);
+                scrollContainer.animate({
+                    scrollTop: Math.max(0, newScrollTop)
+                }, 400, 'swing');
+            }
+        }
+        updateSearchCount();
+    }
+    function toggleSearchVisibility() {
+        console.log('toggleSearchVisibility called, current state:', searchVisible);
+        searchVisible = !searchVisible;
+        const container = $('#tokenSearchContainer');
+        const toggleBtn = $('#searchToggleBtn');
+        console.log('Container found:', container.length, 'Toggle button found:', toggleBtn.length);
+        if (searchVisible) {
+            // Show the container first, then animate
+            container.show();
+            setTimeout(() => {
+                container.addClass('show');
+            }, 10);
+            toggleBtn.addClass('active');
+            console.log('Showing search container');
+            setTimeout(() => {
+                $('#tokenSearchInput').focus();
+            }, 300);
+        } else {
+            container.removeClass('show');
+            toggleBtn.removeClass('active');
+            console.log('Hiding search container');
+            setTimeout(() => {
+                container.hide();
+            }, 300);
+            // Clear search when hiding
+            $('#tokenSearchInput').val('');
+            performTokenSearch('');
+        }
+    }
+    function updateSearchCount() {
+        const countText = searchMatches.length > 0
+            ? `${currentSearchIndex + 1}/${searchMatches.length}`
+            : `0/${searchMatches.length}`;
+        $('#searchCount').text(countText);
+        // Update navigation button states
+        $('#prevMatch').prop('disabled', searchMatches.length === 0 || currentSearchIndex <= 0);
+        $('#nextMatch').prop('disabled', searchMatches.length === 0 || currentSearchIndex >= searchMatches.length - 1);
+    }
+    // Token frequency chart functions
+    function calculateTokenFrequency(tokens) {
+        const frequencyMap = {};
+        tokens.each(function() {
+            const tokenText = $(this).text();
+            if (tokenText.trim()) {
+                frequencyMap[tokenText] = (frequencyMap[tokenText] || 0) + 1;
+            }
+        });
+        // Convert to array and sort by frequency
+        const frequencyArray = Object.entries(frequencyMap)
+            .map(([token, count]) => ({ token, count }))
+            .sort((a, b) => b.count - a.count)
+            .slice(0, 10); // Top 10 tokens
+        return frequencyArray;
+    }
+    function renderFrequencyChart(frequencyData) {
+        const chartContainer = $('#frequencyChart');
+        chartContainer.empty();
+        if (frequencyData.length === 0) {
+            chartContainer.html('<div style="text-align: center; color: var(--secondary-text); padding: 1rem;">No token data available</div>');
+            return;
+        }
+        const maxCount = frequencyData[0].count;
+        frequencyData.forEach(({ token, count }) => {
+            const percentage = (count / maxCount) * 100;
+            const item = $(`
+                <div class="frequency-item">
+                    <div class="frequency-token" data-token="${token}">${token}</div>
+                    <div class="frequency-bar-container">
+                        <div class="frequency-bar">
+                            <div class="frequency-bar-fill" style="width: ${percentage}%"></div>
+                        </div>
+                        <div class="frequency-count">${count}</div>
+                    </div>
+                </div>
+            `);
+            // Add click handler to search for this token
+            item.find('.frequency-token').click(function() {
+                const searchToken = $(this).data('token');
+                $('#tokenSearchInput').val(searchToken);
+                performTokenSearch(searchToken);
+            });
+            chartContainer.append(item);
+        });
+    }
+    function toggleFrequencyChart() {
+        showFrequencyChart = !showFrequencyChart;
+        const container = $('#frequencyChartContainer');
+        const chart = $('#frequencyChart');
+        const toggleBtn = $('#toggleFrequencyChart');
+        if (showFrequencyChart) {
+            container.show();
+            chart.show();
+            toggleBtn.text('Hide Chart').addClass('active');
+            // Calculate and render frequency data
+            const tokens = $('#tokenContainer').find('.token');
+            tokenFrequencyData = calculateTokenFrequency(tokens);
+            renderFrequencyChart(tokenFrequencyData);
+        } else {
+            chart.hide();
+            toggleBtn.text('Show Chart').removeClass('active');
+        }
+    }
+    function updateResults(data) {
+        $('#results').show();
+        // Show search toggle button and frequency chart container
+        $('#searchToggleBtn').show();
+        $('#frequencyChartContainer').show();
+        // Update tokens
+        const tokenContainer = $('#tokenContainer');
+        tokenContainer.empty();
+        data.tokens.forEach(token => {
+            const span = $('<span>')
+                .addClass('token')
+                .css({
+                    'background-color': token.colors.background,
+                    'color': token.colors.text
+                })
+                // Include token id in the tooltip on hover
+                .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
+                .text(token.display);
+            tokenContainer.append(span);
+            if (token.newline) {
+                tokenContainer.append('<br>');
+            }
+        });
+        // Re-apply current search if any
+        const currentSearch = $('#tokenSearchInput').val();
+        if (currentSearch.trim()) {
+            performTokenSearch(currentSearch);
+        }
+        // Update display limit notice
+        if (data.display_limit_reached) {
+            $('#displayLimitNotice').show();
+            $('#totalTokenCount').text(data.total_tokens);
+        } else {
+            $('#displayLimitNotice').hide();
+        }
+        // Update preview notice
+        if (data.preview_only) {
+            $('#previewNotice').show();
+        } else {
+            $('#previewNotice').hide();
+        }
+        // Update basic stats
+        $('#totalTokens').text(data.stats.basic_stats.total_tokens);
+        $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
+        $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
+        $('#specialTokens').text(data.stats.basic_stats.special_tokens);
+        $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
+        $('#spaceCount').text(data.stats.basic_stats.space_tokens);
+        $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
+        $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
+        // Update length stats
+        $('#avgLength').text(data.stats.length_stats.avg_length);
+        $('#medianLength').text(data.stats.length_stats.median_length);
+        $('#stdDev').text(data.stats.length_stats.std_dev);
+        // Update tokenizer info if available
+        if (data.tokenizer_info) {
+            currentTokenizerInfo = data.tokenizer_info;
+            updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
+        }
+    }
+    // Handle text changes to detach file
+    $('#textInput').on('input', function() {
+        // Skip if file was just uploaded (prevents immediate detachment)
+        if (fileJustUploaded) {
+            fileJustUploaded = false;
+            return;
+        }
+        const currentText = $(this).val();
+        const fileInput = document.getElementById('fileInput');
+        // Only detach if a file exists and text has been substantially modified
+        if (fileInput.files.length > 0 && originalTextContent !== null) {
+            // Check if the text is completely different or has been significantly changed
+            // This allows for small edits without detaching
+            const isMajorChange =
+                currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
+                (currentText.length > 0 &&
+                 currentText !== originalTextContent.substring(0, currentText.length) &&
+                 currentText.substring(0, Math.min(20, currentText.length)) !==
+                 originalTextContent.substring(0, Math.min(20, currentText.length)));
+            if (isMajorChange) {
+                detachFile();
+            }
+        }
+    });
+    // Function to detach file
+    function detachFile() {
+        // Clear the file input
+        $('#fileInput').val('');
+        // Hide file info
+        $('#fileInfo').fadeOut(300);
+        // Reset the original content tracker
+        originalTextContent = $('#textInput').val();
+        // Reset last uploaded filename
+        lastUploadedFileName = null;
+    }
+    // For model changes
+    $('#modelSelect').change(function() {
+        const selectedModel = $(this).val();
+        $('#modelInput').val(selectedModel);
+        // Fetch tokenizer info for the selected model
+        fetchTokenizerInfo(selectedModel, false);
+        // If text exists, submit the form
+        if ($('#textInput').val().trim()) {
+            $('#analyzeForm').submit();
+        }
+    });
+    // File drop handling
+    const fileDropZone = $('#fileDropZone');
+    const fileUploadIcon = $('#fileUploadIcon');
+    // Prevent default drag behaviors
+    ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
+        fileDropZone[0].addEventListener(eventName, preventDefaults, false);
+        document.body.addEventListener(eventName, preventDefaults, false);
+    });
+    function preventDefaults(e) {
+        e.preventDefault();
+        e.stopPropagation();
+    }
+    // Show drop zone when file is dragged over the document
+    document.addEventListener('dragenter', showDropZone, false);
+    document.addEventListener('dragover', showDropZone, false);
+    fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
+    fileDropZone[0].addEventListener('drop', hideDropZone, false);
+    function showDropZone(e) {
+        fileDropZone.addClass('active');
+    }
+    function hideDropZone() {
+        fileDropZone.removeClass('active');
+    }
+    // Handle dropped files
+    fileDropZone[0].addEventListener('drop', handleDrop, false);
+    function handleDrop(e) {
+        const dt = e.dataTransfer;
+        const files = dt.files;
+        handleFiles(files);
+    }
+    // Also handle file selection via click on the icon
+    fileUploadIcon.on('click', function() {
+        const input = document.createElement('input');
+        input.type = 'file';
+        input.onchange = e => {
+            handleFiles(e.target.files);
+        };
+        input.click();
+    });
+    function handleFiles(files) {
+        if (files.length) {
+            const file = files[0];
+            currentFile = file;
+            lastUploadedFileName = file.name;
+            fileJustUploaded = true; // Set flag to prevent immediate detachment
+            // Show file info with animation and add detach button
+            $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
+            // Add click handler for detach button
+            $('#fileDetach').on('click', function(e) {
+                e.stopPropagation(); // Prevent event bubbling
+                detachFile();
+                return false;
+            });
+            // Set the file to the file input
+            const dataTransfer = new DataTransfer();
+            dataTransfer.items.add(file);
+            document.getElementById('fileInput').files = dataTransfer.files;
+            // Preview in textarea (first 8096 chars)
+            const reader = new FileReader();
+            reader.onload = function(e) {
+                const previewText = e.target.result.slice(0, 8096);
+                $('#textInput').val(previewText);
+                // Store this as the original content AFTER setting the value
+                // to prevent the input event from firing and detaching immediately
+                setTimeout(() => {
+                    originalTextContent = previewText;
+                    // Automatically submit for analysis
+                    $('#analyzeForm').submit();
+                }, 50);
+            };
+            reader.readAsText(file);
+        }
+    }
+    function formatFileSize(bytes) {
+        if (bytes < 1024) return bytes + ' bytes';
+        else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
+        else return (bytes / 1048576).toFixed(1) + ' MB';
+    }
+    // Make sure to check if there's still a file when analyzing
+    $('#analyzeForm').on('submit', function(e) {
+        e.preventDefault();
+        // Skip detachment check if file was just uploaded
+        if (!fileJustUploaded) {
+            // Check if text has been changed but file is still attached
+            const textInput = $('#textInput').val();
+            const fileInput = document.getElementById('fileInput');
+            if (fileInput.files.length > 0 &&
+                originalTextContent !== null &&
+                textInput !== originalTextContent &&
+                textInput.length < originalTextContent.length * 0.8) {
+                // Text was significantly changed but file is still attached, detach it
+                detachFile();
+            }
+        } else {
+            // Reset flag after first submission
+            fileJustUploaded = false;
+        }
+        // Update the hidden inputs based on current model type
+        if (currentModelType === 'custom') {
+            $('#customModelInputHidden').val($('#customModelInput').val());
+        } else {
+            $('#modelInput').val($('#modelSelect').val());
+        }
+        const formData = new FormData(this);
+        const analyzeButton = $('#analyzeButton');
+        const originalButtonText = analyzeButton.text();
+        analyzeButton.prop('disabled', true);
+        analyzeButton.html(originalButtonText + '<span class="loading-spinner"></span>');
+        showLoadingOverlay('Analyzing text...');
+        $.ajax({
+            url: '/',
+            method: 'POST',
+            data: formData,
+            processData: false,
+            contentType: false,
+            success: function(response) {
+                if (response.error) {
+                    showError(response.error);
+                } else {
+                    updateResults(response);
+                    // Show success badge if custom model
+                    if (currentModelType === 'custom') {
+                        $('#modelSuccessBadge').addClass('show');
+                        setTimeout(() => {
+                            $('#modelSuccessBadge').removeClass('show');
+                        }, 3000);
+                    }
+                }
+            },
+            error: function(xhr) {
+                showError(xhr.responseText || 'An error occurred while processing the text');
+            },
+            complete: function() {
+                analyzeButton.prop('disabled', false);
+                analyzeButton.text(originalButtonText);
+                hideLoadingOverlay();
+            }
+        });
+    });
+    $('#expandButton').click(function() {
+        const container = $('#tokenContainer');
+        const isExpanded = container.hasClass('expanded');
+        container.toggleClass('expanded');
+        $(this).text(isExpanded ? 'Show More' : 'Show Less');
+    });
+    // Initialize tokenizer info for current model
+    if (currentModelType === 'predefined') {
+        fetchTokenizerInfo($('#modelSelect').val(), false);
+    } else if ($('#customModelInput').val()) {
+        fetchTokenizerInfo($('#customModelInput').val(), true);
+    }
+    // Add event listener for custom model input
+    $('#customModelInput').on('change', function() {
+        const modelValue = $(this).val();
+        if (modelValue) {
+            fetchTokenizerInfo(modelValue, true);
+        }
+    });
+    // Keyboard shortcuts - specifically for textarea
+    $('#textInput').keydown(function(e) {
+        // Ctrl+Enter (or Cmd+Enter on Mac) to analyze
+        if ((e.ctrlKey || e.metaKey) && (e.keyCode === 13 || e.which === 13)) {
+            e.preventDefault();
+            if ($(this).val().trim()) {
+                $('#analyzeForm').submit();
+            }
+            return false;
+        }
+    });
+    // Global keyboard shortcuts
+    $(document).keydown(function(e) {
+        // Ctrl+F (or Cmd+F on Mac) to toggle search
+        if ((e.ctrlKey || e.metaKey) && (e.keyCode === 70 || e.which === 70)) {
+            if ($('#searchToggleBtn').is(':visible')) {
+                e.preventDefault();
+                if (!searchVisible) {
+                    toggleSearchVisibility();
+                } else {
+                    $('#tokenSearchInput').focus();
+                }
+                return false;
+            }
+        }
+        // Escape to close search or loading overlay
+        if (e.keyCode === 27 || e.which === 27) {
+            if (searchVisible) {
+                toggleSearchVisibility();
+                return false;
+            }
+            if ($('#loadingOverlay').hasClass('active')) {
+                // Don't close if there's an active request
+                return false;
+            }
+        }
+    });
+    // Add keyboard shortcut hint to the textarea placeholder
+    $('#textInput').attr('placeholder', 'Enter text to analyze or upload a file in bottom left corner... (Ctrl+Enter to analyze)');
+    // Token search event handlers
+    $('#tokenSearchInput').on('input', function() {
+        const searchTerm = $(this).val();
+        performTokenSearch(searchTerm);
+    });
+    $('#nextMatch').click(function() {
+        if (currentSearchIndex < searchMatches.length - 1) {
+            navigateToMatch(currentSearchIndex + 1);
+        }
+    });
+    $('#prevMatch').click(function() {
+        if (currentSearchIndex > 0) {
+            navigateToMatch(currentSearchIndex - 1);
+        }
+    });
+    $('#clearSearch').click(function() {
+        $('#tokenSearchInput').val('');
+        performTokenSearch('');
+    });
+    // Additional keyboard shortcuts for search
+    $('#tokenSearchInput').keydown(function(e) {
+        if (e.keyCode === 13) { // Enter
+            e.preventDefault();
+            if (e.shiftKey) {
+                // Shift+Enter: previous match
+                $('#prevMatch').click();
+            } else {
+                // Enter: next match
+                $('#nextMatch').click();
+            }
+        } else if (e.keyCode === 27) { // Escape
+            $('#clearSearch').click();
+            $(this).blur();
+        }
+    });
+    // Search toggle handler using event delegation
+    $(document).on('click', '#searchToggleBtn', function(e) {
+        console.log('Search toggle button clicked!');
+        e.preventDefault();
+        e.stopPropagation();
+        toggleSearchVisibility();
+        return false;
+    });
+    // Frequency chart toggle handler
+    $('#toggleFrequencyChart').click(function() {
+        toggleFrequencyChart();
+    });
+    // Mobile touch enhancements
+    function addTouchSupport() {
+        // Add touch-friendly double-tap for expand/collapse
+        let lastTap = 0;
+        $('#tokenContainer').on('touchend', function(e) {
+            const currentTime = new Date().getTime();
+            const tapLength = currentTime - lastTap;
+            if (tapLength < 500 && tapLength > 0) {
+                $('#expandButton').click();
+                e.preventDefault();
+            }
+            lastTap = currentTime;
+        });
+        // Improve touch scrolling for token container
+        $('#tokenContainer').on('touchstart', function(e) {
+            this.scrollTop = this.scrollTop;
+        });
+    }
+    // Check if mobile device and add touch support
+    if ('ontouchstart' in window || navigator.maxTouchPoints > 0) {
+        addTouchSupport();
+    }
+});

app/templates/index.html ADDED Viewed

	@@ -0,0 +1,213 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Tokenizer Pro</title>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
+    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
+    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
+</head>
+<body>
+    <!-- Hidden File Drop Zone that appears when dragging files -->
+    <div id="fileDropZone" class="file-drop-zone">
+        <div class="drop-indicator">
+            <div class="file-icon">📄</div>
+            <p>Drop your file here</p>
+        </div>
+    </div>
+    <!-- Loading overlay -->
+    <div id="loadingOverlay" class="loading-overlay">
+        <div class="loading-content">
+            <div class="loading-spinner large"></div>
+            <div class="loading-text" id="loadingText">Analyzing text...</div>
+        </div>
+    </div>
+    <!-- File upload icon in bottom left corner -->
+    <div id="fileUploadIcon" class="file-upload-icon">
+        <span>📎</span>
+    </div>
+    <p class="file-info" id="fileInfo"></p>
+    <div class="container">
+        <div class="header">
+            <div class="title-section">
+                <h1 class="title">Tokenizer Pro</h1>
+                <p class="subtitle">Advanced tokenization analysis and visualization</p>
+            </div>
+            <div class="model-selector">
+                <div class="model-selector-header">
+                    <div class="model-type-toggle">
+                        <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
+                        <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
+                    </div>
+                </div>
+                <div id="predefinedModelSelector">
+                    <div style="position: relative;">
+                        <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
+                        <!-- TOOLTIP MOVED HERE -->
+                        <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
+                            <div id="tokenizerInfoContent">
+                                <div class="tokenizer-info-loading">
+                                    <div class="tokenizer-info-spinner"></div>
+                                </div>
+                            </div>
+                        </div>
+                        <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
+                        <select id="modelSelect" name="model">
+                            {% for model_id, info in models.items() %}
+                            <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
+                                {{ info.alias }}
+                            </option>
+                            {% endfor %}
+                        </select>
+                    </div>
+                </div>
+                <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
+                    <div style="position: relative;">
+                        <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
+                        <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
+                            <div id="customTokenizerInfoContent">
+                                <div class="tokenizer-info-loading">
+                                    <div class="tokenizer-info-spinner"></div>
+                                </div>
+                            </div>
+                        </div>
+                        <input type="text" id="customModelInput" class="custom-model-input"
+                               placeholder="Enter HuggingFace model path"
+                               value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
+                    </div>
+                    <span class="custom-model-help">?</span>
+                    <div class="tooltip">
+                        Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
+                        The model must have a tokenizer available and must be not restricted. (with some exceptions)
+                        Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
+                        Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
+                    </div>
+                    <div class="model-badge" id="modelSuccessBadge">Loaded</div>
+                </div>
+            </div>
+        </div>
+        <div class="error-message" id="errorMessage">{{ error }}</div>
+        <div class="input-section">
+            <div class="keyboard-shortcut-hint">Ctrl+Enter</div>
+            <form id="analyzeForm" method="POST" enctype="multipart/form-data">
+                <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
+                <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
+                <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
+                <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
+                <input type="file" name="file" id="fileInput" style="display: none;">
+                <div class="button-container">
+                    <button type="submit" id="analyzeButton">Analyze Text</button>
+                </div>
+            </form>
+        </div>
+        <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
+            <div class="card">
+                <div class="card-header">
+                    <h2 class="card-title">Token Visualization</h2>
+                    <button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;">
+                        <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
+                            <path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/>
+                        </svg>
+                    </button>
+                </div>
+                <div class="token-search-container" id="tokenSearchContainer" style="display: none;">
+                    <div class="token-search-row">
+                        <input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens...">
+                        <div class="token-search-controls">
+                            <button class="token-search-btn" id="prevMatch">◀</button>
+                            <span class="token-search-count" id="searchCount">0/0</span>
+                            <button class="token-search-btn" id="nextMatch">▶</button>
+                            <button class="token-search-btn" id="clearSearch">Clear</button>
+                        </div>
+                    </div>
+                </div>
+                <div class="preview-notice" id="previewNotice">
+                    Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
+                </div>
+                <div class="token-container" id="tokenContainer">
+                    {% if token_data %}
+                    {% for token in token_data.tokens %}
+                    <span class="token"
+                          style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
+                          title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
+                        {{ token.display }}
+                    </span>
+                    {% if token.newline %}<br>{% endif %}
+                    {% endfor %}
+                    {% endif %}
+                </div>
+                <button class="expand-button" id="expandButton">Show More</button>
+                <div class="display-limit-notice" id="displayLimitNotice">
+                    Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
+                </div>
+                <div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;">
+                    <div class="frequency-chart-title">
+                        <span>Top Token Frequencies</span>
+                        <button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button>
+                    </div>
+                    <div class="frequency-chart" id="frequencyChart"></div>
+                </div>
+            </div>
+            <div class="stats-grid">
+                <div class="stat-card">
+                    <div class="stat-title">Total Tokens</div>
+                    <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
+                    <div class="stat-description">
+                        <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
+                        (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
+                    </div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-title">Token Types</div>
+                    <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
+                    <div class="stat-description">special tokens</div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-title">Whitespace</div>
+                    <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
+                    <div class="stat-description">
+                        spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
+                        newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
+                    </div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-title">Token Length</div>
+                    <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
+                    <div class="stat-description">
+                        median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
+                        ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
+                    </div>
+                </div>
+                <div class="stat-card">
+                    <div class="stat-title">Compression</div>
+                    <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
+                    <div class="stat-description">characters per token</div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
+        @bartar/tokenizers
+    </a>
+    <script>
+        // Pass server data to client-side JavaScript
+        window.tokenizerData = {
+            model_type: "{{ model_type if model_type else 'predefined' }}",
+            error: "{{ error if error else '' }}",
+            tokenizer_info: {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }}
+        };
+    </script>
+    <script src="{{ url_for('static', filename='js/main.js') }}"></script>
+</body>
+</html>

app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Utility functions for Tokenizer Pro
+"""

app/utils/validators.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+Validation utilities for security and input validation
+"""
+import os
+import re
+from typing import Optional
+from urllib.parse import urlparse
+class ValidationError(Exception):
+    """Custom exception for validation errors."""
+    pass
+class Validators:
+    """Collection of validation functions for security and input validation."""
+    # Regex patterns for validation - allow numbers, letters, hyphens, underscores, dots
+    HUGGINGFACE_MODEL_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+/[a-zA-Z0-9_\-\.]+$')
+    SAFE_FILENAME_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+$')
+    @staticmethod
+    def validate_model_path(model_path: str) -> bool:
+        """
+        Validate that a custom model path is safe and follows expected patterns.
+        Args:
+            model_path: The model path to validate
+        Returns:
+            bool: True if valid, False otherwise
+        Raises:
+            ValidationError: If the model path is invalid
+        """
+        if not model_path or not isinstance(model_path, str):
+            raise ValidationError("Model path cannot be empty")
+        # Trim whitespace
+        model_path = model_path.strip()
+        # Check for dangerous characters (excluding single forward slash for HuggingFace format)
+        dangerous_chars = ['..', '\\', '|', ';', '&', '$', '`', '<', '>']
+        if any(char in model_path for char in dangerous_chars):
+            raise ValidationError("Model path contains invalid characters")
+        # Check for multiple slashes or leading/trailing slashes
+        if '//' in model_path or model_path.startswith('/') or model_path.endswith('/'):
+            raise ValidationError("Model path contains invalid characters")
+        # Check if it looks like a HuggingFace model path (user/model format)
+        if not Validators.HUGGINGFACE_MODEL_PATTERN.match(model_path):
+            raise ValidationError("Model path must follow the format 'organization/model-name'")
+        # Check length limits
+        if len(model_path) > 200:
+            raise ValidationError("Model path is too long")
+        return True
+    @staticmethod
+    def validate_filename(filename: str) -> bool:
+        """
+        Validate that a filename is safe for upload.
+        Args:
+            filename: The filename to validate
+        Returns:
+            bool: True if valid, False otherwise
+        Raises:
+            ValidationError: If the filename is invalid
+        """
+        if not filename or not isinstance(filename, str):
+            raise ValidationError("Filename cannot be empty")
+        # Check for dangerous characters and patterns
+        dangerous_patterns = ['..', '/', '\\', '|', ';', '&', '$', '`', '<', '>']
+        if any(pattern in filename for pattern in dangerous_patterns):
+            raise ValidationError("Filename contains invalid characters")
+        # Check if filename starts with a dot (hidden files)
+        if filename.startswith('.'):
+            raise ValidationError("Hidden files are not allowed")
+        # Check length
+        if len(filename) > 255:
+            raise ValidationError("Filename is too long")
+        return True
+    @staticmethod
+    def validate_file_extension(filename: str, allowed_extensions: set) -> bool:
+        """
+        Validate that a file has an allowed extension.
+        Args:
+            filename: The filename to check
+            allowed_extensions: Set of allowed extensions (e.g., {'.txt', '.py'})
+        Returns:
+            bool: True if valid, False otherwise
+        Raises:
+            ValidationError: If the extension is not allowed
+        """
+        if not filename:
+            raise ValidationError("Filename cannot be empty")
+        _, ext = os.path.splitext(filename.lower())
+        if ext not in allowed_extensions:
+            allowed_list = ', '.join(sorted(allowed_extensions))
+            raise ValidationError(f"File type '{ext}' not allowed. Allowed types: {allowed_list}")
+        return True
+    @staticmethod
+    def validate_file_size(file_size: int, max_size: int) -> bool:
+        """
+        Validate that a file size is within limits.
+        Args:
+            file_size: Size of the file in bytes
+            max_size: Maximum allowed size in bytes
+        Returns:
+            bool: True if valid, False otherwise
+        Raises:
+            ValidationError: If the file is too large
+        """
+        if file_size > max_size:
+            max_mb = max_size / (1024 * 1024)
+            current_mb = file_size / (1024 * 1024)
+            raise ValidationError(f"File too large: {current_mb:.1f}MB (max: {max_mb:.1f}MB)")
+        return True
+    @staticmethod
+    def validate_text_input(text: str, max_length: int = 1000000) -> bool:
+        """
+        Validate text input for processing.
+        Args:
+            text: The text to validate
+            max_length: Maximum allowed length
+        Returns:
+            bool: True if valid, False otherwise
+        Raises:
+            ValidationError: If the text is invalid
+        """
+        if not isinstance(text, str):
+            raise ValidationError("Text input must be a string")
+        if len(text) > max_length:
+            raise ValidationError(f"Text too long: {len(text)} characters (max: {max_length})")
+        return True
+    @staticmethod
+    def sanitize_model_path(model_path: str) -> str:
+        """
+        Sanitize a model path by removing potentially dangerous elements.
+        Args:
+            model_path: The model path to sanitize
+        Returns:
+            str: Sanitized model path
+        """
+        if not model_path:
+            return ""
+        # Remove whitespace
+        sanitized = model_path.strip()
+        # Remove any path traversal attempts
+        sanitized = sanitized.replace('..', '')
+        sanitized = sanitized.replace('/', '')
+        sanitized = sanitized.replace('\\', '')
+        return sanitized
+    @staticmethod
+    def is_safe_path(path: str, base_path: str) -> bool:
+        """
+        Check if a path is safe and within the expected base directory.
+        Args:
+            path: The path to check
+            base_path: The base directory that the path should be within
+        Returns:
+            bool: True if the path is safe, False otherwise
+        """
+        try:
+            # Resolve both paths to absolute paths
+            abs_path = os.path.abspath(path)
+            abs_base = os.path.abspath(base_path)
+            # Check if the path is within the base directory
+            return abs_path.startswith(abs_base)
+        except (OSError, ValueError):
+            return False
+# Global instance
+validators = Validators()

config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+from dataclasses import dataclass, field
+from typing import List
+@dataclass
+class Config:
+    """Centralized configuration for Tokenizer Pro application."""
+    # Flask settings
+    SECRET_KEY: str = os.getenv('SECRET_KEY', 'tokenizer-pro-secret-key-change-in-production')
+    DEBUG: bool = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
+    # File upload settings
+    MAX_CONTENT_LENGTH: int = int(os.getenv('MAX_CONTENT_LENGTH', 25 * 1024 * 1024))  # 25MB
+    UPLOAD_FOLDER: str = os.getenv('UPLOAD_FOLDER', '/tmp/tokenizer_uploads')
+    ALLOWED_EXTENSIONS: List[str] = field(default_factory=lambda: ['txt', 'md', 'py', 'js', 'html', 'css', 'json', 'xml', 'csv'])
+    # Tokenizer caching settings
+    CACHE_SIZE: int = int(os.getenv('CACHE_SIZE', 10))
+    CACHE_EXPIRATION: int = int(os.getenv('CACHE_EXPIRATION', 3600))  # 1 hour in seconds
+    # Display limits
+    MAX_DISPLAY_TOKENS: int = int(os.getenv('MAX_DISPLAY_TOKENS', 50000))
+    PREVIEW_CHAR_LIMIT: int = int(os.getenv('PREVIEW_CHAR_LIMIT', 8096))
+    # Performance settings
+    CHUNK_SIZE: int = int(os.getenv('CHUNK_SIZE', 1024 * 1024))  # 1MB chunks for file processing
+    # Security settings
+    VALIDATE_MODEL_PATHS: bool = os.getenv('VALIDATE_MODEL_PATHS', 'True').lower() in ('true', '1', 'yes')
+    ALLOWED_MODEL_PREFIXES: List[str] = field(default_factory=lambda: [
+        'microsoft/', 'google/', 'meta-llama/', 'mistralai/', 'openai-community/',
+        'Qwen/', 'THUDM/', 'deepseek-ai/', 'unsloth/', 'google-bert/', 'bartar/'
+    ])
+    # HuggingFace settings
+    HF_HOME: str = os.getenv('HF_HOME', '/tmp/huggingface')
+    HF_CACHE_DIR: str = os.getenv('HF_CACHE_DIR', '/tmp/huggingface/cache')
+    # Logging settings
+    LOG_LEVEL: str = os.getenv('LOG_LEVEL', 'INFO')
+    LOG_FILE: str = os.getenv('LOG_FILE', 'tokenizer_pro.log')
+    LOG_MAX_BYTES: int = int(os.getenv('LOG_MAX_BYTES', 10 * 1024 * 1024))  # 10MB
+    LOG_BACKUP_COUNT: int = int(os.getenv('LOG_BACKUP_COUNT', 3))
+class DevelopmentConfig(Config):
+    """Development configuration with debug enabled."""
+    DEBUG = True
+    SECRET_KEY = 'dev-secret-key'
+class ProductionConfig(Config):
+    """Production configuration with enhanced security."""
+    DEBUG = False
+    SECRET_KEY = os.getenv('SECRET_KEY', None)
+    def __post_init__(self):
+        if not self.SECRET_KEY:
+            raise ValueError("SECRET_KEY must be set in production environment")
+class TestingConfig(Config):
+    """Testing configuration."""
+    TESTING = True
+    DEBUG = True
+    UPLOAD_FOLDER = '/tmp/test_uploads'
+    CACHE_SIZE = 2
+    MAX_DISPLAY_TOKENS = 100

pytest.ini ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool:pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short --strict-markers
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    integration: marks tests as integration tests
+    unit: marks tests as unit tests
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+flask>=2.3.0
+transformers>=4.30.0
+torch>=2.0.0
+werkzeug>=2.3.0
+psutil>=5.9.0
+pytest>=7.0.0
+pytest-flask>=1.2.0

run.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python3
+"""
+Tokenizer Pro - Advanced tokenization analysis and visualization
+This is the main entry point for the Flask application.
+"""
+import os
+from app import create_app
+from config import Config, DevelopmentConfig, ProductionConfig
+def get_config_class():
+    """Determine which configuration class to use based on environment."""
+    env = os.getenv('FLASK_ENV', 'development').lower()
+    if env == 'production':
+        return ProductionConfig
+    elif env == 'development':
+        return DevelopmentConfig
+    else:
+        return Config
+app = create_app(get_config_class())
+if __name__ == "__main__":
+    # Get configuration from environment variables
+    host = os.getenv('HOST', '0.0.0.0')
+    port = int(os.getenv('PORT', 7860))
+    debug = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
+    app.run(host=host, port=port, debug=debug)

run_tests.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+Test runner script for Tokenizer Pro
+Usage:
+    python run_tests.py              # Run all tests
+    python run_tests.py unit         # Run only unit tests
+    python run_tests.py integration  # Run only integration tests
+    python run_tests.py --coverage   # Run with coverage report
+"""
+import sys
+import subprocess
+import os
+def run_command(cmd):
+    """Run a command and return the exit code."""
+    print(f"Running: {' '.join(cmd)}")
+    return subprocess.call(cmd)
+def main():
+    """Main test runner function."""
+    args = sys.argv[1:]
+    # Base pytest command
+    pytest_cmd = ["python", "-m", "pytest"]
+    # Parse arguments
+    if "--coverage" in args:
+        pytest_cmd.extend(["--cov=app", "--cov-report=html", "--cov-report=term"])
+        args.remove("--coverage")
+    if "unit" in args:
+        pytest_cmd.extend([
+            "tests/test_tokenizer_service.py",
+            "tests/test_stats_service.py",
+            "tests/test_file_service.py",
+            "tests/test_validators.py"
+        ])
+    elif "integration" in args:
+        pytest_cmd.append("tests/test_routes.py")
+    else:
+        # Run all tests
+        pytest_cmd.append("tests/")
+    # Add any remaining arguments
+    pytest_cmd.extend(args)
+    # Run the tests
+    exit_code = run_command(pytest_cmd)
+    if exit_code == 0:
+        print("\n✅ All tests passed!")
+    else:
+        print(f"\n❌ Tests failed with exit code {exit_code}")
+    return exit_code
+if __name__ == "__main__":
+    sys.exit(main())

tests/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Test package for Tokenizer Pro
+"""

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+pytest configuration file
+"""
+import pytest
+import os
+import tempfile
+from unittest.mock import Mock, patch
+from flask import Flask
+# Add the parent directory to Python path so we can import the app
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app import create_app
+from config import TestingConfig
+@pytest.fixture
+def app():
+    """Create a test Flask application."""
+    app = create_app(TestingConfig())
+    # Create a temporary directory for file uploads during testing
+    with tempfile.TemporaryDirectory() as temp_dir:
+        app.config['UPLOAD_FOLDER'] = temp_dir
+        app.config['TESTING'] = True
+        yield app
+@pytest.fixture
+def client(app):
+    """Create a test client."""
+    return app.test_client()
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer for testing."""
+    tokenizer = Mock()
+    tokenizer.tokenize.return_value = ['Hello', 'world', '!']
+    tokenizer.vocab_size = 50257
+    tokenizer.model_max_length = 1024
+    tokenizer.__class__.__name__ = 'MockTokenizer'
+    # Mock special tokens
+    tokenizer.pad_token = '<pad>'
+    tokenizer.eos_token = '</s>'
+    tokenizer.unk_token = '<unk>'
+    tokenizer.bos_token = '<s>'
+    return tokenizer
+@pytest.fixture
+def sample_text():
+    """Sample text for testing."""
+    return "Hello world! This is a test."
+@pytest.fixture
+def sample_tokens():
+    """Sample tokens for testing."""
+    return ['Hello', ' world', '!', ' This', ' is', ' a', ' test', '.']
+@pytest.fixture
+def temp_file():
+    """Create a temporary file for testing."""
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+        f.write("Hello world! This is a test file.")
+        temp_path = f.name
+    yield temp_path
+    # Cleanup
+    if os.path.exists(temp_path):
+        os.unlink(temp_path)

tests/test_file_service.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+Unit tests for FileService
+"""
+import pytest
+import os
+import tempfile
+from unittest.mock import Mock, patch, mock_open
+from werkzeug.datastructures import FileStorage
+from io import BytesIO
+from app.services.file_service import FileService
+class TestFileService:
+    """Test cases for FileService."""
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.service = FileService()
+    def test_is_allowed_file_valid_extensions(self):
+        """Test allowed file extension checking."""
+        # Valid extensions
+        assert self.service.is_allowed_file('test.txt') is True
+        assert self.service.is_allowed_file('document.md') is True
+        assert self.service.is_allowed_file('script.py') is True
+        assert self.service.is_allowed_file('code.js') is True
+        assert self.service.is_allowed_file('data.json') is True
+        assert self.service.is_allowed_file('styles.css') is True
+        assert self.service.is_allowed_file('page.html') is True
+        assert self.service.is_allowed_file('data.csv') is True
+        assert self.service.is_allowed_file('app.log') is True
+    def test_is_allowed_file_invalid_extensions(self):
+        """Test invalid file extensions."""
+        # Invalid extensions
+        assert self.service.is_allowed_file('virus.exe') is False
+        assert self.service.is_allowed_file('archive.zip') is False
+        assert self.service.is_allowed_file('image.jpg') is False
+        assert self.service.is_allowed_file('document.pdf') is False
+        assert self.service.is_allowed_file('data.xlsx') is False
+    def test_is_allowed_file_edge_cases(self):
+        """Test edge cases for file extension checking."""
+        # Empty filename
+        assert self.service.is_allowed_file('') is False
+        assert self.service.is_allowed_file(None) is False
+        # No extension
+        assert self.service.is_allowed_file('filename') is False
+        # Multiple dots
+        assert self.service.is_allowed_file('file.backup.txt') is True
+        # Case sensitivity
+        assert self.service.is_allowed_file('FILE.TXT') is True
+        assert self.service.is_allowed_file('Document.MD') is True
+    def test_generate_secure_filename_basic(self):
+        """Test basic secure filename generation."""
+        filename = self.service.generate_secure_filename('test.txt')
+        assert filename.endswith('_test.txt')
+        assert len(filename) > len('test.txt')  # Should have UUID prefix
+        # Should be different each time
+        filename2 = self.service.generate_secure_filename('test.txt')
+        assert filename != filename2
+    def test_generate_secure_filename_special_characters(self):
+        """Test secure filename with special characters."""
+        # Test filename with spaces and special chars
+        filename = self.service.generate_secure_filename('my file name.txt')
+        assert 'my_file_name.txt' in filename
+        # Test with path separators (should be removed)
+        filename = self.service.generate_secure_filename('../../../etc/passwd')
+        assert '..' not in filename
+        assert '/' not in filename
+        assert '\\' not in filename
+    def test_generate_secure_filename_empty_input(self):
+        """Test secure filename generation with empty input."""
+        filename = self.service.generate_secure_filename('')
+        assert filename.endswith('.txt')
+        assert len(filename) > 4  # Should have UUID
+        filename = self.service.generate_secure_filename(None)
+        assert filename.endswith('.txt')
+        assert len(filename) > 4
+    @patch('os.makedirs')
+    def test_save_uploaded_file_basic(self, mock_makedirs, temp_file):
+        """Test basic file upload saving."""
+        # Create a mock uploaded file
+        file_content = b"Hello world!"
+        uploaded_file = FileStorage(
+            stream=BytesIO(file_content),
+            filename='test.txt',
+            content_type='text/plain'
+        )
+        upload_folder = '/tmp/test_uploads'
+        with patch('builtins.open', mock_open()) as mock_file:
+            file_path = self.service.save_uploaded_file(uploaded_file, upload_folder)
+        # Check that directory creation was attempted
+        mock_makedirs.assert_called_once_with(upload_folder, exist_ok=True)
+        # Check that file path has correct structure
+        assert file_path.startswith(upload_folder)
+        assert file_path.endswith('_test.txt')
+    def test_cleanup_file_existing(self, temp_file):
+        """Test cleanup of existing file."""
+        # Verify file exists
+        assert os.path.exists(temp_file)
+        # Cleanup
+        self.service.cleanup_file(temp_file)
+        # Verify file is deleted
+        assert not os.path.exists(temp_file)
+    def test_cleanup_file_nonexistent(self):
+        """Test cleanup of non-existent file (should not raise error)."""
+        # Should not raise an exception
+        self.service.cleanup_file('/path/that/does/not/exist.txt')
+    @patch('app.services.file_service.tokenizer_service')
+    @patch('app.services.file_service.stats_service')
+    def test_process_file_for_tokenization_basic(self, mock_stats, mock_tokenizer, temp_file):
+        """Test basic file processing for tokenization."""
+        # Mock tokenizer service
+        mock_tokenizer_obj = Mock()
+        mock_tokenizer_obj.tokenize.return_value = ['Hello', ' world', '!']
+        mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
+        # Mock stats service
+        mock_stats.get_token_stats.return_value = {
+            'basic_stats': {'total_tokens': 3},
+            'length_stats': {'avg_length': '2.0'}
+        }
+        mock_stats.format_tokens_for_display.return_value = [
+            {'display': 'Hello', 'original': 'Hello', 'token_id': 1, 'colors': {}, 'newline': False}
+        ]
+        result = self.service.process_file_for_tokenization(
+            file_path=temp_file,
+            model_id_or_name='gpt2',
+            preview_char_limit=1000,
+            max_display_tokens=100,
+            chunk_size=1024
+        )
+        assert isinstance(result, dict)
+        assert 'tokens' in result
+        assert 'stats' in result
+        assert 'display_limit_reached' in result
+        assert 'total_tokens' in result
+        assert 'preview_only' in result
+        assert 'tokenizer_info' in result
+    @patch('app.services.file_service.tokenizer_service')
+    def test_process_file_tokenizer_error(self, mock_tokenizer, temp_file):
+        """Test file processing with tokenizer error."""
+        # Mock tokenizer service to return error
+        mock_tokenizer.load_tokenizer.return_value = (None, {}, "Tokenizer error")
+        with pytest.raises(Exception) as excinfo:
+            self.service.process_file_for_tokenization(
+                file_path=temp_file,
+                model_id_or_name='invalid-model',
+                preview_char_limit=1000,
+                max_display_tokens=100
+            )
+        assert "Tokenizer error" in str(excinfo.value)
+    @patch('app.services.file_service.tokenizer_service')
+    @patch('app.services.file_service.stats_service')
+    def test_process_text_for_tokenization_basic(self, mock_stats, mock_tokenizer):
+        """Test basic text processing for tokenization."""
+        # Mock tokenizer service
+        mock_tokenizer_obj = Mock()
+        mock_tokenizer_obj.tokenize.return_value = ['Hello', ' world']
+        mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {'vocab_size': 1000}, None)
+        # Mock stats service
+        mock_stats.get_token_stats.return_value = {
+            'basic_stats': {'total_tokens': 2},
+            'length_stats': {'avg_length': '3.0'}
+        }
+        mock_stats.format_tokens_for_display.return_value = [
+            {'display': 'Hello', 'original': 'Hello', 'token_id': 1, 'colors': {}, 'newline': False},
+            {'display': ' world', 'original': ' world', 'token_id': 2, 'colors': {}, 'newline': False}
+        ]
+        result = self.service.process_text_for_tokenization(
+            text="Hello world",
+            model_id_or_name='gpt2',
+            max_display_tokens=100
+        )
+        assert isinstance(result, dict)
+        assert 'tokens' in result
+        assert 'stats' in result
+        assert result['display_limit_reached'] is False
+        assert result['total_tokens'] == 2
+        assert result['tokenizer_info']['vocab_size'] == 1000
+    @patch('app.services.file_service.tokenizer_service')
+    @patch('app.services.file_service.stats_service')
+    def test_process_text_display_limit(self, mock_stats, mock_tokenizer):
+        """Test text processing with display limit."""
+        # Create a large number of tokens
+        tokens = [f'token{i}' for i in range(200)]
+        # Mock tokenizer service
+        mock_tokenizer_obj = Mock()
+        mock_tokenizer_obj.tokenize.return_value = tokens
+        mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
+        # Mock stats service
+        mock_stats.get_token_stats.return_value = {
+            'basic_stats': {'total_tokens': 200},
+            'length_stats': {'avg_length': '6.0'}
+        }
+        mock_stats.format_tokens_for_display.return_value = []
+        result = self.service.process_text_for_tokenization(
+            text="Long text",
+            model_id_or_name='gpt2',
+            max_display_tokens=100  # Limit lower than token count
+        )
+        assert result['display_limit_reached'] is True
+        assert result['total_tokens'] == 200
+    @patch('app.services.file_service.tokenizer_service')
+    def test_process_text_tokenizer_error(self, mock_tokenizer):
+        """Test text processing with tokenizer error."""
+        # Mock tokenizer service to return error
+        mock_tokenizer.load_tokenizer.return_value = (None, {}, "Model not found")
+        with pytest.raises(Exception) as excinfo:
+            self.service.process_text_for_tokenization(
+                text="Hello world",
+                model_id_or_name='invalid-model'
+            )
+        assert "Model not found" in str(excinfo.value)
+    @patch('app.services.file_service.tokenizer_service')
+    @patch('app.services.file_service.stats_service')
+    def test_process_text_preview_mode(self, mock_stats, mock_tokenizer):
+        """Test text processing in preview mode."""
+        long_text = "A" * 10000  # Long text
+        # Mock tokenizer service
+        mock_tokenizer_obj = Mock()
+        mock_tokenizer_obj.tokenize.return_value = ['A'] * 5000  # Many tokens
+        mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
+        # Mock stats service
+        mock_stats.get_token_stats.return_value = {
+            'basic_stats': {'total_tokens': 5000},
+            'length_stats': {'avg_length': '1.0'}
+        }
+        mock_stats.format_tokens_for_display.return_value = []
+        result = self.service.process_text_for_tokenization(
+            text=long_text,
+            model_id_or_name='gpt2',
+            is_preview=True,
+            preview_char_limit=100
+        )
+        assert result['preview_only'] is True
+    def test_allowed_extensions_constant(self):
+        """Test that ALLOWED_EXTENSIONS contains expected extensions."""
+        extensions = self.service.ALLOWED_EXTENSIONS
+        assert isinstance(extensions, set)
+        # Check for required extensions
+        required_extensions = {'.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv', '.log'}
+        assert required_extensions.issubset(extensions)
+        # All extensions should start with dot
+        for ext in extensions:
+            assert ext.startswith('.')
+            assert len(ext) > 1

tests/test_routes.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Integration tests for Flask routes
+"""
+import pytest
+import json
+import tempfile
+import os
+from unittest.mock import patch, Mock
+from io import BytesIO
+from werkzeug.datastructures import FileStorage
+class TestMainRoutes:
+    """Integration tests for main application routes."""
+    def test_index_get_basic(self, client):
+        """Test basic GET request to index."""
+        response = client.get('/')
+        assert response.status_code == 200
+        assert b'Tokenizer Pro' in response.data
+        assert b'Advanced tokenization analysis' in response.data
+        assert b'textarea' in response.data
+    def test_index_get_with_parameters(self, client):
+        """Test GET request with query parameters."""
+        response = client.get('/?model=gpt2&model_type=predefined')
+        assert response.status_code == 200
+        assert b'gpt2' in response.data or b'GPT-2' in response.data
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    @patch('app.services.file_service.file_service')
+    def test_index_post_text_analysis(self, mock_file_service, mock_tokenizer_service, client):
+        """Test POST request with text analysis."""
+        # Mock services
+        mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
+        mock_file_service.process_text_for_tokenization.return_value = {
+            'tokens': [
+                {
+                    'display': 'Hello',
+                    'original': 'Hello',
+                    'token_id': 15496,
+                    'colors': {'background': '#FF5733', 'text': '#FFFFFF'},
+                    'newline': False
+                }
+            ],
+            'stats': {
+                'basic_stats': {
+                    'total_tokens': 1,
+                    'unique_tokens': 1,
+                    'unique_percentage': '100.0',
+                    'special_tokens': 0,
+                    'space_tokens': 0,
+                    'newline_tokens': 0,
+                    'compression_ratio': '5.0'
+                },
+                'length_stats': {
+                    'avg_length': '5.0',
+                    'median_length': '5.0',
+                    'std_dev': '0.0'
+                }
+            },
+            'display_limit_reached': False,
+            'total_tokens': 1,
+            'preview_only': False,
+            'tokenizer_info': {
+                'vocab_size': 50257,
+                'tokenizer_type': 'GPT2TokenizerFast'
+            }
+        }
+        response = client.post('/', data={
+            'text': 'Hello',
+            'model': 'gpt2',
+            'model_type': 'predefined'
+        })
+        assert response.status_code == 200
+        mock_file_service.process_text_for_tokenization.assert_called_once()
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    @patch('app.services.file_service.file_service')
+    def test_index_post_ajax_request(self, mock_file_service, mock_tokenizer_service, client):
+        """Test AJAX POST request for text analysis."""
+        # Mock services
+        mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
+        expected_response = {
+            'tokens': [],
+            'stats': {'basic_stats': {}, 'length_stats': {}},
+            'display_limit_reached': False,
+            'total_tokens': 0
+        }
+        mock_file_service.process_text_for_tokenization.return_value = expected_response
+        response = client.post('/',
+            data={'text': 'Test', 'model': 'gpt2', 'model_type': 'predefined'},
+            headers={'X-Requested-With': 'XMLHttpRequest'}
+        )
+        assert response.status_code == 200
+        assert response.content_type == 'application/json'
+        data = json.loads(response.data)
+        assert 'tokens' in data
+        assert 'stats' in data
+    @patch('app.services.file_service.file_service')
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_index_post_file_upload(self, mock_tokenizer_service, mock_file_service, client, app):
+        """Test POST request with file upload."""
+        with app.app_context():
+            # Mock services
+            mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
+            # Mock file processing
+            mock_file_service.save_uploaded_file.return_value = '/tmp/test_file.txt'
+            mock_file_service.process_file_for_tokenization.return_value = {
+                'tokens': [],
+                'stats': {'basic_stats': {}, 'length_stats': {}},
+                'display_limit_reached': False,
+                'total_tokens': 0,
+                'preview_only': True
+            }
+            mock_file_service.cleanup_file.return_value = None
+            # Create test file data
+            file_data = BytesIO(b"Hello world! This is a test file.")
+            response = client.post('/',
+                data={
+                    'file': (file_data, 'test.txt'),
+                    'model': 'gpt2',
+                    'model_type': 'predefined'
+                },
+                content_type='multipart/form-data'
+            )
+            assert response.status_code == 200
+            mock_file_service.save_uploaded_file.assert_called_once()
+            mock_file_service.process_file_for_tokenization.assert_called_once()
+            mock_file_service.cleanup_file.assert_called_once()
+    @patch('app.utils.validators.validators')
+    def test_index_post_validation_error(self, mock_validators, client):
+        """Test POST request with validation error."""
+        from app.utils.validators import ValidationError
+        # Mock validation to raise error
+        mock_validators.validate_text_input.side_effect = ValidationError("Invalid input")
+        response = client.post('/',
+            data={'text': 'Invalid text', 'model': 'gpt2'},
+            headers={'X-Requested-With': 'XMLHttpRequest'}
+        )
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+        assert 'Invalid input' in data['error']
+    def test_index_post_empty_data(self, client):
+        """Test POST request with empty data."""
+        response = client.post('/', data={})
+        assert response.status_code == 200
+        # Should return the form again without processing
+class TestTokenizerInfoRoute:
+    """Integration tests for tokenizer info route."""
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_tokenizer_info_predefined_model(self, mock_tokenizer_service, client):
+        """Test tokenizer info for predefined model."""
+        # Mock service
+        mock_tokenizer_service.is_predefined_model.return_value = True
+        mock_tokenizer_service.load_tokenizer.return_value = (
+            Mock(),
+            {
+                'vocab_size': 50257,
+                'tokenizer_type': 'GPT2TokenizerFast',
+                'model_max_length': 1024,
+                'special_tokens': {'eos_token': '</s>'}
+            },
+            None
+        )
+        response = client.get('/tokenizer-info?model_id=gpt2&is_custom=false')
+        assert response.status_code == 200
+        assert response.content_type == 'application/json'
+        data = json.loads(response.data)
+        assert 'vocab_size' in data
+        assert 'tokenizer_type' in data
+        assert data['vocab_size'] == 50257
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    @patch('app.utils.validators.validators')
+    def test_tokenizer_info_custom_model(self, mock_validators, mock_tokenizer_service, client):
+        """Test tokenizer info for custom model."""
+        # Mock validation
+        mock_validators.validate_model_path.return_value = None
+        # Mock service
+        mock_tokenizer_service.is_predefined_model.return_value = False
+        mock_tokenizer_service.load_tokenizer.return_value = (
+            Mock(),
+            {
+                'vocab_size': 32000,
+                'tokenizer_type': 'LlamaTokenizerFast',
+                'special_tokens': {}
+            },
+            None
+        )
+        response = client.get('/tokenizer-info?model_id=meta-llama/Llama-2-7b-hf&is_custom=true')
+        assert response.status_code == 200
+        data = json.loads(response.data)
+        assert data['vocab_size'] == 32000
+    def test_tokenizer_info_missing_model_id(self, client):
+        """Test tokenizer info without model_id."""
+        response = client.get('/tokenizer-info')
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+        assert 'No model ID provided' in data['error']
+    @patch('app.utils.validators.validators')
+    def test_tokenizer_info_validation_error(self, mock_validators, client):
+        """Test tokenizer info with validation error."""
+        from app.utils.validators import ValidationError
+        # Mock validation to raise error
+        mock_validators.validate_model_path.side_effect = ValidationError("Invalid model path")
+        response = client.get('/tokenizer-info?model_id=invalid/path&is_custom=true')
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+        assert 'Invalid model path' in data['error']
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_tokenizer_info_service_error(self, mock_tokenizer_service, client):
+        """Test tokenizer info with service error."""
+        # Mock service to return error
+        mock_tokenizer_service.is_predefined_model.return_value = True
+        mock_tokenizer_service.load_tokenizer.return_value = (None, {}, "Failed to load tokenizer")
+        response = client.get('/tokenizer-info?model_id=gpt2&is_custom=false')
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+        assert 'Failed to load tokenizer' in data['error']
+class TestHealthCheckRoutes:
+    """Integration tests for health check routes."""
+    def test_basic_health_check(self, client):
+        """Test basic health check endpoint."""
+        response = client.get('/health')
+        assert response.status_code == 200
+        assert response.content_type == 'application/json'
+        data = json.loads(response.data)
+        assert 'status' in data
+        assert 'timestamp' in data
+        assert 'version' in data
+        assert data['status'] == 'healthy'
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    @patch('psutil.cpu_percent')
+    @patch('psutil.virtual_memory')
+    @patch('psutil.disk_usage')
+    def test_detailed_health_check(self, mock_disk, mock_memory, mock_cpu, mock_tokenizer_service, client):
+        """Test detailed health check endpoint."""
+        # Mock system info
+        mock_cpu.return_value = 25.5
+        mock_memory.return_value = Mock(total=8000000000, available=4000000000, percent=50.0, used=4000000000)
+        mock_disk.return_value = Mock(total=100000000000, used=50000000000, free=50000000000)
+        # Mock tokenizer service
+        mock_tokenizer_service.tokenizers = {}
+        mock_tokenizer_service.custom_tokenizers = {}
+        mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {}}
+        mock_tokenizer_service.load_tokenizer.return_value = (Mock(), {}, None)
+        response = client.get('/health/detailed')
+        assert response.status_code == 200
+        data = json.loads(response.data)
+        assert 'status' in data
+        assert 'system' in data
+        assert 'services' in data
+        assert 'configuration' in data
+        # Check system info
+        assert 'cpu_percent' in data['system']
+        assert 'memory' in data['system']
+        assert 'disk' in data['system']
+        # Check services info
+        assert 'tokenizer_service' in data['services']
+        assert 'file_service' in data['services']
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_readiness_check_ready(self, mock_tokenizer_service, client, app):
+        """Test readiness check when application is ready."""
+        with app.app_context():
+            # Mock successful tokenizer loading
+            mock_tokenizer_service.load_tokenizer.return_value = (Mock(), {}, None)
+            response = client.get('/health/ready')
+            assert response.status_code == 200
+            data = json.loads(response.data)
+            assert 'ready' in data
+            assert 'checks' in data
+            assert isinstance(data['checks'], dict)
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_readiness_check_not_ready(self, mock_tokenizer_service, client):
+        """Test readiness check when application is not ready."""
+        # Mock failed tokenizer loading
+        mock_tokenizer_service.load_tokenizer.return_value = (None, {}, "Failed to load")
+        response = client.get('/health/ready')
+        assert response.status_code == 503
+        data = json.loads(response.data)
+        assert data['ready'] is False
+        assert 'checks' in data
+class TestErrorHandling:
+    """Test error handling across routes."""
+    def test_404_handling(self, client):
+        """Test 404 error handling."""
+        response = client.get('/nonexistent-route')
+        assert response.status_code == 404
+    def test_405_method_not_allowed(self, client):
+        """Test 405 error for wrong HTTP method."""
+        response = client.put('/')  # PUT not allowed
+        assert response.status_code == 405
+    @patch('app.services.tokenizer_service.tokenizer_service')
+    def test_500_internal_error(self, mock_tokenizer_service, client):
+        """Test 500 error handling."""
+        # Mock service to raise unexpected exception
+        mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
+        with patch('app.services.file_service.file_service') as mock_file_service:
+            mock_file_service.process_text_for_tokenization.side_effect = Exception("Unexpected error")
+            response = client.post('/',
+                data={'text': 'Test', 'model': 'gpt2', 'model_type': 'predefined'},
+                headers={'X-Requested-With': 'XMLHttpRequest'}
+            )
+            assert response.status_code == 400  # Our app returns 400 for processing errors
+            data = json.loads(response.data)
+            assert 'error' in data
+class TestSecurityFeatures:
+    """Test security features in routes."""
+    @patch('app.utils.validators.validators')
+    def test_malicious_filename_blocked(self, mock_validators, client):
+        """Test that malicious filenames are blocked."""
+        from app.utils.validators import ValidationError
+        # Mock validation to detect malicious filename
+        mock_validators.validate_filename.side_effect = ValidationError("Malicious filename detected")
+        file_data = BytesIO(b"test content")
+        response = client.post('/',
+            data={
+                'file': (file_data, '../../../etc/passwd'),
+                'model': 'gpt2',
+                'model_type': 'predefined'
+            },
+            content_type='multipart/form-data',
+            headers={'X-Requested-With': 'XMLHttpRequest'}
+        )
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+    @patch('app.utils.validators.validators')
+    def test_malicious_model_path_blocked(self, mock_validators, client):
+        """Test that malicious model paths are blocked."""
+        from app.utils.validators import ValidationError
+        # Mock validation to detect malicious model path
+        mock_validators.validate_model_path.side_effect = ValidationError("Untrusted model path")
+        response = client.post('/',
+            data={
+                'text': 'Test',
+                'custom_model': 'malicious/backdoor-model',
+                'model_type': 'custom'
+            },
+            headers={'X-Requested-With': 'XMLHttpRequest'}
+        )
+        assert response.status_code == 400
+        data = json.loads(response.data)
+        assert 'error' in data
+        assert 'Untrusted model path' in data['error']

tests/test_stats_service.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Unit tests for StatsService
+"""
+import pytest
+from unittest.mock import Mock
+from app.services.stats_service import StatsService
+class TestStatsService:
+    """Test cases for StatsService."""
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.service = StatsService()
+    def test_get_varied_color_basic(self):
+        """Test basic color generation."""
+        color = self.service.get_varied_color(0, 10)
+        assert isinstance(color, dict)
+        assert 'background' in color
+        assert 'text' in color
+        assert color['background'].startswith('#')
+        assert color['text'].startswith('#')
+        assert len(color['background']) == 7  # #RRGGBB format
+        assert len(color['text']) == 7
+    def test_get_varied_color_different_indices(self):
+        """Test that different indices produce different colors."""
+        color1 = self.service.get_varied_color(0, 10)
+        color2 = self.service.get_varied_color(1, 10)
+        color3 = self.service.get_varied_color(5, 10)
+        # Colors should be different
+        assert color1['background'] != color2['background']
+        assert color2['background'] != color3['background']
+        assert color1['background'] != color3['background']
+    def test_get_varied_color_edge_cases(self):
+        """Test color generation with edge cases."""
+        # Single token
+        color = self.service.get_varied_color(0, 1)
+        assert isinstance(color, dict)
+        # Large number of tokens
+        color = self.service.get_varied_color(999, 1000)
+        assert isinstance(color, dict)
+        # Zero index
+        color = self.service.get_varied_color(0, 5)
+        assert isinstance(color, dict)
+    def test_fix_token_basic(self):
+        """Test basic token fixing."""
+        assert self.service.fix_token("hello") == "hello"
+        assert self.service.fix_token("world") == "world"
+    def test_fix_token_special_characters(self):
+        """Test token fixing with special characters."""
+        # Test space replacement
+        assert self.service.fix_token(" ") == "␣"
+        assert self.service.fix_token("\t") == "→"
+        assert self.service.fix_token("\n") == "↵"
+        # Test Ġ prefix (common in tokenizers)
+        assert self.service.fix_token("Ġhello") == " hello"
+        assert self.service.fix_token("Ġworld") == " world"
+        # Test combination
+        assert self.service.fix_token("Ġ") == " "
+    def test_fix_token_edge_cases(self):
+        """Test token fixing edge cases."""
+        # Empty string
+        assert self.service.fix_token("") == ""
+        # None (shouldn't happen but test defensive programming)
+        result = self.service.fix_token(None)
+        assert result is None or result == ""
+        # Multiple special characters
+        assert self.service.fix_token("\n\t ") == "↵→␣"
+        # Multiple Ġ prefixes (edge case)
+        assert self.service.fix_token("ĠĠhello") == "  hello"
+    def test_get_token_stats_basic(self, sample_tokens, sample_text):
+        """Test basic token statistics calculation."""
+        stats = self.service.get_token_stats(sample_tokens, sample_text)
+        assert isinstance(stats, dict)
+        assert 'basic_stats' in stats
+        assert 'length_stats' in stats
+        basic = stats['basic_stats']
+        length = stats['length_stats']
+        # Check basic stats structure
+        assert 'total_tokens' in basic
+        assert 'unique_tokens' in basic
+        assert 'unique_percentage' in basic
+        assert 'special_tokens' in basic
+        assert 'space_tokens' in basic
+        assert 'newline_tokens' in basic
+        assert 'compression_ratio' in basic
+        # Check length stats structure
+        assert 'avg_length' in length
+        assert 'median_length' in length
+        assert 'std_dev' in length
+    def test_get_token_stats_calculations(self):
+        """Test specific statistics calculations."""
+        tokens = ['Hello', ' world', '!', ' test']
+        text = "Hello world! test"
+        stats = self.service.get_token_stats(tokens, text)
+        basic = stats['basic_stats']
+        # Test total tokens
+        assert basic['total_tokens'] == 4
+        # Test unique tokens (all are unique in this case)
+        assert basic['unique_tokens'] == 4
+        assert basic['unique_percentage'] == "100.0"
+        # Test compression ratio
+        expected_ratio = len(text) / len(tokens)
+        assert float(basic['compression_ratio']) == pytest.approx(expected_ratio, rel=1e-2)
+    def test_get_token_stats_special_tokens(self):
+        """Test special token counting."""
+        tokens = ['<s>', 'Hello', ' world', '</s>', '<pad>']
+        text = "Hello world"
+        stats = self.service.get_token_stats(tokens, text)
+        basic = stats['basic_stats']
+        # Should detect special tokens (those with < >)
+        assert basic['special_tokens'] >= 2  # <s>, </s>, <pad>
+    def test_get_token_stats_whitespace_tokens(self):
+        """Test whitespace token counting."""
+        tokens = ['Hello', ' ', 'world', '\n', 'test', '\t']
+        text = "Hello world\ntest\t"
+        stats = self.service.get_token_stats(tokens, text)
+        basic = stats['basic_stats']
+        # Should count space and tab tokens
+        assert basic['space_tokens'] >= 1
+        assert basic['newline_tokens'] >= 1
+    def test_get_token_stats_length_calculations(self):
+        """Test token length statistics."""
+        tokens = ['a', 'bb', 'ccc', 'dddd']  # Lengths: 1, 2, 3, 4
+        text = "a bb ccc dddd"
+        stats = self.service.get_token_stats(tokens, text)
+        length = stats['length_stats']
+        # Average length should be 2.5
+        assert float(length['avg_length']) == pytest.approx(2.5, rel=1e-2)
+        # Median should be 2.5 (between 2 and 3)
+        assert float(length['median_length']) == pytest.approx(2.5, rel=1e-2)
+    def test_get_token_stats_empty_input(self):
+        """Test statistics with empty input."""
+        stats = self.service.get_token_stats([], "")
+        basic = stats['basic_stats']
+        length = stats['length_stats']
+        assert basic['total_tokens'] == 0
+        assert basic['unique_tokens'] == 0
+        assert basic['unique_percentage'] == "0.0"
+        assert basic['compression_ratio'] == "0.0"
+        # Length stats should handle empty case gracefully
+        assert length['avg_length'] == "0.0"
+        assert length['median_length'] == "0.0"
+        assert length['std_dev'] == "0.0"
+    def test_format_tokens_for_display_basic(self, mock_tokenizer):
+        """Test basic token formatting for display."""
+        tokens = ['Hello', ' world', '!']
+        # Mock the tokenizer.convert_ids_to_tokens method
+        mock_tokenizer.convert_ids_to_tokens.return_value = tokens
+        formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
+        assert isinstance(formatted, list)
+        assert len(formatted) == len(tokens)
+        for i, token_data in enumerate(formatted):
+            assert isinstance(token_data, dict)
+            assert 'display' in token_data
+            assert 'original' in token_data
+            assert 'token_id' in token_data
+            assert 'colors' in token_data
+            assert 'newline' in token_data
+            assert token_data['original'] == tokens[i]
+            assert isinstance(token_data['colors'], dict)
+            assert 'background' in token_data['colors']
+            assert 'text' in token_data['colors']
+    def test_format_tokens_newline_detection(self, mock_tokenizer):
+        """Test newline detection in token formatting."""
+        tokens = ['Hello', '\n', 'world']
+        mock_tokenizer.convert_ids_to_tokens.return_value = tokens
+        formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
+        # Second token should be marked as newline
+        assert formatted[1]['newline'] is True
+        assert formatted[0]['newline'] is False
+        assert formatted[2]['newline'] is False
+    def test_format_tokens_color_consistency(self, mock_tokenizer):
+        """Test that same tokens get same colors."""
+        tokens = ['hello', 'world', 'hello']  # 'hello' appears twice
+        mock_tokenizer.convert_ids_to_tokens.return_value = tokens
+        formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
+        # Same tokens should have same colors
+        assert formatted[0]['colors']['background'] == formatted[2]['colors']['background']
+        assert formatted[0]['colors']['text'] == formatted[2]['colors']['text']
+        # Different tokens should have different colors
+        assert formatted[0]['colors']['background'] != formatted[1]['colors']['background']
+    def test_format_tokens_special_character_handling(self, mock_tokenizer):
+        """Test special character handling in token formatting."""
+        tokens = [' ', '\t', '\n', 'Ġhello']
+        mock_tokenizer.convert_ids_to_tokens.return_value = tokens
+        formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
+        # Check that special characters are properly converted
+        assert formatted[0]['display'] == '␣'  # Space
+        assert formatted[1]['display'] == '→'  # Tab
+        assert formatted[2]['display'] == '↵'  # Newline
+        assert formatted[3]['display'] == ' hello'  # Ġ prefix

tests/test_tokenizer_service.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Unit tests for TokenizerService
+"""
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from app.services.tokenizer_service import TokenizerService
+import time
+class TestTokenizerService:
+    """Test cases for TokenizerService."""
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.service = TokenizerService()
+    def test_is_predefined_model(self):
+        """Test predefined model checking."""
+        # Test with existing model
+        assert self.service.is_predefined_model('gpt2') is True
+        # Test with non-existing model
+        assert self.service.is_predefined_model('nonexistent-model') is False
+        # Test with empty string
+        assert self.service.is_predefined_model('') is False
+    def test_get_tokenizer_info_basic(self, mock_tokenizer):
+        """Test basic tokenizer info extraction."""
+        info = self.service.get_tokenizer_info(mock_tokenizer)
+        assert 'vocab_size' in info
+        assert 'tokenizer_type' in info
+        assert 'special_tokens' in info
+        assert info['vocab_size'] == 50257
+        assert info['tokenizer_type'] == 'MockTokenizer'
+        # Check special tokens
+        special_tokens = info['special_tokens']
+        assert 'pad_token' in special_tokens
+        assert 'eos_token' in special_tokens
+        assert special_tokens['pad_token'] == '<pad>'
+        assert special_tokens['eos_token'] == '</s>'
+    def test_get_tokenizer_info_with_max_length(self, mock_tokenizer):
+        """Test tokenizer info with model_max_length."""
+        mock_tokenizer.model_max_length = 2048
+        info = self.service.get_tokenizer_info(mock_tokenizer)
+        assert 'model_max_length' in info
+        assert info['model_max_length'] == 2048
+    def test_get_tokenizer_info_error_handling(self):
+        """Test error handling in tokenizer info extraction."""
+        # Create a mock that raises an exception
+        broken_tokenizer = Mock()
+        broken_tokenizer.__class__.__name__ = 'BrokenTokenizer'
+        broken_tokenizer.vocab_size = property(Mock(side_effect=Exception("Test error")))
+        info = self.service.get_tokenizer_info(broken_tokenizer)
+        assert 'error' in info
+        assert 'Test error' in info['error']
+    @patch('app.services.tokenizer_service.AutoTokenizer')
+    def test_load_predefined_tokenizer_success(self, mock_auto_tokenizer, mock_tokenizer):
+        """Test successful loading of predefined tokenizer."""
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+        tokenizer, info, error = self.service.load_tokenizer('gpt2')
+        assert tokenizer is not None
+        assert error is None
+        assert isinstance(info, dict)
+        mock_auto_tokenizer.from_pretrained.assert_called_once()
+    @patch('app.services.tokenizer_service.AutoTokenizer')
+    def test_load_tokenizer_failure(self, mock_auto_tokenizer):
+        """Test tokenizer loading failure."""
+        mock_auto_tokenizer.from_pretrained.side_effect = Exception("Failed to load")
+        tokenizer, info, error = self.service.load_tokenizer('gpt2')
+        assert tokenizer is None
+        assert error is not None
+        assert "Failed to load" in error
+    def test_load_nonexistent_predefined_model(self):
+        """Test loading non-existent predefined model."""
+        tokenizer, info, error = self.service.load_tokenizer('nonexistent-model')
+        assert tokenizer is None
+        assert error is not None
+        assert "not found" in error.lower()
+    @patch('app.services.tokenizer_service.AutoTokenizer')
+    @patch('time.time')
+    def test_custom_tokenizer_caching(self, mock_time, mock_auto_tokenizer, mock_tokenizer, app):
+        """Test custom tokenizer caching behavior."""
+        with app.app_context():
+            mock_time.return_value = 1000.0
+            mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+            # First load
+            tokenizer1, info1, error1 = self.service.load_tokenizer('custom/model')
+            # Second load (should use cache)
+            mock_time.return_value = 1500.0  # Still within cache time
+            tokenizer2, info2, error2 = self.service.load_tokenizer('custom/model')
+            # Should only call from_pretrained once
+            assert mock_auto_tokenizer.from_pretrained.call_count == 1
+            assert tokenizer1 is tokenizer2
+    @patch('app.services.tokenizer_service.AutoTokenizer')
+    @patch('time.time')
+    def test_custom_tokenizer_cache_expiration(self, mock_time, mock_auto_tokenizer, mock_tokenizer, app):
+        """Test custom tokenizer cache expiration."""
+        with app.app_context():
+            mock_time.return_value = 1000.0
+            mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+            # First load
+            self.service.load_tokenizer('custom/model')
+            # Second load after cache expiration
+            mock_time.return_value = 5000.0  # Beyond cache expiration
+            self.service.load_tokenizer('custom/model')
+            # Should call from_pretrained twice
+            assert mock_auto_tokenizer.from_pretrained.call_count == 2
+    def test_tokenizer_models_constant(self):
+        """Test that TOKENIZER_MODELS contains expected models."""
+        models = self.service.TOKENIZER_MODELS
+        assert isinstance(models, dict)
+        assert len(models) > 0
+        # Check that each model has required fields
+        for model_id, model_info in models.items():
+            assert isinstance(model_id, str)
+            assert isinstance(model_info, dict)
+            assert 'name' in model_info
+            assert 'alias' in model_info
+            assert isinstance(model_info['name'], str)
+            assert isinstance(model_info['alias'], str)
+    def test_cache_initialization(self):
+        """Test that caches are properly initialized."""
+        service = TokenizerService()
+        assert hasattr(service, 'tokenizers')
+        assert hasattr(service, 'custom_tokenizers')
+        assert hasattr(service, 'tokenizer_info_cache')
+        assert isinstance(service.tokenizers, dict)
+        assert isinstance(service.custom_tokenizers, dict)
+        assert isinstance(service.tokenizer_info_cache, dict)
+    def test_special_tokens_filtering(self, mock_tokenizer):
+        """Test that only valid special tokens are included."""
+        # Add some None and empty special tokens
+        mock_tokenizer.pad_token = '<pad>'
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.bos_token = None
+        mock_tokenizer.sep_token = ''
+        mock_tokenizer.cls_token = '   '  # Whitespace only
+        mock_tokenizer.unk_token = '<unk>'
+        mock_tokenizer.mask_token = '<mask>'
+        info = self.service.get_tokenizer_info(mock_tokenizer)
+        special_tokens = info['special_tokens']
+        # Should only include non-None, non-empty tokens
+        assert 'pad_token' in special_tokens
+        assert 'eos_token' in special_tokens
+        assert 'unk_token' in special_tokens
+        assert 'mask_token' in special_tokens
+        # Should not include None or empty tokens
+        assert 'bos_token' not in special_tokens
+        assert 'sep_token' not in special_tokens
+        assert 'cls_token' not in special_tokens

tests/test_validators.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+Unit tests for Validators utility
+"""
+import pytest
+from app.utils.validators import Validators, ValidationError
+class TestValidators:
+    """Test cases for Validators utility."""
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.validators = Validators()
+    def test_validate_filename_valid(self):
+        """Test filename validation with valid filenames."""
+        # Valid filenames should not raise
+        self.validators.validate_filename('test.txt')
+        self.validators.validate_filename('document.md')
+        self.validators.validate_filename('script_file.py')
+        self.validators.validate_filename('My Document.txt')
+        self.validators.validate_filename('file-name.json')
+        self.validators.validate_filename('data123.csv')
+    def test_validate_filename_invalid(self):
+        """Test filename validation with invalid filenames."""
+        # Empty or None filename
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('')
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename(None)
+        # Dangerous characters
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('../../../etc/passwd')
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('file\\with\\backslashes.txt')
+        # Null bytes
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('file\x00.txt')
+        # Control characters
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('file\x01\x02.txt')
+        # Reserved names on Windows
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('CON.txt')
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('PRN.txt')
+        with pytest.raises(ValidationError):
+            self.validators.validate_filename('AUX.txt')
+    def test_validate_file_extension_valid(self):
+        """Test file extension validation with valid extensions."""
+        allowed_extensions = {'.txt', '.md', '.py', '.js', '.json'}
+        # Valid extensions should not raise
+        self.validators.validate_file_extension('test.txt', allowed_extensions)
+        self.validators.validate_file_extension('document.md', allowed_extensions)
+        self.validators.validate_file_extension('script.py', allowed_extensions)
+        self.validators.validate_file_extension('data.json', allowed_extensions)
+        # Case insensitive
+        self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
+        self.validators.validate_file_extension('Document.MD', allowed_extensions)
+    def test_validate_file_extension_invalid(self):
+        """Test file extension validation with invalid extensions."""
+        allowed_extensions = {'.txt', '.md', '.py'}
+        # Invalid extensions should raise
+        with pytest.raises(ValidationError):
+            self.validators.validate_file_extension('virus.exe', allowed_extensions)
+        with pytest.raises(ValidationError):
+            self.validators.validate_file_extension('archive.zip', allowed_extensions)
+        with pytest.raises(ValidationError):
+            self.validators.validate_file_extension('image.jpg', allowed_extensions)
+        # No extension
+        with pytest.raises(ValidationError):
+            self.validators.validate_file_extension('filename', allowed_extensions)
+        # Empty filename
+        with pytest.raises(ValidationError):
+            self.validators.validate_file_extension('', allowed_extensions)
+    def test_validate_model_path_valid(self):
+        """Test model path validation with valid paths."""
+        # Valid HuggingFace model paths
+        valid_paths = [
+            'microsoft/DialoGPT-medium',
+            'google/bert-base-uncased',
+            'meta-llama/Llama-2-7b-hf',
+            'mistralai/Mistral-7B-Instruct-v0.1',
+            'Qwen/Qwen2.5-72B-Instruct',
+            'THUDM/chatglm-6b',
+            'deepseek-ai/deepseek-coder-6.7b-base',
+            'unsloth/llama-2-7b-bnb-4bit',
+            'google-bert/bert-base-uncased',
+            'bartar/SPLM-2'  # User's specific case
+        ]
+        for path in valid_paths:
+            self.validators.validate_model_path(path)  # Should not raise
+    def test_validate_model_path_invalid_format(self):
+        """Test model path validation with invalid formats."""
+        # Invalid formats should raise
+        invalid_paths = [
+            '',  # Empty
+            'invalid-path',  # No slash
+            'user/',  # Empty model name
+            '/model-name',  # Empty user
+            'user//model',  # Double slash
+            'user/model/extra',  # Too many parts
+            'user name/model',  # Space in user
+            'user/model name',  # Space in model (actually this might be valid)
+            'user@domain/model',  # Invalid characters
+            '../malicious/path',  # Path traversal
+            'user\\model',  # Backslash
+        ]
+        for path in invalid_paths:
+            with pytest.raises(ValidationError):
+                self.validators.validate_model_path(path)
+    def test_validate_model_path_untrusted_prefix(self):
+        """Test model path validation with untrusted prefixes."""
+        # Paths with untrusted prefixes should raise
+        untrusted_paths = [
+            'random-user/some-model',
+            'untrusted/malicious-model',
+            'hacker/backdoor-model',
+            'suspicious/model'
+        ]
+        for path in untrusted_paths:
+            with pytest.raises(ValidationError):
+                self.validators.validate_model_path(path)
+    def test_validate_model_path_edge_cases(self):
+        """Test model path validation edge cases."""
+        # None input
+        with pytest.raises(ValidationError):
+            self.validators.validate_model_path(None)
+        # Very long path
+        long_path = 'microsoft/' + 'a' * 1000
+        with pytest.raises(ValidationError):
+            self.validators.validate_model_path(long_path)
+        # Special characters in allowed prefix
+        self.validators.validate_model_path('microsoft/model-with-dashes')
+        self.validators.validate_model_path('microsoft/model_with_underscores')
+        self.validators.validate_model_path('microsoft/model.with.dots')
+    def test_validate_text_input_valid(self):
+        """Test text input validation with valid inputs."""
+        # Valid text inputs should not raise
+        self.validators.validate_text_input('Hello world!')
+        self.validators.validate_text_input('A' * 1000)  # Long but reasonable text
+        self.validators.validate_text_input('Text with\nnewlines\nand\ttabs')
+        self.validators.validate_text_input('Unicode: 你好世界 🌍')
+        self.validators.validate_text_input('')  # Empty text might be valid
+    def test_validate_text_input_invalid(self):
+        """Test text input validation with invalid inputs."""
+        # None input
+        with pytest.raises(ValidationError):
+            self.validators.validate_text_input(None)
+        # Extremely long text (if there's a limit)
+        very_long_text = 'A' * (10 * 1024 * 1024)  # 10MB of text
+        with pytest.raises(ValidationError):
+            self.validators.validate_text_input(very_long_text)
+    def test_validate_text_input_malicious_content(self):
+        """Test text input validation with potentially malicious content."""
+        # Null bytes
+        with pytest.raises(ValidationError):
+            self.validators.validate_text_input('text\x00with\x00nulls')
+        # Control characters (some might be allowed like \n, \t)
+        try:
+            self.validators.validate_text_input('text\x01with\x02controls')
+        except ValidationError:
+            pass  # This might be expected
+    def test_validation_error_messages(self):
+        """Test that ValidationError contains meaningful messages."""
+        # Test filename validation error message
+        try:
+            self.validators.validate_filename('../../../etc/passwd')
+            assert False, "Should have raised ValidationError"
+        except ValidationError as e:
+            assert 'filename' in str(e).lower() or 'path' in str(e).lower()
+        # Test file extension error message
+        try:
+            self.validators.validate_file_extension('virus.exe', {'.txt'})
+            assert False, "Should have raised ValidationError"
+        except ValidationError as e:
+            assert 'extension' in str(e).lower() or 'allowed' in str(e).lower()
+        # Test model path error message
+        try:
+            self.validators.validate_model_path('invalid-path')
+            assert False, "Should have raised ValidationError"
+        except ValidationError as e:
+            assert 'model' in str(e).lower() or 'path' in str(e).lower()
+    def test_allowed_model_prefixes_coverage(self):
+        """Test that all common model prefixes are covered."""
+        # This test ensures we have good coverage of trusted model prefixes
+        common_prefixes = [
+            'microsoft/',
+            'google/',
+            'meta-llama/',
+            'mistralai/',
+            'openai-community/',
+            'Qwen/',
+            'THUDM/',
+            'deepseek-ai/',
+            'unsloth/',
+            'google-bert/'
+        ]
+        for prefix in common_prefixes:
+            # Should be able to validate models with these prefixes
+            test_path = prefix + 'test-model'
+            try:
+                self.validators.validate_model_path(test_path)
+            except ValidationError:
+                pytest.fail(f"Trusted prefix {prefix} should be allowed")
+    def test_case_sensitivity(self):
+        """Test case sensitivity in various validations."""
+        # File extensions should be case insensitive
+        allowed_extensions = {'.txt', '.md'}
+        self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
+        self.validators.validate_file_extension('Document.MD', allowed_extensions)
+        # Model path prefixes should be case sensitive (HuggingFace convention)
+        self.validators.validate_model_path('Microsoft/model')  # Capital M
+        # But random capitalization in untrusted prefixes should still fail
+        with pytest.raises(ValidationError):
+            self.validators.validate_model_path('RANDOM/model')