import gradio as gr from transformers import AutoTokenizer, T5Tokenizer import asyncio import threading from concurrent.futures import ThreadPoolExecutor import time # Fixed list of custom tokenizers (left) TOKENIZER_CUSTOM = { "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended", "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended", "Google mT5": "google/mt5-base", "Google mT5 Extended": "alakxender/mt5-dhivehi-tokenizer-extended", "DeBERTa Extended": "alakxender/deberta-dhivehi-tokenizer-extended", "XLM-RoBERTa Extended": "alakxender/xlmr-dhivehi-tokenizer-extended", "Bert Extended": "alakxender/bert-dhivehi-tokenizer-extended", "Bert Extended Fast": "alakxender/bert-fast-dhivehi-tokenizer-extended" } # Suggested stock model paths for the right input SUGGESTED_STOCK_PATHS = [ "google/flan-t5-base", "t5-small", "t5-base", "t5-large", "google/mt5-base", "microsoft/trocr-base-handwritten", "microsoft/trocr-base-printed", "microsoft/deberta-v3-base" "xlm-roberta-base", "naver-clova-ix/donut-base", "bert-base-multilingual-cased" ] # Cache for loaded tokenizers to avoid reloading tokenizer_cache = {} # Load tokenizer with fallback to slow T5 def load_tokenizer(tokenizer_path): if tokenizer_path in tokenizer_cache: return tokenizer_cache[tokenizer_path] try: tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) tokenizer_cache[tokenizer_path] = tokenizer return tokenizer except Exception: if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower(): tokenizer = T5Tokenizer.from_pretrained(tokenizer_path) tokenizer_cache[tokenizer_path] = tokenizer return tokenizer raise # Tokenize and decode with enhanced visualization def tokenize_display(text, tokenizer_path): try: tokenizer = load_tokenizer(tokenizer_path) encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True) tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids) ids = encoding.input_ids decoded = tokenizer.decode(ids, skip_special_tokens=False) return tokens, ids, decoded except Exception as e: return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]" def create_token_visualization(tokens, ids): """Create a visual representation of tokens with colors and spacing""" if not tokens or not ids: return "❌ No tokens to display" # Create colored token blocks token_blocks = [] colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"] for i, (token, token_id) in enumerate(zip(tokens, ids)): color = colors[i % len(colors)] # Clean token display (remove special characters for better readability) clean_token = token.replace('▁', '_').replace('', '[END]').replace('', '[START]') token_blocks.append(f"{color} `{clean_token}` ({token_id})") return " ".join(token_blocks) # Async comparison with progress updates def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()): def format_block(title, tokenizer_path): dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path) en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path) return f"""\ ## 🔤 {title} ### 🈁 Dhivehi: `{dv_text}` **🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens {create_token_visualization(dv_tokens, dv_ids)} **🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}` **🔄 Decoded:** `{dv_decoded}` --- ### 🇬🇧 English: `{en_text}` **🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens {create_token_visualization(en_tokens, en_ids)} **🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}` **🔄 Decoded:** `{en_decoded}` --- """ try: custom_path = TOKENIZER_CUSTOM[custom_label] except KeyError: return "[ERROR] Invalid custom tokenizer selected", "" # Show loading progress progress(0.1, desc="Loading custom tokenizer...") # Load custom tokenizer try: custom_result = format_block("Custom Tokenizer", custom_path) progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...") except Exception as e: custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}" progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...") # Load stock tokenizer try: stock_result = format_block("Stock Tokenizer", stock_path) progress(1.0, desc="Complete!") except Exception as e: stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}" progress(1.0, desc="Complete with errors!") return custom_result, stock_result # Non-blocking comparison function def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path): # Return immediate loading message loading_msg = """ ## ⏳ Loading Tokenizer... 🚀 **Status:** Downloading and initializing tokenizer... *This may take a moment for first-time downloads* """ # Use ThreadPoolExecutor for non-blocking execution with ThreadPoolExecutor(max_workers=2) as executor: future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path) # Return loading state first yield loading_msg, loading_msg # Then return actual results try: custom_result, stock_result = future.result(timeout=120) # 2 minute timeout yield custom_result, stock_result except Exception as e: error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}" yield error_msg, error_msg # Gradio UI with better UX with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo: gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison") gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.") with gr.Row(): dhivehi_text = gr.Textbox( label="Dhivehi Text", lines=2, value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ", rtl=True, placeholder="Enter Dhivehi text here..." ) english_text = gr.Textbox( label="English Text", lines=2, value="The quick brown fox jumps over the lazy dog", placeholder="Enter English text here..." ) with gr.Row(): tokenizer_a = gr.Dropdown( label="Select Custom Tokenizer", choices=list(TOKENIZER_CUSTOM.keys()), value="T5 Extended", info="Pre-trained Dhivehi tokenizers (or paste a path)" ) tokenizer_b = gr.Dropdown( label="Enter or Select Stock Tokenizer Path", choices=SUGGESTED_STOCK_PATHS, value="google/flan-t5-base", allow_custom_value=True, info="Standard HuggingFace tokenizers (or paste a path)" ) compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg") with gr.Row(): output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400) output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400) # Use the non-blocking function compare_button.click( compare_side_by_side_with_progress, inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b], outputs=[output_custom, output_stock], show_progress=True ) if __name__ == "__main__": demo.launch()