Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoTokenizer, T5Tokenizer | |
# Fixed list of custom tokenizers (left) | |
TOKENIZER_CUSTOM = { | |
"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended", | |
"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended", | |
"Google mT5": "google/mt5-base", | |
"DeBERTa": "alakxender/deberta-dhivehi-tokenizer-extended" | |
} | |
# Suggested stock model paths for the right input | |
SUGGESTED_STOCK_PATHS = [ | |
"google/flan-t5-base", | |
"t5-small", | |
"t5-base", | |
"t5-large", | |
"google/mt5-base", | |
"microsoft/trocr-base-handwritten", | |
"microsoft/trocr-base-printed", | |
"microsoft/deberta-v3-base" | |
] | |
# Load tokenizer with fallback to slow T5 | |
def load_tokenizer(tokenizer_path): | |
try: | |
return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True) | |
except Exception: | |
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower(): | |
return T5Tokenizer.from_pretrained(tokenizer_path) | |
raise | |
# Tokenize and decode with error handling | |
def tokenize_display(text, tokenizer_path): | |
try: | |
tokenizer = load_tokenizer(tokenizer_path) | |
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True) | |
tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids) | |
ids = encoding.input_ids | |
decoded = tokenizer.decode(ids, skip_special_tokens=False) | |
return tokens, ids, decoded | |
except Exception as e: | |
return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]" | |
# Comparison logic | |
def compare_side_by_side(dv_text, en_text, custom_label, stock_path): | |
def format_block(title, tokenizer_path): | |
dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path) | |
en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path) | |
return f"""\ | |
### 🔤 {title} | |
#### 🈁 Dhivehi Text | |
`{dv_text}` | |
**Tokenized:** | |
{' '.join(dv_tokens)} | |
**Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'} | |
**IDs:** {dv_ids or '[ERROR]'} | |
**Decoded:** `{dv_decoded}` | |
--- | |
#### 🇬🇧 English Text | |
`{en_text}` | |
**Tokenized:** | |
{' '.join(en_tokens)} | |
**Number of tokens:** {len(en_tokens) if en_ids else 'N/A'} | |
**IDs:** {en_ids or '[ERROR]'} | |
**Decoded:** `{en_decoded}` | |
""" | |
try: | |
custom_path = TOKENIZER_CUSTOM[custom_label] | |
except KeyError: | |
return "[ERROR] Invalid custom tokenizer selected", "" | |
return ( | |
format_block("Custom Tokenizer", custom_path), | |
format_block("Stock Tokenizer", stock_path) | |
) | |
# Gradio UI | |
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo: | |
gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison") | |
gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.") | |
with gr.Row(): | |
dhivehi_text = gr.Textbox( | |
label="Dhivehi Text", | |
lines=1, | |
value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ", | |
rtl=True | |
) | |
english_text = gr.Textbox( | |
label="English Text", | |
lines=1, | |
value="The quick brown fox jumps over the lazy dog" | |
) | |
with gr.Row(): | |
tokenizer_a = gr.Dropdown( | |
label="Select Custom Tokenizer", | |
choices=list(TOKENIZER_CUSTOM.keys()), | |
value="T5 Extended" | |
) | |
tokenizer_b = gr.Dropdown( | |
label="Enter or Select Stock Tokenizer Path", | |
choices=SUGGESTED_STOCK_PATHS, | |
value="google/flan-t5-base", | |
allow_custom_value=True | |
) | |
compare_button = gr.Button("Compare Tokenizers") | |
with gr.Row(): | |
output_custom = gr.Markdown(label="Custom Tokenizer Output") | |
output_stock = gr.Markdown(label="Stock Tokenizer Output") | |
compare_button.click( | |
compare_side_by_side, | |
inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b], | |
outputs=[output_custom, output_stock] | |
) | |
demo.launch() | |