alakxender's picture
d
fee5e46
raw
history blame
4.06 kB
import gradio as gr
from transformers import AutoTokenizer, T5Tokenizer
# Fixed list of custom tokenizers (left)
TOKENIZER_CUSTOM = {
"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
"Google mT5": "google/mt5-base",
"DeBERTa": "alakxender/deberta-dhivehi-tokenizer-extended"
}
# Suggested stock model paths for the right input
SUGGESTED_STOCK_PATHS = [
"google/flan-t5-base",
"t5-small",
"t5-base",
"t5-large",
"google/mt5-base",
"microsoft/trocr-base-handwritten",
"microsoft/trocr-base-printed",
"microsoft/deberta-v3-base"
]
# Load tokenizer with fallback to slow T5
def load_tokenizer(tokenizer_path):
try:
return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
except Exception:
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
return T5Tokenizer.from_pretrained(tokenizer_path)
raise
# Tokenize and decode with error handling
def tokenize_display(text, tokenizer_path):
try:
tokenizer = load_tokenizer(tokenizer_path)
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
ids = encoding.input_ids
decoded = tokenizer.decode(ids, skip_special_tokens=False)
return tokens, ids, decoded
except Exception as e:
return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
# Comparison logic
def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
def format_block(title, tokenizer_path):
dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
return f"""\
### 🔤 {title}
#### 🈁 Dhivehi Text
`{dv_text}`
**Tokenized:**
{' '.join(dv_tokens)}
**Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'}
**IDs:** {dv_ids or '[ERROR]'}
**Decoded:** `{dv_decoded}`
---
#### 🇬🇧 English Text
`{en_text}`
**Tokenized:**
{' '.join(en_tokens)}
**Number of tokens:** {len(en_tokens) if en_ids else 'N/A'}
**IDs:** {en_ids or '[ERROR]'}
**Decoded:** `{en_decoded}`
"""
try:
custom_path = TOKENIZER_CUSTOM[custom_label]
except KeyError:
return "[ERROR] Invalid custom tokenizer selected", ""
return (
format_block("Custom Tokenizer", custom_path),
format_block("Stock Tokenizer", stock_path)
)
# Gradio UI
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
with gr.Row():
dhivehi_text = gr.Textbox(
label="Dhivehi Text",
lines=1,
value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
rtl=True
)
english_text = gr.Textbox(
label="English Text",
lines=1,
value="The quick brown fox jumps over the lazy dog"
)
with gr.Row():
tokenizer_a = gr.Dropdown(
label="Select Custom Tokenizer",
choices=list(TOKENIZER_CUSTOM.keys()),
value="T5 Extended"
)
tokenizer_b = gr.Dropdown(
label="Enter or Select Stock Tokenizer Path",
choices=SUGGESTED_STOCK_PATHS,
value="google/flan-t5-base",
allow_custom_value=True
)
compare_button = gr.Button("Compare Tokenizers")
with gr.Row():
output_custom = gr.Markdown(label="Custom Tokenizer Output")
output_stock = gr.Markdown(label="Stock Tokenizer Output")
compare_button.click(
compare_side_by_side,
inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
outputs=[output_custom, output_stock]
)
demo.launch()