Spaces:

alakxender
/

dhivehi-tokenizers

Running

fee5e46 2 months ago

4.06 kB

	import gradio as gr
	from transformers import AutoTokenizer, T5Tokenizer

	# Fixed list of custom tokenizers (left)
	TOKENIZER_CUSTOM = {
	"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
	"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
	"Google mT5": "google/mt5-base",
	"DeBERTa": "alakxender/deberta-dhivehi-tokenizer-extended"
	}

	# Suggested stock model paths for the right input
	SUGGESTED_STOCK_PATHS = [
	"google/flan-t5-base",
	"t5-small",
	"t5-base",
	"t5-large",
	"google/mt5-base",
	"microsoft/trocr-base-handwritten",
	"microsoft/trocr-base-printed",
	"microsoft/deberta-v3-base"
	]

	# Load tokenizer with fallback to slow T5
	def load_tokenizer(tokenizer_path):
	try:
	return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
	except Exception:
	if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
	return T5Tokenizer.from_pretrained(tokenizer_path)
	raise

	# Tokenize and decode with error handling
	def tokenize_display(text, tokenizer_path):
	try:
	tokenizer = load_tokenizer(tokenizer_path)
	encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
	tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
	ids = encoding.input_ids
	decoded = tokenizer.decode(ids, skip_special_tokens=False)
	return tokens, ids, decoded
	except Exception as e:
	return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"

	# Comparison logic
	def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
	def format_block(title, tokenizer_path):
	dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
	en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)

	return f"""\
	### 🔤 {title}

	#### 🈁 Dhivehi Text
	`{dv_text}`

	Tokenized:
	{' '.join(dv_tokens)}

	Number of tokens: {len(dv_tokens) if dv_ids else 'N/A'}
	IDs: {dv_ids or '[ERROR]'}
	Decoded: `{dv_decoded}`

	---

	#### 🇬🇧 English Text
	`{en_text}`

	Tokenized:
	{' '.join(en_tokens)}

	Number of tokens: {len(en_tokens) if en_ids else 'N/A'}
	IDs: {en_ids or '[ERROR]'}
	Decoded: `{en_decoded}`
	"""

	try:
	custom_path = TOKENIZER_CUSTOM[custom_label]
	except KeyError:
	return "[ERROR] Invalid custom tokenizer selected", ""

	return (
	format_block("Custom Tokenizer", custom_path),
	format_block("Stock Tokenizer", stock_path)
	)

	# Gradio UI
	with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
	gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
	gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")

	with gr.Row():
	dhivehi_text = gr.Textbox(
	label="Dhivehi Text",
	lines=1,
	value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
	rtl=True
	)
	english_text = gr.Textbox(
	label="English Text",
	lines=1,
	value="The quick brown fox jumps over the lazy dog"
	)

	with gr.Row():
	tokenizer_a = gr.Dropdown(
	label="Select Custom Tokenizer",
	choices=list(TOKENIZER_CUSTOM.keys()),
	value="T5 Extended"
	)
	tokenizer_b = gr.Dropdown(
	label="Enter or Select Stock Tokenizer Path",
	choices=SUGGESTED_STOCK_PATHS,
	value="google/flan-t5-base",
	allow_custom_value=True
	)

	compare_button = gr.Button("Compare Tokenizers")

	with gr.Row():
	output_custom = gr.Markdown(label="Custom Tokenizer Output")
	output_stock = gr.Markdown(label="Stock Tokenizer Output")

	compare_button.click(
	compare_side_by_side,
	inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
	outputs=[output_custom, output_stock]
	)

	demo.launch()