Spaces:

alakxender
/

dhivehi-tokenizers

Running

App Files Files Community

alakxender commited on Jun 7

Commit

249b1cb

1 Parent(s): 9405745

n

Browse files

Files changed (1) hide show

app.py +50 -33

app.py CHANGED Viewed

@@ -1,26 +1,39 @@
 import gradio as gr
 from transformers import AutoTokenizer
-# Tokenizer options
-TOKENIZER_PATHS = {
-    "Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)": "alakxender/flan-t5-dhivehi-tokenizer",
-    "Stock Tokenizer (google/flan-t5-base)": "google/flan-t5-base",
-    "T5 Small (t5-small)": "t5-small"
 }
 def tokenize_display(text, tokenizer_path):
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
-    tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
-    ids = encoding.input_ids
-    decoded = tokenizer.decode(ids, skip_special_tokens=False)
-    return tokens, ids, decoded
-def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
     def format_block(title, tokenizer_path):
-        # Dhivehi
         dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
-        # English
         en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
         return f"""\
@@ -32,8 +45,8 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
 **Tokenized:**
 {' '.join(dv_tokens)}
-**Number of tokens:** {len(dv_tokens)}
-**IDs:** {dv_ids}
 **Decoded:** `{dv_decoded}`
 ---
@@ -44,46 +57,50 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
 **Tokenized:**
 {' '.join(en_tokens)}
-**Number of tokens:** {len(en_tokens)}
-**IDs:** {en_ids}
 **Decoded:** `{en_decoded}`
 """
-    custom_path = TOKENIZER_PATHS[custom_label]
-    stock_path = TOKENIZER_PATHS[stock_label]
     return (
         format_block("Custom Tokenizer", custom_path),
         format_block("Stock Tokenizer", stock_path)
     )
-# Gradio app
-with gr.Blocks(title="Tokenizer Comparison Tool") as demo:
-    gr.Markdown("## 🧠 Tokenizer Comparison (Custom vs Stock)")
-    gr.Markdown("Compare how different tokenizers handle Dhivehi and English input text.")
     with gr.Row():
         dhivehi_text = gr.Textbox(
             label="Dhivehi Text",
-            lines=2,
-            value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ"
         )
         english_text = gr.Textbox(
             label="English Text",
-            lines=2,
             value="The quick brown fox jumps over the lazy dog"
         )
     with gr.Row():
         tokenizer_a = gr.Dropdown(
             label="Select Custom Tokenizer",
-            choices=list(TOKENIZER_PATHS.keys()),
-            value="Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)"
         )
         tokenizer_b = gr.Dropdown(
-            label="Select Stock Tokenizer",
-            choices=list(TOKENIZER_PATHS.keys()),
-            value="Stock Tokenizer (google/flan-t5-base)"
         )
     compare_button = gr.Button("Compare Tokenizers")

 import gradio as gr
 from transformers import AutoTokenizer
+# Fixed list of custom tokenizers (left)
+TOKENIZER_CUSTOM = {
+    "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
+    "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended"
 }
+# Suggested stock model paths for the right input
+SUGGESTED_STOCK_PATHS = [
+    "google/flan-t5-base",
+    "t5-small",
+    "t5-base",
+    "t5-large",
+    "google/mt5-base",
+    "microsoft/trocr-base-handwritten",
+    "microsoft/trocr-base-printed"
+]
+# Tokenize and decode with error handling
 def tokenize_display(text, tokenizer_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
+        tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
+        ids = encoding.input_ids
+        decoded = tokenizer.decode(ids, skip_special_tokens=False)
+        return tokens, ids, decoded
+    except Exception as e:
+        return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
+# Comparison logic
+def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
     def format_block(title, tokenizer_path):
         dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
         en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
         return f"""\
 **Tokenized:**
 {' '.join(dv_tokens)}
+**Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'}
+**IDs:** {dv_ids or '[ERROR]'}
 **Decoded:** `{dv_decoded}`
 ---
 **Tokenized:**
 {' '.join(en_tokens)}
+**Number of tokens:** {len(en_tokens) if en_ids else 'N/A'}
+**IDs:** {en_ids or '[ERROR]'}
 **Decoded:** `{en_decoded}`
 """
+    try:
+        custom_path = TOKENIZER_CUSTOM[custom_label]
+    except KeyError:
+        return "[ERROR] Invalid custom tokenizer selected", ""
     return (
         format_block("Custom Tokenizer", custom_path),
         format_block("Stock Tokenizer", stock_path)
     )
+# Gradio UI
+with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
+    gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
+    gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
     with gr.Row():
         dhivehi_text = gr.Textbox(
             label="Dhivehi Text",
+            lines=1,
+            value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
+            rtl=True
         )
         english_text = gr.Textbox(
             label="English Text",
+            lines=1,
             value="The quick brown fox jumps over the lazy dog"
         )
     with gr.Row():
         tokenizer_a = gr.Dropdown(
             label="Select Custom Tokenizer",
+            choices=list(TOKENIZER_CUSTOM.keys()),
+            value="T5 Extended"
         )
         tokenizer_b = gr.Dropdown(
+            label="Enter or Select Stock Tokenizer Path",
+            choices=SUGGESTED_STOCK_PATHS,
+            value="google/flan-t5-base",
+            allow_custom_value=True
         )
     compare_button = gr.Button("Compare Tokenizers")