alakxender commited on
Commit
249b1cb
·
1 Parent(s): 9405745
Files changed (1) hide show
  1. app.py +50 -33
app.py CHANGED
@@ -1,26 +1,39 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
 
4
- # Tokenizer options
5
- TOKENIZER_PATHS = {
6
- "Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)": "alakxender/flan-t5-dhivehi-tokenizer",
7
- "Stock Tokenizer (google/flan-t5-base)": "google/flan-t5-base",
8
- "T5 Small (t5-small)": "t5-small"
9
  }
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def tokenize_display(text, tokenizer_path):
12
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
13
- encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
14
- tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
15
- ids = encoding.input_ids
16
- decoded = tokenizer.decode(ids, skip_special_tokens=False)
17
- return tokens, ids, decoded
18
-
19
- def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
 
 
 
 
20
  def format_block(title, tokenizer_path):
21
- # Dhivehi
22
  dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
23
- # English
24
  en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
25
 
26
  return f"""\
@@ -32,8 +45,8 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
32
  **Tokenized:**
33
  {' '.join(dv_tokens)}
34
 
35
- **Number of tokens:** {len(dv_tokens)}
36
- **IDs:** {dv_ids}
37
  **Decoded:** `{dv_decoded}`
38
 
39
  ---
@@ -44,46 +57,50 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
44
  **Tokenized:**
45
  {' '.join(en_tokens)}
46
 
47
- **Number of tokens:** {len(en_tokens)}
48
- **IDs:** {en_ids}
49
  **Decoded:** `{en_decoded}`
50
  """
51
 
52
- custom_path = TOKENIZER_PATHS[custom_label]
53
- stock_path = TOKENIZER_PATHS[stock_label]
 
 
54
 
55
  return (
56
  format_block("Custom Tokenizer", custom_path),
57
  format_block("Stock Tokenizer", stock_path)
58
  )
59
 
60
- # Gradio app
61
- with gr.Blocks(title="Tokenizer Comparison Tool") as demo:
62
- gr.Markdown("## 🧠 Tokenizer Comparison (Custom vs Stock)")
63
- gr.Markdown("Compare how different tokenizers handle Dhivehi and English input text.")
64
 
65
  with gr.Row():
66
  dhivehi_text = gr.Textbox(
67
  label="Dhivehi Text",
68
- lines=2,
69
- value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ"
 
70
  )
71
  english_text = gr.Textbox(
72
  label="English Text",
73
- lines=2,
74
  value="The quick brown fox jumps over the lazy dog"
75
  )
76
 
77
  with gr.Row():
78
  tokenizer_a = gr.Dropdown(
79
  label="Select Custom Tokenizer",
80
- choices=list(TOKENIZER_PATHS.keys()),
81
- value="Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)"
82
  )
83
  tokenizer_b = gr.Dropdown(
84
- label="Select Stock Tokenizer",
85
- choices=list(TOKENIZER_PATHS.keys()),
86
- value="Stock Tokenizer (google/flan-t5-base)"
 
87
  )
88
 
89
  compare_button = gr.Button("Compare Tokenizers")
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
 
4
+ # Fixed list of custom tokenizers (left)
5
+ TOKENIZER_CUSTOM = {
6
+ "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
7
+ "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended"
 
8
  }
9
 
10
+ # Suggested stock model paths for the right input
11
+ SUGGESTED_STOCK_PATHS = [
12
+ "google/flan-t5-base",
13
+ "t5-small",
14
+ "t5-base",
15
+ "t5-large",
16
+ "google/mt5-base",
17
+ "microsoft/trocr-base-handwritten",
18
+ "microsoft/trocr-base-printed"
19
+ ]
20
+
21
+ # Tokenize and decode with error handling
22
  def tokenize_display(text, tokenizer_path):
23
+ try:
24
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
25
+ encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
26
+ tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
27
+ ids = encoding.input_ids
28
+ decoded = tokenizer.decode(ids, skip_special_tokens=False)
29
+ return tokens, ids, decoded
30
+ except Exception as e:
31
+ return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
32
+
33
+ # Comparison logic
34
+ def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
35
  def format_block(title, tokenizer_path):
 
36
  dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
 
37
  en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
38
 
39
  return f"""\
 
45
  **Tokenized:**
46
  {' '.join(dv_tokens)}
47
 
48
+ **Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'}
49
+ **IDs:** {dv_ids or '[ERROR]'}
50
  **Decoded:** `{dv_decoded}`
51
 
52
  ---
 
57
  **Tokenized:**
58
  {' '.join(en_tokens)}
59
 
60
+ **Number of tokens:** {len(en_tokens) if en_ids else 'N/A'}
61
+ **IDs:** {en_ids or '[ERROR]'}
62
  **Decoded:** `{en_decoded}`
63
  """
64
 
65
+ try:
66
+ custom_path = TOKENIZER_CUSTOM[custom_label]
67
+ except KeyError:
68
+ return "[ERROR] Invalid custom tokenizer selected", ""
69
 
70
  return (
71
  format_block("Custom Tokenizer", custom_path),
72
  format_block("Stock Tokenizer", stock_path)
73
  )
74
 
75
+ # Gradio UI
76
+ with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
77
+ gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
78
+ gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
79
 
80
  with gr.Row():
81
  dhivehi_text = gr.Textbox(
82
  label="Dhivehi Text",
83
+ lines=1,
84
+ value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
85
+ rtl=True
86
  )
87
  english_text = gr.Textbox(
88
  label="English Text",
89
+ lines=1,
90
  value="The quick brown fox jumps over the lazy dog"
91
  )
92
 
93
  with gr.Row():
94
  tokenizer_a = gr.Dropdown(
95
  label="Select Custom Tokenizer",
96
+ choices=list(TOKENIZER_CUSTOM.keys()),
97
+ value="T5 Extended"
98
  )
99
  tokenizer_b = gr.Dropdown(
100
+ label="Enter or Select Stock Tokenizer Path",
101
+ choices=SUGGESTED_STOCK_PATHS,
102
+ value="google/flan-t5-base",
103
+ allow_custom_value=True
104
  )
105
 
106
  compare_button = gr.Button("Compare Tokenizers")