alakxender commited on
Commit
9405745
·
1 Parent(s): 60c8da8
Files changed (2) hide show
  1. app.py +97 -3
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,7 +1,101 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer
3
 
4
+ # Tokenizer options
5
+ TOKENIZER_PATHS = {
6
+ "Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)": "alakxender/flan-t5-dhivehi-tokenizer",
7
+ "Stock Tokenizer (google/flan-t5-base)": "google/flan-t5-base",
8
+ "T5 Small (t5-small)": "t5-small"
9
+ }
10
+
11
+ def tokenize_display(text, tokenizer_path):
12
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
13
+ encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
14
+ tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
15
+ ids = encoding.input_ids
16
+ decoded = tokenizer.decode(ids, skip_special_tokens=False)
17
+ return tokens, ids, decoded
18
+
19
+ def compare_side_by_side(dv_text, en_text, custom_label, stock_label):
20
+ def format_block(title, tokenizer_path):
21
+ # Dhivehi
22
+ dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
23
+ # English
24
+ en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
25
+
26
+ return f"""\
27
+ ### 🔤 {title}
28
+
29
+ #### 🈁 Dhivehi Text
30
+ `{dv_text}`
31
+
32
+ **Tokenized:**
33
+ {' '.join(dv_tokens)}
34
+
35
+ **Number of tokens:** {len(dv_tokens)}
36
+ **IDs:** {dv_ids}
37
+ **Decoded:** `{dv_decoded}`
38
+
39
+ ---
40
+
41
+ #### 🇬🇧 English Text
42
+ `{en_text}`
43
+
44
+ **Tokenized:**
45
+ {' '.join(en_tokens)}
46
+
47
+ **Number of tokens:** {len(en_tokens)}
48
+ **IDs:** {en_ids}
49
+ **Decoded:** `{en_decoded}`
50
+ """
51
+
52
+ custom_path = TOKENIZER_PATHS[custom_label]
53
+ stock_path = TOKENIZER_PATHS[stock_label]
54
+
55
+ return (
56
+ format_block("Custom Tokenizer", custom_path),
57
+ format_block("Stock Tokenizer", stock_path)
58
+ )
59
+
60
+ # Gradio app
61
+ with gr.Blocks(title="Tokenizer Comparison Tool") as demo:
62
+ gr.Markdown("## 🧠 Tokenizer Comparison (Custom vs Stock)")
63
+ gr.Markdown("Compare how different tokenizers handle Dhivehi and English input text.")
64
+
65
+ with gr.Row():
66
+ dhivehi_text = gr.Textbox(
67
+ label="Dhivehi Text",
68
+ lines=2,
69
+ value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ"
70
+ )
71
+ english_text = gr.Textbox(
72
+ label="English Text",
73
+ lines=2,
74
+ value="The quick brown fox jumps over the lazy dog"
75
+ )
76
+
77
+ with gr.Row():
78
+ tokenizer_a = gr.Dropdown(
79
+ label="Select Custom Tokenizer",
80
+ choices=list(TOKENIZER_PATHS.keys()),
81
+ value="Custom Tokenizer (alakxender/flan-t5-dhivehi-tokenizer)"
82
+ )
83
+ tokenizer_b = gr.Dropdown(
84
+ label="Select Stock Tokenizer",
85
+ choices=list(TOKENIZER_PATHS.keys()),
86
+ value="Stock Tokenizer (google/flan-t5-base)"
87
+ )
88
+
89
+ compare_button = gr.Button("Compare Tokenizers")
90
+
91
+ with gr.Row():
92
+ output_custom = gr.Markdown(label="Custom Tokenizer Output")
93
+ output_stock = gr.Markdown(label="Stock Tokenizer Output")
94
+
95
+ compare_button.click(
96
+ compare_side_by_side,
97
+ inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
98
+ outputs=[output_custom, output_stock]
99
+ )
100
 
 
101
  demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ transformers