Spaces:
Running
Running
Commit
·
fee5e46
1
Parent(s):
249b1cb
- app.py +16 -4
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import AutoTokenizer
|
3 |
|
4 |
# Fixed list of custom tokenizers (left)
|
5 |
TOKENIZER_CUSTOM = {
|
6 |
"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
|
7 |
-
"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended"
|
|
|
|
|
8 |
}
|
9 |
|
10 |
# Suggested stock model paths for the right input
|
@@ -15,13 +17,23 @@ SUGGESTED_STOCK_PATHS = [
|
|
15 |
"t5-large",
|
16 |
"google/mt5-base",
|
17 |
"microsoft/trocr-base-handwritten",
|
18 |
-
"microsoft/trocr-base-printed"
|
|
|
19 |
]
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Tokenize and decode with error handling
|
22 |
def tokenize_display(text, tokenizer_path):
|
23 |
try:
|
24 |
-
tokenizer =
|
25 |
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
|
26 |
tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
|
27 |
ids = encoding.input_ids
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import AutoTokenizer, T5Tokenizer
|
3 |
|
4 |
# Fixed list of custom tokenizers (left)
|
5 |
TOKENIZER_CUSTOM = {
|
6 |
"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
|
7 |
+
"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
|
8 |
+
"Google mT5": "google/mt5-base",
|
9 |
+
"DeBERTa": "alakxender/deberta-dhivehi-tokenizer-extended"
|
10 |
}
|
11 |
|
12 |
# Suggested stock model paths for the right input
|
|
|
17 |
"t5-large",
|
18 |
"google/mt5-base",
|
19 |
"microsoft/trocr-base-handwritten",
|
20 |
+
"microsoft/trocr-base-printed",
|
21 |
+
"microsoft/deberta-v3-base"
|
22 |
]
|
23 |
|
24 |
+
# Load tokenizer with fallback to slow T5
|
25 |
+
def load_tokenizer(tokenizer_path):
|
26 |
+
try:
|
27 |
+
return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
|
28 |
+
except Exception:
|
29 |
+
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
|
30 |
+
return T5Tokenizer.from_pretrained(tokenizer_path)
|
31 |
+
raise
|
32 |
+
|
33 |
# Tokenize and decode with error handling
|
34 |
def tokenize_display(text, tokenizer_path):
|
35 |
try:
|
36 |
+
tokenizer = load_tokenizer(tokenizer_path)
|
37 |
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
|
38 |
tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
|
39 |
ids = encoding.input_ids
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
transformers
|
|
|
|
1 |
+
transformers
|
2 |
+
sentencepiece
|