alakxender commited on
Commit
fee5e46
·
1 Parent(s): 249b1cb
Files changed (2) hide show
  1. app.py +16 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer
3
 
4
  # Fixed list of custom tokenizers (left)
5
  TOKENIZER_CUSTOM = {
6
  "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
7
- "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended"
 
 
8
  }
9
 
10
  # Suggested stock model paths for the right input
@@ -15,13 +17,23 @@ SUGGESTED_STOCK_PATHS = [
15
  "t5-large",
16
  "google/mt5-base",
17
  "microsoft/trocr-base-handwritten",
18
- "microsoft/trocr-base-printed"
 
19
  ]
20
 
 
 
 
 
 
 
 
 
 
21
  # Tokenize and decode with error handling
22
  def tokenize_display(text, tokenizer_path):
23
  try:
24
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
25
  encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
26
  tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
27
  ids = encoding.input_ids
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, T5Tokenizer
3
 
4
  # Fixed list of custom tokenizers (left)
5
  TOKENIZER_CUSTOM = {
6
  "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
7
+ "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
8
+ "Google mT5": "google/mt5-base",
9
+ "DeBERTa": "alakxender/deberta-dhivehi-tokenizer-extended"
10
  }
11
 
12
  # Suggested stock model paths for the right input
 
17
  "t5-large",
18
  "google/mt5-base",
19
  "microsoft/trocr-base-handwritten",
20
+ "microsoft/trocr-base-printed",
21
+ "microsoft/deberta-v3-base"
22
  ]
23
 
24
+ # Load tokenizer with fallback to slow T5
25
+ def load_tokenizer(tokenizer_path):
26
+ try:
27
+ return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
28
+ except Exception:
29
+ if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
30
+ return T5Tokenizer.from_pretrained(tokenizer_path)
31
+ raise
32
+
33
  # Tokenize and decode with error handling
34
  def tokenize_display(text, tokenizer_path):
35
  try:
36
+ tokenizer = load_tokenizer(tokenizer_path)
37
  encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
38
  tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
39
  ids = encoding.input_ids
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- transformers
 
 
1
+ transformers
2
+ sentencepiece