Zihan428 commited on
Commit
c612a94
·
1 Parent(s): 364e836

Rename and cleanup

Browse files
app.py CHANGED
@@ -102,7 +102,7 @@ LANGUAGE_CONFIG = {
102
  },
103
  "zh": {
104
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
105
- "text": "上个月,我们达到了一个新的里程碑. 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
106
  },
107
  }
108
 
 
102
  },
103
  "zh": {
104
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
105
+ "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
106
  },
107
  }
108
 
src/chatterbox/models/tokenizers/tokenizer.py CHANGED
@@ -1,10 +1,9 @@
1
  import logging
2
  import json
3
- import re
4
 
5
  import torch
6
  from pathlib import Path
7
- from unicodedata import category
8
  from tokenizers import Tokenizer
9
  from huggingface_hub import hf_hub_download
10
 
@@ -33,7 +32,7 @@ class EnTokenizer:
33
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
34
  return text_tokens
35
 
36
- def encode( self, txt: str, verbose=False):
37
  """
38
  clean_text > (append `lang_id`) > replace SPACE > encode text using Tokenizer
39
  """
@@ -46,8 +45,7 @@ class EnTokenizer:
46
  if isinstance(seq, torch.Tensor):
47
  seq = seq.cpu().numpy()
48
 
49
- txt: str = self.tokenizer.decode(seq,
50
- skip_special_tokens=False)
51
  txt = txt.replace(' ', '')
52
  txt = txt.replace(SPACE, ' ')
53
  txt = txt.replace(EOT, '')
@@ -61,6 +59,7 @@ REPO_ID = "ResembleAI/chatterbox"
61
  # Global instances for optional dependencies
62
  _kakasi = None
63
  _dicta = None
 
64
 
65
 
66
  def is_kanji(c: str) -> bool:
@@ -207,7 +206,6 @@ class ChineseCangjieConverter:
207
  index = str(index) if index > 0 else ""
208
  return code + str(index)
209
 
210
-
211
 
212
  def __call__(self, text):
213
  """Convert Chinese characters in text to Cangjie tokens."""
@@ -235,53 +233,30 @@ class ChineseCangjieConverter:
235
  return "".join(output)
236
 
237
 
238
- class RussianStressLabeler:
239
- """Adds stress marks to Russian text when the optional dependency is available."""
240
-
241
- def __init__(self):
242
- self._stresser = None
243
- self._available = False
244
- self._error_logged = False
245
- self._initialize()
246
-
247
- def _initialize(self):
248
- try:
249
  from russian_text_stresser.text_stresser import RussianTextStresser
250
- except ImportError:
251
- logger.warning("russian_text_stresser not available - Russian stress labeling skipped")
252
- self._error_logged = True
253
- return
254
- except Exception as exc:
255
- logger.warning(f"Failed to import RussianTextStresser: {exc}")
256
- self._error_logged = True
257
- return
258
-
259
- try:
260
- self._stresser = RussianTextStresser()
261
- self._available = True
262
- except Exception as exc:
263
- logger.warning(f"Failed to initialize RussianTextStresser: {exc}")
264
- self._error_logged = True
265
-
266
- def __call__(self, text: str) -> str:
267
- if not text or not self._available:
268
- return text
269
-
270
- try:
271
- return self._stresser.stress_text(text)
272
- except Exception as exc:
273
- if not self._error_logged:
274
- logger.warning(f"Russian stress labeling failed: {exc}")
275
- self._error_logged = True
276
- return text
277
 
 
278
 
 
 
 
 
 
 
 
 
279
  class MTLTokenizer:
280
  def __init__(self, vocab_file_path):
281
  self.tokenizer: Tokenizer = Tokenizer.from_file(vocab_file_path)
282
  model_dir = Path(vocab_file_path).parent
283
  self.cangjie_converter = ChineseCangjieConverter(model_dir)
284
- self.russian_stress_labeler = RussianStressLabeler()
285
  self.check_vocabset_sot_eot()
286
 
287
  def check_vocabset_sot_eot(self):
@@ -289,12 +264,26 @@ class MTLTokenizer:
289
  assert SOT in voc
290
  assert EOT in voc
291
 
292
- def text_to_tokens(self, text: str, language_id: str = None):
293
- text_tokens = self.encode(text, language_id=language_id)
 
 
 
 
 
 
 
 
 
 
 
 
294
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
295
  return text_tokens
296
 
297
- def encode(self, txt: str, language_id: str = None):
 
 
298
  # Language-specific text processing
299
  if language_id == 'zh':
300
  txt = self.cangjie_converter(txt)
@@ -305,11 +294,7 @@ class MTLTokenizer:
305
  elif language_id == 'ko':
306
  txt = korean_normalize(txt)
307
  elif language_id == 'ru':
308
- txt = self.russian_stress_labeler(txt)
309
- elif language_id == 'pl':
310
- # Polish text normalization: ensure diacritic characters are preserved
311
- import unicodedata
312
- txt = unicodedata.normalize('NFC', txt)
313
 
314
  # Prepend language token
315
  if language_id:
 
1
  import logging
2
  import json
 
3
 
4
  import torch
5
  from pathlib import Path
6
+ from unicodedata import category, normalize
7
  from tokenizers import Tokenizer
8
  from huggingface_hub import hf_hub_download
9
 
 
32
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
33
  return text_tokens
34
 
35
+ def encode(self, txt: str):
36
  """
37
  clean_text > (append `lang_id`) > replace SPACE > encode text using Tokenizer
38
  """
 
45
  if isinstance(seq, torch.Tensor):
46
  seq = seq.cpu().numpy()
47
 
48
+ txt: str = self.tokenizer.decode(seq, skip_special_tokens=False)
 
49
  txt = txt.replace(' ', '')
50
  txt = txt.replace(SPACE, ' ')
51
  txt = txt.replace(EOT, '')
 
59
  # Global instances for optional dependencies
60
  _kakasi = None
61
  _dicta = None
62
+ _russian_stresser = None
63
 
64
 
65
  def is_kanji(c: str) -> bool:
 
206
  index = str(index) if index > 0 else ""
207
  return code + str(index)
208
 
 
209
 
210
  def __call__(self, text):
211
  """Convert Chinese characters in text to Cangjie tokens."""
 
233
  return "".join(output)
234
 
235
 
236
+ def add_russian_stress(text: str) -> str:
237
+ """Russian text normalization: adds stress marks to Russian text."""
238
+ global _russian_stresser
239
+
240
+ try:
241
+ if _russian_stresser is None:
 
 
 
 
 
242
  from russian_text_stresser.text_stresser import RussianTextStresser
243
+ _russian_stresser = RussianTextStresser()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ return _russian_stresser.stress_text(text)
246
 
247
+ except ImportError:
248
+ logger.warning("russian_text_stresser not available - Russian stress labeling skipped")
249
+ return text
250
+ except Exception as e:
251
+ logger.warning(f"Russian stress labeling failed: {e}")
252
+ return text
253
+
254
+
255
  class MTLTokenizer:
256
  def __init__(self, vocab_file_path):
257
  self.tokenizer: Tokenizer = Tokenizer.from_file(vocab_file_path)
258
  model_dir = Path(vocab_file_path).parent
259
  self.cangjie_converter = ChineseCangjieConverter(model_dir)
 
260
  self.check_vocabset_sot_eot()
261
 
262
  def check_vocabset_sot_eot(self):
 
264
  assert SOT in voc
265
  assert EOT in voc
266
 
267
+ def preprocess_text(self, raw_text: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
268
+ """
269
+ Text preprocessor that handles lowercase conversion and NFKD normalization.
270
+ """
271
+ preprocessed_text = raw_text
272
+ if lowercase:
273
+ preprocessed_text = preprocessed_text.lower()
274
+ if nfkd_normalize:
275
+ preprocessed_text = normalize("NFKD", preprocessed_text)
276
+
277
+ return preprocessed_text
278
+
279
+ def text_to_tokens(self, text: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
280
+ text_tokens = self.encode(text, language_id=language_id, lowercase=lowercase, nfkd_normalize=nfkd_normalize)
281
  text_tokens = torch.IntTensor(text_tokens).unsqueeze(0)
282
  return text_tokens
283
 
284
+ def encode(self, txt: str, language_id: str = None, lowercase: bool = True, nfkd_normalize: bool = True):
285
+ txt = self.preprocess_text(txt, language_id=language_id, lowercase=lowercase, nfkd_normalize=nfkd_normalize)
286
+
287
  # Language-specific text processing
288
  if language_id == 'zh':
289
  txt = self.cangjie_converter(txt)
 
294
  elif language_id == 'ko':
295
  txt = korean_normalize(txt)
296
  elif language_id == 'ru':
297
+ txt = add_russian_stress(txt)
 
 
 
 
298
 
299
  # Prepend language token
300
  if language_id: