Spaces:
Runtime error
Runtime error
File size: 1,271 Bytes
05b45a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
def get_vocab():
"""Get the vocabulary dictionary mapping characters to token IDs"""
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»"" '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# Create vocabulary dictionary
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
return {symbol: i for i, symbol in enumerate(symbols)}
# Initialize vocabulary
VOCAB = get_vocab()
def tokenize(phonemes: str) -> list[int]:
"""Convert phonemes string to token IDs
Args:
phonemes: String of phonemes to tokenize
Returns:
List of token IDs
"""
return [i for i in map(VOCAB.get, phonemes) if i is not None]
def decode_tokens(tokens: list[int]) -> str:
"""Convert token IDs back to phonemes string
Args:
tokens: List of token IDs
Returns:
String of phonemes
"""
# Create reverse mapping
id_to_symbol = {i: s for s, i in VOCAB.items()}
return "".join(id_to_symbol[t] for t in tokens)
|