Spaces:
Runtime error
Runtime error
def get_vocab(): | |
"""Get the vocabulary dictionary mapping characters to token IDs""" | |
_pad = "$" | |
_punctuation = ';:,.!?¡¿—…"«»"" ' | |
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
# Create vocabulary dictionary | |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
return {symbol: i for i, symbol in enumerate(symbols)} | |
# Initialize vocabulary | |
VOCAB = get_vocab() | |
def tokenize(phonemes: str) -> list[int]: | |
"""Convert phonemes string to token IDs | |
Args: | |
phonemes: String of phonemes to tokenize | |
Returns: | |
List of token IDs | |
""" | |
return [i for i in map(VOCAB.get, phonemes) if i is not None] | |
def decode_tokens(tokens: list[int]) -> str: | |
"""Convert token IDs back to phonemes string | |
Args: | |
tokens: List of token IDs | |
Returns: | |
String of phonemes | |
""" | |
# Create reverse mapping | |
id_to_symbol = {i: s for s, i in VOCAB.items()} | |
return "".join(id_to_symbol[t] for t in tokens) | |