Michael Hu
initial check in
05b45a5
def get_vocab():
"""Get the vocabulary dictionary mapping characters to token IDs"""
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»"" '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# Create vocabulary dictionary
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
return {symbol: i for i, symbol in enumerate(symbols)}
# Initialize vocabulary
VOCAB = get_vocab()
def tokenize(phonemes: str) -> list[int]:
"""Convert phonemes string to token IDs
Args:
phonemes: String of phonemes to tokenize
Returns:
List of token IDs
"""
return [i for i in map(VOCAB.get, phonemes) if i is not None]
def decode_tokens(tokens: list[int]) -> str:
"""Convert token IDs back to phonemes string
Args:
tokens: List of token IDs
Returns:
String of phonemes
"""
# Create reverse mapping
id_to_symbol = {i: s for s, i in VOCAB.items()}
return "".join(id_to_symbol[t] for t in tokens)