IPA-Transcription-EN

Running

App Files Files Community

SanderGi commited on 29 days ago

Commit

7796889

1 Parent(s): f718747

powsm support; multiple output formats

Browse files

Files changed (8) hide show

DEVELOPMENT.md +1 -0
app/app.py +37 -4
app/codes.py +394 -0
app/hf.py +16 -4
app/inference.py +50 -2
app/tasks.py +16 -4
requirements.txt +4 -2
requirements_lock.txt +52 -2

DEVELOPMENT.md CHANGED Viewed

@@ -85,6 +85,7 @@ IPA-Transcription-EN/
 ├── app/                        # All application code lives here
 │   ├── data/                   # Phoneme transcription test set
 │   ├── app.py                  # Main Gradio UI
 │   ├── hf.py                   # Interface with the Huggingface API
 │   ├── inference.py            # Model inference
 │   └── metrics.py              # Evaluation metrics

 ├── app/                        # All application code lives here
 │   ├── data/                   # Phoneme transcription test set
 │   ├── app.py                  # Main Gradio UI
+│   ├── codes.py                # Phonetic Alphabet conversions
 │   ├── hf.py                   # Interface with the Huggingface API
 │   ├── inference.py            # Model inference
 │   └── metrics.py              # Evaluation metrics

app/app.py CHANGED Viewed

@@ -7,6 +7,25 @@ import pandas as pd
 from tasks import start_eval_task, get_status
 from hf import get_or_create_leaderboard
 def get_latest_leaderboard_html(datasets: list[str], sort_option: str) -> str:
     try:
@@ -28,6 +47,7 @@ def get_latest_leaderboard_html(datasets: list[str], sort_option: str) -> str:
                     lambda r: f'<a href="https://huggingface.co/{r["repo_id"]}" target="_blank">{r["display_name"]}</a>',
                     axis=1,
                 ),
                 "Average PER ⬇️": df["average_per"].apply(lambda x: f"{100 * x:.2f}%"),
             }
             | {
@@ -53,14 +73,16 @@ def get_latest_leaderboard_html(datasets: list[str], sort_option: str) -> str:
         return f"Error updating leaderboard: {type(e).__name__} - {e}"
-def submit_evaluation(model_id: str, display_name: str, url: str) -> str:
     model_id = model_id.strip()
     display_name = display_name.strip()
     if not model_id or not display_name:
         return "⚠️ Please provide both model name and submission name."
     try:
-        task_id = start_eval_task(display_name, model_id, url)
         return f"✅ Evaluation submitted successfully! Task ID: {task_id}"
     except Exception as e:
         return f"❌ Error: {str(e)}"
@@ -100,7 +122,7 @@ with gr.Blocks(
     - **PER (Phoneme Error Rate)**: The Levenshtein distance calculated between phoneme sequences of the predicted and actual transcriptions.
     - **FER (Feature Error Rate)**: The edit distance between the predicted and actual phoneme sequences, weighted by the phonetic features from [panphon](https://github.com/dmort27/panphon).
-    Models are evaluated on a variety of English speech: native, non-native, and impaired. Read more about evaluations on [our blog](https://www.koellabs.com/blog/phonemic-transcription-metrics)
     ## Compute
     This leaderboard uses the free basic plan (16GB RAM, 2vCPUs) to allow for reproducability. The evaluation may take several hours to complete. Please be patient and do not submit the same model multiple times.
@@ -163,12 +185,23 @@ with gr.Blocks(
                 label="Github/Kaggle/HF URL (optional)",
                 placeholder="https://github.com/username/repo",
             )
             submit_btn = gr.Button("Submit")
             result = gr.Textbox(label="Submission Status")
             submit_btn.click(
                 fn=submit_evaluation,
-                inputs=[model_id, display_name, url],
                 outputs=result,
             )

 from tasks import start_eval_task, get_status
 from hf import get_or_create_leaderboard
+from codes import CODES
+from inference import MODEL_TYPES
+from math import log
+unit_list = list(zip(["B", "KB", "MB", "GB", "TB", "PB"], [0, 0, 1, 2, 2, 2]))
+def sizeof_fmt(num):
+    """Human friendly file size"""
+    if isinstance(num, int):
+        exponent = min(int(log(num, 1024)), len(unit_list) - 1)
+        quotient = float(num) / 1024**exponent
+        unit, num_decimals = unit_list[exponent]
+        format_string = "{:.%sf} {}" % (num_decimals)
+        return format_string.format(quotient, unit)
+    else:
+        return "unknown"
 def get_latest_leaderboard_html(datasets: list[str], sort_option: str) -> str:
     try:
                     lambda r: f'<a href="https://huggingface.co/{r["repo_id"]}" target="_blank">{r["display_name"]}</a>',
                     axis=1,
                 ),
+                "Size": df["model_bytes"].apply(sizeof_fmt),
                 "Average PER ⬇️": df["average_per"].apply(lambda x: f"{100 * x:.2f}%"),
             }
             | {
         return f"Error updating leaderboard: {type(e).__name__} - {e}"
+def submit_evaluation(
+    model_id: str, display_name: str, url: str, model_type: str, phone_code: str
+) -> str:
     model_id = model_id.strip()
     display_name = display_name.strip()
     if not model_id or not display_name:
         return "⚠️ Please provide both model name and submission name."
     try:
+        task_id = start_eval_task(display_name, model_id, url, model_type, phone_code)
         return f"✅ Evaluation submitted successfully! Task ID: {task_id}"
     except Exception as e:
         return f"❌ Error: {str(e)}"
     - **PER (Phoneme Error Rate)**: The Levenshtein distance calculated between phoneme sequences of the predicted and actual transcriptions.
     - **FER (Feature Error Rate)**: The edit distance between the predicted and actual phoneme sequences, weighted by the phonetic features from [panphon](https://github.com/dmort27/panphon).
+    Models are evaluated on a variety of English speech: native, non-native, and impaired. Read more about [evaluations](https://www.koellabs.com/blog/phonemic-transcription-metrics) or [how to build your own leaderboards](https://www.koellabs.com/blog/building-open-source-leaderboards) on our blog.
     ## Compute
     This leaderboard uses the free basic plan (16GB RAM, 2vCPUs) to allow for reproducability. The evaluation may take several hours to complete. Please be patient and do not submit the same model multiple times.
                 label="Github/Kaggle/HF URL (optional)",
                 placeholder="https://github.com/username/repo",
             )
+            model_type = gr.Dropdown(
+                choices=["Transformers CTC"]
+                + [c for c in sorted(MODEL_TYPES) if c != "Transformers CTC"],
+                label="Model Type",
+                interactive=True,
+            )
+            output_code = gr.Dropdown(
+                choices=["ipa"] + [c for c in sorted(CODES) if c != "ipa"],
+                label="Model Output Phonetic Code",
+                interactive=True,
+            )
             submit_btn = gr.Button("Submit")
             result = gr.Textbox(label="Submission Status")
             submit_btn.click(
                 fn=submit_evaluation,
+                inputs=[model_id, display_name, url, model_type, output_code],
                 outputs=result,
             )

app/codes.py ADDED Viewed

	@@ -0,0 +1,394 @@

+#!/usr/bin/env python3
+# Conversion between different phonetic codes
+# Modified from https://github.com/jhasegaw/phonecodes/blob/master/src/phonecodes.py
+# Canonical version of this file lives in https://github.com/KoelLabs/ML
+import sys
+# CODES = set(("ipa", "timit", "arpabet", "xsampa", "buckeye", "epadb", "isle", "disc", "callhome"))
+CODES = set(("ipa", "timit", "arpabet", "xsampa", "buckeye", "epadb", "isle"))
+def convert(phoneme_string, from_code, to_code):
+    assert from_code in CODES, f"from_code must be one of {CODES}"
+    assert to_code in CODES, f"to_code must be one of {CODES}"
+    if from_code == "ipa":
+        return globals()[f"ipa2{to_code}"](phoneme_string)
+    elif to_code == "ipa":
+        return globals()[f"{from_code}2ipa"](phoneme_string)
+    else:
+        return globals()[f"ipa2{to_code}"](
+            globals()[f"{from_code}2ipa"](phoneme_string)
+        )
+def string2symbols(string, symbols):
+    """Converts a string of symbols into a list of symbols, minimizing the number of untranslatable symbols, then minimizing the number of translated symbols."""
+    N = len(string)
+    symcost = 1  # path cost per translated symbol
+    oovcost = len(string)  # path cost per untranslatable symbol
+    maxsym = max(len(k) for k in symbols)  # max input symbol length
+    # (pathcost to s[(n-m):n], n-m, translation[s[(n-m):m]], True/False)
+    lattice = [(0, 0, "", True)]
+    for n in range(1, N + 1):
+        # Initialize on the assumption that s[n-1] is untranslatable
+        lattice.append((oovcost + lattice[n - 1][0], n - 1, string[(n - 1) : n], False))
+        # Search for translatable sequences s[(n-m):n], and keep the best
+        for m in range(1, min(n + 1, maxsym + 1)):
+            if (
+                string[(n - m) : n] in symbols
+                and symcost + lattice[n - m][0] < lattice[n][0]
+            ):
+                lattice[n] = (
+                    symcost + lattice[n - m][0],
+                    n - m,
+                    string[(n - m) : n],
+                    True,
+                )
+    # Back-trace
+    tl = []
+    translated = []
+    n = N
+    while n > 0:
+        tl.append(lattice[n][2])
+        translated.append(lattice[n][3])
+        n = lattice[n][1]
+    return (tl[::-1], translated[::-1])
+#####################################################################
+# Handle tones/stress markers
+# fmt: off
+TONE2IPA = {
+    'arz' : { '0':'', '1':'ˈ', '2':'ˌ',  '3': '',   '4': '',   '5': '',   '6': '' },
+    'eng' : { '0':'', '1':'ˈ', '2':'ˌ',  '3': '',   '4': '',   '5': '',   '6': '' },
+    'yue' : { '0':'', '1':'˥', '2':'˧˥', '3':'˧',   '4':'˨˩',  '5':'˩˧',  '6':'˨' },
+    'lao' : { '0':'', '1':'˧', '2':'˥˧', '3':'˧˩',  '4':'˥',   '5':'˩˧',  '6':'˩' },
+    'cmn' : { '0':'', '1':'˥', '2':'˧˥', '3':'˨˩˦', '4':'˥˩',  '5': '',   '6': '' },
+    'spa' : { '0':'', '1':'ˈ', '2':'ˌ',  '3': '',   '4': '',   '5': '',   '6': '' },
+    'vie' : { '0':'', '1':'˧', '2':'˨˩h', '3':'˧˥', '4':'˨˩˨', '5':'˧ʔ˥', '6':'˧˨ʔ' },
+}
+IPA2TONE = {key: {v: k for k, v in val.items()} for key, val in TONE2IPA.items()}
+# fmt: on
+def update_dict_with_tones(code2ipa: dict, ipa2code: dict, lang):
+    code2ipa.update(TONE2IPA[lang])
+    ipa2code.update(IPA2TONE[lang])
+#####################################################################
+# X-SAMPA
+# XSAMPA2IPA = {"_": "͡", "a": "a", "b": "b", "b_<": "ɓ", "c": "c", "d": "d", "d`": "ɖ", "d_<": "ɗ", "e": "e", "f": "f", "g": "ɡ", "g_<": "ɠ", "h": "h", "h\\": "ɦ", "i": "i", "j": "j", "j\\": "ʝ", "k": "k", "l": "l", "l`": "ɭ", "l\\": "ɺ", "m": "m", "n": "n", "n`": "ɳ", "o": "o", "p": "p", "p\\": "ɸ", "q": "q", "r": "r", "r`": "ɽ", "r\\": "ɹ", "r\\`": "ɻ", "s": "s", "s`": "ʂ", "s\\": "ɕ", "t": "t", "t`": "ʈ", "u": "u", "v": "v", "v\\": "ʋ", "P": "ʋ", "w": "w", "x": "x", "x\\": "ɧ", "y": "y", "z": "z", "z`": "ʐ", "z\\": "ʑ", "A": "ɑ", "B": "β", "B\\": "ʙ", "C": "ç", "D": "ð", "E": "ɛ", "F": "ɱ", "G": "ɣ", "G\\": "ɢ", "G\\_<": "ʛ", "H": "ɥ", "H\\": "ʜ", "I": "ɪ", "I\\": "ᵻ", "J": "ɲ", "J\\": "ɟ", "J\\_<": "ʄ", "K": "ɬ", "K\\": "ɮ", "L": "ʎ", "L\\": "ʟ", "M": "ɯ", "M\\": "ɰ", "N": "ŋ", "N\\": "ɴ", "O": "ɔ", "O\\": "ʘ", "Q": "ɒ", "R": "ʁ", "R\\": "ʀ", "S": "ʃ", "T": "θ", "U": "ʊ", "U\\": "ᵿ", "V": "ʌ", "W": "ʍ", "X": "χ", "X\\": "ħ", "Y": "ʏ", "Z": "ʒ", ".": ".", '"': "ˈ", "%": "ˌ", "'": "ʲ", "_j": "ʲ", ":": "ː", ":\\": "ˑ", "@": "ə", "@\\": "ɘ", "@`": "ɚ", "{": "æ", "}": "ʉ", "1": "ɨ", "2": "ø", "3": "ɜ", "3\\": "ɞ", "4": "ɾ", "5": "ɫ", "6": "ɐ", "7": "ɤ", "8": "ɵ", "9": "œ", "&": "ɶ", "?": "ʔ", "?\\": "ʕ", "/": "/", "<": "⟨", "<\\": "ʢ", ">": "⟩", ">\\": "ʡ", "^": "ꜛ", "!": "ꜜ", "!\\": "ǃ", "|": "|", "|\\": "ǀ", "||": "‖", "|\\|\\": "ǁ", "=\\": "ǂ", "-\\": "‿", '_"': '̈', "_+": " ", "_-": " ", "_/": " ", "_0": " ", "=": " ", "_=": " ", "_>": "ʼ", "_?\\": "ˤ", "_^": " ", "_}": " ", "`": "˞", "~": " ", "_~": " ", "_A": " ", "_a": " ̺", "_B": " ̏", "_B_L": " ᷅", "_c": " ", "_d": " ̪", "_e": " ̴", "<F>": "↘", "_F": " ", "_\\": " ", "_G": "ˠ", "_H": " ", "_H_T": " ᷄", "_h": "ʰ", "_k": " ̰", "_L": " ̀", "_l": "ˡ", "_M": " ̄", "_m": " ", "_N": " ̼", "_n": "ⁿ", "_O": " ", "_o": " ", "_q": " ", "<R>": "↗", "_R": " ", "_R_F": " ᷈", "_r": " ", "_T": " ", "_t": " ", "_v": " ", "_w": "ʷ", "_X": " ", "_x": " "}  # fmt: skip
+XSAMPA2IPA = {"_": "͡", "a": "a", "b": "b", "b_<": "ɓ", "c": "c", "d": "d", "d`": "ɖ", "d_<": "ɗ", "e": "e", "f": "f", "g": "ɡ", "g_<": "ɠ", "h": "h", "h\\": "ɦ", "i": "i", "j": "j", "j\\": "ʝ", "k": "k", "l": "l", "l`": "ɭ", "l\\": "ɺ", "m": "m", "n": "n", "n`": "ɳ", "o": "o", "p": "p", "p\\": "ɸ", "q": "q", "r": "r", "r`": "ɽ", "r\\": "ɹ", "r\\`": "ɻ", "s": "s", "s`": "ʂ", "s\\": "ɕ", "t": "t", "t`": "ʈ", "u": "u", "v": "v", "v\\": "ʋ", "P": "ʋ", "w": "w", "x": "x", "x\\": "ɧ", "y": "y", "z": "z", "z`": "ʐ", "z\\": "ʑ", "A": "ɑ", "B": "β", "B\\": "ʙ", "C": "ç", "D": "ð", "E": "ɛ", "F": "ɱ", "G": "ɣ", "G\\": "ɢ", "G\\_<": "ʛ", "H": "ɥ", "H\\": "ʜ", "I": "ɪ", "I\\": "ᵻ", "J": "ɲ", "J\\": "ɟ", "J\\<": "ʄ", "K": "ɬ", "K\\": "ɮ", "L": "ʎ", "L\\": "ʟ", "M": "ɯ", "M\\": "ɰ", "N": "ŋ", "N\\": "ɴ", "O": "ɔ", "O\\": "ʘ", "Q": "ɒ", "R": "ʁ", "R\\": "ʀ", "S": "ʃ", "T": "θ", "U": "ʊ", "U\\": "ᵿ", "V": "ʌ", "W": "ʍ", "X": "χ", "X\\": "ħ", "Y": "ʏ", "Z": "ʒ", ".": ".", '"': "ˈ", "%": "ˌ", "'": "ʲ", "_j": "ʲ", ":": "ː", ":\\": "ˑ", "@": "ə", "@\\": "ɘ", "@`": "ɚ", "{": "æ", "}": "ʉ", "1": "ɨ", "2": "ø", "3": "ɜ", "3\\": "ɞ", "4": "ɾ", "5": "ɫ", "6": "ɐ", "7": "ɤ", "8": "ɵ", "9": "œ", "&": "ɶ", "?": "ʔ", "?\\": "ʕ", "/": "/", "<": "⟨", "<\\": "ʢ", ">": "⟩", ">\\": "ʡ", "^": "ꜛ", "!": "ꜜ", "!\\": "ǃ", "|": "|", "|\\": "ǀ", "||": "‖", "|\\|\\": "ǁ", "=\\": "ǂ", "-\\": "‿", '_"': '̈', "_+": " ", "_-": " ", "_/": " ", "_0": " ", "=": " ", "_=": " ", "_>": "ʼ", "_?\\": "ˤ", "_^": " ", "_}": " ", "`": "˞", "~": " ", "_~": " ", "_A": " ", "_a": " ̺", "_B": " ̏", "_B_L": " ᷅", "_c": " ", "_d": " ̪", "_e": " ̴", "<f>": "↘", "_F": " ", "_\\": " ", "_G": "ˠ", "_H": " ", "_H_T": " ᷄", "_h": "ʰ", "_k": " ̰", "_L": " ̀", "_l": "ˡ", "_M": " ̄", "_m": " ", "_N": " ̼", "_n": "ⁿ", "_O": " ", "_o": " ", "_q": " ", "<r>": "↗", "_R": " ", "_R_F": " ᷈", "_r": " ", "_T": " ", "_t": " ", "_v": " ", "_w": "ʷ", "_X": " ", "_x": " "}  # fmt: skip
+# Not supported yet:
+# _<
+# -
+# *
+# rhotization for consonants
+IPA2XSAMPA = {v: k for k, v in XSAMPA2IPA.items()}
+def ipa2xsampa(ipa_string, lang="eng"):
+    ipa_symbols = string2symbols(ipa_string, IPA2XSAMPA.keys())[0]
+    xsampa_symbols = [IPA2XSAMPA[x] for x in ipa_symbols]
+    return " ".join(xsampa_symbols)
+def xsampa2ipa(xsampa_string, lang="eng"):
+    if " " in xsampa_string:
+        xsampa_symbols = xsampa_string.split()
+    else:
+        xsampa_symbols = string2symbols(xsampa_string, XSAMPA2IPA.keys())[0]
+    return "".join([XSAMPA2IPA[x] for x in xsampa_symbols])
+#####################################################################
+# DISC, the system used by CELEX
+def ipa2disc(ipa_string, lang="eng"):
+    raise NotImplementedError
+def disc2ipa(disc_string, lang="eng"):
+    raise NotImplementedError
+#####################################################################
+# Kirshenbaum
+def ipa2kirshenbaum(ipa_string, lang="eng"):
+    raise NotImplementedError
+def kirshenbaum2ipa(kirshenbaum_string, lang="eng"):
+    raise NotImplementedError
+#######################################################################
+# Callhome phone codes
+def ipa2callhome(ipa_string, lang="eng"):
+    raise NotImplementedError
+def callhome2ipa(callhome_string, lang="eng"):
+    raise NotImplementedError
+#########################################################################
+# Buckeye
+BUCKEYE2IPA = {'aa':'ɑ', 'ae':'æ', 'ay':'aɪ', 'aw':'aʊ', 'ao':'ɔ', 'oy':'ɔɪ', 'ow':'oʊ', 'eh':'ɛ', 'ey':'eɪ', 'er':'ɝ', 'ah':'ʌ', 'uw':'u', 'uh':'ʊ', 'ih':'ɪ', 'iy':'i', 'm':'m', 'n':'n', 'en':'n̩', 'ng':'ŋ', 'l':'l', 'el':'l̩', 't':'t', 'd':'d', 'ch':'tʃ', 'jh':'dʒ', 'th':'θ', 'dh':'ð', 'sh':'ʃ', 'zh':'ʒ', 's':'s', 'z':'z', 'k':'k', 'g':'ɡ', 'p':'p', 'b':'b', 'f':'f', 'v':'v', 'w':'w', 'hh':'h', 'y':'j', 'r':'ɹ', 'dx':'ɾ', 'nx':'ɾ̃', 'tq':'ʔ', 'er':'ɚ', 'em':'m̩', 'ihn': 'ĩ', 'ehn': 'ɛ̃', 'own': 'oʊ̃', 'ayn': 'aɪ̃', 'aen': 'æ̃', 'aan': 'ɑ̃', 'ahn': 'ə̃', 'eng': 'ŋ̍', 'iyn': 'ĩ', 'uhn': 'ʊ̃'}  # fmt: skip
+IPA2BUCKEYE = {v: k for k, v in BUCKEYE2IPA.items()}
+# 'Vn':'◌̃'
+def ipa2buckeye(ipa_string, lang="eng"):
+    update_dict_with_tones(BUCKEYE2IPA, IPA2BUCKEYE, lang)
+    ipa_symbols = string2symbols(ipa_string, IPA2BUCKEYE.keys())[0]
+    buckeye_symbols = [IPA2BUCKEYE[x] for x in ipa_symbols]
+    return " ".join(buckeye_symbols)
+def buckeye2ipa(buckeye_string, lang="eng"):
+    update_dict_with_tones(BUCKEYE2IPA, IPA2BUCKEYE, lang)
+    if " " in buckeye_string:
+        buckeye_symbols = buckeye_string.split()
+    else:
+        buckeye_symbols = string2symbols(buckeye_string, BUCKEYE2IPA.keys())[0]
+    return "".join([BUCKEYE2IPA[x] for x in buckeye_symbols])
+#########################################################################
+# ARPABET
+ARPABET2IPA = {'AA':'ɑ','AE':'æ','AH':'ʌ','AO':'ɔ','IX':'ɨ','AW':'aʊ','AX':'ə','AXR':'ɚ','AY':'aɪ','EH':'ɛ','ER':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','UX':'ʉ','B':'b','CH':'tʃ','D':'d','DH':'ð','EL':'l̩','EM':'m̩','EN':'n̩','F':'f','G':'ɡ','HH':'h','H':'h','JH':'dʒ','K':'k','L':'l','M':'m','N':'n','NG':'ŋ','NX':'ɾ̃','P':'p','Q':'ʔ','R':'ɹ','S':'s','SH':'ʃ','T':'t','TH':'θ','V':'v','W':'w','WH':'ʍ','Y':'j','Z':'z','ZH':'ʒ','DX':'ɾ'}  # fmt: skip
+IPA2ARPABET = {v: k for k, v in ARPABET2IPA.items()}
+def ipa2arpabet(ipa_string, lang="eng"):
+    update_dict_with_tones(ARPABET2IPA, IPA2ARPABET, lang)
+    ipa_symbols = string2symbols(ipa_string, IPA2ARPABET.keys())[0]
+    arpabet_symbols = [IPA2ARPABET[x] for x in ipa_symbols]
+    return " ".join(arpabet_symbols)
+def arpabet2ipa(arpabet_string, lang="eng"):
+    update_dict_with_tones(ARPABET2IPA, IPA2ARPABET, lang)
+    if " " in arpabet_string:
+        arpabet_symbols = arpabet_string.split()
+    else:
+        arpabet_symbols = string2symbols(arpabet_string, ARPABET2IPA.keys())[0]
+    return "".join([ARPABET2IPA[x] for x in arpabet_symbols])
+#########################################################################
+# EpaDB
+# We simplify 'A' to 'a' instead of 'ä'
+EPADB2IPA = dict(ARPABET2IPA, **{"PH": "pʰ", "TH": "θʰ", "SH": "sʰ", "KH": "kʰ", "DH": "ð", 'BH': 'β', 'GH': 'ɣ', 'RR': 'r', 'DX': 'ɾ', 'X': 'x', 'A': 'a', 'E': 'e', 'O': 'o', 'U': ARPABET2IPA['UW'], 'I': ARPABET2IPA['IY'], 'LL': 'ʟ'})  # fmt: skip
+IPA2EPADB = {v: k for k, v in EPADB2IPA.items()}
+def ipa2epadb(ipa_string, lang="eng"):
+    update_dict_with_tones(EPADB2IPA, IPA2EPADB, lang)
+    ipa_symbols = string2symbols(ipa_string, IPA2EPADB.keys())[0]
+    epadb_symbols = [IPA2EPADB[x] for x in ipa_symbols]
+    return " ".join(epadb_symbols)
+def epadb2ipa(epadb_string, lang="eng"):
+    update_dict_with_tones(EPADB2IPA, IPA2EPADB, lang)
+    if " " in epadb_string:
+        epadb_symbols = epadb_string.split()
+    else:
+        epadb_symbols = string2symbols(epadb_string, EPADB2IPA.keys())[0]
+    return "".join([EPADB2IPA[x] for x in epadb_symbols])
+#########################################################################
+# TIMIT
+CLOSURE_INTERVALS = {
+    "BCL": ["B"],
+    "DCL": ["D", "JH"],
+    "GCL": ["G"],
+    "PCL": ["P"],
+    "TCL": ["T", "CH"],
+    "KCL": ["K"],
+}
+TIMIT2IPA = {'AA': 'ɑ', 'AE': 'æ', 'AH': 'ʌ', 'AO': 'ɔ', 'AW': 'aʊ', 'AX': 'ə', 'AXR': 'ɚ', 'AX-H': 'ə̥', 'AY': 'aɪ', 'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'IH': 'ɪ', 'IY': 'i', 'OW': 'oʊ', 'OY': 'ɔɪ', 'UH': 'ʊ', 'UW': 'u', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð', 'EL': 'l̩', 'EM': 'm̩', 'EN': 'n̩', 'F': 'f', 'G': 'ɡ', 'HH': 'h', 'JH': 'dʒ', 'K': 'k', 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'P': 'p', 'Q': 'ʔ', 'R': 'ɹ', 'S': 's', 'SH': 'ʃ', 'T': 't', 'TH': 'θ', 'V': 'v', 'W': 'w', 'WH': 'ʍ', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'DX': 'ɾ', 'ENG': 'ŋ̍', 'EPI': '', 'HV': 'ɦ', 'H#': '', 'IX': 'ɨ', 'NX': 'ɾ̃', 'PAU': '', 'UX': 'ʉ'}  # fmt: skip
+IPA2TIMIT = {v: k for k, v in TIMIT2IPA.items()}
+INVERSE_CLOSURE_INTERVALS = {v: k for k, val in CLOSURE_INTERVALS.items() for v in val}
+def parse_timit(lines):
+    # parses the format of a TIMIT .PHN file, handling edge cases where the closure interval and stops are not always paired
+    timestamped_phonemes = []
+    closure_interval_start = None
+    for line in lines:
+        if line == "":
+            continue
+        start, end, phoneme = line.split()
+        phoneme = phoneme.upper()
+        if closure_interval_start:
+            cl_start, cl_end, cl_phoneme = closure_interval_start
+            if phoneme not in CLOSURE_INTERVALS[cl_phoneme]:
+                ipa_phoneme = TIMIT2IPA[CLOSURE_INTERVALS[cl_phoneme][0]]
+                timestamped_phonemes.append((ipa_phoneme, int(cl_start), int(cl_end)))
+            else:
+                assert phoneme not in CLOSURE_INTERVALS
+                start = cl_start
+        if phoneme in CLOSURE_INTERVALS:
+            closure_interval_start = (start, end, phoneme)
+            continue
+        ipa_phoneme = TIMIT2IPA[phoneme]
+        timestamped_phonemes.append((ipa_phoneme, int(start), int(end)))
+        closure_interval_start = None
+    if closure_interval_start:
+        cl_start, cl_end, cl_phoneme = closure_interval_start
+        ipa_phoneme = TIMIT2IPA[CLOSURE_INTERVALS[cl_phoneme][0]]
+        timestamped_phonemes.append((ipa_phoneme, int(cl_start), int(cl_end)))
+    return timestamped_phonemes
+def ipa2timit(ipa_string, lang="eng"):
+    update_dict_with_tones(TIMIT2IPA, IPA2TIMIT, lang)
+    ipa_symbols = string2symbols(ipa_string, IPA2TIMIT.keys())[0]
+    timit_symbols = [IPA2TIMIT[x] for x in ipa_symbols]
+    # insert closure intervals before each stop
+    timit_symbols_with_closures = []
+    for timit_symbol in timit_symbols:
+        if timit_symbol in INVERSE_CLOSURE_INTERVALS:
+            timit_symbols_with_closures.append(INVERSE_CLOSURE_INTERVALS[timit_symbol])
+        timit_symbols_with_closures.append(timit_symbol)
+    return " ".join(timit_symbols_with_closures)
+def timit2ipa(timit_string, lang="eng"):
+    update_dict_with_tones(TIMIT2IPA, IPA2TIMIT, lang)
+    if " " in timit_string:
+        timit_symbols = timit_string.split()
+    else:
+        timit_symbols = string2symbols(
+            timit_string, TIMIT2IPA.keys() | CLOSURE_INTERVALS.keys()
+        )[0]
+    timestamped_phonemes = parse_timit((f"0 0 {x}" for x in timit_symbols))
+    return "".join([x[0] for x in timestamped_phonemes])
+#########################################################################
+# Isle (adaptation of deprecated Entropic GrapHvite UK Phone Set), see http://www.lrec-conf.org/proceedings/lrec2000/pdf/313.pdf
+#
+# Closely matches ARBABet but:
+#   - Some simplifications:
+#       - Only use HH to denote h, not also H
+#       - Drop IX (ɨ), UX (ʉ), EL (l̩), EM (m̩), EN (n̩), NX (ɾ̃), Q (ʔ), WH (ʍ), DX (ɾ)
+#   - Some adaptations to UK dialect:
+#       - Distinquish ɑ vs ɒ by adding OH for ɒ and restricting AA to ɑ
+#       - Map OW to əʊ instead of oʊ
+#       - ER maps to ɜ instead of ɝ because British English is non-rhotic (the r sound is dropped at the end of syllables)
+#   - Some adaptations to Italian/German dialects:
+#       - ER (ɜ) explicitly followed by R (ɹ) maps to ɝ because most Italian/German dialects are rhotic
+#       - We also keep AXR from ARPABet even though it is not in the UK Phone set, so we now map AX (ə) explicitly followed by R (ɹ) to AXR (ɚ) for the same reason
+#
+# symbol : example - UK G2P / US G2P | UK / US / ARPABet | comments
+# Aa     : balm    - bɑːm   / bɑm    | ɑ  / ɑ  / ɑ       |
+# Aa     : barn    - bɑːn   / bɑrn   | ɑ  / ɑ  / ɑ       |
+# Ae     : bat     - bæt    / bæt    | æ  / æ  / æ       |
+# Ah     : bat     - bæt    / bæt    | æ  / æ  / ʌ       |
+# Ao     : bought  - bɔːt   / bɑt    | ɔ  / ɑ  / ɔ       |
+# Aw     : bout    - baʊt   / baʊt   | aʊ / aʊ / aʊ      |
+# Ax     : about   - əˈbaʊt / əˈbaʊt | ə  / ə  / ə       |
+# Ay     : bite    - baɪt   / baɪt   | aɪ / aɪ / aɪ      |
+# Eh     : bet     - bɛt    / bɛt    | ɛ  / ɛ  / ɛ       |
+# Er     : bird    - bɜːd   / bɜrd   | ɜ  / ɝ  / ɝ       | different, ER represents non-r-colored ɜ in UK English because it is non-rhotic unlike American English which is what ARPABet is based on
+# Ey     : bait    - beɪt   / beɪt   | eɪ / eɪ / eɪ      |
+# Ih     : bit     - bɪt    / bɪt    | ɪ  / ɪ  / ɪ       |
+# Iy     : beet    - biːt   / bit    | i  / i  / i       |
+# Ow     : boat    - bəʊt   / boʊt   | əʊ / oʊ / oʊ      | different, map OW to əʊ
+# Oy     : boy     - bɔɪ    / bɔɪ    | ɔɪ / ɔɪ / ɔɪ      |
+# Oh     : box     - bɒks   / bɑks   | ɒ  / ɑ  / -       | added OH to disambiguate ɒ
+# Uh     : book    - bʊk    / bʊk    | ʊ  / ʊ  / ʊ       |
+# Uw     : boot    - buːt   / but    | u  / u  / u       |
+# B      : bet     - bɛt    / bɛt    | b  / b  / b       |
+# Ch     : cheap   - ʧiːp   / ʧip    | ʧ  / ʧ  / tʃ      |
+# D      : debt    - dɛt    / dɛt    | d  / d  / d       |
+# Dh     : that    - ðæt    / ðæt    | ð  / ð  / ð       |
+# F      : fan     - fæn    / fæn    | f  / f  / f       |
+# G      : get     - ɡɛt    / ɡɛt    | ɡ  / ɡ  / ɡ       |
+# Hh     : hat     - hæt    / hæt    | h  / h  / h       | match, but drop alternative H
+# Jh     : jeep    - ʤiːp   / ʤip    | ʤ  / ʤ  / dʒ      |
+# K      : cat     - kæt    / kæt    | k  / k  / k       |
+# L      : led     - lɛd    / lɛd    | l  / l  / l       |
+# M      : met     - mɛt    / mɛt    | m  / m  / m       |
+# N      : net     - nɛt    / nɛt    | n  / n  / n       |
+# Ng     : thing   - θɪŋ    / θɪŋ    | ŋ  / ŋ  / ŋ       |
+# P      : pet     - pɛt    / pɛt    | p  / p  / p       |
+# R      : red     - rɛd    / ˈɹɛd   | r  / ɹ  / ɹ       | different, but due to broad vs narrow and other annotation conventions; the sounds are actually different too but not sure how to model this
+# S      : sue     - sjuː   / su     | s  / s  / s       |
+# Sh     : shoe    - ʃuː    / ʃu     | ʃ  / ʃ  / ʃ       |
+# T      : tat     - tæt    / tæt    | t  / t  / t       |
+# Th     : thin    - θɪn    / θɪn    | θ  / θ  / θ       |
+# V      : van     - væn    / væn    | v  / v  / v       |
+# W      : wed     - wɛd    / wɛd    | w  / w  / w       |
+# Y      : yet     - jɛt    / jɛt    | j  / j  / j       |
+# Z      : zoo     - zuː    / zu     | z  / z  / z       |
+# Zh     : measure - ˈmɛʒə  / ˈmɛʒər | ʒ  / ʒ  / ʒ       |
+ISLE2IPA = {'AA':'ɑ','AE':'æ','AH':'ʌ','AO':'ɔ','AW':'aʊ','AX':'ə','AXR':'ɚ','AY':'aɪ','EH':'ɛ','ER':'ɜ','ERR':'ɝ','EY':'eɪ','IH':'ɪ','IY':'i','OW':'əʊ','OY':'ɔɪ','OH':'ɒ','UH':'ʊ','UW':'u','B':'b','CH':'tʃ','D':'d','DH':'ð','F':'f','G':'ɡ','HH':'h','JH':'dʒ','K':'k','L':'l','M':'m','N':'n','NG':'ŋ','P':'p','R':'ɹ','S':'s','SH':'ʃ','T':'t','TH':'θ','V':'v','W':'w','Y':'j','Z':'z','ZH':'ʒ'}  # fmt: skip
+IPA2ISLE = {v: k for k, v in ISLE2IPA.items()}
+def ipa2isle(ipa_string, lang="eng"):
+    update_dict_with_tones(ISLE2IPA, IPA2ISLE, lang)
+    ipa_symbols = string2symbols(ipa_string, IPA2ISLE.keys())[0]
+    isle_symbols = [IPA2ISLE[x] for x in ipa_symbols]
+    return " ".join(isle_symbols)
+def isle2ipa(isle_string, lang="eng"):
+    update_dict_with_tones(ISLE2IPA, IPA2ISLE, lang)
+    if " " in isle_string:
+        isle_symbols = isle_string.split()
+    else:
+        isle_symbols = string2symbols(isle_string, ISLE2IPA.keys())[0]
+    return "".join([ISLE2IPA[x] for x in isle_symbols])
+#########################################################################
+# CLI
+def usage():
+    print("Usage: python ./scripts/core/codes.py <src> <tgt> <phoneme_string>")
+    print("Supported codes:", CODES)
+ALL_ANNOTATED_IPA_SYMBOLS = set()
+for code in CODES:
+    if code == "ipa":
+        continue
+    ALL_ANNOTATED_IPA_SYMBOLS |= set(globals()[f"{code.upper()}2IPA"].values())
+ALL_ANNOTATED_IPA_SYMBOLS.discard("")
+ALL_ANNOTATED_IPA_SYMBOLS.discard(" ")
+ALL_ANNOTATED_IPA_SYMBOLS.discard("ʰ")
+ALL_ANNOTATED_IPA_SYMBOLS |= set(f"{s}ʰ" for s in ALL_ANNOTATED_IPA_SYMBOLS)
+def main(args):
+    if len(args) != 3:
+        usage()
+        return
+    src, tgt, phoneme_string = args
+    print(phoneme_string, "=>", convert(phoneme_string, src, tgt))
+if __name__ == "__main__":
+    try:
+        main(sys.argv[1:])
+    except Exception as e:
+        print(f"Line {e.__traceback__.tb_lineno}:", e)  # type: ignore
+        usage()

app/hf.py CHANGED Viewed

@@ -26,6 +26,7 @@ LEADERBOARD_FEATURES = Features(
         "fer_PSST": Value("float32"),
         "fer_SpeechOcean": Value("float32"),
         "fer_ISLE": Value("float32"),
     }
 )
 LEADERBOARD_DEFAULTS = {
@@ -35,17 +36,26 @@ LEADERBOARD_DEFAULTS = {
     "fer_PSST": None,
     "fer_SpeechOcean": None,
     "fer_ISLE": None,
 }
 def get_repo_info(
     repo_id, type: Literal["model", "dataset", "space"] = "model"
-) -> tuple[str, datetime]:
     try:
-        repo_info = api.repo_info(repo_id=repo_id, repo_type=type)
-        return repo_info.sha, repo_info.last_modified  # type: ignore
     except RepositoryNotFoundError:
-        return "", datetime(year=1970, month=1, day=1)
 def get_or_create_leaderboard() -> Dataset:
@@ -81,6 +91,7 @@ def add_leaderboard_entry(
     average_per: float,
     average_fer: float,
     url: str,
     per_dataset_fers: dict = {},
 ):
     existing_dataset = get_or_create_leaderboard()
@@ -99,6 +110,7 @@ def add_leaderboard_entry(
             fer_PSST=[per_dataset_fers.get("PSST")],
             fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")],
             fer_ISLE=[per_dataset_fers.get("ISLE")],
         ),
         features=LEADERBOARD_FEATURES,
     )

         "fer_PSST": Value("float32"),
         "fer_SpeechOcean": Value("float32"),
         "fer_ISLE": Value("float32"),
+        "model_bytes": Value("int64"),
     }
 )
 LEADERBOARD_DEFAULTS = {
     "fer_PSST": None,
     "fer_SpeechOcean": None,
     "fer_ISLE": None,
+    "model_bytes": None,
 }
+def get_size(repo_info):
+    total_size_bytes = 0
+    for sibling in repo_info.siblings:
+        size_in_bytes = sibling.size or 0
+        total_size_bytes += size_in_bytes
+    return total_size_bytes
 def get_repo_info(
     repo_id, type: Literal["model", "dataset", "space"] = "model"
+) -> tuple[str, datetime, int | None]:
     try:
+        repo_info = api.repo_info(repo_id=repo_id, repo_type=type, files_metadata=True)
+        return repo_info.sha, repo_info.last_modified, get_size(repo_info) or None  # type: ignore
     except RepositoryNotFoundError:
+        return "", datetime(year=1970, month=1, day=1), None
 def get_or_create_leaderboard() -> Dataset:
     average_per: float,
     average_fer: float,
     url: str,
+    model_bytes: int | None,
     per_dataset_fers: dict = {},
 ):
     existing_dataset = get_or_create_leaderboard()
             fer_PSST=[per_dataset_fers.get("PSST")],
             fer_SpeechOcean=[per_dataset_fers.get("SpeechOcean")],
             fer_ISLE=[per_dataset_fers.get("ISLE")],
+            model_bytes=[model_bytes],
         ),
         features=LEADERBOARD_FEATURES,
     )

app/inference.py CHANGED Viewed

@@ -2,6 +2,9 @@
 import torch
 from transformers import AutoProcessor, AutoModelForCTC
 DEVICE = (
     "cuda"
@@ -27,13 +30,37 @@ def clear_cache():
         torch.mps.empty_cache()
-def load_model(model_id, device=DEVICE):
     processor = AutoProcessor.from_pretrained(model_id)
     model = AutoModelForCTC.from_pretrained(model_id).to(device)
     return model, processor
-def transcribe(audio, model, processor) -> str:
     input_values = (
         processor(
             [audio],
@@ -49,3 +76,24 @@ def transcribe(audio, model, processor) -> str:
     predicted_ids = torch.argmax(logits, dim=-1)
     return processor.decode(predicted_ids[0])

 import torch
 from transformers import AutoProcessor, AutoModelForCTC
+from espnet2.bin.s2t_inference import Speech2Text
+MODEL_TYPES = ["Transformers CTC", "POWSM"]
 DEVICE = (
     "cuda"
         torch.mps.empty_cache()
+# ================================== POWSM ==================================
+def load_powsm(model_id, language="<eng>", device=DEVICE):
+    s2t = Speech2Text.from_pretrained(
+        model_id,
+        device=device.replace("mps", "cpu"),
+        lang_sym=language,
+        task_sym="<pr>",
+    )
+    if device == "mps":
+        s2t.s2t_model.to(device=device, dtype=torch.float32)
+        s2t.beam_search.to(device=device, dtype=torch.float32)
+        s2t.dtype = "float32"
+        s2t.device = device
+    return s2t
+def transcribe_powsm(audio, model):
+    pred = model(audio, text_prev="<na>")[0][0]
+    return pred.split("<notimestamps>")[1].strip().replace("/", "")
+# ===========================================================================
+# ============================= Transformers CTC ============================
+def load_transformers_ctc(model_id, device=DEVICE):
     processor = AutoProcessor.from_pretrained(model_id)
     model = AutoModelForCTC.from_pretrained(model_id).to(device)
     return model, processor
+def transcribe_transformers_ctc(audio, model) -> str:
+    model, processor = model
     input_values = (
         processor(
             [audio],
     predicted_ids = torch.argmax(logits, dim=-1)
     return processor.decode(predicted_ids[0])
+# ===========================================================================
+def load_model(model_id, type, device=DEVICE):
+    if type == "POWSM":
+        return load_powsm(model_id, device=device)
+    elif type == "Transformers CTC":
+        return load_transformers_ctc(model_id, device=device)
+    else:
+        raise ValueError("Unsupported model type: " + str(type))
+def transcribe(audio, type, model) -> str:
+    if type == "POWSM":
+        return transcribe_powsm(audio, model)
+    elif type == "Transformers CTC":
+        return transcribe_transformers_ctc(audio, model)
+    else:
+        raise ValueError("Unsupported model type: " + str(type))

app/tasks.py CHANGED Viewed

@@ -10,6 +10,7 @@ from metrics import per, fer
 from datasets import load_from_disk
 from hf import get_repo_info, add_leaderboard_entry
 from inference import clear_cache, load_model, transcribe
 leaderboard_lock = multiprocessing.Lock()
@@ -21,6 +22,9 @@ class Task(TypedDict):
     repo_hash: str
     repo_last_modified: datetime
     submission_timestamp: datetime
     url: str
     error: str | None
@@ -42,10 +46,12 @@ def get_status(query: str) -> dict:
     return {"error": f"No results found for '{query}'"}
-def start_eval_task(display_name: str, repo_id: str, url: str) -> str:
     """Start evaluation task in background. Returns task ID that can be used to check status."""
-    repo_hash, last_modified = get_repo_info(repo_id)
     # TODO: check if hash is different from the most recent submission if any for repo_id, otherwise don't recompute
     task = Task(
         status="submitted",
@@ -54,6 +60,9 @@ def start_eval_task(display_name: str, repo_id: str, url: str) -> str:
         repo_hash=repo_hash,
         repo_last_modified=last_modified,
         submission_timestamp=datetime.now(),
         url=url,
         error=None,
     )
@@ -83,9 +92,11 @@ def _eval_task(task: Task, leaderboard_lock):
         per_dataset_fers = {}
         clear_cache()
-        model, processor = load_model(task["repo_id"])
         for row in test_ds:
-            transcript = transcribe(row["audio"]["array"], model, processor)  # type: ignore
             row_per = per(transcript, row["ipa"])  # type: ignore
             row_fer = fer(transcript, row["ipa"])  # type: ignore
             average_per += row_per
@@ -107,6 +118,7 @@ def _eval_task(task: Task, leaderboard_lock):
                 average_per=average_per,
                 average_fer=average_fer,
                 url=task["url"],
                 per_dataset_fers=per_dataset_fers,
             )

 from datasets import load_from_disk
 from hf import get_repo_info, add_leaderboard_entry
 from inference import clear_cache, load_model, transcribe
+from codes import convert
 leaderboard_lock = multiprocessing.Lock()
     repo_hash: str
     repo_last_modified: datetime
     submission_timestamp: datetime
+    model_type: str
+    phone_code: str
+    model_bytes: int | None
     url: str
     error: str | None
     return {"error": f"No results found for '{query}'"}
+def start_eval_task(
+    display_name: str, repo_id: str, url: str, model_type: str, phone_code: str
+) -> str:
     """Start evaluation task in background. Returns task ID that can be used to check status."""
+    repo_hash, last_modified, size_bytes = get_repo_info(repo_id)
     # TODO: check if hash is different from the most recent submission if any for repo_id, otherwise don't recompute
     task = Task(
         status="submitted",
         repo_hash=repo_hash,
         repo_last_modified=last_modified,
         submission_timestamp=datetime.now(),
+        model_type=model_type,
+        phone_code=phone_code,
+        model_bytes=size_bytes,
         url=url,
         error=None,
     )
         per_dataset_fers = {}
         clear_cache()
+        model = load_model(task["repo_id"], task["model_type"])
         for row in test_ds:
+            transcript = transcribe(row["audio"]["array"], task["model_type"], model)  # type: ignore
+            if task["phone_code"] != "ipa":
+                transcript = convert(transcript, task["phone_code"], "ipa")
             row_per = per(transcript, row["ipa"])  # type: ignore
             row_fer = fer(transcript, row["ipa"])  # type: ignore
             average_per += row_per
                 average_per=average_per,
                 average_fer=average_fer,
                 url=task["url"],
+                model_bytes=task["model_bytes"],
                 per_dataset_fers=per_dataset_fers,
             )

requirements.txt CHANGED Viewed

@@ -3,14 +3,16 @@ huggingface_hub==0.34.4
 datasets==4.0.0
 # Data processing
-pandas==2.0.3
-numpy==1.25.2
 panphon==0.21.2
 torch==2.8.0
 torchaudio==2.8.0
 torchcodec==0.6.0
 transformers==4.56.0
 phonemizer==3.3.0
 # UI
 gradio==5.12.0

 datasets==4.0.0
 # Data processing
+pandas==2.3.3
+numpy==2.0.2
 panphon==0.21.2
 torch==2.8.0
 torchaudio==2.8.0
 torchcodec==0.6.0
 transformers==4.56.0
 phonemizer==3.3.0
+espnet==202509
+espnet-model-zoo==0.1.7
 # UI
 gradio==5.12.0

requirements_lock.txt CHANGED Viewed

@@ -3,65 +3,104 @@ aiohappyeyeballs==2.6.1
 aiohttp==3.12.15
 aiosignal==1.4.0
 annotated-types==0.7.0
 anyio==4.10.0
 async-timeout==5.0.1
 attrs==25.3.0
 babel==2.17.0
 certifi==2025.8.3
 charset-normalizer==3.4.3
 click==8.2.1
 colorama==0.4.6
 csvw==3.5.1
 datasets==4.0.0
 dill==0.3.8
 dlinfo==2.0.0
 editdistance==0.8.1
 exceptiongroup==1.3.0
 fastapi==0.116.1
 ffmpy==0.6.1
 filelock==3.19.1
 frozenlist==1.7.0
 fsspec==2025.3.0
 gradio==5.12.0
 gradio_client==1.5.4
 h11==0.16.0
 hf-xet==1.1.9
 httpcore==1.0.9
 httpx==0.28.1
 huggingface-hub==0.34.4
 idna==3.10
 isodate==0.7.2
 Jinja2==3.1.6
 joblib==1.5.2
 jsonschema==4.25.1
 jsonschema-specifications==2025.4.1
 language-tags==1.2.0
 markdown-it-py==4.0.0
 MarkupSafe==2.1.5
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.6.4
 multiprocess==0.70.16
 munkres==1.1.4
 networkx==3.4.2
-numpy==1.25.2
 orjson==3.11.3
 packaging==25.0
-pandas==2.0.3
 panphon==0.21.2
 phonemizer==3.3.0
 pillow==11.3.0
 propcache==0.3.2
 protobuf==6.32.0
 pyarrow==21.0.0
 pydantic==2.11.7
 pydantic_core==2.33.2
 pydub==0.25.1
 Pygments==2.19.2
 pyparsing==3.2.3
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
 pytz==2025.2
 PyYAML==6.0.2
 rdflib==7.1.4
 referencing==0.36.2
@@ -73,28 +112,39 @@ rpds-py==0.27.1
 ruff==0.12.11
 safehttpx==0.1.6
 safetensors==0.6.2
 segments==2.3.0
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1
 starlette==0.47.3
 sympy==1.14.0
 tokenizers==0.22.0
 tomlkit==0.13.3
 torch==2.8.0
 torchaudio==2.8.0
 torchcodec==0.6.0
 tqdm==4.67.1
 transformers==4.56.0
 typer==0.17.3
 typing-inspection==0.4.1
 typing_extensions==4.15.0
 tzdata==2025.2
 unicodecsv==0.14.1
 uritemplate==4.2.0
 urllib3==2.5.0
 uvicorn==0.35.0
 websockets==14.2
 xxhash==3.5.0
 yarl==1.20.1

 aiohttp==3.12.15
 aiosignal==1.4.0
 annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
 anyio==4.10.0
+asteroid-filterbanks==0.4.0
 async-timeout==5.0.1
 attrs==25.3.0
+audioread==3.1.0
 babel==2.17.0
 certifi==2025.8.3
+cffi==2.0.0
 charset-normalizer==3.4.3
+ci-sdr==0.0.2
 click==8.2.1
 colorama==0.4.6
+ConfigArgParse==1.7.1
 csvw==3.5.1
 datasets==4.0.0
+decorator==5.2.1
 dill==0.3.8
+Distance==0.1.3
 dlinfo==2.0.0
 editdistance==0.8.1
+einops==0.8.1
+espnet==202509
+espnet-model-zoo==0.1.7
+espnet-tts-frontend==0.0.3
 exceptiongroup==1.3.0
+fast-bss-eval==0.1.3
 fastapi==0.116.1
 ffmpy==0.6.1
 filelock==3.19.1
 frozenlist==1.7.0
 fsspec==2025.3.0
+g2p-en==2.1.0
 gradio==5.12.0
 gradio_client==1.5.4
 h11==0.16.0
+h5py==3.15.1
 hf-xet==1.1.9
 httpcore==1.0.9
 httpx==0.28.1
 huggingface-hub==0.34.4
+humanfriendly==10.0
+hydra-core==1.3.2
 idna==3.10
+importlib-metadata==4.13.0
+inflect==7.5.0
 isodate==0.7.2
+jaconv==0.4.0
+jamo==0.4.1
 Jinja2==3.1.6
 joblib==1.5.2
 jsonschema==4.25.1
 jsonschema-specifications==2025.4.1
+kaldiio==2.18.1
 language-tags==1.2.0
+lazy_loader==0.4
+librosa==0.11.0
+lightning==2.5.5
+lightning-utilities==0.15.2
+llvmlite==0.45.1
 markdown-it-py==4.0.0
 MarkupSafe==2.1.5
 mdurl==0.1.2
+more-itertools==10.8.0
 mpmath==1.3.0
+msgpack==1.1.2
 multidict==6.6.4
 multiprocess==0.70.16
 munkres==1.1.4
 networkx==3.4.2
+nltk==3.9.2
+numba==0.62.1
+numpy==2.0.2
+omegaconf==2.3.0
+opt_einsum==3.4.0
 orjson==3.11.3
 packaging==25.0
+pandas==2.3.3
 panphon==0.21.2
 phonemizer==3.3.0
 pillow==11.3.0
+platformdirs==4.5.0
+pooch==1.8.2
 propcache==0.3.2
 protobuf==6.32.0
 pyarrow==21.0.0
+pycparser==2.23
 pydantic==2.11.7
 pydantic_core==2.33.2
 pydub==0.25.1
 Pygments==2.19.2
 pyparsing==3.2.3
+pypinyin==0.44.0
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
+pytorch-lightning==2.5.5
 pytz==2025.2
+pyworld==0.3.5
 PyYAML==6.0.2
 rdflib==7.1.4
 referencing==0.36.2
 ruff==0.12.11
 safehttpx==0.1.6
 safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.15.3
 segments==2.3.0
 semantic-version==2.10.0
+sentencepiece==0.2.0
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1
+soundfile==0.13.1
+soxr==1.0.0
 starlette==0.47.3
 sympy==1.14.0
+threadpoolctl==3.6.0
 tokenizers==0.22.0
 tomlkit==0.13.3
 torch==2.8.0
+torch-complex==0.4.4
 torchaudio==2.8.0
 torchcodec==0.6.0
+torchmetrics==1.8.2
 tqdm==4.67.1
 transformers==4.56.0
+typeguard==4.4.4
 typer==0.17.3
 typing-inspection==0.4.1
 typing_extensions==4.15.0
 tzdata==2025.2
 unicodecsv==0.14.1
+Unidecode==1.4.0
 uritemplate==4.2.0
 urllib3==2.5.0
 uvicorn==0.35.0
 websockets==14.2
 xxhash==3.5.0
 yarl==1.20.1
+zipp==3.23.0