Spaces:

ginic
/

multipa-english-to-ipa

Running

File size: 15,865 Bytes

# Imports
from pathlib import Path
import tempfile
import os
import gradio as gr
import librosa
import tgt.core
import tgt.io3
import soundfile as sf
import zipfile
from transformers import pipeline

# Constants
TEXTGRID_DIR = tempfile.mkdtemp()
DEFAULT_MODEL = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
TEXTGRID_DOWNLOAD_TEXT = "Download TextGrid file"
TEXTGRID_NAME_INPUT_LABEL = "TextGrid file name"

# Selection of models
VALID_MODELS = [
    "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
    "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000",
    "ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_30_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_4_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/gender_split_70_female_5_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_old_only_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_old_only_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_old_only_3_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_young_only_1_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_young_only_2_wav2vec2-large-xlsr-53-buckeye-ipa",
    "ginic/vary_individuals_young_only_3_wav2vec2-large-xlsr-53-buckeye-ipa",
]


def load_model_and_predict(
    model_name: str,
    audio_in: str,
    model_state: dict,
):
    try:
        if audio_in is None:
            return (
                "",
                model_state,
                gr.Textbox(label=TEXTGRID_NAME_INPUT_LABEL, interactive=False),
            )

        if model_state["model_name"] != model_name:
            model_state = {
                "loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
                "model_name": model_name,
            }

        prediction = model_state["loaded_model"](audio_in)["text"]
        return prediction, model_state
    except Exception as e:
        raise gr.Error(f"Failed to load model: {str(e)}")


def get_textgrid_contents(audio_in, textgrid_tier_name, transcription_prediction):
    if audio_in is None or transcription_prediction is None:
        return ""

    duration = librosa.get_duration(path=audio_in)

    annotation = tgt.core.Interval(0, duration, transcription_prediction)
    transcription_tier = tgt.core.IntervalTier(
        start_time=0, end_time=duration, name=textgrid_tier_name
    )
    transcription_tier.add_annotation(annotation)
    textgrid = tgt.core.TextGrid()
    textgrid.add_tier(transcription_tier)
    return tgt.io3.export_to_long_textgrid(textgrid)


def write_textgrid(textgrid_contents, textgrid_filename):
    """Writes the text grid contents to a named file in the temporary directory.
    Returns the path for download.
    """
    textgrid_path = Path(TEXTGRID_DIR) / Path(textgrid_filename).name
    textgrid_path.write_text(textgrid_contents)
    return textgrid_path


def get_interactive_download_button(textgrid_contents, textgrid_filename):
    return gr.DownloadButton(
        label=TEXTGRID_DOWNLOAD_TEXT,
        variant="primary",
        interactive=True,
        value=write_textgrid(textgrid_contents, textgrid_filename),
    )


def transcribe_intervals(audio_in, textgrid_path, source_tier, target_tier, model_state):
    if audio_in is None or textgrid_path is None:
        return "Missing audio or TextGrid input file."

    tg=tgt.io.read_textgrid(textgrid_path.name)
    tier = tg.get_tier_by_name(source_tier)
    ipa_tier = tgt.core.IntervalTier(name=target_tier)

    for interval in tier.intervals:
        if not interval.text.strip(): # Skip empty text intervals
            continue

        start, end = interval.start_time, interval.end_time
        try:
            y, sr = librosa.load(audio_in, sr=None, offset=start, duration=end-start)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
                sf.write(temp_audio.name, y, sr)
                prediction = model_state["loaded_model"](temp_audio.name)["text"]
                ipa_tier.add_annotation(tgt.core.Interval(start, end, prediction))
                os.remove(temp_audio.name)
        except Exception as e:
            ipa_tier.add_annotation(tgt.core.Interval(start, end, f"[Error]: {str(e)}"))

    tg.add_tier(ipa_tier)
    tgt_str = tgt.io3.export_to_long_textgrid(tg)

    return tgt_str


def extract_tier_names(textgrid_file):
    try:
        tg = tgt.io.read_textgrid(textgrid_file.name)
        tier_names = [tier.name for tier in tg.tiers]
        return gr.update(choices=tier_names, value=tier_names[0] if tier_names else None)
    except Exception as e:
        return gr.update(choices=[], value=None)
    

def validate_textgrid_for_intervals(audio_path, textgrid_file):
    try:
        if not audio_path or not textgrid_file:
            return gr.update(interactive=False)

        audio_duration = librosa.get_duration(path=audio_path)
        tg = tgt.io.read_textgrid(textgrid_file.name)
        tg_end_time = max(tier.end_time for tier in tg.tiers)

        if tg_end_time > audio_duration:
            raise gr.Error(
                f"TextGrid ends at {tg_end_time:.2f}s but audio is only {audio_duration:.2f}s. "
                "Please upload matching files."
            )

        epsilon = 0.01
        if abs(tg_end_time - audio_duration) > epsilon:
            gr.Warning(
                f"TextGrid ends at {tg_end_time:.2f}s but audio is {audio_duration:.2f}s. "
                "Only the annotated portion will be transcribed."
            )

        return gr.update(interactive=True)

    except Exception as e:
        raise gr.Error(f"Invalid TextGrid or audio file:\n{str(e)}")


def transcribe_multiple_files(model_name, audio_files, model_state, tier_name):
    try:
        if not audio_files:
            return [], None, model_state

        if model_state["model_name"] != model_name:
            model_state = {
                "loaded_model": pipeline(task="automatic-speech-recognition", model=model_name),
                "model_name": model_name,
            }

        table_data = []
        tg_paths = []

        for file in audio_files:
            prediction = model_state["loaded_model"](file)["text"]
            duration = librosa.get_duration(path=file)

            annotation = tgt.core.Interval(0, duration, prediction)
            transcription_tier = tgt.core.IntervalTier(0, duration, tier_name)
            transcription_tier.add_annotation(annotation)

            tg = tgt.core.TextGrid()
            tg.add_tier(transcription_tier)

            tg_str = tgt.io3.export_to_long_textgrid(tg)
            tg_filename = Path(file).with_suffix(".TextGrid").name
            tg_path = Path(TEXTGRID_DIR) / tg_filename
            tg_path.write_text(tg_str)

            table_data.append([Path(file).name, prediction])
            tg_paths.append(tg_path)

        # ZIP generation
        zip_path = Path(tempfile.mkdtemp()) / "textgrids.zip"
        with zipfile.ZipFile(zip_path, "w") as zipf:
            for tg in tg_paths:
                zipf.write(tg, arcname=tg.name)

        return table_data, str(zip_path), model_state

    except Exception as e:
        raise gr.Error(f"Transcription failed: {str(e)}")


def launch_demo():
    initial_model = {
        "loaded_model": pipeline(
            task="automatic-speech-recognition", model=DEFAULT_MODEL
        ),
        "model_name": DEFAULT_MODEL,
    }

    with gr.Blocks() as demo:
        gr.Markdown("""# Automatic International Phonetic Alphabet Transcription
        This demo allows you to experiment with producing phonetic transcriptions of uploaded or recorded audio using a selected automatic speech recognition (ASR) model.""")

        # Dropdown for model selection
        model_name = gr.Dropdown(
            VALID_MODELS,
            value=DEFAULT_MODEL,
            label="IPA transcription ASR model",
            info="Select the model to use for prediction.",
        )

        # Dropdown for transcription type selection
        transcription_type = gr.Dropdown(
            choices=["Full Audio", "Multiple Full Audio", "TextGrid Interval"],
            label="Transcription Type",
            value=None,
            interactive=True,
        )

        model_state = gr.State(value=initial_model)

        # Full audio transcription section
        with gr.Column(visible=False) as full_audio_section:
            full_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
            full_transcribe_btn = gr.Button("Transcribe Full Audio", interactive=False, variant="primary")
            full_prediction = gr.Textbox(label="IPA Transcription", show_copy_button=True)

            full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="IPA", interactive=True)

            full_textgrid_contents = gr.Textbox(label="TextGrid Contents", show_copy_button=True)
            full_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
            full_reset_btn = gr.Button("Reset", variant="secondary")

        # Multiple full audio transcription section
        with gr.Column(visible=False) as multiple_full_audio_section:
            multiple_full_audio = gr.File(file_types=[".wav"], label="Upload Audio File(s)", file_count="multiple")
            multiple_full_textgrid_tier = gr.Textbox(label="TextGrid Tier Name", value="IPA")
            multiple_full_transcribe_btn = gr.Button("Transcribe Audio Files", interactive=False, variant="primary")

            multiple_full_table = gr.Dataframe(
                headers=["Filename", "Transcription"],
                interactive=False,
                label="IPA Transcriptions",
                datatype=["str", "str"]
            )

            multiple_full_zip_download_btn = gr.File(label="Download All as ZIP", interactive=False)
            multiple_full_reset_btn = gr.Button("Reset", variant="secondary")


        # Interval transcription section
        with gr.Column(visible=False) as interval_section:
            interval_audio = gr.Audio(type="filepath", show_download_button=True, label="Upload Audio File")
            interval_textgrid_file = gr.File(file_types=[".TextGrid"], label="Upload TextGrid File")
            tier_names = gr.Dropdown(label="Source Tier (existing)", choices=[], interactive=True)
            target_tier = gr.Textbox(label="Target Tier (new)", value="IPATier", placeholder="e.g. IPATier")

            interval_transcribe_btn = gr.Button("Transcribe Intervals", interactive=False, variant="primary")
            interval_result = gr.Textbox(label="IPA Interval Transcription", show_copy_button=True, interactive=False)
            interval_download_btn = gr.DownloadButton(label=TEXTGRID_DOWNLOAD_TEXT, interactive=False, variant="primary")
            interval_reset_btn = gr.Button("Reset", variant="secondary")

        # Section visibility toggle
        transcription_type.change(
            fn=lambda t: (
                gr.update(visible=t == "Full Audio"),
                gr.update(visible=t == "Multiple Full Audio"),
                gr.update(visible=t == "TextGrid Interval"),
            ),
            inputs=transcription_type,
            outputs=[full_audio_section, multiple_full_audio_section, interval_section],
        )

        # Enable full transcribe button after audio uploaded
        full_audio.change(
            fn=lambda audio: gr.update(interactive=audio is not None),
            inputs=full_audio,
            outputs=full_transcribe_btn,
        )

        # Full transcription logic
        full_transcribe_btn.click(
            fn=load_model_and_predict,
            inputs=[model_name, full_audio, model_state],
            outputs=[full_prediction, model_state],
        )

        full_prediction.change(
            fn=get_textgrid_contents,
            inputs=[full_audio, full_textgrid_tier, full_prediction],
            outputs=[full_textgrid_contents],
        )

        full_textgrid_contents.change(
            fn=lambda tg_text, audio_path: get_interactive_download_button(
                tg_text,
                Path(audio_path).with_suffix(".TextGrid").name if audio_path else "output.TextGrid"
            ),
            inputs=[full_textgrid_contents, full_audio],
            outputs=[full_download_btn],
        )

        full_reset_btn.click(
            fn=lambda: (None, "", "", "", gr.update(interactive=False)),
            outputs=[full_audio, full_prediction, full_textgrid_contents, full_download_btn],
        )

        # Enable interval transcribe button only when both files are uploaded
        interval_audio.change(
            fn=validate_textgrid_for_intervals,
            inputs=[interval_audio, interval_textgrid_file],
            outputs=[interval_transcribe_btn],
        )

        interval_textgrid_file.change(
            fn=validate_textgrid_for_intervals,
            inputs=[interval_audio, interval_textgrid_file],
            outputs=[interval_transcribe_btn],
        )

        # Interval logic
        interval_textgrid_file.change(
            fn=extract_tier_names,
            inputs=[interval_textgrid_file],
            outputs=[tier_names],
        )

        interval_transcribe_btn.click(
            fn=transcribe_intervals,
            inputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, model_state],
            outputs=[interval_result],
        )

        interval_result.change(
            fn=lambda tg_text, audio_path: gr.update(
                value=write_textgrid(
                    tg_text,
                    Path(audio_path).with_suffix("").name+"_IPA.TextGrid"
                ),
                interactive=True,
            ),
            inputs=[interval_result, interval_audio],
            outputs=[interval_download_btn],
        )

        interval_reset_btn.click(
            fn=lambda: (None, None, gr.update(choices=[]), "IPATier", "", gr.update(interactive=False)),
            outputs=[interval_audio, interval_textgrid_file, tier_names, target_tier, interval_result, interval_download_btn],
        )

        # Multiple full audio transcription logic
        multiple_full_audio.change(
            fn=lambda files: gr.update(interactive=bool(files)),
            inputs=multiple_full_audio,
            outputs=multiple_full_transcribe_btn,
        )

        multiple_full_transcribe_btn.click(
            fn=transcribe_multiple_files,
            inputs=[model_name, multiple_full_audio, model_state, multiple_full_textgrid_tier],
            outputs=[multiple_full_table, multiple_full_zip_download_btn, model_state],
        )

        multiple_full_reset_btn.click(
            fn=lambda: (None, "", [], None, gr.update(interactive=False)),
            outputs=[multiple_full_audio, multiple_full_textgrid_tier, multiple_full_table, multiple_full_zip_download_btn, multiple_full_transcribe_btn],
        )

    demo.launch(max_file_size="100mb")

if __name__ == "__main__":
    launch_demo()