Spaces:

nvidia
/

canary-1b-v2

Running on Zero

App Files Files Community

msekoyan commited on Aug 10

Commit

986541a

1 Parent(s): 63f740b

adding app.py and supporting files

Browse files

Signed-off-by: monica-sekoyan <[email protected]>

Files changed (7) hide show

.gitattributes +3 -0
README.md +5 -5
app.py +409 -0
data/example-yt_saTD1u8PorI.mp3 +3 -0
packages.txt +2 -0
requirements.txt +2 -0
supported_languages.py +27 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.nemo filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: Canary 1b V2
-emoji: 👁
-colorFrom: red
-colorTo: indigo
 sdk: gradio
 sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 license: cc-by-4.0
-short_description: Space for multilingual and multitask Canary-1b-v2
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Canary 1b V2
+emoji: 🐤🇪🇺
+colorFrom: orange
+colorTo: red
 sdk: gradio
 sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 license: cc-by-4.0
+short_description: Transcribe and Translate in 25 European Languages
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,409 @@

+from nemo.collections.asr.models import ASRModel
+import torch
+import gradio as gr
+import spaces
+import gc
+import shutil
+from pathlib import Path
+from pydub import AudioSegment
+import numpy as np
+import os
+import gradio.themes as gr_themes
+import csv
+from supported_languages import SUPPORTED_LANGS_MAP
+device = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME="nvidia/canary-1b-v2"
+model = ASRModel.from_pretrained(model_name=MODEL_NAME)
+model.eval()
+AVAILABLE_SRC_LANGS = list(SUPPORTED_LANGS_MAP.keys())
+DEFAULT_SRC_LANG = "English"
+AVAILABLE_TGT_LANGS = list(SUPPORTED_LANGS_MAP.keys())
+DEFAULT_TGT_LANG = "English"
+def start_session(request: gr.Request):
+    session_hash = request.session_hash
+    session_dir = Path(f'/tmp/{session_hash}')
+    session_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Session with hash {session_hash} started.")
+    return session_dir.as_posix()
+def end_session(request: gr.Request):
+    session_hash = request.session_hash
+    session_dir = Path(f'/tmp/{session_hash}')
+    if session_dir.exists():
+        shutil.rmtree(session_dir)
+    print(f"Session with hash {session_hash} ended.")
+def update_src_lang_dropdown(selected_value):
+    if selected_value == DEFAULT_SRC_LANG:
+        tgt_langs = AVAILABLE_TGT_LANGS
+        default_tgt_lang = DEFAULT_TGT_LANG
+    else:
+        tgt_langs = [DEFAULT_TGT_LANG, selected_value]
+        default_tgt_lang = selected_value
+    return gr.Dropdown(choices=tgt_langs, value=default_tgt_lang, interactive=True)
+def update_button_intstruction(src_lang, tgt_lang):
+    if src_lang == tgt_lang:
+        instruction = "Transcribe"
+    else:
+        instruction = "Translate"
+    return (gr.Button(f"{instruction} Uploaded File", variant="primary"), gr.Button(f"{instruction} Microphone Input", variant="primary"))
+def get_audio_segment(audio_path, start_second, end_second):
+    if not audio_path or not Path(audio_path).exists():
+        print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
+        return None
+    try:
+        start_ms = int(start_second * 1000)
+        end_ms = int(end_second * 1000)
+        start_ms = max(0, start_ms)
+        if end_ms <= start_ms:
+            print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
+            end_ms = start_ms + 100
+        audio = AudioSegment.from_file(audio_path)
+        clipped_audio = audio[start_ms:end_ms]
+        samples = np.array(clipped_audio.get_array_of_samples())
+        if clipped_audio.channels == 2:
+            samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
+        frame_rate = clipped_audio.frame_rate
+        if frame_rate <= 0:
+             print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
+             frame_rate = audio.frame_rate
+        if samples.size == 0:
+             print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
+             return None
+        return (frame_rate, samples)
+    except FileNotFoundError:
+        print(f"Error: Audio file not found at path: {audio_path}")
+        return None
+    except Exception as e:
+        print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
+        return None
+@spaces.GPU
+def get_transcripts_and_raw_times(audio_path, session_dir, source_lang, target_lang):
+    if not audio_path:
+        gr.Error("No audio file path provided for transcription.", duration=None)
+        # Return an update to hide the button
+        return [], [], None, gr.DownloadButton(visible=False)
+    vis_data = [["N/A", "N/A", "Processing failed"]]
+    raw_times_data = [[0.0, 0.0]]
+    processed_audio_path = None
+    csv_file_path = None
+    original_path_name = Path(audio_path).name
+    audio_name = Path(audio_path).stem
+    try:
+        try:
+            gr.Info(f"Loading audio: {original_path_name}", duration=2)
+            audio = AudioSegment.from_file(audio_path)
+            print('Audio loaded successfully')
+        except Exception as load_e:
+            gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
+            # Return an update to hide the button
+            return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        resampled = False
+        mono = False
+        target_sr = 16000
+        if audio.frame_rate != target_sr:
+            try:
+                audio = audio.set_frame_rate(target_sr)
+                resampled = True
+            except Exception as resample_e:
+                 gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
+                 # Return an update to hide the button
+                 return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        if audio.channels == 2:
+            try:
+                audio = audio.set_channels(1)
+                mono = True
+            except Exception as mono_e:
+                 gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
+                 # Return an update to hide the button
+                 return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        elif audio.channels > 2:
+             gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
+             # Return an update to hide the button
+             return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        if resampled or mono:
+            try:
+                processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
+                audio.export(processed_audio_path, format="wav")
+                transcribe_path = processed_audio_path.as_posix()
+                info_path_name = f"{original_path_name} (processed)"
+            except Exception as export_e:
+                gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
+                if processed_audio_path and os.path.exists(processed_audio_path):
+                    os.remove(processed_audio_path)
+                # Return an update to hide the button
+                return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        else:
+            transcribe_path = audio_path
+            info_path_name = original_path_name
+        try:
+            model.to(device)
+            gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
+            output = model.transcribe([transcribe_path], timestamps=True, source_lang=SUPPORTED_LANGS_MAP[source_lang], target_lang=SUPPORTED_LANGS_MAP[target_lang])
+            if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
+                 gr.Error("Transcription failed or produced unexpected output format.", duration=None)
+                 # Return an update to hide the button
+                 return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+            segment_timestamps = output[0].timestamp['segment']
+            csv_headers = ["Start (s)", "End (s)", "Segment"]
+            vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
+            raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
+            # Default button update (hidden) in case CSV writing fails
+            button_update = gr.DownloadButton(visible=False)
+            try:
+                csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
+                writer = csv.writer(open(csv_file_path, 'w'))
+                writer.writerow(csv_headers)
+                writer.writerows(vis_data)
+                print(f"CSV transcript saved to temporary file: {csv_file_path}")
+                # If CSV is saved, create update to show button with path
+                button_update = gr.DownloadButton(value=csv_file_path, visible=True)
+            except Exception as csv_e:
+                gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
+                print(f"Error writing CSV: {csv_e}")
+                # csv_file_path remains None, button_update remains hidden
+            gr.Info("Transcription complete.", duration=2)
+            # Return the data and the button update dictionary
+            return vis_data, raw_times_data, audio_path, button_update
+        except torch.cuda.OutOfMemoryError as e:
+            error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
+            print(f"CUDA OutOfMemoryError: {e}")
+            gr.Error(error_msg, duration=None)
+            # Return an update to hide the button
+            return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        except FileNotFoundError:
+            error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
+            print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
+            gr.Error(error_msg, duration=None)
+            # Return an update to hide the button
+            return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, gr.DownloadButton(visible=False)
+        except Exception as e:
+            error_msg = f"Transcription failed: {e}"
+            print(f"Error during transcription processing: {e}")
+            gr.Error(error_msg, duration=None)
+            vis_data = [["Error", "Error", error_msg]]
+            raw_times_data = [[0.0, 0.0]]
+            # Return an update to hide the button
+            return vis_data, raw_times_data, audio_path, gr.DownloadButton(visible=False)
+        finally:
+            try:
+                if 'model' in locals() and hasattr(model, 'cpu'):
+                     if device == 'cuda':
+                          model.cpu()
+                gc.collect()
+                if device == 'cuda':
+                    torch.cuda.empty_cache()
+            except Exception as cleanup_e:
+                print(f"Error during model cleanup: {cleanup_e}")
+                gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
+    finally:
+        if processed_audio_path and os.path.exists(processed_audio_path):
+            try:
+                os.remove(processed_audio_path)
+                print(f"Temporary audio file {processed_audio_path} removed.")
+            except Exception as e:
+                print(f"Error removing temporary audio file {processed_audio_path}: {e}")
+def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
+    if not isinstance(raw_ts_list, list):
+        print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
+        return gr.Audio(value=None, label="Selected Segment")
+    if not current_audio_path:
+        print("No audio path available to play segment from.")
+        return gr.Audio(value=None, label="Selected Segment")
+    selected_index = evt.index[0]
+    if selected_index < 0 or selected_index >= len(raw_ts_list):
+         print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
+         return gr.Audio(value=None, label="Selected Segment")
+    if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
+         print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
+         return gr.Audio(value=None, label="Selected Segment")
+    start_time_s, end_time_s = raw_ts_list[selected_index]
+    print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
+    segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
+    if segment_data:
+        print("Segment data retrieved successfully.")
+        return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
+    else:
+        print("Failed to get audio segment data.")
+        return gr.Audio(value=None, label="Selected Segment")
+article = (
+    "<p style='font-size: 1.1em;'>"
+    "This demo showcases <code><a href='https://huggingface.co/nvidia/canary-1b-v2'>canary-1b-v2</a></code>, a 1-billion-parameter model built for high-quality speech transcription and translation across 25 European languages."
+    "</p>"
+    "<p><strong style='color: #ffb300; font-size: 1.2em;'>Key Features:</strong></p>"
+    "<ul style='font-size: 1.1em;'>"
+    "    <li>Support for <strong>25 European languages</strong></li>"
+    "    <li>Automatic <strong>punctuation and capitalization</strong></li>"
+    "    <li>Accurate <strong>word-level and segment-level timestamps</strong></li>"
+    "    <li><strong>Segment-level timestamps</strong> for translated outputs</li>"
+    "</ul>"
+    "<p style='font-size: 1.1em;'>"
+    "This model is <strong>available for commercial and non-commercial use</strong>."
+    "</p>"
+    "<p style='text-align: center;'>"
+    "<a href='https://huggingface.co/nvidia/canary-1b-v2' target='_blank'>🎙️ Learn more about the Model</a> | "
+    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 NeMo Repository</a>"
+    "</p>"
+)
+examples = [
+    ["data/example-yt_saTD1u8PorI.mp3"],
+]
+# Define an NVIDIA-inspired theme
+nvidia_theme = gr_themes.Default(
+    primary_hue=gr_themes.Color(
+        c50="#FFF9E6", # Lightest yellow
+        c100="#FFF2CC",
+        c200="#FFEB99",
+        c300="#FFE066",
+        c400="#FFD633",
+        c500="#FFCC00", # Canary Yellow
+        c600="#E6B800",
+        c700="#CC9900",
+        c800="#B38600",
+        c900="#996600", # Orange-brown
+        c950="#805500"
+    ),
+    neutral_hue="gray", # Use gray for neutral elements
+    font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+).set()
+# Apply the custom theme
+with gr.Blocks(theme=nvidia_theme) as demo:
+    model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
+    gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>🐤 Transcribe and Translate with {model_display_name}</h1>")
+    gr.Markdown(f"<h2 style='text-align: center; margin: 0 auto;'>in 25 European Languages</h2>")
+    gr.HTML(article)
+    current_audio_path_state = gr.State(None)
+    raw_timestamps_list_state = gr.State([])
+    session_dir = gr.State()
+    demo.load(start_session, outputs=[session_dir])
+    with gr.Row():
+        source_lang_dropdown = gr.Dropdown(
+                        choices=AVAILABLE_SRC_LANGS,
+                        value=DEFAULT_SRC_LANG,
+                        label="Select Source Language (Audio)",
+                        interactive=True)
+        target_lang_dropdown = gr.Dropdown(
+            choices=AVAILABLE_TGT_LANGS,
+            value=DEFAULT_TGT_LANG,
+            label="Select Target Language (Output Text)",
+            interactive=True)
+    with gr.Tabs():
+        with gr.TabItem("Audio File"):
+            file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
+            gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
+            file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
+        with gr.TabItem("Microphone"):
+            mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
+            mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
+    gr.Markdown("---")
+    gr.Markdown(f"""
+                <div style="text-align: center; font-size: 1.2em; font-weight: bold; padding: 10px; margin: 10px 0; border-top: 0px solid #ccc; border-bottom: 1px solid #ccc;">
+                    Ready to dive in? Just <span style="color:orange">click</span> on the text to jump to the part you need!
+                </div>
+                """)
+    # Define the DownloadButton *before* the DataFrame
+    download_btn = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
+    vis_timestamps_df = gr.DataFrame(
+        headers=["Start (s)", "End (s)", "Segment"],
+        datatype=["number", "number", "str"],
+        wrap=True,
+        label="Transcription Segments"
+    )
+    # selected_segment_player was defined after download_btn previously, keep it after df for layout
+    selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
+    source_lang_dropdown.select(
+                    fn=update_src_lang_dropdown,
+                    inputs=[source_lang_dropdown],
+                    outputs=[target_lang_dropdown]
+                    )
+    target_lang_dropdown.select(
+                    fn=update_button_intstruction,
+                    inputs=[source_lang_dropdown, target_lang_dropdown],
+                    outputs=[file_transcribe_btn, mic_transcribe_btn]
+                    )
+    mic_transcribe_btn.click(
+        fn=get_transcripts_and_raw_times,
+        inputs=[mic_input, session_dir, source_lang_dropdown, target_lang_dropdown],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
+        api_name="transcribe_mic"
+    )
+    file_transcribe_btn.click(
+        fn=get_transcripts_and_raw_times,
+        inputs=[file_input, session_dir, source_lang_dropdown, target_lang_dropdown],
+        outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
+        api_name="transcribe_file"
+    )
+    vis_timestamps_df.select(
+        fn=play_segment,
+        inputs=[raw_timestamps_list_state, current_audio_path_state],
+        outputs=[selected_segment_player],
+    )
+    demo.unload(end_session)
+if __name__ == "__main__":
+    print("Launching Gradio Demo...")
+    demo.queue()
+    demo.launch()

data/example-yt_saTD1u8PorI.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cb340c3b868eb3695cdb06683decbff217331c2459a69394be8d3ad3b53bdf0
+size 2493472

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Cython
2	+ git+https://github.com/NVIDIA/NeMo.git@msekoyan/canary2_timestamps#egg=nemo_toolkit[asr]

supported_languages.py ADDED Viewed

	@@ -0,0 +1,27 @@

+SUPPORTED_LANGS_MAP = {
+    "Bulgarian": "bg",
+    "Croatian": "hr",
+    "Czech": "cs",
+    "Danish": "da",
+    "Dutch": "nl",
+    "English": "en",
+    "Estonian": "et",
+    "Finnish": "fi",
+    "French": "fr",
+    "German": "de",
+    "Greek": "el",
+    "Hungarian": "hu",
+    "Italian": "it",
+    "Latvian": "lv",
+    "Lithuanian": "lt",
+    "Maltese": "mt",
+    "Polish": "pl",
+    "Portuguese": "pt",
+    "Romanian": "ro",
+    "Slovak": "sk",
+    "Slovenian": "sl",
+    "Spanish": "es",
+    "Swedish": "sv",
+    "Russian": "ru",
+    "Ukrainian": "uk"
+}