Spaces:

rohanmiriyala
/

Audio_Translate

Sleeping

App Files Files Community

rohanmiriyala commited on Jul 15

Commit

a05081b

verified ·

1 Parent(s): e6699ca

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -115

app.py CHANGED Viewed

@@ -1,144 +1,242 @@
-# main.py
 from __future__ import annotations
 import os
-import io
 import torch
-import numpy as np
 import torchaudio
-import nltk
-import gradio as gr
-from pydub import AudioSegment
 from transformers import (
     SeamlessM4TFeatureExtractor,
     SeamlessM4TTokenizer,
     SeamlessM4Tv2ForSpeechToText,
-    AutoTokenizer,
-    AutoFeatureExtractor
 )
-from parler_tts import ParlerTTSForConditionalGeneration
-nltk.download('punkt')
-# === CONFIG ===
-HF_TOKEN = os.getenv("HF_TOKEN")
-device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
-SAMPLE_RATE = 16000
 DEFAULT_TARGET_LANGUAGE = "Hindi"
-# === Load translation model ===
-trans_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
-    "ai4bharat/indic-seamless", torch_dtype=torch_dtype, token=HF_TOKEN
-).to(device)
-processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless", token=HF_TOKEN)
-tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless", token=HF_TOKEN)
-# === Load TTS models ===
-tts_repo = "ai4bharat/indic-parler-tts-pretrained"
-tts_finetuned_repo = "ai4bharat/indic-parler-tts"
-tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
-    tts_repo, attn_implementation="eager", torch_dtype=torch_dtype
-).to(device)
-tts_finetuned_model = ParlerTTSForConditionalGeneration.from_pretrained(
-    tts_finetuned_repo, attn_implementation="eager", torch_dtype=torch_dtype
-).to(device)
-desc_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
-text_tokenizer = AutoTokenizer.from_pretrained(tts_repo)
-tts_sampling_rate = tts_model.audio_encoder.config.sampling_rate
-# === Utilities ===
-def numpy_to_mp3(audio_array, sampling_rate):
-    if np.issubdtype(audio_array.dtype, np.floating):
-        audio_array = (audio_array / np.max(np.abs(audio_array))) * 32767
-        audio_array = audio_array.astype(np.int16)
-    segment = AudioSegment(
-        audio_array.tobytes(),
-        frame_rate=sampling_rate,
-        sample_width=audio_array.dtype.itemsize,
-        channels=1
-    )
-    mp3_io = io.BytesIO()
-    segment.export(mp3_io, format="mp3", bitrate="320k")
-    return mp3_io.getvalue()
-def chunk_text(text, max_words=25):
-    sentences = nltk.sent_tokenize(text)
-    chunks, curr = [], ""
-    for s in sentences:
-        candidate = f"{curr} {s}".strip()
-        if len(candidate.split()) > max_words:
-            if curr: chunks.append(curr)
-            curr = s
-        else:
-            curr = candidate
-    if curr: chunks.append(curr)
-    return chunks
-# === Translation ===
-def translate_audio(input_audio, target_language):
-    audio, orig_sr = torchaudio.load(input_audio)
-    audio = torchaudio.functional.resample(audio, orig_sr, SAMPLE_RATE)
-    inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(device, dtype=torch_dtype)
-    target_lang_code = "hin"  # default Hindi, change as needed
-    gen_ids = trans_model.generate(**inputs, tgt_lang=target_lang_code)[0]
-    return tokenizer.decode(gen_ids, skip_special_tokens=True)
-# === TTS generation ===
-def generate_tts(text, description, use_finetuned=False):
-    model = tts_finetuned_model if use_finetuned else tts_model
-    inputs = desc_tokenizer(description, return_tensors="pt").to(device)
-    chunks = chunk_text(text)
-    all_audio = []
-    for chunk in chunks:
-        prompt = text_tokenizer(chunk, return_tensors="pt").to(device)
-        gen = model.generate(
-            input_ids=inputs.input_ids,
-            attention_mask=inputs.attention_mask,
-            prompt_input_ids=prompt.input_ids,
-            prompt_attention_mask=prompt.attention_mask,
-            do_sample=True,
-            return_dict_in_generate=True
-        )
-        if hasattr(gen, 'sequences') and hasattr(gen, 'audios_length'):
-            audio = gen.sequences[0, :gen.audios_length[0]]
-            audio_np = audio.float().cpu().numpy().flatten()
-            all_audio.append(audio_np)
-    combined = np.concatenate(all_audio)
-    return numpy_to_mp3(combined, sampling_rate=tts_sampling_rate)
-# === Gradio UI ===
-with gr.Blocks() as demo:
-    gr.Markdown("## 🎙️ Speech-to-Text → Text-to-Speech Demo")
     with gr.Row():
         with gr.Column():
-            input_audio = gr.Audio(label="Upload or record audio", type="filepath")
-            target_language = gr.Textbox(label="Target language (default Hindi)", value="Hindi")
-            btn_translate = gr.Button("Translate to text")
         with gr.Column():
-            translated_text = gr.Textbox(label="Translated text")
-    btn_translate.click(
-        translate_audio,
-        inputs=[input_audio, target_language],
-        outputs=translated_text
     )
     with gr.Row():
         with gr.Column():
-            voice_desc = gr.Textbox(label="Voice description", value="A calm, neutral Indian voice, clear audio.")
-            use_finetuned = gr.Checkbox(label="Use fine-tuned TTS", value=True)
-            btn_tts = gr.Button("Generate speech")
         with gr.Column():
-            generated_audio = gr.Audio(label="Generated speech", format="mp3", autoplay=True)
-    btn_tts.click(
-        generate_tts,
-        inputs=[translated_text, voice_desc, use_finetuned],
-        outputs=generated_audio
     )
 demo.launch(share=True)

 from __future__ import annotations
 import os
+import gradio as gr
+import spaces
 import torch
 import torchaudio
 from transformers import (
     SeamlessM4TFeatureExtractor,
     SeamlessM4TTokenizer,
     SeamlessM4Tv2ForSpeechToText,
 )
+from lang_list import (
+    ASR_TARGET_LANGUAGE_NAMES,
+    LANGUAGE_NAME_TO_CODE,
+    S2ST_TARGET_LANGUAGE_NAMES,
+    S2TT_TARGET_LANGUAGE_NAMES,
+    T2ST_TARGET_LANGUAGE_NAMES,
+    TEXT_SOURCE_LANGUAGE_NAMES,
+)
+DESCRIPTION = """\
+### **IndicSeamless: Speech-to-Text Translation Model for Indian Languages** 🎙️➡️📜
+This Gradio demo showcases **IndicSeamless**, a fine-tuned **SeamlessM4T-v2-large** model for **speech-to-text translation** across **13 Indian languages and English**. Trained on **BhasaAnuvaad**, the largest open-source speech translation dataset for Indian languages, it delivers **accurate and robust translations** across diverse linguistic and acoustic conditions.
+🔗 **Model Checkpoint:** [ai4bharat/indic-seamless](https://huggingface.co/ai4bharat/indic-seamless)
+#### **How to Use:**
+1. **Upload or record** an audio clip in any supported Indian language.
+2. Click **"Translate"** to generate the corresponding text in the target language.
+3. View or copy the output for further use.
+🚀 Try it out and experience seamless speech translation for Indian languages!
+"""
+hf_token = os.getenv("HF_TOKEN")
+device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32
+model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/indic-seamless", torch_dtype=torch_dtype, token=hf_token).to(device)
+processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless", token=hf_token)
+tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless", token=hf_token)
+CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
+AUDIO_SAMPLE_RATE = 16000
+MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
 DEFAULT_TARGET_LANGUAGE = "Hindi"
+def preprocess_audio(input_audio: str) -> None:
+    arr, org_sr = torchaudio.load(input_audio)
+    new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
+    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
+    if new_arr.shape[1] > max_length:
+        new_arr = new_arr[:, :max_length]
+        gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+    torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
+@spaces.GPU
+def run_s2tt(input_audio: str, source_language: str, target_language: str) -> str:
+    # preprocess_audio(input_audio)
+    # source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
+    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
+    input_audio, orig_freq = torchaudio.load(input_audio)
+    input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
+    audio_inputs= processor(input_audio, sampling_rate=16000, return_tensors="pt").to(device=device, dtype=torch_dtype)
+    text_out = model.generate(**audio_inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
+    return tokenizer.decode(text_out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+@spaces.GPU
+def run_asr(input_audio: str, target_language: str) -> str:
+    # preprocess_audio(input_audio)
+    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
+    input_audio, orig_freq = torchaudio.load(input_audio)
+    input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
+    audio_inputs= processor(input_audio, sampling_rate=16000, return_tensors="pt").to(device=device, dtype=torch_dtype)
+    text_out = model.generate(**audio_inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
+    return tokenizer.decode(text_out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+with gr.Blocks() as demo_s2st:
     with gr.Row():
         with gr.Column():
+            with gr.Group():
+                input_audio = gr.Audio(label="Input speech", type="filepath")
+                source_language = gr.Dropdown(
+                    label="Source language",
+                    choices=ASR_TARGET_LANGUAGE_NAMES,
+                    value="English",
+                )
+                target_language = gr.Dropdown(
+                    label="Target language",
+                    choices=S2ST_TARGET_LANGUAGE_NAMES,
+                    value=DEFAULT_TARGET_LANGUAGE,
+                )
+            btn = gr.Button("Translate")
         with gr.Column():
+            with gr.Group():
+                output_audio = gr.Audio(
+                    label="Translated speech",
+                    autoplay=False,
+                    streaming=False,
+                    type="numpy",
+                )
+                output_text = gr.Textbox(label="Translated text")
+with gr.Blocks() as demo_s2tt:
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                input_audio = gr.Audio(label="Input speech", type="filepath")
+                source_language = gr.Dropdown(
+                    label="Source language",
+                    choices=ASR_TARGET_LANGUAGE_NAMES,
+                    value="English",
+                )
+                target_language = gr.Dropdown(
+                    label="Target language",
+                    choices=S2TT_TARGET_LANGUAGE_NAMES,
+                    value=DEFAULT_TARGET_LANGUAGE,
+                )
+            btn = gr.Button("Translate")
+        with gr.Column():
+            output_text = gr.Textbox(label="Translated text")
+    gr.Examples(
+        examples=[
+            ["assets/Bengali.wav", "Bengali", "English"],
+            ["assets/Gujarati.wav", "Gujarati", "Hindi"],
+            ["assets/Punjabi.wav", "Punjabi", "Hindi"],
+        ],
+        inputs=[input_audio, source_language, target_language],
+        outputs=output_text,
+        fn=run_s2tt,
+        cache_examples=CACHE_EXAMPLES,
+        api_name=False,
+    )
+    btn.click(
+        fn=run_s2tt,
+        inputs=[input_audio, source_language, target_language],
+        outputs=output_text,
+        api_name="s2tt",
     )
+with gr.Blocks() as demo_t2st:
     with gr.Row():
         with gr.Column():
+            with gr.Group():
+                input_text = gr.Textbox(label="Input text")
+                with gr.Row():
+                    source_language = gr.Dropdown(
+                        label="Source language",
+                        choices=TEXT_SOURCE_LANGUAGE_NAMES,
+                        value="English",
+                    )
+                    target_language = gr.Dropdown(
+                        label="Target language",
+                        choices=T2ST_TARGET_LANGUAGE_NAMES,
+                        value=DEFAULT_TARGET_LANGUAGE,
+                    )
+            btn = gr.Button("Translate")
         with gr.Column():
+            with gr.Group():
+                output_audio = gr.Audio(
+                    label="Translated speech",
+                    autoplay=False,
+                    streaming=False,
+                    type="numpy",
+                )
+                output_text = gr.Textbox(label="Translated text")
+with gr.Blocks() as demo_asr:
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                input_audio = gr.Audio(label="Input speech", type="filepath")
+                target_language = gr.Dropdown(
+                    label="Target language",
+                    choices=ASR_TARGET_LANGUAGE_NAMES,
+                    value=DEFAULT_TARGET_LANGUAGE,
+                )
+            btn = gr.Button("Transcribe")
+        with gr.Column():
+            output_text = gr.Textbox(label="Transcribed text")
+    gr.Examples(
+        examples=[
+            ["assets/Bengali.wav", "Bengali", "English"],
+            ["assets/Gujarati.wav", "Gujarati", "Hindi"],
+            ["assets/Punjabi.wav", "Punjabi", "Hindi"],
+        ],
+        inputs=[input_audio, target_language],
+        outputs=output_text,
+        fn=run_asr,
+        cache_examples=CACHE_EXAMPLES,
+        api_name=False,
+    )
+    btn.click(
+        fn=run_asr,
+        inputs=[input_audio, target_language],
+        outputs=output_text,
+        api_name="asr",
+    )
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
     )
+    with gr.Tabs():
+        # with gr.Tab(label="S2ST"):
+        #     demo_s2st.render()
+        with gr.Tab(label="S2TT"):
+            demo_s2tt.render()
+        # with gr.Tab(label="T2ST"):
+        #     demo_t2st.render()
+        # with gr.Tab(label="T2TT"):
+        #     demo_t2tt.render()
+        with gr.Tab(label="ASR"):
+            demo_asr.render()
 demo.launch(share=True)