Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on Feb 23

Commit

9dbf879

verified ·

1 Parent(s): 413a70d

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -21

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from faster_whisper import WhisperModel
 import google.generativeai as genai
 from gtts import gTTS, lang
 import tempfile
 # Configure Gemini API (use environment variable for Hugging Face Spaces)
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
@@ -19,6 +21,19 @@ except ValueError:
     print("Float16 not supported, falling back to int8 on CPU")
     whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
 # Function to transcribe audio using faster-whisper
 def transcribe_audio(audio_file):
     try:
@@ -40,20 +55,36 @@ def translate_text(text, target_language):
     except Exception as e:
         return None, f"Translation error: {str(e)}"
-# Function to convert text to speech using gTTS with full language support
-def text_to_speech(text, language):
     try:
-        lang_map = lang.tts_langs()
-        tts_lang = language.lower() if language.lower() in lang_map else "en"
-        tts = gTTS(text=text, lang=tts_lang, slow=False)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
-            tts.save(fp.name)
-            return fp.name, None
     except Exception as e:
         return None, f"TTS error: {str(e)}"
 # Main function to process audio input and return outputs
-def process_audio(audio_file, target_language):
     if audio_file is None:
         return "Please upload an audio file or record audio.", None, None, None
@@ -65,9 +96,7 @@ def process_audio(audio_file, target_language):
     if error:
         return error, transcription, None, None
-    lang_map = lang.tts_langs()
-    lang_key = next((k for k, v in lang_map.items() if v.lower() == target_language.lower()), "en")
-    audio_output, error = text_to_speech(translated_text, lang_key)
     if error:
         return error, transcription, translated_text, None
@@ -76,18 +105,23 @@ def process_audio(audio_file, target_language):
 # Gradio interface
 with gr.Blocks(title="AI Audio Translator") as demo:
     gr.Markdown("# AI Audio Translator")
-    gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio!")
-    supported_langs = {v: k for k, v in lang.tts_langs().items()}
-    language_choices = list(supported_langs.keys())
     with gr.Row():
         audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
-        target_lang = gr.Dropdown(
-            choices=sorted(language_choices),
-            value="Spanish",
-            label="Target Language"
-        )
     submit_btn = gr.Button("Translate")
@@ -99,7 +133,7 @@ with gr.Blocks(title="AI Audio Translator") as demo:
     submit_btn.click(
         fn=process_audio,
-        inputs=[audio_input, target_lang],
         outputs=[error_output, transcription_output, translation_output, audio_output]
     )

 import google.generativeai as genai
 from gtts import gTTS, lang
 import tempfile
+import soundfile as sf
+from kokoro import KPipeline
 # Configure Gemini API (use environment variable for Hugging Face Spaces)
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
     print("Float16 not supported, falling back to int8 on CPU")
     whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
+# Language codes for Kokoro TTS
+KOKORO_LANGUAGES = {
+    "American English": "a",
+    "British English": "b",
+    "Japanese": "j",
+    "Mandarin Chinese": "z",
+    "Spanish": "e",
+    "French": "f",
+    "Hindi": "h",
+    "Italian": "i",
+    "Brazilian Portuguese": "p"
+}
 # Function to transcribe audio using faster-whisper
 def transcribe_audio(audio_file):
     try:
     except Exception as e:
         return None, f"Translation error: {str(e)}"
+# Function to convert text to speech using Kokoro or gTTS
+def text_to_speech(text, language, tts_engine):
     try:
+        if tts_engine == "Kokoro" and language in KOKORO_LANGUAGES:
+            # Use Kokoro TTS
+            lang_code = KOKORO_LANGUAGES[language]
+            pipeline = KPipeline(lang_code=lang_code)
+            generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
+            audio_data = None
+            for i, (gs, ps, audio) in enumerate(generator):
+                audio_data = audio  # Use the last generated audio segment
+                break  # Only take the first segment for simplicity
+            if audio_data is None:
+                raise ValueError("No audio generated by Kokoro")
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
+                sf.write(fp.name, audio_data, 24000)
+                return fp.name, None
+        else:
+            # Fallback to gTTS
+            lang_map = lang.tts_langs()
+            tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
+            tts = gTTS(text=text, lang=tts_lang, slow=False)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+                tts.save(fp.name)
+                return fp.name, None
     except Exception as e:
         return None, f"TTS error: {str(e)}"
 # Main function to process audio input and return outputs
+def process_audio(audio_file, target_language, tts_engine):
     if audio_file is None:
         return "Please upload an audio file or record audio.", None, None, None
     if error:
         return error, transcription, None, None
+    audio_output, error = text_to_speech(translated_text, target_language, tts_engine)
     if error:
         return error, transcription, translated_text, None
 # Gradio interface
 with gr.Blocks(title="AI Audio Translator") as demo:
     gr.Markdown("# AI Audio Translator")
+    gr.Markdown("Upload an audio file or record via microphone, select a target language and TTS engine, and get the transcription, translation, and translated audio!")
+    supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
     with gr.Row():
         audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
+        with gr.Column():
+            target_lang = gr.Dropdown(
+                choices=sorted(supported_langs),
+                value="Spanish",
+                label="Target Language"
+            )
+            tts_engine = gr.Radio(
+                choices=["Kokoro", "gTTS"],
+                value="gTTS",
+                label="Text-to-Speech Engine"
+            )
     submit_btn = gr.Button("Translate")
     submit_btn.click(
         fn=process_audio,
+        inputs=[audio_input, target_lang, tts_engine],
         outputs=[error_output, transcription_output, translation_output, audio_output]
     )