import os import gradio as gr from faster_whisper import WhisperModel import google.generativeai as genai from gtts import gTTS, lang import tempfile import soundfile as sf from kokoro import KPipeline # Configure Gemini API (use environment variable for Hugging Face Spaces) GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") if not GEMINI_API_KEY: raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.") genai.configure(api_key=GEMINI_API_KEY) # Initialize the faster-whisper model with fallback compute type model_size = "Systran/faster-whisper-large-v3" try: whisper_model = WhisperModel(model_size, device="auto", compute_type="float16") except ValueError: print("Float16 not supported, falling back to int8 on CPU") whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8") # Language codes for Kokoro TTS KOKORO_LANGUAGES = { "American English": "a", "British English": "b", "Japanese": "j", "Mandarin Chinese": "z", "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p" } # Function to transcribe audio using faster-whisper def transcribe_audio(audio_file): try: segments, info = whisper_model.transcribe(audio_file, beam_size=5) transcription = " ".join([segment.text for segment in segments]) detected_language = info.language return transcription, detected_language, None except Exception as e: return None, None, f"Transcription error: {str(e)}" # Function to translate text using Gemini API with a magic prompt def translate_text(text, target_language): try: model = genai.GenerativeModel("gemini-1.5-flash") prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}" response = model.generate_content(prompt) translated_text = response.text.strip() return translated_text, None except Exception as e: return None, f"Translation error: {str(e)}" # Function to convert text to speech using Kokoro or gTTS def text_to_speech(text, language, tts_engine): try: if tts_engine == "Kokoro" and language in KOKORO_LANGUAGES: # Use Kokoro TTS lang_code = KOKORO_LANGUAGES[language] pipeline = KPipeline(lang_code=lang_code) generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+') audio_data = None for i, (gs, ps, audio) in enumerate(generator): audio_data = audio # Use the last generated audio segment break # Only take the first segment for simplicity if audio_data is None: raise ValueError("No audio generated by Kokoro") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp: sf.write(fp.name, audio_data, 24000) return fp.name, None else: # Fallback to gTTS lang_map = lang.tts_langs() tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en") tts = gTTS(text=text, lang=tts_lang, slow=False) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: tts.save(fp.name) return fp.name, None except Exception as e: return None, f"TTS error: {str(e)}" # Main function to process audio input and return outputs def process_audio(audio_file, target_language, tts_engine): if audio_file is None: return "Please upload an audio file or record audio.", None, None, None transcription, detected_language, error = transcribe_audio(audio_file) if error: return error, None, None, None translated_text, error = translate_text(transcription, target_language) if error: return error, transcription, None, None audio_output, error = text_to_speech(translated_text, target_language, tts_engine) if error: return error, transcription, translated_text, None return None, transcription, translated_text, audio_output # Gradio interface with gr.Blocks(title="AI Audio Translator") as demo: gr.Markdown("# AI Audio Translator") gr.Markdown("Upload an audio file or record via microphone, select a target language and TTS engine, and get the transcription, translation, and translated audio!") supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys()))) with gr.Row(): audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio") with gr.Column(): target_lang = gr.Dropdown( choices=sorted(supported_langs), value="Spanish", label="Target Language" ) tts_engine = gr.Radio( choices=["Kokoro", "gTTS"], value="gTTS", label="Text-to-Speech Engine" ) submit_btn = gr.Button("Translate") with gr.Row(): error_output = gr.Textbox(label="Error", visible=True) transcription_output = gr.Textbox(label="Transcription") translation_output = gr.Textbox(label="Translated Text") audio_output = gr.Audio(label="Translated Audio") submit_btn.click( fn=process_audio, inputs=[audio_input, target_lang, tts_engine], outputs=[error_output, transcription_output, translation_output, audio_output] ) # Launch the app demo.launch()