Spaces:
Running
Running
import os | |
import gradio as gr | |
from faster_whisper import WhisperModel | |
import google.generativeai as genai | |
from gtts import gTTS, lang | |
import tempfile | |
import soundfile as sf | |
from kokoro import KPipeline | |
# Configure Gemini API (use environment variable for Hugging Face Spaces) | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
if not GEMINI_API_KEY: | |
raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.") | |
genai.configure(api_key=GEMINI_API_KEY) | |
# Initialize the faster-whisper model with fallback compute type | |
model_size = "Systran/faster-whisper-large-v3" | |
try: | |
whisper_model = WhisperModel(model_size, device="auto", compute_type="float16") | |
except ValueError: | |
print("Float16 not supported, falling back to int8 on CPU") | |
whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8") | |
# Language codes for Kokoro TTS | |
KOKORO_LANGUAGES = { | |
"American English": "a", | |
"British English": "b", | |
"Japanese": "j", | |
"Mandarin Chinese": "z", | |
"Spanish": "e", | |
"French": "f", | |
"Hindi": "h", | |
"Italian": "i", | |
"Brazilian Portuguese": "p" | |
} | |
# Function to transcribe audio using faster-whisper | |
def transcribe_audio(audio_file): | |
try: | |
segments, info = whisper_model.transcribe(audio_file, beam_size=5) | |
transcription = " ".join([segment.text for segment in segments]) | |
detected_language = info.language | |
return transcription, detected_language, None | |
except Exception as e: | |
return None, None, f"Transcription error: {str(e)}" | |
# Function to translate text using Gemini API with a magic prompt | |
def translate_text(text, target_language): | |
try: | |
model = genai.GenerativeModel("gemini-1.5-flash") | |
prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}" | |
response = model.generate_content(prompt) | |
translated_text = response.text.strip() | |
return translated_text, None | |
except Exception as e: | |
return None, f"Translation error: {str(e)}" | |
# Function to convert text to speech using Kokoro or gTTS based on language | |
def text_to_speech(text, language): | |
try: | |
# Check if the language is supported by Kokoro | |
if language in KOKORO_LANGUAGES: | |
# Use Kokoro TTS | |
lang_code = KOKORO_LANGUAGES[language] | |
pipeline = KPipeline(lang_code=lang_code) | |
generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+') | |
audio_data = None | |
for i, (gs, ps, audio) in enumerate(generator): | |
audio_data = audio # Use the first segment | |
break | |
if audio_data is None: | |
raise ValueError("No audio generated by Kokoro") | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp: | |
sf.write(fp.name, audio_data, 24000) | |
return fp.name, None | |
else: | |
# Fallback to gTTS | |
lang_map = lang.tts_langs() | |
tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en") | |
tts = gTTS(text=text, lang=tts_lang, slow=False) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
tts.save(fp.name) | |
return fp.name, None | |
except Exception as e: | |
return None, f"TTS error: {str(e)}" | |
# Main function to process audio input and return outputs | |
def process_audio(audio_file, target_language): | |
if audio_file is None: | |
return "Please upload an audio file or record audio.", None, None, None | |
transcription, detected_language, error = transcribe_audio(audio_file) | |
if error: | |
return error, None, None, None | |
translated_text, error = translate_text(transcription, target_language) | |
if error: | |
return error, transcription, None, None | |
audio_output, error = text_to_speech(translated_text, target_language) | |
if error: | |
return error, transcription, translated_text, None | |
return None, transcription, translated_text, audio_output | |
# Gradio interface | |
with gr.Blocks(title="AI Audio Translator") as demo: | |
gr.Markdown("# AI Audio Translator") | |
gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio! Uses Kokoro TTS for supported languages, otherwise gTTS.") | |
supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys()))) | |
with gr.Row(): | |
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio") | |
target_lang = gr.Dropdown( | |
choices=sorted(supported_langs), | |
value="Spanish", | |
label="Target Language" | |
) | |
submit_btn = gr.Button("Translate") | |
with gr.Row(): | |
error_output = gr.Textbox(label="Error", visible=True) | |
transcription_output = gr.Textbox(label="Transcription") | |
translation_output = gr.Textbox(label="Translated Text") | |
audio_output = gr.Audio(label="Translated Audio") | |
submit_btn.click( | |
fn=process_audio, | |
inputs=[audio_input, target_lang], | |
outputs=[error_output, transcription_output, translation_output, audio_output] | |
) | |
# Launch the app | |
demo.launch() |