Athspi's picture
Update app.py
ef2c8e0 verified
raw
history blame
5.46 kB
import os
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline
# Configure Gemini API (use environment variable for Hugging Face Spaces)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.")
genai.configure(api_key=GEMINI_API_KEY)
# Initialize the faster-whisper model with fallback compute type
model_size = "Systran/faster-whisper-large-v3"
try:
whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
except ValueError:
print("Float16 not supported, falling back to int8 on CPU")
whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
# Language codes for Kokoro TTS
KOKORO_LANGUAGES = {
"American English": "a",
"British English": "b",
"Japanese": "j",
"Mandarin Chinese": "z",
"Spanish": "e",
"French": "f",
"Hindi": "h",
"Italian": "i",
"Brazilian Portuguese": "p"
}
# Function to transcribe audio using faster-whisper
def transcribe_audio(audio_file):
try:
segments, info = whisper_model.transcribe(audio_file, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
detected_language = info.language
return transcription, detected_language, None
except Exception as e:
return None, None, f"Transcription error: {str(e)}"
# Function to translate text using Gemini API with a magic prompt
def translate_text(text, target_language):
try:
model = genai.GenerativeModel("gemini-1.5-flash")
prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}"
response = model.generate_content(prompt)
translated_text = response.text.strip()
return translated_text, None
except Exception as e:
return None, f"Translation error: {str(e)}"
# Function to convert text to speech using Kokoro or gTTS based on language
def text_to_speech(text, language):
try:
# Check if the language is supported by Kokoro
if language in KOKORO_LANGUAGES:
# Use Kokoro TTS
lang_code = KOKORO_LANGUAGES[language]
pipeline = KPipeline(lang_code=lang_code)
generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
audio_data = None
for i, (gs, ps, audio) in enumerate(generator):
audio_data = audio # Use the first segment
break
if audio_data is None:
raise ValueError("No audio generated by Kokoro")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
sf.write(fp.name, audio_data, 24000)
return fp.name, None
else:
# Fallback to gTTS
lang_map = lang.tts_langs()
tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
tts = gTTS(text=text, lang=tts_lang, slow=False)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts.save(fp.name)
return fp.name, None
except Exception as e:
return None, f"TTS error: {str(e)}"
# Main function to process audio input and return outputs
def process_audio(audio_file, target_language):
if audio_file is None:
return "Please upload an audio file or record audio.", None, None, None
transcription, detected_language, error = transcribe_audio(audio_file)
if error:
return error, None, None, None
translated_text, error = translate_text(transcription, target_language)
if error:
return error, transcription, None, None
audio_output, error = text_to_speech(translated_text, target_language)
if error:
return error, transcription, translated_text, None
return None, transcription, translated_text, audio_output
# Gradio interface
with gr.Blocks(title="AI Audio Translator") as demo:
gr.Markdown("# AI Audio Translator")
gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio! Uses Kokoro TTS for supported languages, otherwise gTTS.")
supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
with gr.Row():
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
target_lang = gr.Dropdown(
choices=sorted(supported_langs),
value="Spanish",
label="Target Language"
)
submit_btn = gr.Button("Translate")
with gr.Row():
error_output = gr.Textbox(label="Error", visible=True)
transcription_output = gr.Textbox(label="Transcription")
translation_output = gr.Textbox(label="Translated Text")
audio_output = gr.Audio(label="Translated Audio")
submit_btn.click(
fn=process_audio,
inputs=[audio_input, target_lang],
outputs=[error_output, transcription_output, translation_output, audio_output]
)
# Launch the app
demo.launch()