Spaces:
Running
Running
File size: 5,694 Bytes
dbe8a71 9dbf879 dbe8a71 413a70d dbe8a71 413a70d dbe8a71 413a70d dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 9dbf879 413a70d dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 9dbf879 dbe8a71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import os
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline
# Configure Gemini API (use environment variable for Hugging Face Spaces)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.")
genai.configure(api_key=GEMINI_API_KEY)
# Initialize the faster-whisper model with fallback compute type
model_size = "Systran/faster-whisper-large-v3"
try:
whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
except ValueError:
print("Float16 not supported, falling back to int8 on CPU")
whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
# Language codes for Kokoro TTS
KOKORO_LANGUAGES = {
"American English": "a",
"British English": "b",
"Japanese": "j",
"Mandarin Chinese": "z",
"Spanish": "e",
"French": "f",
"Hindi": "h",
"Italian": "i",
"Brazilian Portuguese": "p"
}
# Function to transcribe audio using faster-whisper
def transcribe_audio(audio_file):
try:
segments, info = whisper_model.transcribe(audio_file, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
detected_language = info.language
return transcription, detected_language, None
except Exception as e:
return None, None, f"Transcription error: {str(e)}"
# Function to translate text using Gemini API with a magic prompt
def translate_text(text, target_language):
try:
model = genai.GenerativeModel("gemini-1.5-flash")
prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}"
response = model.generate_content(prompt)
translated_text = response.text.strip()
return translated_text, None
except Exception as e:
return None, f"Translation error: {str(e)}"
# Function to convert text to speech using Kokoro or gTTS
def text_to_speech(text, language, tts_engine):
try:
if tts_engine == "Kokoro" and language in KOKORO_LANGUAGES:
# Use Kokoro TTS
lang_code = KOKORO_LANGUAGES[language]
pipeline = KPipeline(lang_code=lang_code)
generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
audio_data = None
for i, (gs, ps, audio) in enumerate(generator):
audio_data = audio # Use the last generated audio segment
break # Only take the first segment for simplicity
if audio_data is None:
raise ValueError("No audio generated by Kokoro")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
sf.write(fp.name, audio_data, 24000)
return fp.name, None
else:
# Fallback to gTTS
lang_map = lang.tts_langs()
tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
tts = gTTS(text=text, lang=tts_lang, slow=False)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts.save(fp.name)
return fp.name, None
except Exception as e:
return None, f"TTS error: {str(e)}"
# Main function to process audio input and return outputs
def process_audio(audio_file, target_language, tts_engine):
if audio_file is None:
return "Please upload an audio file or record audio.", None, None, None
transcription, detected_language, error = transcribe_audio(audio_file)
if error:
return error, None, None, None
translated_text, error = translate_text(transcription, target_language)
if error:
return error, transcription, None, None
audio_output, error = text_to_speech(translated_text, target_language, tts_engine)
if error:
return error, transcription, translated_text, None
return None, transcription, translated_text, audio_output
# Gradio interface
with gr.Blocks(title="AI Audio Translator") as demo:
gr.Markdown("# AI Audio Translator")
gr.Markdown("Upload an audio file or record via microphone, select a target language and TTS engine, and get the transcription, translation, and translated audio!")
supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
with gr.Row():
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
with gr.Column():
target_lang = gr.Dropdown(
choices=sorted(supported_langs),
value="Spanish",
label="Target Language"
)
tts_engine = gr.Radio(
choices=["Kokoro", "gTTS"],
value="gTTS",
label="Text-to-Speech Engine"
)
submit_btn = gr.Button("Translate")
with gr.Row():
error_output = gr.Textbox(label="Error", visible=True)
transcription_output = gr.Textbox(label="Transcription")
translation_output = gr.Textbox(label="Translated Text")
audio_output = gr.Audio(label="Translated Audio")
submit_btn.click(
fn=process_audio,
inputs=[audio_input, target_lang, tts_engine],
outputs=[error_output, transcription_output, translation_output, audio_output]
)
# Launch the app
demo.launch() |