File size: 5,694 Bytes
dbe8a71
 
 
 
 
 
9dbf879
 
dbe8a71
413a70d
 
 
 
dbe8a71
 
413a70d
dbe8a71
413a70d
 
 
 
 
dbe8a71
9dbf879
 
 
 
 
 
 
 
 
 
 
 
 
dbe8a71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dbf879
 
dbe8a71
9dbf879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbe8a71
 
 
 
9dbf879
413a70d
 
 
dbe8a71
 
 
 
 
 
 
 
9dbf879
dbe8a71
 
 
 
 
 
 
 
9dbf879
dbe8a71
9dbf879
dbe8a71
 
 
9dbf879
 
 
 
 
 
 
 
 
 
 
dbe8a71
 
 
 
 
 
 
 
 
 
 
9dbf879
dbe8a71
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline

# Configure Gemini API (use environment variable for Hugging Face Spaces)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.")
genai.configure(api_key=GEMINI_API_KEY)

# Initialize the faster-whisper model with fallback compute type
model_size = "Systran/faster-whisper-large-v3"
try:
    whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
except ValueError:
    print("Float16 not supported, falling back to int8 on CPU")
    whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")

# Language codes for Kokoro TTS
KOKORO_LANGUAGES = {
    "American English": "a",
    "British English": "b",
    "Japanese": "j",
    "Mandarin Chinese": "z",
    "Spanish": "e",
    "French": "f",
    "Hindi": "h",
    "Italian": "i",
    "Brazilian Portuguese": "p"
}

# Function to transcribe audio using faster-whisper
def transcribe_audio(audio_file):
    try:
        segments, info = whisper_model.transcribe(audio_file, beam_size=5)
        transcription = " ".join([segment.text for segment in segments])
        detected_language = info.language
        return transcription, detected_language, None
    except Exception as e:
        return None, None, f"Transcription error: {str(e)}"

# Function to translate text using Gemini API with a magic prompt
def translate_text(text, target_language):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}"
        response = model.generate_content(prompt)
        translated_text = response.text.strip()
        return translated_text, None
    except Exception as e:
        return None, f"Translation error: {str(e)}"

# Function to convert text to speech using Kokoro or gTTS
def text_to_speech(text, language, tts_engine):
    try:
        if tts_engine == "Kokoro" and language in KOKORO_LANGUAGES:
            # Use Kokoro TTS
            lang_code = KOKORO_LANGUAGES[language]
            pipeline = KPipeline(lang_code=lang_code)
            generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
            audio_data = None
            for i, (gs, ps, audio) in enumerate(generator):
                audio_data = audio  # Use the last generated audio segment
                break  # Only take the first segment for simplicity
            if audio_data is None:
                raise ValueError("No audio generated by Kokoro")
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
                sf.write(fp.name, audio_data, 24000)
                return fp.name, None
        else:
            # Fallback to gTTS
            lang_map = lang.tts_langs()
            tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
            tts = gTTS(text=text, lang=tts_lang, slow=False)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
                tts.save(fp.name)
                return fp.name, None
    except Exception as e:
        return None, f"TTS error: {str(e)}"

# Main function to process audio input and return outputs
def process_audio(audio_file, target_language, tts_engine):
    if audio_file is None:
        return "Please upload an audio file or record audio.", None, None, None
    
    transcription, detected_language, error = transcribe_audio(audio_file)
    if error:
        return error, None, None, None
    
    translated_text, error = translate_text(transcription, target_language)
    if error:
        return error, transcription, None, None
    
    audio_output, error = text_to_speech(translated_text, target_language, tts_engine)
    if error:
        return error, transcription, translated_text, None
    
    return None, transcription, translated_text, audio_output

# Gradio interface
with gr.Blocks(title="AI Audio Translator") as demo:
    gr.Markdown("# AI Audio Translator")
    gr.Markdown("Upload an audio file or record via microphone, select a target language and TTS engine, and get the transcription, translation, and translated audio!")
    
    supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
    
    with gr.Row():
        audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
        with gr.Column():
            target_lang = gr.Dropdown(
                choices=sorted(supported_langs),
                value="Spanish",
                label="Target Language"
            )
            tts_engine = gr.Radio(
                choices=["Kokoro", "gTTS"],
                value="gTTS",
                label="Text-to-Speech Engine"
            )
    
    submit_btn = gr.Button("Translate")
    
    with gr.Row():
        error_output = gr.Textbox(label="Error", visible=True)
        transcription_output = gr.Textbox(label="Transcription")
        translation_output = gr.Textbox(label="Translated Text")
        audio_output = gr.Audio(label="Translated Audio")
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input, target_lang, tts_engine],
        outputs=[error_output, transcription_output, translation_output, audio_output]
    )

# Launch the app
demo.launch()