File size: 5,458 Bytes
dbe8a71
 
 
 
 
 
9dbf879
 
dbe8a71
413a70d
 
 
 
dbe8a71
 
413a70d
dbe8a71
413a70d
 
 
 
 
dbe8a71
9dbf879
 
 
 
 
 
 
 
 
 
 
 
 
dbe8a71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef2c8e0
 
dbe8a71
ef2c8e0
 
9dbf879
 
 
 
 
 
ef2c8e0
 
9dbf879
 
 
 
 
 
 
 
 
 
 
 
 
dbe8a71
 
 
 
ef2c8e0
413a70d
 
 
dbe8a71
 
 
 
 
 
 
 
ef2c8e0
dbe8a71
 
 
 
 
 
 
 
ef2c8e0
dbe8a71
9dbf879
dbe8a71
 
 
ef2c8e0
 
 
 
 
dbe8a71
 
 
 
 
 
 
 
 
 
 
ef2c8e0
dbe8a71
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import gradio as gr
from faster_whisper import WhisperModel
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline

# Configure Gemini API (use environment variable for Hugging Face Spaces)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.")
genai.configure(api_key=GEMINI_API_KEY)

# Initialize the faster-whisper model with fallback compute type
model_size = "Systran/faster-whisper-large-v3"
try:
    whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
except ValueError:
    print("Float16 not supported, falling back to int8 on CPU")
    whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")

# Language codes for Kokoro TTS
KOKORO_LANGUAGES = {
    "American English": "a",
    "British English": "b",
    "Japanese": "j",
    "Mandarin Chinese": "z",
    "Spanish": "e",
    "French": "f",
    "Hindi": "h",
    "Italian": "i",
    "Brazilian Portuguese": "p"
}

# Function to transcribe audio using faster-whisper
def transcribe_audio(audio_file):
    try:
        segments, info = whisper_model.transcribe(audio_file, beam_size=5)
        transcription = " ".join([segment.text for segment in segments])
        detected_language = info.language
        return transcription, detected_language, None
    except Exception as e:
        return None, None, f"Transcription error: {str(e)}"

# Function to translate text using Gemini API with a magic prompt
def translate_text(text, target_language):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}"
        response = model.generate_content(prompt)
        translated_text = response.text.strip()
        return translated_text, None
    except Exception as e:
        return None, f"Translation error: {str(e)}"

# Function to convert text to speech using Kokoro or gTTS based on language
def text_to_speech(text, language):
    try:
        # Check if the language is supported by Kokoro
        if language in KOKORO_LANGUAGES:
            # Use Kokoro TTS
            lang_code = KOKORO_LANGUAGES[language]
            pipeline = KPipeline(lang_code=lang_code)
            generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
            audio_data = None
            for i, (gs, ps, audio) in enumerate(generator):
                audio_data = audio  # Use the first segment
                break
            if audio_data is None:
                raise ValueError("No audio generated by Kokoro")
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
                sf.write(fp.name, audio_data, 24000)
                return fp.name, None
        else:
            # Fallback to gTTS
            lang_map = lang.tts_langs()
            tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
            tts = gTTS(text=text, lang=tts_lang, slow=False)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
                tts.save(fp.name)
                return fp.name, None
    except Exception as e:
        return None, f"TTS error: {str(e)}"

# Main function to process audio input and return outputs
def process_audio(audio_file, target_language):
    if audio_file is None:
        return "Please upload an audio file or record audio.", None, None, None
    
    transcription, detected_language, error = transcribe_audio(audio_file)
    if error:
        return error, None, None, None
    
    translated_text, error = translate_text(transcription, target_language)
    if error:
        return error, transcription, None, None
    
    audio_output, error = text_to_speech(translated_text, target_language)
    if error:
        return error, transcription, translated_text, None
    
    return None, transcription, translated_text, audio_output

# Gradio interface
with gr.Blocks(title="AI Audio Translator") as demo:
    gr.Markdown("# AI Audio Translator")
    gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio! Uses Kokoro TTS for supported languages, otherwise gTTS.")
    
    supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
    
    with gr.Row():
        audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
        target_lang = gr.Dropdown(
            choices=sorted(supported_langs),
            value="Spanish",
            label="Target Language"
        )
    
    submit_btn = gr.Button("Translate")
    
    with gr.Row():
        error_output = gr.Textbox(label="Error", visible=True)
        transcription_output = gr.Textbox(label="Transcription")
        translation_output = gr.Textbox(label="Translated Text")
        audio_output = gr.Audio(label="Translated Audio")
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input, target_lang],
        outputs=[error_output, transcription_output, translation_output, audio_output]
    )

# Launch the app
demo.launch()