Spaces:

mgokg
/

voicebot

Sleeping

File size: 1,333 Bytes

3e9fe0d
 
51a7232
3e9fe0d
216a1d9
51a7232
3e9fe0d
 
216a1d9
e218c4e
51a7232
 
 
 
 
 
 
7e33221
51a7232
 
 
 
 
 
 
e218c4e
 
3e9fe0d
 
51a7232
 
0d59c05
e218c4e
 
51a7232
 
 
 
 
30b0477
216a1d9

import gradio as gr
import torch
from transformers import pipeline


# Laden der Modelle (einmalig beim Start)
device = "cuda" if torch.cuda.is_available() else "cpu"

speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=device)
#text_to_speech = pipeline("text-to-speech", model="facebook/seamless-streaming", device=device)

def audio_to_audio_chatbot(audio):
    if audio is None:
        return None, "Bitte eine Audio-Datei hochladen."

    # 1. Speech-to-Text
    text = speech_to_text(audio)["text"]
    return text
    print(f"User: {text}")

    # 2. Text-to-Text (Hier wird ein einfacher Echo-Bot verwendet, kann durch ein komplexeres Modell ersetzt werden)
    response_text = f"Du hast gesagt: {text}"
    print(f"Bot: {response_text}")

    # 3. Text-to-Speech
    #speech = text_to_speech(response_text)
    #return speech["audio"], response_text

if __name__ == "__main__":
    iface = gr.Interface(
        fn=audio_to_audio_chatbot,
        inputs=gr.Audio(type="filepath"),
        outputs= gr.Textbox(),
        #outputs=[gr.Audio(), gr.Textbox()],
        title="Audio-zu-Audio-Chatbot (Streaming)",
        description="Spreche in das Mikrofon und der Bot antwortet mit einer Audio-Ausgabe.",
        live=True  # Aktiviert Streaming
    )

    iface.launch()