Spaces:

karagmercola
/

Sesame-AI-POC

Running

File size: 3,223 Bytes

import gradio as gr 
import whisper
import time
from main import conversation_with_voice

# Load Whisper model
model = whisper.load_model("base")

# Description displayed at the top of the UI
description = """
Proof Of Concept  
This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.  
**Why each tool was added:**  
- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.  
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.  
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.  

**Example questions you can ask:**  
- What are the healthiest oils to cook with?  
- How much water should I drink daily?  
- What are good snacks for weight loss?  

Created by Kara Granados
"""

def voice_to_voice(audio_file):
    if audio_file is None:
        return "No audio received", None
    start_time = time.time()
    result = model.transcribe(audio_file)
    user_input = result["text"]
    response = conversation_with_voice(user_input)
    end_time = time.time()
    print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
    if "error" in response:
        return response.get("error"), None
    return response["text_response"], response["audio_path"]

def text_to_voice(text_input):
    if not text_input.strip():
        return "Please enter a question.", None
    start_time = time.time()
    response = conversation_with_voice(text_input)
    end_time = time.time()
    print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
    if "error" in response:
        return response.get("error"), None
    return response["text_response"], response["audio_path"]

with gr.Blocks(title="Sesame AI POC") as demo:
    gr.Markdown("# Sesame AI POC")
    gr.Markdown(description)

    with gr.Tab("Speak to Sesame"):
        mic_input = gr.Audio(type="filepath", label="Speak your question")
        mic_output_text = gr.Textbox(label="AI Response")
        mic_output_audio = gr.Audio(label="Sesame AI Voice")
        mic_button = gr.Button("Submit Voice")
        mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio])

    with gr.Tab("Type to Sesame"):
        text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?")
        text_output_text = gr.Textbox(label="AI Response")
        text_output_audio = gr.Audio(label="Sesame AI Voice")
        text_button = gr.Button("Submit Text")
        text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])

    gr.Markdown("""
**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.  

**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
""")

demo.launch()