Sesame-AI-POC / app.py
karagmercola's picture
Update app.py
5f13ae1 verified
import gradio as gr
import whisper
import time
from main import conversation_with_voice
# Load Whisper model
model = whisper.load_model("base")
# Description displayed at the top of the UI
description = """
Proof Of Concept
This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
**Why each tool was added:**
- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
**Example questions you can ask:**
- What are the healthiest oils to cook with?
- How much water should I drink daily?
- What are good snacks for weight loss?
Created by Kara Granados
"""
def voice_to_voice(audio_file):
if audio_file is None:
return "No audio received", None
start_time = time.time()
result = model.transcribe(audio_file)
user_input = result["text"]
response = conversation_with_voice(user_input)
end_time = time.time()
print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
if "error" in response:
return response.get("error"), None
return response["text_response"], response["audio_path"]
def text_to_voice(text_input):
if not text_input.strip():
return "Please enter a question.", None
start_time = time.time()
response = conversation_with_voice(text_input)
end_time = time.time()
print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
if "error" in response:
return response.get("error"), None
return response["text_response"], response["audio_path"]
with gr.Blocks(title="Sesame AI POC") as demo:
gr.Markdown("# Sesame AI POC")
gr.Markdown(description)
with gr.Tab("Speak to Sesame"):
mic_input = gr.Audio(type="filepath", label="Speak your question")
mic_output_text = gr.Textbox(label="AI Response")
mic_output_audio = gr.Audio(label="Sesame AI Voice")
mic_button = gr.Button("Submit Voice")
mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio])
with gr.Tab("Type to Sesame"):
text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?")
text_output_text = gr.Textbox(label="AI Response")
text_output_audio = gr.Audio(label="Sesame AI Voice")
text_button = gr.Button("Submit Text")
text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
gr.Markdown("""
**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.
**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
""")
demo.launch()