Spaces:
Running
Running
import gradio as gr | |
import whisper | |
import time | |
from main import conversation_with_voice | |
# Load Whisper model | |
model = whisper.load_model("base") | |
# Description displayed at the top of the UI | |
description = """ | |
Proof Of Concept | |
This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities. | |
**Why each tool was added:** | |
- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech. | |
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions. | |
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech. | |
**Example questions you can ask:** | |
- What are the healthiest oils to cook with? | |
- How much water should I drink daily? | |
- What are good snacks for weight loss? | |
Created by Kara Granados | |
""" | |
def voice_to_voice(audio_file): | |
if audio_file is None: | |
return "No audio received", None | |
start_time = time.time() | |
result = model.transcribe(audio_file) | |
user_input = result["text"] | |
response = conversation_with_voice(user_input) | |
end_time = time.time() | |
print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds") | |
if "error" in response: | |
return response.get("error"), None | |
return response["text_response"], response["audio_path"] | |
def text_to_voice(text_input): | |
if not text_input.strip(): | |
return "Please enter a question.", None | |
start_time = time.time() | |
response = conversation_with_voice(text_input) | |
end_time = time.time() | |
print(f"Total processing time (text input): {end_time - start_time:.2f} seconds") | |
if "error" in response: | |
return response.get("error"), None | |
return response["text_response"], response["audio_path"] | |
with gr.Blocks(title="Sesame AI POC") as demo: | |
gr.Markdown("# Sesame AI POC") | |
gr.Markdown(description) | |
with gr.Tab("Speak to Sesame"): | |
mic_input = gr.Audio(type="filepath", label="Speak your question") | |
mic_output_text = gr.Textbox(label="AI Response") | |
mic_output_audio = gr.Audio(label="Sesame AI Voice") | |
mic_button = gr.Button("Submit Voice") | |
mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio]) | |
with gr.Tab("Type to Sesame"): | |
text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?") | |
text_output_text = gr.Textbox(label="AI Response") | |
text_output_audio = gr.Audio(label="Sesame AI Voice") | |
text_button = gr.Button("Submit Text") | |
text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio]) | |
gr.Markdown(""" | |
**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance. | |
**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity. | |
""") | |
demo.launch() | |