Spaces:
Running
Running
File size: 3,223 Bytes
5f13ae1 86b8225 5f13ae1 6606fbd 86b8225 6606fbd 86b8225 6606fbd 86b8225 5f13ae1 86b8225 5f13ae1 9711e69 86b8225 5f13ae1 6606fbd 86b8225 5f13ae1 86b8225 5e903c1 5f13ae1 5e903c1 5f13ae1 5e903c1 9711e69 5e903c1 9711e69 5e903c1 9711e69 86b8225 5f13ae1 86b8225 5f13ae1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import whisper
import time
from main import conversation_with_voice
# Load Whisper model
model = whisper.load_model("base")
# Description displayed at the top of the UI
description = """
Proof Of Concept
This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
**Why each tool was added:**
- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
**Example questions you can ask:**
- What are the healthiest oils to cook with?
- How much water should I drink daily?
- What are good snacks for weight loss?
Created by Kara Granados
"""
def voice_to_voice(audio_file):
if audio_file is None:
return "No audio received", None
start_time = time.time()
result = model.transcribe(audio_file)
user_input = result["text"]
response = conversation_with_voice(user_input)
end_time = time.time()
print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
if "error" in response:
return response.get("error"), None
return response["text_response"], response["audio_path"]
def text_to_voice(text_input):
if not text_input.strip():
return "Please enter a question.", None
start_time = time.time()
response = conversation_with_voice(text_input)
end_time = time.time()
print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
if "error" in response:
return response.get("error"), None
return response["text_response"], response["audio_path"]
with gr.Blocks(title="Sesame AI POC") as demo:
gr.Markdown("# Sesame AI POC")
gr.Markdown(description)
with gr.Tab("Speak to Sesame"):
mic_input = gr.Audio(type="filepath", label="Speak your question")
mic_output_text = gr.Textbox(label="AI Response")
mic_output_audio = gr.Audio(label="Sesame AI Voice")
mic_button = gr.Button("Submit Voice")
mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio])
with gr.Tab("Type to Sesame"):
text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?")
text_output_text = gr.Textbox(label="AI Response")
text_output_audio = gr.Audio(label="Sesame AI Voice")
text_button = gr.Button("Submit Text")
text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
gr.Markdown("""
**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.
**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
""")
demo.launch()
|