Spaces:

fastrtc
/

echo-audio-gradio

Sleeping

App Files Files Community

freddyaboulton HF Staff commited on Feb 22

Commit

606dee0

verified ·

1 Parent(s): 6ae9e35

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +54 -9
demo.py +28 -0
script.md +26 -0

app.py CHANGED Viewed

@@ -1,21 +1,66 @@
 import numpy as np
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
-from fastrtc import ReplyOnPause, Stream, get_twilio_turn_credentials
 from gradio.utils import get_space
-def detection(audio: tuple[int, np.ndarray]):
-    # Implement any iterator that yields audio
-    # See "LLM Voice Chat" for a more complete example
-    yield audio
 stream = Stream(
-    handler=ReplyOnPause(detection),
     modality="audio",
     mode="send-receive",
-    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
     concurrency_limit=20 if get_space() else None,
 )
@@ -35,9 +80,9 @@ if __name__ == "__main__":
     import os
     if (mode := os.getenv("MODE")) == "UI":
-        stream.ui.launch(server_port=7860, server_name="0.0.0.0")
     elif mode == "PHONE":
-        stream.fastphone(host="0.0.0.0", port=7860)
     else:
         import uvicorn

+import os
+import time
 import numpy as np
+from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
+from fastrtc import (
+    ReplyOnPause,
+    Stream,
+    get_stt_model,
+    get_tts_model,
+)
 from gradio.utils import get_space
+from numpy.typing import NDArray
+from openai import OpenAI
+load_dotenv()
+sambanova_client = OpenAI(
+    api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
+)
+stt_model = get_stt_model()
+tts_model = get_tts_model()
+chat_history = [
+    {
+        "role": "system",
+        "content": (
+            "You are a helpful assistant having a spoken conversation."
+            "Please keep your answers short and concise."
+        ),
+    }
+]
+def echo(audio: tuple[int, NDArray[np.int16]]):
+    prompt = stt_model.stt(audio)
+    print("prompt", prompt)
+    chat_history.append({"role": "user", "content": prompt})
+    start_time = time.time()
+    response = sambanova_client.chat.completions.create(
+        model="Meta-Llama-3.2-3B-Instruct",
+        messages=chat_history,
+        max_tokens=200,
+    )
+    end_time = time.time()
+    print("time taken inference", end_time - start_time)
+    prompt = response.choices[0].message.content
+    chat_history.append({"role": "assistant", "content": prompt})
+    start_time = time.time()
+    for audio_chunk in tts_model.stream_tts_sync(prompt):
+        yield audio_chunk
+    end_time = time.time()
+    print("time taken tts", end_time - start_time)
 stream = Stream(
+    handler=ReplyOnPause(echo),
     modality="audio",
     mode="send-receive",
+    rtc_configuration=None,  # get_twilio_turn_credentials() if get_space() else None,
     concurrency_limit=20 if get_space() else None,
 )
     import os
     if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
+        stream.fastphone(port=7860)
     else:
         import uvicorn

demo.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from fastrtc import ReplyOnPause, Stream, get_stt_model, get_tts_model
+from openai import OpenAI
+sambanova_client = OpenAI(
+    api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
+)
+stt_model = get_stt_model()
+tts_model = get_tts_model()
+def echo(audio):
+    prompt = stt_model.stt(audio)
+    response = sambanova_client.chat.completions.create(
+        model="Meta-Llama-3.2-3B-Instruct",
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=200,
+    )
+    prompt = response.choices[0].message.content
+    for audio_chunk in tts_model.stream_tts_sync(prompt):
+        yield audio_chunk
+stream = Stream(ReplyOnPause(echo), modality="audio", mode="send-receive")
+stream.ui.launch()

script.md ADDED Viewed

	@@ -0,0 +1,26 @@

+Hi, I'm Freddy and I want to give a tour of FastRTC - the real-time communication library for Python.
+Let's start with the basics - echoing audio.
+In FastRTC, you can wrap any iterator with `ReplyOnPause` and pass it to the `Stream` class.
+This will create a WebRTC-powered web server that handles voice detection and turn taking - you just worry about the logic for the generating the response.
+Each stream comes with a built-in webRTC-powered Gradio UI that you can use for testing.
+Simply call `ui.launch()`. Let's see it in action.
+We can level up our application by having an LLM generate the response.
+We'll import the SambaNova API as well as some FastRTC utils for doing speech-to-text and text-to-speech and then pipe them all together.
+Importantly, you can use any LLM, speech-to-text, or text-to-speech model. Even an audio-to-audio model.
+Bring the tools you love and we'll just handle the real-time communication.
+You can also call into the stream for FREE if you have a Hugging Face Token.
+Finally, deployment is really easy too. You can stick with Gradio or mount the stream in a FastAPI app and build any application you want. By the way, video is supported too!
+Thanks for watching! Please visit fastrtc.org to see the cookbook for all the demos shown here as well as the docs.