Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
@@ -1,21 +1,66 @@
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
|
|
2 |
from fastapi import FastAPI
|
3 |
from fastapi.responses import RedirectResponse
|
4 |
-
from fastrtc import
|
|
|
|
|
|
|
|
|
|
|
5 |
from gradio.utils import get_space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
stream = Stream(
|
15 |
-
handler=ReplyOnPause(
|
16 |
modality="audio",
|
17 |
mode="send-receive",
|
18 |
-
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
19 |
concurrency_limit=20 if get_space() else None,
|
20 |
)
|
21 |
|
@@ -35,9 +80,9 @@ if __name__ == "__main__":
|
|
35 |
import os
|
36 |
|
37 |
if (mode := os.getenv("MODE")) == "UI":
|
38 |
-
stream.ui.launch(server_port=7860
|
39 |
elif mode == "PHONE":
|
40 |
-
stream.fastphone(
|
41 |
else:
|
42 |
import uvicorn
|
43 |
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
|
4 |
import numpy as np
|
5 |
+
from dotenv import load_dotenv
|
6 |
from fastapi import FastAPI
|
7 |
from fastapi.responses import RedirectResponse
|
8 |
+
from fastrtc import (
|
9 |
+
ReplyOnPause,
|
10 |
+
Stream,
|
11 |
+
get_stt_model,
|
12 |
+
get_tts_model,
|
13 |
+
)
|
14 |
from gradio.utils import get_space
|
15 |
+
from numpy.typing import NDArray
|
16 |
+
from openai import OpenAI
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
sambanova_client = OpenAI(
|
21 |
+
api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
|
22 |
+
)
|
23 |
+
|
24 |
+
stt_model = get_stt_model()
|
25 |
+
tts_model = get_tts_model()
|
26 |
|
27 |
+
chat_history = [
|
28 |
+
{
|
29 |
+
"role": "system",
|
30 |
+
"content": (
|
31 |
+
"You are a helpful assistant having a spoken conversation."
|
32 |
+
"Please keep your answers short and concise."
|
33 |
+
),
|
34 |
+
}
|
35 |
+
]
|
36 |
|
37 |
+
|
38 |
+
def echo(audio: tuple[int, NDArray[np.int16]]):
|
39 |
+
prompt = stt_model.stt(audio)
|
40 |
+
print("prompt", prompt)
|
41 |
+
chat_history.append({"role": "user", "content": prompt})
|
42 |
+
start_time = time.time()
|
43 |
+
response = sambanova_client.chat.completions.create(
|
44 |
+
model="Meta-Llama-3.2-3B-Instruct",
|
45 |
+
messages=chat_history,
|
46 |
+
max_tokens=200,
|
47 |
+
)
|
48 |
+
end_time = time.time()
|
49 |
+
print("time taken inference", end_time - start_time)
|
50 |
+
prompt = response.choices[0].message.content
|
51 |
+
chat_history.append({"role": "assistant", "content": prompt})
|
52 |
+
start_time = time.time()
|
53 |
+
for audio_chunk in tts_model.stream_tts_sync(prompt):
|
54 |
+
yield audio_chunk
|
55 |
+
end_time = time.time()
|
56 |
+
print("time taken tts", end_time - start_time)
|
57 |
|
58 |
|
59 |
stream = Stream(
|
60 |
+
handler=ReplyOnPause(echo),
|
61 |
modality="audio",
|
62 |
mode="send-receive",
|
63 |
+
rtc_configuration=None, # get_twilio_turn_credentials() if get_space() else None,
|
64 |
concurrency_limit=20 if get_space() else None,
|
65 |
)
|
66 |
|
|
|
80 |
import os
|
81 |
|
82 |
if (mode := os.getenv("MODE")) == "UI":
|
83 |
+
stream.ui.launch(server_port=7860)
|
84 |
elif mode == "PHONE":
|
85 |
+
stream.fastphone(port=7860)
|
86 |
else:
|
87 |
import uvicorn
|
88 |
|
demo.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from fastrtc import ReplyOnPause, Stream, get_stt_model, get_tts_model
|
4 |
+
from openai import OpenAI
|
5 |
+
|
6 |
+
sambanova_client = OpenAI(
|
7 |
+
api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
|
8 |
+
)
|
9 |
+
|
10 |
+
stt_model = get_stt_model()
|
11 |
+
tts_model = get_tts_model()
|
12 |
+
|
13 |
+
|
14 |
+
def echo(audio):
|
15 |
+
prompt = stt_model.stt(audio)
|
16 |
+
response = sambanova_client.chat.completions.create(
|
17 |
+
model="Meta-Llama-3.2-3B-Instruct",
|
18 |
+
messages=[{"role": "user", "content": prompt}],
|
19 |
+
max_tokens=200,
|
20 |
+
)
|
21 |
+
prompt = response.choices[0].message.content
|
22 |
+
for audio_chunk in tts_model.stream_tts_sync(prompt):
|
23 |
+
yield audio_chunk
|
24 |
+
|
25 |
+
|
26 |
+
stream = Stream(ReplyOnPause(echo), modality="audio", mode="send-receive")
|
27 |
+
|
28 |
+
stream.ui.launch()
|
script.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hi, I'm Freddy and I want to give a tour of FastRTC - the real-time communication library for Python.
|
2 |
+
|
3 |
+
Let's start with the basics - echoing audio.
|
4 |
+
|
5 |
+
In FastRTC, you can wrap any iterator with `ReplyOnPause` and pass it to the `Stream` class.
|
6 |
+
|
7 |
+
This will create a WebRTC-powered web server that handles voice detection and turn taking - you just worry about the logic for the generating the response.
|
8 |
+
|
9 |
+
Each stream comes with a built-in webRTC-powered Gradio UI that you can use for testing.
|
10 |
+
|
11 |
+
Simply call `ui.launch()`. Let's see it in action.
|
12 |
+
|
13 |
+
We can level up our application by having an LLM generate the response.
|
14 |
+
|
15 |
+
We'll import the SambaNova API as well as some FastRTC utils for doing speech-to-text and text-to-speech and then pipe them all together.
|
16 |
+
|
17 |
+
Importantly, you can use any LLM, speech-to-text, or text-to-speech model. Even an audio-to-audio model.
|
18 |
+
Bring the tools you love and we'll just handle the real-time communication.
|
19 |
+
|
20 |
+
You can also call into the stream for FREE if you have a Hugging Face Token.
|
21 |
+
|
22 |
+
Finally, deployment is really easy too. You can stick with Gradio or mount the stream in a FastAPI app and build any application you want. By the way, video is supported too!
|
23 |
+
|
24 |
+
Thanks for watching! Please visit fastrtc.org to see the cookbook for all the demos shown here as well as the docs.
|
25 |
+
|
26 |
+
|