Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,21 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
|
|
|
| 2 |
from fastapi import FastAPI
|
| 3 |
from fastapi.responses import RedirectResponse
|
| 4 |
-
from fastrtc import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from gradio.utils import get_space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
stream = Stream(
|
| 15 |
-
handler=ReplyOnPause(
|
| 16 |
modality="audio",
|
| 17 |
mode="send-receive",
|
| 18 |
-
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
| 19 |
concurrency_limit=20 if get_space() else None,
|
| 20 |
)
|
| 21 |
|
|
@@ -35,9 +80,9 @@ if __name__ == "__main__":
|
|
| 35 |
import os
|
| 36 |
|
| 37 |
if (mode := os.getenv("MODE")) == "UI":
|
| 38 |
-
stream.ui.launch(server_port=7860
|
| 39 |
elif mode == "PHONE":
|
| 40 |
-
stream.fastphone(
|
| 41 |
else:
|
| 42 |
import uvicorn
|
| 43 |
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
import numpy as np
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
from fastapi import FastAPI
|
| 7 |
from fastapi.responses import RedirectResponse
|
| 8 |
+
from fastrtc import (
|
| 9 |
+
ReplyOnPause,
|
| 10 |
+
Stream,
|
| 11 |
+
get_stt_model,
|
| 12 |
+
get_tts_model,
|
| 13 |
+
)
|
| 14 |
from gradio.utils import get_space
|
| 15 |
+
from numpy.typing import NDArray
|
| 16 |
+
from openai import OpenAI
|
| 17 |
+
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
sambanova_client = OpenAI(
|
| 21 |
+
api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
stt_model = get_stt_model()
|
| 25 |
+
tts_model = get_tts_model()
|
| 26 |
|
| 27 |
+
chat_history = [
|
| 28 |
+
{
|
| 29 |
+
"role": "system",
|
| 30 |
+
"content": (
|
| 31 |
+
"You are a helpful assistant having a spoken conversation."
|
| 32 |
+
"Please keep your answers short and concise."
|
| 33 |
+
),
|
| 34 |
+
}
|
| 35 |
+
]
|
| 36 |
|
| 37 |
+
|
| 38 |
+
def echo(audio: tuple[int, NDArray[np.int16]]):
|
| 39 |
+
prompt = stt_model.stt(audio)
|
| 40 |
+
print("prompt", prompt)
|
| 41 |
+
chat_history.append({"role": "user", "content": prompt})
|
| 42 |
+
start_time = time.time()
|
| 43 |
+
response = sambanova_client.chat.completions.create(
|
| 44 |
+
model="Meta-Llama-3.2-3B-Instruct",
|
| 45 |
+
messages=chat_history,
|
| 46 |
+
max_tokens=200,
|
| 47 |
+
)
|
| 48 |
+
end_time = time.time()
|
| 49 |
+
print("time taken inference", end_time - start_time)
|
| 50 |
+
prompt = response.choices[0].message.content
|
| 51 |
+
chat_history.append({"role": "assistant", "content": prompt})
|
| 52 |
+
start_time = time.time()
|
| 53 |
+
for audio_chunk in tts_model.stream_tts_sync(prompt):
|
| 54 |
+
yield audio_chunk
|
| 55 |
+
end_time = time.time()
|
| 56 |
+
print("time taken tts", end_time - start_time)
|
| 57 |
|
| 58 |
|
| 59 |
stream = Stream(
|
| 60 |
+
handler=ReplyOnPause(echo),
|
| 61 |
modality="audio",
|
| 62 |
mode="send-receive",
|
| 63 |
+
rtc_configuration=None, # get_twilio_turn_credentials() if get_space() else None,
|
| 64 |
concurrency_limit=20 if get_space() else None,
|
| 65 |
)
|
| 66 |
|
|
|
|
| 80 |
import os
|
| 81 |
|
| 82 |
if (mode := os.getenv("MODE")) == "UI":
|
| 83 |
+
stream.ui.launch(server_port=7860)
|
| 84 |
elif mode == "PHONE":
|
| 85 |
+
stream.fastphone(port=7860)
|
| 86 |
else:
|
| 87 |
import uvicorn
|
| 88 |
|
demo.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from fastrtc import ReplyOnPause, Stream, get_stt_model, get_tts_model
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
|
| 6 |
+
sambanova_client = OpenAI(
|
| 7 |
+
api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
stt_model = get_stt_model()
|
| 11 |
+
tts_model = get_tts_model()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def echo(audio):
|
| 15 |
+
prompt = stt_model.stt(audio)
|
| 16 |
+
response = sambanova_client.chat.completions.create(
|
| 17 |
+
model="Meta-Llama-3.2-3B-Instruct",
|
| 18 |
+
messages=[{"role": "user", "content": prompt}],
|
| 19 |
+
max_tokens=200,
|
| 20 |
+
)
|
| 21 |
+
prompt = response.choices[0].message.content
|
| 22 |
+
for audio_chunk in tts_model.stream_tts_sync(prompt):
|
| 23 |
+
yield audio_chunk
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
stream = Stream(ReplyOnPause(echo), modality="audio", mode="send-receive")
|
| 27 |
+
|
| 28 |
+
stream.ui.launch()
|
script.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Hi, I'm Freddy and I want to give a tour of FastRTC - the real-time communication library for Python.
|
| 2 |
+
|
| 3 |
+
Let's start with the basics - echoing audio.
|
| 4 |
+
|
| 5 |
+
In FastRTC, you can wrap any iterator with `ReplyOnPause` and pass it to the `Stream` class.
|
| 6 |
+
|
| 7 |
+
This will create a WebRTC-powered web server that handles voice detection and turn taking - you just worry about the logic for the generating the response.
|
| 8 |
+
|
| 9 |
+
Each stream comes with a built-in webRTC-powered Gradio UI that you can use for testing.
|
| 10 |
+
|
| 11 |
+
Simply call `ui.launch()`. Let's see it in action.
|
| 12 |
+
|
| 13 |
+
We can level up our application by having an LLM generate the response.
|
| 14 |
+
|
| 15 |
+
We'll import the SambaNova API as well as some FastRTC utils for doing speech-to-text and text-to-speech and then pipe them all together.
|
| 16 |
+
|
| 17 |
+
Importantly, you can use any LLM, speech-to-text, or text-to-speech model. Even an audio-to-audio model.
|
| 18 |
+
Bring the tools you love and we'll just handle the real-time communication.
|
| 19 |
+
|
| 20 |
+
You can also call into the stream for FREE if you have a Hugging Face Token.
|
| 21 |
+
|
| 22 |
+
Finally, deployment is really easy too. You can stick with Gradio or mount the stream in a FastAPI app and build any application you want. By the way, video is supported too!
|
| 23 |
+
|
| 24 |
+
Thanks for watching! Please visit fastrtc.org to see the cookbook for all the demos shown here as well as the docs.
|
| 25 |
+
|
| 26 |
+
|