freddyaboulton HF Staff commited on
Commit
06820f1
·
verified ·
1 Parent(s): 5498fb6

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +54 -9
  2. demo.py +28 -0
  3. script.md +26 -0
app.py CHANGED
@@ -1,21 +1,66 @@
 
 
 
1
  import numpy as np
 
2
  from fastapi import FastAPI
3
  from fastapi.responses import RedirectResponse
4
- from fastrtc import ReplyOnPause, Stream, get_twilio_turn_credentials
 
 
 
 
 
5
  from gradio.utils import get_space
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
7
 
8
- def detection(audio: tuple[int, np.ndarray]):
9
- # Implement any iterator that yields audio
10
- # See "LLM Voice Chat" for a more complete example
11
- yield audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  stream = Stream(
15
- handler=ReplyOnPause(detection),
16
  modality="audio",
17
  mode="send-receive",
18
- rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
19
  concurrency_limit=20 if get_space() else None,
20
  )
21
 
@@ -35,9 +80,9 @@ if __name__ == "__main__":
35
  import os
36
 
37
  if (mode := os.getenv("MODE")) == "UI":
38
- stream.ui.launch(server_port=7860, server_name="0.0.0.0")
39
  elif mode == "PHONE":
40
- stream.fastphone(host="0.0.0.0", port=7860)
41
  else:
42
  import uvicorn
43
 
 
1
+ import os
2
+ import time
3
+
4
  import numpy as np
5
+ from dotenv import load_dotenv
6
  from fastapi import FastAPI
7
  from fastapi.responses import RedirectResponse
8
+ from fastrtc import (
9
+ ReplyOnPause,
10
+ Stream,
11
+ get_stt_model,
12
+ get_tts_model,
13
+ )
14
  from gradio.utils import get_space
15
+ from numpy.typing import NDArray
16
+ from openai import OpenAI
17
+
18
+ load_dotenv()
19
+
20
+ sambanova_client = OpenAI(
21
+ api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
22
+ )
23
+
24
+ stt_model = get_stt_model()
25
+ tts_model = get_tts_model()
26
 
27
+ chat_history = [
28
+ {
29
+ "role": "system",
30
+ "content": (
31
+ "You are a helpful assistant having a spoken conversation."
32
+ "Please keep your answers short and concise."
33
+ ),
34
+ }
35
+ ]
36
 
37
+
38
+ def echo(audio: tuple[int, NDArray[np.int16]]):
39
+ prompt = stt_model.stt(audio)
40
+ print("prompt", prompt)
41
+ chat_history.append({"role": "user", "content": prompt})
42
+ start_time = time.time()
43
+ response = sambanova_client.chat.completions.create(
44
+ model="Meta-Llama-3.2-3B-Instruct",
45
+ messages=chat_history,
46
+ max_tokens=200,
47
+ )
48
+ end_time = time.time()
49
+ print("time taken inference", end_time - start_time)
50
+ prompt = response.choices[0].message.content
51
+ chat_history.append({"role": "assistant", "content": prompt})
52
+ start_time = time.time()
53
+ for audio_chunk in tts_model.stream_tts_sync(prompt):
54
+ yield audio_chunk
55
+ end_time = time.time()
56
+ print("time taken tts", end_time - start_time)
57
 
58
 
59
  stream = Stream(
60
+ handler=ReplyOnPause(echo),
61
  modality="audio",
62
  mode="send-receive",
63
+ rtc_configuration=None, # get_twilio_turn_credentials() if get_space() else None,
64
  concurrency_limit=20 if get_space() else None,
65
  )
66
 
 
80
  import os
81
 
82
  if (mode := os.getenv("MODE")) == "UI":
83
+ stream.ui.launch(server_port=7860)
84
  elif mode == "PHONE":
85
+ stream.fastphone(port=7860)
86
  else:
87
  import uvicorn
88
 
demo.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from fastrtc import ReplyOnPause, Stream, get_stt_model, get_tts_model
4
+ from openai import OpenAI
5
+
6
+ sambanova_client = OpenAI(
7
+ api_key=os.getenv("SAMBANOVA_API_KEY"), base_url="https://api.sambanova.ai/v1"
8
+ )
9
+
10
+ stt_model = get_stt_model()
11
+ tts_model = get_tts_model()
12
+
13
+
14
+ def echo(audio):
15
+ prompt = stt_model.stt(audio)
16
+ response = sambanova_client.chat.completions.create(
17
+ model="Meta-Llama-3.2-3B-Instruct",
18
+ messages=[{"role": "user", "content": prompt}],
19
+ max_tokens=200,
20
+ )
21
+ prompt = response.choices[0].message.content
22
+ for audio_chunk in tts_model.stream_tts_sync(prompt):
23
+ yield audio_chunk
24
+
25
+
26
+ stream = Stream(ReplyOnPause(echo), modality="audio", mode="send-receive")
27
+
28
+ stream.ui.launch()
script.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hi, I'm Freddy and I want to give a tour of FastRTC - the real-time communication library for Python.
2
+
3
+ Let's start with the basics - echoing audio.
4
+
5
+ In FastRTC, you can wrap any iterator with `ReplyOnPause` and pass it to the `Stream` class.
6
+
7
+ This will create a WebRTC-powered web server that handles voice detection and turn taking - you just worry about the logic for the generating the response.
8
+
9
+ Each stream comes with a built-in webRTC-powered Gradio UI that you can use for testing.
10
+
11
+ Simply call `ui.launch()`. Let's see it in action.
12
+
13
+ We can level up our application by having an LLM generate the response.
14
+
15
+ We'll import the SambaNova API as well as some FastRTC utils for doing speech-to-text and text-to-speech and then pipe them all together.
16
+
17
+ Importantly, you can use any LLM, speech-to-text, or text-to-speech model. Even an audio-to-audio model.
18
+ Bring the tools you love and we'll just handle the real-time communication.
19
+
20
+ You can also call into the stream for FREE if you have a Hugging Face Token.
21
+
22
+ Finally, deployment is really easy too. You can stick with Gradio or mount the stream in a FastAPI app and build any application you want. By the way, video is supported too!
23
+
24
+ Thanks for watching! Please visit fastrtc.org to see the cookbook for all the demos shown here as well as the docs.
25
+
26
+