Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
-
import os
|
| 2 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
| 3 |
import asyncio
|
| 4 |
import base64
|
| 5 |
import json
|
|
|
|
| 6 |
import pathlib
|
| 7 |
from typing import AsyncGenerator, Literal, List
|
| 8 |
|
|
@@ -10,11 +9,11 @@ import numpy as np
|
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
from fastapi import FastAPI
|
| 12 |
from fastapi.responses import HTMLResponse
|
| 13 |
-
from fastrtc import AsyncStreamHandler, Stream,
|
| 14 |
from pydantic import BaseModel
|
| 15 |
import uvicorn
|
| 16 |
|
| 17 |
-
# --- Import get_space to detect Hugging Face Spaces ---
|
| 18 |
from gradio.utils import get_space
|
| 19 |
|
| 20 |
# --- Document processing and RAG libraries ---
|
|
@@ -116,7 +115,7 @@ def generate_answer(query: str) -> str:
|
|
| 116 |
# 2. Speech-to-Text and Text-to-Speech Functions
|
| 117 |
# ====================================================
|
| 118 |
|
| 119 |
-
#
|
| 120 |
stt_model = whisper.load_model("base", device="cpu")
|
| 121 |
|
| 122 |
def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
|
|
@@ -158,7 +157,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
| 158 |
self.last_input_time = asyncio.get_event_loop().time()
|
| 159 |
|
| 160 |
def copy(self) -> "RAGVoiceHandler":
|
| 161 |
-
# Return a new instance with the same configuration
|
| 162 |
return RAGVoiceHandler(
|
| 163 |
expected_layout="mono",
|
| 164 |
output_sample_rate=self.output_sample_rate,
|
|
@@ -166,7 +164,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
| 166 |
)
|
| 167 |
|
| 168 |
async def stream(self) -> AsyncGenerator[bytes, None]:
|
| 169 |
-
# Continuously check for new audio; if a short silence occurs (timeout), process the buffered utterance.
|
| 170 |
while not self.quit.is_set():
|
| 171 |
try:
|
| 172 |
audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
|
|
@@ -174,7 +171,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
| 174 |
self.last_input_time = asyncio.get_event_loop().time()
|
| 175 |
except asyncio.TimeoutError:
|
| 176 |
if self.input_buffer:
|
| 177 |
-
# Process the buffered utterance
|
| 178 |
audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
|
| 179 |
self.input_buffer = bytearray()
|
| 180 |
query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
|
|
@@ -187,7 +183,6 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
| 187 |
await asyncio.sleep(0.1)
|
| 188 |
|
| 189 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
| 190 |
-
# Each received frame is added as bytes to the input queue
|
| 191 |
sample_rate, audio_array = frame
|
| 192 |
audio_bytes = audio_array.tobytes()
|
| 193 |
await self.input_queue.put(audio_bytes)
|
|
@@ -202,13 +197,10 @@ class RAGVoiceHandler(AsyncStreamHandler):
|
|
| 202 |
# 4. Voice Streaming Setup & FastAPI Endpoints
|
| 203 |
# ====================================================
|
| 204 |
|
| 205 |
-
#
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
else:
|
| 209 |
-
rtc_config = get_twilio_turn_credentials()
|
| 210 |
|
| 211 |
-
# Create a Stream instance using our RAGVoiceHandler.
|
| 212 |
stream = Stream(
|
| 213 |
modality="audio",
|
| 214 |
mode="send-receive",
|
|
@@ -218,7 +210,6 @@ stream = Stream(
|
|
| 218 |
time_limit=90,
|
| 219 |
)
|
| 220 |
|
| 221 |
-
# Define a simple input hook (if needed by the client to initialize the call)
|
| 222 |
class InputData(BaseModel):
|
| 223 |
webrtc_id: str
|
| 224 |
|
|
@@ -230,13 +221,10 @@ async def input_hook(body: InputData):
|
|
| 230 |
stream.set_input(body.webrtc_id)
|
| 231 |
return {"status": "ok"}
|
| 232 |
|
| 233 |
-
# Endpoint to handle WebRTC offer from the client (for voice calls)
|
| 234 |
@app.post("/webrtc/offer")
|
| 235 |
async def webrtc_offer(offer: dict):
|
| 236 |
-
# This uses fastrtc's built-in handling of the offer to set up the connection.
|
| 237 |
return await stream.handle_offer(offer)
|
| 238 |
|
| 239 |
-
# Serve your existing HTML file (which contains your voice UI)
|
| 240 |
@app.get("/")
|
| 241 |
async def index():
|
| 242 |
index_path = current_dir / "index.html"
|
|
@@ -250,14 +238,12 @@ async def index():
|
|
| 250 |
if __name__ == "__main__":
|
| 251 |
mode = os.getenv("MODE", "PHONE")
|
| 252 |
if mode == "UI":
|
| 253 |
-
# Optionally launch a text-based Gradio interface for testing the RAG backend
|
| 254 |
import gradio as gr
|
| 255 |
def gradio_chat(user_input):
|
| 256 |
return generate_answer(user_input)
|
| 257 |
iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
|
| 258 |
iface.launch(server_port=7860)
|
| 259 |
elif mode == "PHONE":
|
| 260 |
-
# Run the FastAPI app so that callers can use the voice functionality.
|
| 261 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 262 |
else:
|
| 263 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import base64
|
| 3 |
import json
|
| 4 |
+
import os
|
| 5 |
import pathlib
|
| 6 |
from typing import AsyncGenerator, Literal, List
|
| 7 |
|
|
|
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
from fastapi import FastAPI
|
| 11 |
from fastapi.responses import HTMLResponse
|
| 12 |
+
from fastrtc import AsyncStreamHandler, Stream, wait_for_item
|
| 13 |
from pydantic import BaseModel
|
| 14 |
import uvicorn
|
| 15 |
|
| 16 |
+
# --- Import get_space to detect Hugging Face Spaces (optional) ---
|
| 17 |
from gradio.utils import get_space
|
| 18 |
|
| 19 |
# --- Document processing and RAG libraries ---
|
|
|
|
| 115 |
# 2. Speech-to-Text and Text-to-Speech Functions
|
| 116 |
# ====================================================
|
| 117 |
|
| 118 |
+
# Force Whisper to load on CPU explicitly
|
| 119 |
stt_model = whisper.load_model("base", device="cpu")
|
| 120 |
|
| 121 |
def speech_to_text(audio_array: np.ndarray, sample_rate: int = 16000) -> str:
|
|
|
|
| 157 |
self.last_input_time = asyncio.get_event_loop().time()
|
| 158 |
|
| 159 |
def copy(self) -> "RAGVoiceHandler":
|
|
|
|
| 160 |
return RAGVoiceHandler(
|
| 161 |
expected_layout="mono",
|
| 162 |
output_sample_rate=self.output_sample_rate,
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
async def stream(self) -> AsyncGenerator[bytes, None]:
|
|
|
|
| 167 |
while not self.quit.is_set():
|
| 168 |
try:
|
| 169 |
audio_data = await asyncio.wait_for(self.input_queue.get(), timeout=0.5)
|
|
|
|
| 171 |
self.last_input_time = asyncio.get_event_loop().time()
|
| 172 |
except asyncio.TimeoutError:
|
| 173 |
if self.input_buffer:
|
|
|
|
| 174 |
audio_array = np.frombuffer(self.input_buffer, dtype=np.int16)
|
| 175 |
self.input_buffer = bytearray()
|
| 176 |
query_text = speech_to_text(audio_array, sample_rate=self.input_sample_rate)
|
|
|
|
| 183 |
await asyncio.sleep(0.1)
|
| 184 |
|
| 185 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
|
|
|
| 186 |
sample_rate, audio_array = frame
|
| 187 |
audio_bytes = audio_array.tobytes()
|
| 188 |
await self.input_queue.put(audio_bytes)
|
|
|
|
| 197 |
# 4. Voice Streaming Setup & FastAPI Endpoints
|
| 198 |
# ====================================================
|
| 199 |
|
| 200 |
+
# For ZeroGPU spaces, supply a dummy RTC configuration.
|
| 201 |
+
# (This avoids calling get_twilio_turn_credentials() which depends on NVML.)
|
| 202 |
+
rtc_config = {"iceServers": [{"urls": "stun:stun.l.google.com:19302"}]}
|
|
|
|
|
|
|
| 203 |
|
|
|
|
| 204 |
stream = Stream(
|
| 205 |
modality="audio",
|
| 206 |
mode="send-receive",
|
|
|
|
| 210 |
time_limit=90,
|
| 211 |
)
|
| 212 |
|
|
|
|
| 213 |
class InputData(BaseModel):
|
| 214 |
webrtc_id: str
|
| 215 |
|
|
|
|
| 221 |
stream.set_input(body.webrtc_id)
|
| 222 |
return {"status": "ok"}
|
| 223 |
|
|
|
|
| 224 |
@app.post("/webrtc/offer")
|
| 225 |
async def webrtc_offer(offer: dict):
|
|
|
|
| 226 |
return await stream.handle_offer(offer)
|
| 227 |
|
|
|
|
| 228 |
@app.get("/")
|
| 229 |
async def index():
|
| 230 |
index_path = current_dir / "index.html"
|
|
|
|
| 238 |
if __name__ == "__main__":
|
| 239 |
mode = os.getenv("MODE", "PHONE")
|
| 240 |
if mode == "UI":
|
|
|
|
| 241 |
import gradio as gr
|
| 242 |
def gradio_chat(user_input):
|
| 243 |
return generate_answer(user_input)
|
| 244 |
iface = gr.Interface(fn=gradio_chat, inputs="text", outputs="text", title="Customer Support Chatbot")
|
| 245 |
iface.launch(server_port=7860)
|
| 246 |
elif mode == "PHONE":
|
|
|
|
| 247 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 248 |
else:
|
| 249 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|