Spaces:
Sleeping
Sleeping
File size: 3,724 Bytes
d439419 442d49c e10a51a d439419 004ead9 d439419 b74ae51 004ead9 442d49c 004ead9 6b189d0 004ead9 f383782 004ead9 4a071d5 004ead9 d439419 004ead9 6b189d0 004ead9 d439419 004ead9 4a071d5 004ead9 d439419 4a071d5 004ead9 d439419 004ead9 e10a51a 004ead9 e10a51a 6b189d0 004ead9 e10a51a 004ead9 d439419 6b189d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import os
import uuid
import threading
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
# Load OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY environment variable must be set")
client = OpenAI(api_key=OPENAI_API_KEY)
# Session state
session_id = str(uuid.uuid4())
if session_id not in connections:
connections[session_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, session_id)
threading.Thread(target=connections[session_id].run, daemon=True).start()
# Functions for Document Assistant
def process_user_input(message, history):
if not message:
return "Please enter a message.", history
try:
thread = client.beta.threads.create()
client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content=message
)
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=os.environ.get("ASSISTANT_ID")
)
while True:
run_status = client.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
if run_status.status == "completed":
break
messages = client.beta.threads.messages.list(thread_id=thread.id)
assistant_reply = next((m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), "No response.")
history.append((message, assistant_reply))
return "", history
except Exception as e:
return f"\u274c Error: {str(e)}", history
# Functions for Realtime Voice Transcription
def send_audio_chunk_realtime(mic_chunk):
if session_id not in connections:
return "Initializing voice session..."
if mic_chunk is not None:
sr, y = mic_chunk
connections[session_id].enqueue_audio_chunk(sr, y)
return connections[session_id].transcript
def clear_transcript():
if session_id in connections:
connections[session_id].transcript = ""
return ""
# Gradio UI Components
doc_image = gr.Image(label="\ud83d\udcd8 Extracted Document Image", show_label=True, elem_id="docimg", height=480, width=340)
chatbot = gr.Chatbot(label="\ud83e\udde0 Document Assistant", elem_id="chatbox", bubble_full_width=False, height=480)
prompt = gr.Textbox(placeholder="Ask about the document...", label="Ask about the document")
send_btn = gr.Button("Send")
# Voice Section
audio_in = gr.Audio(label="\ud83c\udfb5 Audio", type="numpy", streaming=True)
live_transcript = gr.Textbox(label="Live Transcript", lines=6)
clear_btn = gr.Button("Clear Transcript")
with gr.Blocks(theme=gr.themes.Base(), css="""
#docimg img { object-fit: contain !important; }
#chatbox { border-radius: 10px; }
.gr-box { border-radius: 12px; }
""") as demo:
gr.Markdown("# \ud83e\uddd0 Document AI + \ud83c\udfa7 Voice Assistant")
with gr.Row():
with gr.Column(scale=1):
doc_image.render()
with gr.Column(scale=2):
chatbot.render()
with gr.Row():
prompt.render()
send_btn.render()
send_btn.click(fn=process_user_input, inputs=[prompt, chatbot], outputs=[prompt, chatbot])
with gr.Accordion("\ud83c\udf99\ufe0f Or Use Voice Instead", open=False):
live_transcript.render()
with gr.Row():
audio_in.render()
clear_btn.render()
audio_in.stream(fn=send_audio_chunk_realtime, inputs=audio_in, outputs=live_transcript)
clear_btn.click(fn=clear_transcript, outputs=live_transcript)
demo.launch()
|