Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import uuid | |
import threading | |
from openai import OpenAI | |
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS | |
# Load OpenAI API key | |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
if not OPENAI_API_KEY: | |
raise ValueError("OPENAI_API_KEY environment variable must be set") | |
client = OpenAI(api_key=OPENAI_API_KEY) | |
# Session state | |
session_id = str(uuid.uuid4()) | |
if session_id not in connections: | |
connections[session_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, session_id) | |
threading.Thread(target=connections[session_id].run, daemon=True).start() | |
# Functions for Document Assistant | |
def process_user_input(message, history): | |
if not message: | |
return "Please enter a message.", history | |
try: | |
thread = client.beta.threads.create() | |
client.beta.threads.messages.create( | |
thread_id=thread.id, | |
role="user", | |
content=message | |
) | |
run = client.beta.threads.runs.create( | |
thread_id=thread.id, | |
assistant_id=os.environ.get("ASSISTANT_ID") | |
) | |
while True: | |
run_status = client.beta.threads.runs.retrieve( | |
thread_id=thread.id, | |
run_id=run.id | |
) | |
if run_status.status == "completed": | |
break | |
messages = client.beta.threads.messages.list(thread_id=thread.id) | |
assistant_reply = next((m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), "No response.") | |
history.append((message, assistant_reply)) | |
return "", history | |
except Exception as e: | |
return f"\u274c Error: {str(e)}", history | |
# Functions for Realtime Voice Transcription | |
def send_audio_chunk_realtime(mic_chunk): | |
if session_id not in connections: | |
return "Initializing voice session..." | |
if mic_chunk is not None: | |
sr, y = mic_chunk | |
connections[session_id].enqueue_audio_chunk(sr, y) | |
return connections[session_id].transcript | |
def clear_transcript(): | |
if session_id in connections: | |
connections[session_id].transcript = "" | |
return "" | |
# Gradio UI Components | |
doc_image = gr.Image(label="\ud83d\udcd8 Extracted Document Image", show_label=True, elem_id="docimg", height=480, width=340) | |
chatbot = gr.Chatbot(label="\ud83e\udde0 Document Assistant", elem_id="chatbox", bubble_full_width=False, height=480) | |
prompt = gr.Textbox(placeholder="Ask about the document...", label="Ask about the document") | |
send_btn = gr.Button("Send") | |
# Voice Section | |
audio_in = gr.Audio(label="\ud83c\udfb5 Audio", type="numpy", streaming=True) | |
live_transcript = gr.Textbox(label="Live Transcript", lines=6) | |
clear_btn = gr.Button("Clear Transcript") | |
with gr.Blocks(theme=gr.themes.Base(), css=""" | |
#docimg img { object-fit: contain !important; } | |
#chatbox { border-radius: 10px; } | |
.gr-box { border-radius: 12px; } | |
""") as demo: | |
gr.Markdown("# \ud83e\uddd0 Document AI + \ud83c\udfa7 Voice Assistant") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
doc_image.render() | |
with gr.Column(scale=2): | |
chatbot.render() | |
with gr.Row(): | |
prompt.render() | |
send_btn.render() | |
send_btn.click(fn=process_user_input, inputs=[prompt, chatbot], outputs=[prompt, chatbot]) | |
with gr.Accordion("\ud83c\udf99\ufe0f Or Use Voice Instead", open=False): | |
live_transcript.render() | |
with gr.Row(): | |
audio_in.render() | |
clear_btn.render() | |
audio_in.stream(fn=send_audio_chunk_realtime, inputs=audio_in, outputs=live_transcript) | |
clear_btn.click(fn=clear_transcript, outputs=live_transcript) | |
demo.launch() | |