IAMTFRMZA's picture
Update app.py
4a071d5 verified
raw
history blame
3.72 kB
import gradio as gr
import os
import uuid
import threading
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
# Load OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY environment variable must be set")
client = OpenAI(api_key=OPENAI_API_KEY)
# Session state
session_id = str(uuid.uuid4())
if session_id not in connections:
connections[session_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, session_id)
threading.Thread(target=connections[session_id].run, daemon=True).start()
# Functions for Document Assistant
def process_user_input(message, history):
if not message:
return "Please enter a message.", history
try:
thread = client.beta.threads.create()
client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content=message
)
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=os.environ.get("ASSISTANT_ID")
)
while True:
run_status = client.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
if run_status.status == "completed":
break
messages = client.beta.threads.messages.list(thread_id=thread.id)
assistant_reply = next((m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), "No response.")
history.append((message, assistant_reply))
return "", history
except Exception as e:
return f"\u274c Error: {str(e)}", history
# Functions for Realtime Voice Transcription
def send_audio_chunk_realtime(mic_chunk):
if session_id not in connections:
return "Initializing voice session..."
if mic_chunk is not None:
sr, y = mic_chunk
connections[session_id].enqueue_audio_chunk(sr, y)
return connections[session_id].transcript
def clear_transcript():
if session_id in connections:
connections[session_id].transcript = ""
return ""
# Gradio UI Components
doc_image = gr.Image(label="\ud83d\udcd8 Extracted Document Image", show_label=True, elem_id="docimg", height=480, width=340)
chatbot = gr.Chatbot(label="\ud83e\udde0 Document Assistant", elem_id="chatbox", bubble_full_width=False, height=480)
prompt = gr.Textbox(placeholder="Ask about the document...", label="Ask about the document")
send_btn = gr.Button("Send")
# Voice Section
audio_in = gr.Audio(label="\ud83c\udfb5 Audio", type="numpy", streaming=True)
live_transcript = gr.Textbox(label="Live Transcript", lines=6)
clear_btn = gr.Button("Clear Transcript")
with gr.Blocks(theme=gr.themes.Base(), css="""
#docimg img { object-fit: contain !important; }
#chatbox { border-radius: 10px; }
.gr-box { border-radius: 12px; }
""") as demo:
gr.Markdown("# \ud83e\uddd0 Document AI + \ud83c\udfa7 Voice Assistant")
with gr.Row():
with gr.Column(scale=1):
doc_image.render()
with gr.Column(scale=2):
chatbot.render()
with gr.Row():
prompt.render()
send_btn.render()
send_btn.click(fn=process_user_input, inputs=[prompt, chatbot], outputs=[prompt, chatbot])
with gr.Accordion("\ud83c\udf99\ufe0f Or Use Voice Instead", open=False):
live_transcript.render()
with gr.Row():
audio_in.render()
clear_btn.render()
audio_in.stream(fn=send_audio_chunk_realtime, inputs=audio_in, outputs=live_transcript)
clear_btn.click(fn=clear_transcript, outputs=live_transcript)
demo.launch()