IAMTFRMZA's picture
Update app.py
442d49c verified
raw
history blame
3.69 kB
import gradio as gr
import os
import uuid
import threading
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
# Load OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY environment variable must be set")
client = OpenAI(api_key=OPENAI_API_KEY)
# Session state
session_id = str(uuid.uuid4())
if session_id not in connections:
connections[session_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, session_id)
threading.Thread(target=connections[session_id].run, daemon=True).start()
# Functions for Document Assistant
def process_user_input(message, history):
if not message:
return "Please enter a message.", history
try:
thread = client.beta.threads.create()
client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content=message
)
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=os.environ.get("ASSISTANT_ID")
)
while True:
run_status = client.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
if run_status.status == "completed":
break
messages = client.beta.threads.messages.list(thread_id=thread.id)
assistant_reply = next((m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), "No response.")
history.append((message, assistant_reply))
return "", history
except Exception as e:
return f"❌ Error: {str(e)}", history
# Functions for Realtime Voice Transcription
def send_audio_chunk_realtime(mic_chunk):
if session_id not in connections:
return "Initializing voice session..."
if mic_chunk is not None:
sr, y = mic_chunk
connections[session_id].enqueue_audio_chunk(sr, y)
return connections[session_id].transcript
def clear_transcript():
if session_id in connections:
connections[session_id].transcript = ""
return ""
# Gradio UI Components
doc_image = gr.Image(label="πŸ“˜ Extracted Document Image", show_label=True, elem_id="docimg", height=500, width=360)
chatbot = gr.Chatbot(label="🧠 Document Assistant", elem_id="chatbox", bubble_full_width=False)
prompt = gr.Textbox(placeholder="Ask about the document...", label="Ask about the document")
send_btn = gr.Button("Send")
# Voice Section
audio_in = gr.Audio(label="🎡 Audio", type="numpy", streaming=True)
live_transcript = gr.Textbox(label="Live Transcript", lines=6)
clear_btn = gr.Button("Clear Transcript")
with gr.Blocks(theme=gr.themes.Base(), css="""
#docimg img { object-fit: contain !important; }
#chatbox { height: 500px; }
.gr-box { border-radius: 12px; }
""") as demo:
gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")
with gr.Row():
with gr.Column(scale=1):
doc_image.render()
with gr.Column(scale=2):
chatbot.render()
with gr.Row():
prompt.render()
send_btn.render()
send_btn.click(fn=process_user_input, inputs=[prompt, chatbot], outputs=[prompt, chatbot])
with gr.Accordion("πŸŽ™οΈ Or Use Voice Instead", open=False):
live_transcript.render()
with gr.Row():
audio_in.render()
clear_btn.render()
audio_in.stream(fn=send_audio_chunk_realtime, inputs=audio_in, outputs=live_transcript)
clear_btn.click(fn=clear_transcript, outputs=live_transcript)
if __name__ == "__main__":
demo.launch()