File size: 3,724 Bytes
d439419
 
 
442d49c
e10a51a
d439419
 
004ead9
 
 
 
d439419
b74ae51
004ead9
 
 
 
442d49c
004ead9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b189d0
004ead9
 
 
 
 
 
 
 
 
 
 
 
 
 
f383782
004ead9
4a071d5
 
004ead9
 
d439419
004ead9
6b189d0
004ead9
 
d439419
004ead9
 
4a071d5
004ead9
 
d439419
4a071d5
004ead9
 
 
 
 
d439419
 
004ead9
 
e10a51a
004ead9
e10a51a
6b189d0
004ead9
e10a51a
004ead9
 
 
 
d439419
6b189d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import os
import uuid
import threading
from openai import OpenAI
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS

# Load OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable must be set")
client = OpenAI(api_key=OPENAI_API_KEY)

# Session state
session_id = str(uuid.uuid4())
if session_id not in connections:
    connections[session_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, session_id)
    threading.Thread(target=connections[session_id].run, daemon=True).start()

# Functions for Document Assistant
def process_user_input(message, history):
    if not message:
        return "Please enter a message.", history
    try:
        thread = client.beta.threads.create()
        client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=message
        )
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=os.environ.get("ASSISTANT_ID")
        )
        while True:
            run_status = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id
            )
            if run_status.status == "completed":
                break
        messages = client.beta.threads.messages.list(thread_id=thread.id)
        assistant_reply = next((m.content[0].text.value for m in reversed(messages.data) if m.role == "assistant"), "No response.")
        history.append((message, assistant_reply))
        return "", history
    except Exception as e:
        return f"\u274c Error: {str(e)}", history

# Functions for Realtime Voice Transcription
def send_audio_chunk_realtime(mic_chunk):
    if session_id not in connections:
        return "Initializing voice session..."
    if mic_chunk is not None:
        sr, y = mic_chunk
        connections[session_id].enqueue_audio_chunk(sr, y)
    return connections[session_id].transcript

def clear_transcript():
    if session_id in connections:
        connections[session_id].transcript = ""
    return ""

# Gradio UI Components
doc_image = gr.Image(label="\ud83d\udcd8 Extracted Document Image", show_label=True, elem_id="docimg", height=480, width=340)
chatbot = gr.Chatbot(label="\ud83e\udde0 Document Assistant", elem_id="chatbox", bubble_full_width=False, height=480)
prompt = gr.Textbox(placeholder="Ask about the document...", label="Ask about the document")
send_btn = gr.Button("Send")

# Voice Section
audio_in = gr.Audio(label="\ud83c\udfb5 Audio", type="numpy", streaming=True)
live_transcript = gr.Textbox(label="Live Transcript", lines=6)
clear_btn = gr.Button("Clear Transcript")

with gr.Blocks(theme=gr.themes.Base(), css="""
    #docimg img { object-fit: contain !important; }
    #chatbox { border-radius: 10px; }
    .gr-box { border-radius: 12px; }
""") as demo:

    gr.Markdown("# \ud83e\uddd0 Document AI + \ud83c\udfa7 Voice Assistant")
    with gr.Row():
        with gr.Column(scale=1):
            doc_image.render()
        with gr.Column(scale=2):
            chatbot.render()

    with gr.Row():
        prompt.render()
        send_btn.render()

    send_btn.click(fn=process_user_input, inputs=[prompt, chatbot], outputs=[prompt, chatbot])

    with gr.Accordion("\ud83c\udf99\ufe0f Or Use Voice Instead", open=False):
        live_transcript.render()
        with gr.Row():
            audio_in.render()
            clear_btn.render()
        audio_in.stream(fn=send_audio_chunk_realtime, inputs=audio_in, outputs=live_transcript)
        clear_btn.click(fn=clear_transcript, outputs=live_transcript)

demo.launch()