File size: 4,625 Bytes
d439419
 
 
 
 
 
 
 
 
b74ae51
d439419
 
 
 
 
 
 
 
 
 
 
 
eb04d10
d439419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b74ae51
d439419
 
 
 
 
b74ae51
 
 
 
 
 
 
 
d439419
eb04d10
 
 
 
 
 
f383782
eb04d10
d439419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb04d10
d439419
 
 
 
 
 
 
b74ae51
 
eb04d10
b74ae51
eb04d10
 
 
 
 
 
 
 
 
 
 
 
d439419
 
 
 
b74ae51
d439419
b74ae51
d439419
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import os
import json
import uuid
import threading
import time
import re

from openai import OpenAI
from dotenv import load_dotenv
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS

# ------------------ Load API Key ------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSISTANT_ID = os.getenv("ASSISTANT_ID")

if not OPENAI_API_KEY or not ASSISTANT_ID:
    raise ValueError("Missing OPENAI_API_KEY or ASSISTANT_ID in environment variables")

client = OpenAI(api_key=OPENAI_API_KEY)

# ------------------ Chat Threading ------------------
session_threads = {}

def reset_session():
    session_id = str(uuid.uuid4())
    thread = client.beta.threads.create()
    session_threads[session_id] = thread.id
    return session_id

def process_chat(message, history, session_id):
    thread_id = session_threads.get(session_id)
    if not thread_id:
        thread_id = client.beta.threads.create().id
        session_threads[session_id] = thread_id

    client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=message
    )

    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=ASSISTANT_ID
    )

    while True:
        run_status = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run.id
        )
        if run_status.status == "completed":
            break
        time.sleep(1)

    messages = client.beta.threads.messages.list(thread_id=thread_id)
    assistant_response = "⚠️ Assistant did not respond."
    for msg in reversed(messages.data):
        if msg.role == "assistant":
            assistant_response = msg.content[0].text.value
            break

    return assistant_response

def extract_image_url(text):
    match = re.search(
        r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
        text
    )
    return match.group(0) if match else None

def chat_handler(message, history, session_id):
    history.append(("user", message))
    response = process_chat(message, history, session_id)
    history.append(("assistant", response))
    image_url = extract_image_url(response)
    return history, image_url

# ------------------ Transcription ------------------
def create_websocket_client():
    client_id = str(uuid.uuid4())
    connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
    threading.Thread(target=connections[client_id].run, daemon=True).start()
    return client_id

def clear_transcript(client_id):
    if client_id in connections:
        connections[client_id].transcript = ""
    return ""

def send_audio_chunk(audio, client_id):
    if client_id not in connections:
        return "Initializing connection..."
    sr, y = audio
    connections[client_id].enqueue_audio_chunk(sr, y)
    return connections[client_id].transcript

# ------------------ Gradio App ------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")

    session_id = gr.State(value=reset_session())
    client_id = gr.State()

    with gr.Row():
        with gr.Column(scale=1):
            image_display = gr.Image(label="πŸ“‘ Extracted Document Image", show_label=True, height=400)

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="πŸ’¬ Document Assistant", height=400)
            message_input = gr.Textbox(label="Ask about the document", placeholder="e.g. What does clause 3.2 mean?")
            send_button = gr.Button("Send")

    # Send message logic
    def user_send(msg, history, session_id):
        return chat_handler(msg, history, session_id)

    send_button.click(user_send, inputs=[message_input, chatbot, session_id], outputs=[chatbot, image_display])
    message_input.submit(user_send, inputs=[message_input, chatbot, session_id], outputs=[chatbot, image_display])

    # ------------------ Voice Section ------------------
    gr.Markdown("## πŸŽ™οΈ Realtime Voice Transcription")

    with gr.Row():
        transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)

    with gr.Row():
        mic_input = gr.Audio(streaming=True)
        clear_button = gr.Button("Clear Transcript")

    mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
    clear_button.click(fn=clear_transcript, inputs=[client_id], outputs=transcript_box)
    demo.load(fn=create_websocket_client, outputs=client_id)

demo.launch()