IAMTFRMZA commited on
Commit
f4c7018
Β·
verified Β·
1 Parent(s): 2615e33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -96
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # Final version with Perplexity-style UI, voice toggle, rich formatting, full-width input and working mic
2
  import gradio as gr
3
  import os, time, re, json, base64, asyncio, threading, uuid, io
4
  import numpy as np
@@ -18,6 +17,7 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
18
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
19
  connections = {}
20
 
 
21
  class WebSocketClient:
22
  def __init__(self, uri, headers, client_id):
23
  self.uri = uri
@@ -30,12 +30,12 @@ class WebSocketClient:
30
 
31
  async def connect(self):
32
  try:
33
- self.websocket = await connect(self.uri, extra_headers=self.headers)
34
  with open("openai_transcription_settings.json", "r") as f:
35
  await self.websocket.send(f.read())
36
  await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
37
  except Exception as e:
38
- print(f"WebSocket failed: {e}")
39
 
40
  def run(self):
41
  asyncio.set_event_loop(self.loop)
@@ -70,6 +70,7 @@ class WebSocketClient:
70
  if data["type"] == "conversation.item.input_audio_transcription.delta":
71
  self.transcript += data["delta"]
72
 
 
73
  def create_ws():
74
  cid = str(uuid.uuid4())
75
  client = WebSocketClient(WS_URI, HEADERS, cid)
@@ -84,125 +85,140 @@ def send_audio(chunk, cid):
84
  connections[cid].enqueue_audio_chunk(sr, arr)
85
  return connections[cid].transcript.strip()
86
 
87
- def clear_transcript(cid):
88
  if cid in connections:
89
  connections[cid].transcript = ""
90
  return ""
91
 
92
- def format_response(content, prompt):
93
- md = f"""### ❓ {prompt}\n\n**🧠 In summary:**\n\n{content}\n"""
94
- images = re.findall(r'https://raw\.githubusercontent\.com/[^\s)]+\.png', content)
95
- if images:
96
- md += "\n\n**πŸ“Ž Sources:**\n" + "\n".join([f"![]({url})" for url in images])
97
- return md
98
 
99
- def handle_chat(prompt, thread_id):
 
100
  if not OPENAI_API_KEY or not ASSISTANT_ID:
101
- return "❌ Missing credentials", thread_id
 
102
  try:
103
  if thread_id is None:
104
  thread = client.beta.threads.create()
105
  thread_id = thread.id
106
- client.beta.threads.messages.create(thread_id=thread_id, role="user", content=prompt)
 
107
  run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=ASSISTANT_ID)
 
108
  while True:
109
  status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
110
  if status.status == "completed":
111
  break
112
  time.sleep(1)
 
113
  msgs = client.beta.threads.messages.list(thread_id=thread_id)
114
  for msg in reversed(msgs.data):
115
  if msg.role == "assistant":
116
- return format_response(msg.content[0].text.value, prompt), thread_id
117
- return "⚠️ No reply", thread_id
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
- return f"❌ {e}", thread_id
120
 
121
- def feed_transcript(transcript, thread_id, cid):
 
122
  if not transcript.strip():
123
- return gr.update(), thread_id
124
  if cid in connections:
125
  connections[cid].transcript = ""
126
- return handle_chat(transcript, thread_id)
127
-
128
- with gr.Blocks(css="""
129
- body {
130
- background-color: #0f0f0f;
131
- color: white;
132
- font-family: 'Inter', sans-serif;
133
- }
134
- .markdown-container {
135
- margin-top: 16px;
136
- padding: 18px;
137
- background: #1a1a1a;
138
- border-radius: 12px;
139
- font-size: 16px;
140
- box-shadow: 0 2px 6px #000;
141
- }
142
- .input-bar {
143
- position: fixed;
144
- bottom: 16px;
145
- left: 0;
146
- right: 0;
147
- max-width: 1000px;
148
- margin: auto;
149
- display: flex;
150
- padding: 12px;
151
- gap: 10px;
152
- background: #1f1f1f;
153
- border-radius: 16px;
154
- }
155
- #user_input {
156
- flex: 1;
157
- padding: 12px;
158
- font-size: 16px;
159
- background: #292929;
160
- border: none;
161
- color: white;
162
- border-radius: 10px;
163
- }
164
- .btn {
165
- background: #4f46e5;
166
- color: white;
167
- border: none;
168
- border-radius: 10px;
169
- padding: 10px 14px;
170
- font-size: 18px;
171
- }
172
- .voice-area {
173
- background: #222;
174
- padding: 14px;
175
- margin-top: 20px;
176
- border-radius: 12px;
177
- display: flex;
178
- flex-direction: column;
179
- gap: 12px;
180
- }
181
- """) as app:
182
  thread_state = gr.State()
 
183
  client_id = gr.State()
184
- voice_visible = gr.State(False)
185
-
186
- gr.Markdown("<h1 style='text-align:center;'>How can I help you today?</h1>")
187
- response = gr.Markdown(elem_classes="markdown-container")
188
-
189
- with gr.Row(elem_classes="input-bar"):
190
- prompt = gr.Textbox(placeholder="Ask a question...", elem_id="user_input", show_label=False)
191
- send = gr.Button("➀", elem_id="send", elem_classes="btn")
192
- mic = gr.Button("πŸŽ™", elem_id="mic", elem_classes="btn")
193
-
194
- with gr.Column(visible=False, elem_classes="voice-area") as voice_box:
195
- voice = gr.Audio(label="Tap to Speak", streaming=True, type="numpy")
196
- transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
197
- send_voice = gr.Button("Send Voice")
198
- clear = gr.Button("Clear Transcript")
199
-
200
- send.click(handle_chat, [prompt, thread_state], [response, thread_state])
201
- mic.click(lambda x: not x, voice_visible, voice_visible)
202
- voice_visible.change(fn=None, inputs=voice_visible, outputs=voice_box, show_progress=False)
203
- voice.stream(send_audio, [voice, client_id], transcript, stream_every=0.5)
204
- send_voice.click(feed_transcript, [transcript, thread_state, client_id], [response, thread_state])
205
- clear.click(clear_transcript, [client_id], transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  app.load(fn=create_ws, outputs=[client_id])
207
 
208
  app.launch()
 
 
1
  import gradio as gr
2
  import os, time, re, json, base64, asyncio, threading, uuid, io
3
  import numpy as np
 
17
  WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
18
  connections = {}
19
 
20
+ # WebSocket Client
21
  class WebSocketClient:
22
  def __init__(self, uri, headers, client_id):
23
  self.uri = uri
 
30
 
31
  async def connect(self):
32
  try:
33
+ self.websocket = await connect(self.uri, additional_headers=self.headers)
34
  with open("openai_transcription_settings.json", "r") as f:
35
  await self.websocket.send(f.read())
36
  await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
37
  except Exception as e:
38
+ print(f"πŸ”΄ WebSocket Connection Failed: {e}")
39
 
40
  def run(self):
41
  asyncio.set_event_loop(self.loop)
 
70
  if data["type"] == "conversation.item.input_audio_transcription.delta":
71
  self.transcript += data["delta"]
72
 
73
+ # WebSocket Connection Manager
74
  def create_ws():
75
  cid = str(uuid.uuid4())
76
  client = WebSocketClient(WS_URI, HEADERS, cid)
 
85
  connections[cid].enqueue_audio_chunk(sr, arr)
86
  return connections[cid].transcript.strip()
87
 
88
+ def clear_transcript_only(cid):
89
  if cid in connections:
90
  connections[cid].transcript = ""
91
  return ""
92
 
93
+ def clear_chat_only():
94
+ return [], None, None
 
 
 
 
95
 
96
+ # Assistant chat handler
97
+ def handle_chat(user_input, history, thread_id, image_url):
98
  if not OPENAI_API_KEY or not ASSISTANT_ID:
99
+ return "❌ Missing secrets!", history, thread_id, image_url
100
+
101
  try:
102
  if thread_id is None:
103
  thread = client.beta.threads.create()
104
  thread_id = thread.id
105
+
106
+ client.beta.threads.messages.create(thread_id=thread_id, role="user", content=user_input)
107
  run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=ASSISTANT_ID)
108
+
109
  while True:
110
  status = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
111
  if status.status == "completed":
112
  break
113
  time.sleep(1)
114
+
115
  msgs = client.beta.threads.messages.list(thread_id=thread_id)
116
  for msg in reversed(msgs.data):
117
  if msg.role == "assistant":
118
+ content = msg.content[0].text.value
119
+ history.append((user_input, content))
120
+ match = re.search(
121
+ r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
122
+ content
123
+ )
124
+ if match:
125
+ image_url = match.group(0)
126
+ break
127
+
128
+ return "", history, thread_id, image_url
129
+
130
  except Exception as e:
131
+ return f"❌ {e}", history, thread_id, image_url
132
 
133
+ # Feed transcript as assistant input
134
+ def feed_transcript(transcript, history, thread_id, image_url, cid):
135
  if not transcript.strip():
136
+ return gr.update(), history, thread_id, image_url
137
  if cid in connections:
138
  connections[cid].transcript = ""
139
+ return handle_chat(transcript, history, thread_id, image_url)
140
+
141
+ # Fallback for image display
142
+ def update_image_display(image_url):
143
+ if image_url and isinstance(image_url, str) and image_url.startswith("http"):
144
+ return image_url
145
+ return None
146
+
147
+ # ============ Gradio UI ============
148
+
149
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
150
+ gr.Markdown("# πŸ“„ Document AI Assistant")
151
+
152
+ gr.HTML("""
153
+ <style>
154
+ .big-btn {
155
+ font-size: 18px !important;
156
+ padding: 14px 28px !important;
157
+ border-radius: 8px !important;
158
+ width: 100% !important;
159
+ margin-top: 10px;
160
+ }
161
+ .voice-area {
162
+ padding-top: 12px;
163
+ border-top: 1px solid #444;
164
+ margin-top: 12px;
165
+ }
166
+ </style>
167
+ """)
168
+
169
+ chat_state = gr.State([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  thread_state = gr.State()
171
+ image_state = gr.State()
172
  client_id = gr.State()
173
+
174
+ with gr.Row(equal_height=True):
175
+ with gr.Column(scale=0.8): # thinner image column
176
+ image_display = gr.Image(label="πŸ–ΌοΈ Document", type="filepath", show_download_button=False)
177
+
178
+ with gr.Column(elem_classes="voice-area"):
179
+ gr.Markdown("### πŸŽ™οΈ Voice Input")
180
+ voice_input = gr.Audio(label="Tap to Record", streaming=True, type="numpy", show_label=True)
181
+ voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
182
+
183
+ with gr.Row():
184
+ voice_send_btn = gr.Button("🟒 Send Voice to Assistant", elem_classes="big-btn")
185
+ clear_transcript_btn = gr.Button("🧹 Clear Transcript", elem_classes="big-btn")
186
+
187
+ with gr.Column(scale=2): # wider chat column
188
+ chat = gr.Chatbot(label="πŸ’¬ Chat", height=460)
189
+
190
+ with gr.Row():
191
+ user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=8)
192
+ send_btn = gr.Button("Send", variant="primary", scale=2)
193
+
194
+ with gr.Row():
195
+ clear_chat_btn = gr.Button("πŸ—‘οΈ Clear Chat", elem_classes="big-btn")
196
+
197
+ # Bindings
198
+ send_btn.click(fn=handle_chat,
199
+ inputs=[user_prompt, chat_state, thread_state, image_state],
200
+ outputs=[user_prompt, chat, thread_state, image_state])
201
+
202
+ voice_input.stream(fn=send_audio,
203
+ inputs=[voice_input, client_id],
204
+ outputs=voice_transcript,
205
+ stream_every=0.5)
206
+
207
+ voice_send_btn.click(fn=feed_transcript,
208
+ inputs=[voice_transcript, chat_state, thread_state, image_state, client_id],
209
+ outputs=[user_prompt, chat, thread_state, image_state])
210
+
211
+ clear_transcript_btn.click(fn=clear_transcript_only,
212
+ inputs=[client_id],
213
+ outputs=voice_transcript)
214
+
215
+ clear_chat_btn.click(fn=clear_chat_only,
216
+ outputs=[chat, thread_state, image_state])
217
+
218
+ image_state.change(fn=update_image_display,
219
+ inputs=image_state,
220
+ outputs=image_display)
221
+
222
  app.load(fn=create_ws, outputs=[client_id])
223
 
224
  app.launch()