seawolf2357 commited on
Commit
c509d9d
·
verified ·
1 Parent(s): a8e8360

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +996 -72
app.py CHANGED
@@ -6,8 +6,8 @@ import os
6
  import numpy as np
7
  import openai
8
  from dotenv import load_dotenv
9
- from fastapi import FastAPI, Request
10
- from fastapi.responses import HTMLResponse, StreamingResponse
11
  from fastrtc import (
12
  AdditionalOutputs,
13
  AsyncStreamHandler,
@@ -23,10 +23,35 @@ import gradio as gr
23
  import io
24
  from scipy import signal
25
  import wave
 
 
 
 
 
 
 
 
 
26
 
27
  load_dotenv()
28
 
29
  SAMPLE_RATE = 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Supported languages for OpenAI Realtime API
32
  SUPPORTED_LANGUAGES = {
@@ -61,14 +86,14 @@ SUPPORTED_LANGUAGES = {
61
  "ms": "Bahasa Melayu (Malay)"
62
  }
63
 
64
- # HTML content embedded as a string
65
  HTML_CONTENT = """<!DOCTYPE html>
66
  <html lang="ko">
67
 
68
  <head>
69
  <meta charset="UTF-8">
70
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
71
- <title>Mouth of 'MOUSE'</title>
72
  <style>
73
  :root {
74
  --primary-color: #6f42c1;
@@ -110,6 +135,43 @@ HTML_CONTENT = """<!DOCTYPE html>
110
  flex-shrink: 0;
111
  background-color: var(--card-bg);
112
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  .main-content {
114
  display: flex;
115
  gap: 20px;
@@ -219,7 +281,7 @@ HTML_CONTENT = """<!DOCTYPE html>
219
  .text-input-section {
220
  margin-top: 15px;
221
  }
222
- input[type="text"], textarea {
223
  width: 100%;
224
  background-color: var(--dark-bg);
225
  color: var(--text-color);
@@ -318,11 +380,91 @@ HTML_CONTENT = """<!DOCTYPE html>
318
  flex-shrink: 0;
319
  padding-top: 20px;
320
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  /* Responsive design */
322
  @media (max-width: 1024px) {
323
  .sidebar {
324
  width: 300px;
325
  }
 
 
 
326
  }
327
  @media (max-width: 768px) {
328
  .main-content {
@@ -335,6 +477,9 @@ HTML_CONTENT = """<!DOCTYPE html>
335
  .chat-section {
336
  height: 400px;
337
  }
 
 
 
338
  }
339
  button {
340
  background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
@@ -432,6 +577,10 @@ HTML_CONTENT = """<!DOCTYPE html>
432
  background-color: #ff9800;
433
  color: white;
434
  }
 
 
 
 
435
  .status-indicator {
436
  display: inline-flex;
437
  align-items: center;
@@ -508,7 +657,7 @@ HTML_CONTENT = """<!DOCTYPE html>
508
  <div class="mouse-ears mouse-ear-right"></div>
509
  <div class="mouse-face"></div>
510
  </div>
511
- <h1>MOUSE 음성 챗</h1>
512
  </div>
513
  <div class="status-indicator">
514
  <div id="status-dot" class="status-dot disconnected"></div>
@@ -516,81 +665,218 @@ HTML_CONTENT = """<!DOCTYPE html>
516
  </div>
517
  </div>
518
 
519
- <div class="main-content">
520
- <div class="sidebar">
521
- <div class="settings-section">
522
- <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
523
- <div class="settings-grid">
524
- <div class="setting-item">
525
- <span class="setting-label">웹 검색</span>
526
- <div id="search-toggle" class="toggle-switch">
527
- <div class="toggle-slider"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  </div>
529
  </div>
530
- <div class="setting-item">
531
- <span class="setting-label">다국어 번역 채팅</span>
532
- <select id="language-select">
533
- <option value="">비활성화</option>
534
- <option value="ko">한국어 (Korean)</option>
535
- <option value="en">English</option>
536
- <option value="es">Español (Spanish)</option>
537
- <option value="fr">Français (French)</option>
538
- <option value="de">Deutsch (German)</option>
539
- <option value="it">Italiano (Italian)</option>
540
- <option value="pt">Português (Portuguese)</option>
541
- <option value="ru">Русский (Russian)</option>
542
- <option value="ja">日本語 (Japanese)</option>
543
- <option value="zh">中文 (Chinese)</option>
544
- <option value="ar">العربية (Arabic)</option>
545
- <option value="hi">हिन्दी (Hindi)</option>
546
- <option value="nl">Nederlands (Dutch)</option>
547
- <option value="pl">Polski (Polish)</option>
548
- <option value="tr">Türkçe (Turkish)</option>
549
- <option value="vi">Tiếng Việt (Vietnamese)</option>
550
- <option value="th">ไทย (Thai)</option>
551
- <option value="id">Bahasa Indonesia</option>
552
- <option value="sv">Svenska (Swedish)</option>
553
- <option value="da">Dansk (Danish)</option>
554
- <option value="no">Norsk (Norwegian)</option>
555
- <option value="fi">Suomi (Finnish)</option>
556
- <option value="he">עברית (Hebrew)</option>
557
- <option value="uk">Українська (Ukrainian)</option>
558
- <option value="cs">Čeština (Czech)</option>
559
- <option value="el">Ελληνικά (Greek)</option>
560
- <option value="ro">Română (Romanian)</option>
561
- <option value="hu">Magyar (Hungarian)</option>
562
- <option value="ms">Bahasa Melayu (Malay)</option>
563
- </select>
564
  </div>
565
  </div>
566
- <div class="text-input-section">
567
- <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
568
- <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
569
  </div>
570
  </div>
571
 
572
- <div class="controls">
573
- <button id="start-button">대화 시작</button>
 
 
 
 
 
 
 
 
 
574
  </div>
575
  </div>
576
-
577
- <div class="chat-section">
578
- <div class="chat-container">
579
- <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
580
- <div class="chat-messages" id="chat-messages"></div>
581
- <div class="text-input-section" style="margin-top: 10px;">
582
- <div style="display: flex; gap: 10px;">
583
- <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
584
- <button id="send-button" style="display: none;">전송</button>
585
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  </div>
587
  </div>
588
  </div>
589
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  </div>
591
  <audio id="audio-output"></audio>
592
 
593
  <script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
  let peerConnection;
595
  let webrtc_id;
596
  let webSearchEnabled = false;
@@ -612,6 +898,13 @@ HTML_CONTENT = """<!DOCTYPE html>
612
  let dataChannel = null;
613
  let isVoiceActive = false;
614
 
 
 
 
 
 
 
 
615
  // Web search toggle functionality
616
  searchToggle.addEventListener('click', () => {
617
  webSearchEnabled = !webSearchEnabled;
@@ -708,6 +1001,7 @@ HTML_CONTENT = """<!DOCTYPE html>
708
  isVoiceActive = false;
709
  }
710
  }
 
711
  function updateButtonState() {
712
  const button = document.getElementById('start-button');
713
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
@@ -737,6 +1031,7 @@ HTML_CONTENT = """<!DOCTYPE html>
737
  updateStatus('disconnected');
738
  }
739
  }
 
740
  function setupAudioVisualization(stream) {
741
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
742
  analyser = audioContext.createAnalyser();
@@ -771,6 +1066,7 @@ HTML_CONTENT = """<!DOCTYPE html>
771
 
772
  updateAudioLevel();
773
  }
 
774
  function showError(message) {
775
  const toast = document.getElementById('error-toast');
776
  toast.textContent = message;
@@ -780,6 +1076,17 @@ HTML_CONTENT = """<!DOCTYPE html>
780
  toast.style.display = 'none';
781
  }, 5000);
782
  }
 
 
 
 
 
 
 
 
 
 
 
783
  async function setupWebRTC() {
784
  const config = __RTC_CONFIGURATION__;
785
  peerConnection = new RTCPeerConnection(config);
@@ -845,14 +1152,6 @@ HTML_CONTENT = """<!DOCTYPE html>
845
  });
846
  webrtc_id = Math.random().toString(36).substring(7);
847
 
848
- // Log current settings before sending
849
- console.log('Sending offer with settings:', {
850
- webrtc_id: webrtc_id,
851
- web_search_enabled: webSearchEnabled,
852
- target_language: selectedLanguage,
853
- system_prompt: systemPrompt
854
- });
855
-
856
  const response = await fetch('/webrtc/offer', {
857
  method: 'POST',
858
  headers: { 'Content-Type': 'application/json' },
@@ -897,6 +1196,7 @@ HTML_CONTENT = """<!DOCTYPE html>
897
  stop();
898
  }
899
  }
 
900
  function addMessage(role, content) {
901
  const messageDiv = document.createElement('div');
902
  messageDiv.classList.add('message', role);
@@ -986,6 +1286,7 @@ HTML_CONTENT = """<!DOCTYPE html>
986
  webrtc_id = null;
987
  }
988
  }
 
989
  startButton.addEventListener('click', () => {
990
  console.log('clicked');
991
  console.log(peerConnection, peerConnection?.connectionState);
@@ -997,6 +1298,320 @@ HTML_CONTENT = """<!DOCTYPE html>
997
  }
998
  });
999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
  // Initialize on page load
1001
  window.addEventListener('DOMContentLoaded', () => {
1002
  sendButton.style.display = 'block';
@@ -1006,6 +1621,108 @@ HTML_CONTENT = """<!DOCTYPE html>
1006
 
1007
  </html>"""
1008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
 
1010
  class BraveSearchClient:
1011
  """Brave Search API client"""
@@ -1056,6 +1773,9 @@ print(f"Search client initialized: {search_client is not None}, API key present:
1056
  # Store connection settings
1057
  connection_settings = {}
1058
 
 
 
 
1059
  # Initialize OpenAI client for text chat
1060
  client = openai.AsyncOpenAI()
1061
 
@@ -1715,6 +2435,210 @@ async def outputs(webrtc_id: str):
1715
  return StreamingResponse(output_stream(), media_type="text/event-stream")
1716
 
1717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1718
  @app.get("/")
1719
  async def index():
1720
  """Serve the HTML page"""
 
6
  import numpy as np
7
  import openai
8
  from dotenv import load_dotenv
9
+ from fastapi import FastAPI, Request, UploadFile, File, Form
10
+ from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
11
  from fastrtc import (
12
  AdditionalOutputs,
13
  AsyncStreamHandler,
 
23
  import io
24
  from scipy import signal
25
  import wave
26
+ import torch
27
+ from transformers import pipeline
28
+ import tempfile
29
+ import subprocess
30
+ import pdfplumber
31
+ import scipy.signal as sps
32
+ from datetime import datetime
33
+ from zoneinfo import ZoneInfo
34
+ import concurrent.futures
35
 
36
  load_dotenv()
37
 
38
  SAMPLE_RATE = 24000
39
+ WHISPER_SAMPLE_RATE = 16000
40
+ SEOUL_TZ = ZoneInfo("Asia/Seoul")
41
+
42
+ # Whisper model settings
43
+ WHISPER_MODEL_NAME = "openai/whisper-large-v3-turbo"
44
+ WHISPER_BATCH_SIZE = 8
45
+
46
+ # Real-time segmentation parameters
47
+ MIN_SEG_SEC = 10
48
+ MAX_SEG_SEC = 15
49
+ SILENCE_SEC = 0.6
50
+ SILENCE_THRESH = 1e-4
51
+
52
+ # CPU-side pool for Whisper tasks
53
+ whisper_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
54
+ whisper_futures_queue: list[concurrent.futures.Future] = []
55
 
56
  # Supported languages for OpenAI Realtime API
57
  SUPPORTED_LANGUAGES = {
 
86
  "ms": "Bahasa Melayu (Malay)"
87
  }
88
 
89
+ # HTML content embedded as a string (extended with new tabs)
90
  HTML_CONTENT = """<!DOCTYPE html>
91
  <html lang="ko">
92
 
93
  <head>
94
  <meta charset="UTF-8">
95
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
96
+ <title>Mouth of 'MOUSE' - Extended</title>
97
  <style>
98
  :root {
99
  --primary-color: #6f42c1;
 
135
  flex-shrink: 0;
136
  background-color: var(--card-bg);
137
  }
138
+ .tabs-container {
139
+ display: flex;
140
+ gap: 10px;
141
+ margin-bottom: 20px;
142
+ border-bottom: 2px solid var(--border-color);
143
+ padding-bottom: 10px;
144
+ overflow-x: auto;
145
+ scrollbar-width: thin;
146
+ scrollbar-color: var(--primary-color) var(--card-bg);
147
+ }
148
+ .tab-button {
149
+ padding: 10px 20px;
150
+ background-color: var(--card-bg);
151
+ color: var(--text-color);
152
+ border: 1px solid var(--border-color);
153
+ border-radius: 8px 8px 0 0;
154
+ cursor: pointer;
155
+ transition: all 0.3s;
156
+ white-space: nowrap;
157
+ font-size: 14px;
158
+ }
159
+ .tab-button:hover {
160
+ background-color: var(--secondary-color);
161
+ }
162
+ .tab-button.active {
163
+ background-color: var(--primary-color);
164
+ border-bottom: 2px solid var(--primary-color);
165
+ }
166
+ .tab-content {
167
+ display: none;
168
+ flex-grow: 1;
169
+ overflow: hidden;
170
+ }
171
+ .tab-content.active {
172
+ display: flex;
173
+ flex-direction: column;
174
+ }
175
  .main-content {
176
  display: flex;
177
  gap: 20px;
 
281
  .text-input-section {
282
  margin-top: 15px;
283
  }
284
+ input[type="text"], input[type="file"], textarea {
285
  width: 100%;
286
  background-color: var(--dark-bg);
287
  color: var(--text-color);
 
380
  flex-shrink: 0;
381
  padding-top: 20px;
382
  }
383
+ /* Whisper Tab Styles */
384
+ .whisper-container {
385
+ padding: 20px;
386
+ background-color: var(--card-bg);
387
+ border-radius: 12px;
388
+ border: 1px solid var(--border-color);
389
+ margin: 20px auto;
390
+ max-width: 800px;
391
+ width: 100%;
392
+ }
393
+ .whisper-input-section {
394
+ margin-bottom: 20px;
395
+ }
396
+ .whisper-output-section {
397
+ display: grid;
398
+ grid-template-columns: 1fr 1fr;
399
+ gap: 20px;
400
+ margin-top: 20px;
401
+ }
402
+ .whisper-output {
403
+ background-color: var(--dark-bg);
404
+ padding: 15px;
405
+ border-radius: 8px;
406
+ border: 1px solid var(--border-color);
407
+ max-height: 300px;
408
+ overflow-y: auto;
409
+ }
410
+ .whisper-output h4 {
411
+ margin-top: 0;
412
+ color: var(--primary-color);
413
+ }
414
+ .file-upload-area {
415
+ border: 2px dashed var(--border-color);
416
+ border-radius: 8px;
417
+ padding: 30px;
418
+ text-align: center;
419
+ cursor: pointer;
420
+ transition: all 0.3s;
421
+ }
422
+ .file-upload-area:hover {
423
+ border-color: var(--primary-color);
424
+ background-color: rgba(111, 66, 193, 0.1);
425
+ }
426
+ .file-upload-area.drag-over {
427
+ border-color: var(--primary-color);
428
+ background-color: rgba(111, 66, 193, 0.2);
429
+ }
430
+ .realtime-output {
431
+ background-color: var(--dark-bg);
432
+ padding: 20px;
433
+ border-radius: 8px;
434
+ margin-top: 20px;
435
+ min-height: 200px;
436
+ max-height: 400px;
437
+ overflow-y: auto;
438
+ }
439
+ .recording-indicator {
440
+ display: inline-flex;
441
+ align-items: center;
442
+ gap: 10px;
443
+ padding: 10px 20px;
444
+ background-color: #f44336;
445
+ color: white;
446
+ border-radius: 20px;
447
+ animation: pulse 1.5s infinite;
448
+ }
449
+ .recording-dot {
450
+ width: 10px;
451
+ height: 10px;
452
+ background-color: white;
453
+ border-radius: 50%;
454
+ animation: blink 1s infinite;
455
+ }
456
+ @keyframes blink {
457
+ 0%, 50% { opacity: 1; }
458
+ 51%, 100% { opacity: 0; }
459
+ }
460
  /* Responsive design */
461
  @media (max-width: 1024px) {
462
  .sidebar {
463
  width: 300px;
464
  }
465
+ .whisper-output-section {
466
+ grid-template-columns: 1fr;
467
+ }
468
  }
469
  @media (max-width: 768px) {
470
  .main-content {
 
477
  .chat-section {
478
  height: 400px;
479
  }
480
+ .tabs-container {
481
+ flex-wrap: wrap;
482
+ }
483
  }
484
  button {
485
  background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
 
577
  background-color: #ff9800;
578
  color: white;
579
  }
580
+ .toast.success {
581
+ background-color: #4caf50;
582
+ color: white;
583
+ }
584
  .status-indicator {
585
  display: inline-flex;
586
  align-items: center;
 
657
  <div class="mouse-ears mouse-ear-right"></div>
658
  <div class="mouse-face"></div>
659
  </div>
660
+ <h1>MOUSE Extended</h1>
661
  </div>
662
  <div class="status-indicator">
663
  <div id="status-dot" class="status-dot disconnected"></div>
 
665
  </div>
666
  </div>
667
 
668
+ <div class="tabs-container">
669
+ <button class="tab-button active" onclick="switchTab('voice-chat')">음성 채팅</button>
670
+ <button class="tab-button" onclick="switchTab('mic-whisper')">마이크 전사</button>
671
+ <button class="tab-button" onclick="switchTab('audio-whisper')">오디오 파일</button>
672
+ <button class="tab-button" onclick="switchTab('video-whisper')">비디오 파일</button>
673
+ <button class="tab-button" onclick="switchTab('pdf-whisper')">PDF 번역</button>
674
+ <button class="tab-button" onclick="switchTab('realtime-whisper')">실시간 통역</button>
675
+ </div>
676
+
677
+ <!-- Voice Chat Tab (Original) -->
678
+ <div id="voice-chat" class="tab-content active">
679
+ <div class="main-content">
680
+ <div class="sidebar">
681
+ <div class="settings-section">
682
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
683
+ <div class="settings-grid">
684
+ <div class="setting-item">
685
+ <span class="setting-label">웹 검색</span>
686
+ <div id="search-toggle" class="toggle-switch">
687
+ <div class="toggle-slider"></div>
688
+ </div>
689
+ </div>
690
+ <div class="setting-item">
691
+ <span class="setting-label">다국어 번역 채팅</span>
692
+ <select id="language-select">
693
+ <option value="">비활성화</option>
694
+ <option value="ko">한국어 (Korean)</option>
695
+ <option value="en">English</option>
696
+ <option value="es">Español (Spanish)</option>
697
+ <option value="fr">Français (French)</option>
698
+ <option value="de">Deutsch (German)</option>
699
+ <option value="it">Italiano (Italian)</option>
700
+ <option value="pt">Português (Portuguese)</option>
701
+ <option value="ru">Русский (Russian)</option>
702
+ <option value="ja">日本語 (Japanese)</option>
703
+ <option value="zh">中文 (Chinese)</option>
704
+ <option value="ar">العربية (Arabic)</option>
705
+ <option value="hi">हिन्दी (Hindi)</option>
706
+ <option value="nl">Nederlands (Dutch)</option>
707
+ <option value="pl">Polski (Polish)</option>
708
+ <option value="tr">Türkçe (Turkish)</option>
709
+ <option value="vi">Tiếng Việt (Vietnamese)</option>
710
+ <option value="th">ไทย (Thai)</option>
711
+ <option value="id">Bahasa Indonesia</option>
712
+ <option value="sv">Svenska (Swedish)</option>
713
+ <option value="da">Dansk (Danish)</option>
714
+ <option value="no">Norsk (Norwegian)</option>
715
+ <option value="fi">Suomi (Finnish)</option>
716
+ <option value="he">עברית (Hebrew)</option>
717
+ <option value="uk">Українська (Ukrainian)</option>
718
+ <option value="cs">Čeština (Czech)</option>
719
+ <option value="el">Ελληνικά (Greek)</option>
720
+ <option value="ro">Română (Romanian)</option>
721
+ <option value="hu">Magyar (Hungarian)</option>
722
+ <option value="ms">Bahasa Melayu (Malay)</option>
723
+ </select>
724
  </div>
725
  </div>
726
+ <div class="text-input-section">
727
+ <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
728
+ <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  </div>
730
  </div>
731
+
732
+ <div class="controls">
733
+ <button id="start-button">대화 시작</button>
734
  </div>
735
  </div>
736
 
737
+ <div class="chat-section">
738
+ <div class="chat-container">
739
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
740
+ <div class="chat-messages" id="chat-messages"></div>
741
+ <div class="text-input-section" style="margin-top: 10px;">
742
+ <div style="display: flex; gap: 10px;">
743
+ <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
744
+ <button id="send-button" style="display: none;">전송</button>
745
+ </div>
746
+ </div>
747
+ </div>
748
  </div>
749
  </div>
750
+ </div>
751
+
752
+ <!-- Microphone Whisper Tab -->
753
+ <div id="mic-whisper" class="tab-content">
754
+ <div class="whisper-container">
755
+ <h3>마이크 녹음 전사 및 4개 언어 번역</h3>
756
+ <div class="whisper-input-section">
757
+ <button id="mic-record-btn" onclick="toggleMicRecording()">녹음 시작</button>
758
+ <div id="mic-status" style="margin-top: 10px;"></div>
759
+ </div>
760
+ <div class="whisper-output-section">
761
+ <div class="whisper-output">
762
+ <h4>원문</h4>
763
+ <div id="mic-original"></div>
764
+ </div>
765
+ <div class="whisper-output">
766
+ <h4>번역 (EN/ZH/TH/RU)</h4>
767
+ <div id="mic-translation"></div>
768
+ </div>
769
+ </div>
770
+ </div>
771
+ </div>
772
+
773
+ <!-- Audio File Whisper Tab -->
774
+ <div id="audio-whisper" class="tab-content">
775
+ <div class="whisper-container">
776
+ <h3>오디오 파일 → 전사 및 4개 언어 번역</h3>
777
+ <div class="whisper-input-section">
778
+ <div class="file-upload-area" id="audio-upload-area">
779
+ <p>오디오 파일을 드래그하거나 클릭하여 업로드</p>
780
+ <input type="file" id="audio-file-input" accept="audio/*" style="display: none;">
781
+ </div>
782
+ </div>
783
+ <div class="whisper-output-section">
784
+ <div class="whisper-output">
785
+ <h4>원문</h4>
786
+ <div id="audio-original"></div>
787
+ </div>
788
+ <div class="whisper-output">
789
+ <h4>번역 (EN/ZH/TH/RU)</h4>
790
+ <div id="audio-translation"></div>
791
  </div>
792
  </div>
793
  </div>
794
  </div>
795
+
796
+ <!-- Video File Whisper Tab -->
797
+ <div id="video-whisper" class="tab-content">
798
+ <div class="whisper-container">
799
+ <h3>비디오 파일 → 오디오 추출 → 전사 및 4개 언어 번역</h3>
800
+ <div class="whisper-input-section">
801
+ <div class="file-upload-area" id="video-upload-area">
802
+ <p>비디오 파일을 드래그하거나 클릭하여 업로드</p>
803
+ <input type="file" id="video-file-input" accept="video/*" style="display: none;">
804
+ </div>
805
+ </div>
806
+ <div class="whisper-output-section">
807
+ <div class="whisper-output">
808
+ <h4>원문</h4>
809
+ <div id="video-original"></div>
810
+ </div>
811
+ <div class="whisper-output">
812
+ <h4>번역 (EN/ZH/TH/RU)</h4>
813
+ <div id="video-translation"></div>
814
+ </div>
815
+ </div>
816
+ </div>
817
+ </div>
818
+
819
+ <!-- PDF Whisper Tab -->
820
+ <div id="pdf-whisper" class="tab-content">
821
+ <div class="whisper-container">
822
+ <h3>PDF 파일 → 텍스트 추출 → 4개 언어 번역</h3>
823
+ <div class="whisper-input-section">
824
+ <div class="file-upload-area" id="pdf-upload-area">
825
+ <p>PDF 파일을 드래그하거나 클릭하여 업로드</p>
826
+ <input type="file" id="pdf-file-input" accept=".pdf" style="display: none;">
827
+ </div>
828
+ <div class="setting-item" style="margin-top: 15px;">
829
+ <span class="setting-label">최대 페이지:</span>
830
+ <input type="number" id="pdf-max-pages" value="10" min="1" max="50" style="width: 80px;">
831
+ </div>
832
+ </div>
833
+ <div class="whisper-output-section">
834
+ <div class="whisper-output">
835
+ <h4>추출된 텍스트</h4>
836
+ <div id="pdf-original"></div>
837
+ </div>
838
+ <div class="whisper-output">
839
+ <h4>번역 (EN/ZH/TH/RU)</h4>
840
+ <div id="pdf-translation"></div>
841
+ </div>
842
+ </div>
843
+ </div>
844
+ </div>
845
+
846
+ <!-- Realtime Whisper Tab -->
847
+ <div id="realtime-whisper" class="tab-content">
848
+ <div class="whisper-container">
849
+ <h3>실시간 통역 (Korean → EN/ZH/TH/RU)</h3>
850
+ <p>10-15초 문장 단위로 자동 전환 — 최신 내용이 위에 표시됩니다.</p>
851
+ <div class="whisper-input-section">
852
+ <button id="realtime-start-btn" onclick="toggleRealtimeRecording()">실시간 통역 시작</button>
853
+ <div id="realtime-status" style="margin-top: 10px;"></div>
854
+ </div>
855
+ <div class="realtime-output" id="realtime-output"></div>
856
+ </div>
857
+ </div>
858
  </div>
859
  <audio id="audio-output"></audio>
860
 
861
  <script>
862
+ // Tab switching functionality
863
+ function switchTab(tabName) {
864
+ // Hide all tabs
865
+ const tabs = document.querySelectorAll('.tab-content');
866
+ tabs.forEach(tab => tab.classList.remove('active'));
867
+
868
+ // Remove active class from all buttons
869
+ const buttons = document.querySelectorAll('.tab-button');
870
+ buttons.forEach(btn => btn.classList.remove('active'));
871
+
872
+ // Show selected tab
873
+ document.getElementById(tabName).classList.add('active');
874
+
875
+ // Add active class to clicked button
876
+ event.target.classList.add('active');
877
+ }
878
+
879
+ // Original voice chat variables and functions
880
  let peerConnection;
881
  let webrtc_id;
882
  let webSearchEnabled = false;
 
898
  let dataChannel = null;
899
  let isVoiceActive = false;
900
 
901
+ // Whisper-related variables
902
+ let micRecorder = null;
903
+ let isRecording = false;
904
+ let realtimeRecorder = null;
905
+ let isRealtimeRecording = false;
906
+ let realtimeStream = null;
907
+
908
  // Web search toggle functionality
909
  searchToggle.addEventListener('click', () => {
910
  webSearchEnabled = !webSearchEnabled;
 
1001
  isVoiceActive = false;
1002
  }
1003
  }
1004
+
1005
  function updateButtonState() {
1006
  const button = document.getElementById('start-button');
1007
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
 
1031
  updateStatus('disconnected');
1032
  }
1033
  }
1034
+
1035
  function setupAudioVisualization(stream) {
1036
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
1037
  analyser = audioContext.createAnalyser();
 
1066
 
1067
  updateAudioLevel();
1068
  }
1069
+
1070
  function showError(message) {
1071
  const toast = document.getElementById('error-toast');
1072
  toast.textContent = message;
 
1076
  toast.style.display = 'none';
1077
  }, 5000);
1078
  }
1079
+
1080
+ function showSuccess(message) {
1081
+ const toast = document.getElementById('error-toast');
1082
+ toast.textContent = message;
1083
+ toast.className = 'toast success';
1084
+ toast.style.display = 'block';
1085
+ setTimeout(() => {
1086
+ toast.style.display = 'none';
1087
+ }, 3000);
1088
+ }
1089
+
1090
  async function setupWebRTC() {
1091
  const config = __RTC_CONFIGURATION__;
1092
  peerConnection = new RTCPeerConnection(config);
 
1152
  });
1153
  webrtc_id = Math.random().toString(36).substring(7);
1154
 
 
 
 
 
 
 
 
 
1155
  const response = await fetch('/webrtc/offer', {
1156
  method: 'POST',
1157
  headers: { 'Content-Type': 'application/json' },
 
1196
  stop();
1197
  }
1198
  }
1199
+
1200
  function addMessage(role, content) {
1201
  const messageDiv = document.createElement('div');
1202
  messageDiv.classList.add('message', role);
 
1286
  webrtc_id = null;
1287
  }
1288
  }
1289
+
1290
  startButton.addEventListener('click', () => {
1291
  console.log('clicked');
1292
  console.log(peerConnection, peerConnection?.connectionState);
 
1298
  }
1299
  });
1300
 
1301
+ // Whisper Tab Functions
1302
+
1303
+ // Microphone recording
1304
+ async function toggleMicRecording() {
1305
+ const btn = document.getElementById('mic-record-btn');
1306
+ const status = document.getElementById('mic-status');
1307
+
1308
+ if (!isRecording) {
1309
+ try {
1310
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
1311
+ micRecorder = new MediaRecorder(stream);
1312
+ const chunks = [];
1313
+
1314
+ micRecorder.ondataavailable = (e) => chunks.push(e.data);
1315
+ micRecorder.onstop = async () => {
1316
+ const blob = new Blob(chunks, { type: 'audio/webm' });
1317
+ await processAudioBlob(blob, 'mic');
1318
+ stream.getTracks().forEach(track => track.stop());
1319
+ };
1320
+
1321
+ micRecorder.start();
1322
+ isRecording = true;
1323
+ btn.textContent = '녹음 중지';
1324
+ status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>녹음 중...</div>';
1325
+ } catch (err) {
1326
+ showError('마이크 접근 권한이 필요합니다.');
1327
+ }
1328
+ } else {
1329
+ micRecorder.stop();
1330
+ isRecording = false;
1331
+ btn.textContent = '녹음 시작';
1332
+ status.textContent = '처리 중...';
1333
+ }
1334
+ }
1335
+
1336
+ // File upload handlers
1337
+ document.getElementById('audio-upload-area').addEventListener('click', () => {
1338
+ document.getElementById('audio-file-input').click();
1339
+ });
1340
+
1341
+ document.getElementById('video-upload-area').addEventListener('click', () => {
1342
+ document.getElementById('video-file-input').click();
1343
+ });
1344
+
1345
+ document.getElementById('pdf-upload-area').addEventListener('click', () => {
1346
+ document.getElementById('pdf-file-input').click();
1347
+ });
1348
+
1349
+ // Drag and drop handlers
1350
+ ['audio', 'video', 'pdf'].forEach(type => {
1351
+ const area = document.getElementById(`${type}-upload-area`);
1352
+
1353
+ area.addEventListener('dragover', (e) => {
1354
+ e.preventDefault();
1355
+ area.classList.add('drag-over');
1356
+ });
1357
+
1358
+ area.addEventListener('dragleave', () => {
1359
+ area.classList.remove('drag-over');
1360
+ });
1361
+
1362
+ area.addEventListener('drop', (e) => {
1363
+ e.preventDefault();
1364
+ area.classList.remove('drag-over');
1365
+ const file = e.dataTransfer.files[0];
1366
+ if (file) {
1367
+ if (type === 'audio') processAudioFile(file);
1368
+ else if (type === 'video') processVideoFile(file);
1369
+ else if (type === 'pdf') processPDFFile(file);
1370
+ }
1371
+ });
1372
+ });
1373
+
1374
+ // File input change handlers
1375
+ document.getElementById('audio-file-input').addEventListener('change', (e) => {
1376
+ if (e.target.files[0]) processAudioFile(e.target.files[0]);
1377
+ });
1378
+
1379
+ document.getElementById('video-file-input').addEventListener('change', (e) => {
1380
+ if (e.target.files[0]) processVideoFile(e.target.files[0]);
1381
+ });
1382
+
1383
+ document.getElementById('pdf-file-input').addEventListener('change', (e) => {
1384
+ if (e.target.files[0]) processPDFFile(e.target.files[0]);
1385
+ });
1386
+
1387
+ // Process audio blob (for microphone recording)
1388
+ async function processAudioBlob(blob, type) {
1389
+ const formData = new FormData();
1390
+ formData.append('audio', blob, 'recording.webm');
1391
+
1392
+ try {
1393
+ const response = await fetch('/whisper/transcribe', {
1394
+ method: 'POST',
1395
+ body: formData
1396
+ });
1397
+
1398
+ const result = await response.json();
1399
+ if (result.error) {
1400
+ showError(result.error);
1401
+ } else {
1402
+ document.getElementById(`${type}-original`).textContent = result.text;
1403
+ document.getElementById(`${type}-translation`).innerHTML = result.translation.replace(/\n/g, '<br>');
1404
+ document.getElementById(`${type}-status`).textContent = '';
1405
+ }
1406
+ } catch (error) {
1407
+ showError('처리 중 오류가 발생했습니다.');
1408
+ document.getElementById(`${type}-status`).textContent = '';
1409
+ }
1410
+ }
1411
+
1412
+ // Process audio file
1413
+ async function processAudioFile(file) {
1414
+ const formData = new FormData();
1415
+ formData.append('audio', file);
1416
+
1417
+ try {
1418
+ showSuccess('오디오 파일 처리 중...');
1419
+ const response = await fetch('/whisper/audio', {
1420
+ method: 'POST',
1421
+ body: formData
1422
+ });
1423
+
1424
+ const result = await response.json();
1425
+ if (result.error) {
1426
+ showError(result.error);
1427
+ } else {
1428
+ document.getElementById('audio-original').textContent = result.text;
1429
+ document.getElementById('audio-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1430
+ }
1431
+ } catch (error) {
1432
+ showError('오디오 파일 처리 중 오류가 발생했습니다.');
1433
+ }
1434
+ }
1435
+
1436
+ // Process video file
1437
+ async function processVideoFile(file) {
1438
+ const formData = new FormData();
1439
+ formData.append('video', file);
1440
+
1441
+ try {
1442
+ showSuccess('비디오 파일 처리 중... (시간이 걸릴 수 있습니다)');
1443
+ const response = await fetch('/whisper/video', {
1444
+ method: 'POST',
1445
+ body: formData
1446
+ });
1447
+
1448
+ const result = await response.json();
1449
+ if (result.error) {
1450
+ showError(result.error);
1451
+ } else {
1452
+ document.getElementById('video-original').textContent = result.text;
1453
+ document.getElementById('video-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1454
+ }
1455
+ } catch (error) {
1456
+ showError('비디오 파일 처리 중 오류가 발생했습니다.');
1457
+ }
1458
+ }
1459
+
1460
+ // Process PDF file
1461
+ async function processPDFFile(file) {
1462
+ const formData = new FormData();
1463
+ formData.append('pdf', file);
1464
+ formData.append('max_pages', document.getElementById('pdf-max-pages').value);
1465
+
1466
+ try {
1467
+ showSuccess('PDF 파일 처리 중...');
1468
+ const response = await fetch('/whisper/pdf', {
1469
+ method: 'POST',
1470
+ body: formData
1471
+ });
1472
+
1473
+ const result = await response.json();
1474
+ if (result.error) {
1475
+ showError(result.error);
1476
+ } else {
1477
+ document.getElementById('pdf-original').textContent = result.text;
1478
+ document.getElementById('pdf-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1479
+ }
1480
+ } catch (error) {
1481
+ showError('PDF 파일 처리 중 오류가 발생했습니다.');
1482
+ }
1483
+ }
1484
+
1485
+ // Realtime recording
1486
+ let realtimeEventSource = null;
1487
+
1488
+ async function toggleRealtimeRecording() {
1489
+ const btn = document.getElementById('realtime-start-btn');
1490
+ const status = document.getElementById('realtime-status');
1491
+ const output = document.getElementById('realtime-output');
1492
+
1493
+ if (!isRealtimeRecording) {
1494
+ try {
1495
+ const response = await fetch('/whisper/realtime/start', {
1496
+ method: 'POST'
1497
+ });
1498
+
1499
+ const result = await response.json();
1500
+ if (result.session_id) {
1501
+ // Start listening for results
1502
+ realtimeEventSource = new EventSource(`/whisper/realtime/stream?session_id=${result.session_id}`);
1503
+
1504
+ realtimeEventSource.onmessage = (event) => {
1505
+ const data = JSON.parse(event.data);
1506
+ if (data.timestamp) {
1507
+ const segment = document.createElement('div');
1508
+ segment.style.borderBottom = '1px solid #333';
1509
+ segment.style.paddingBottom = '15px';
1510
+ segment.style.marginBottom = '15px';
1511
+ segment.innerHTML = `
1512
+ <strong>[${data.timestamp}]</strong><br>
1513
+ <strong>[KO]</strong> ${data.text}<br>
1514
+ ${data.translation.replace(/\n/g, '<br>')}
1515
+ `;
1516
+ output.insertBefore(segment, output.firstChild);
1517
+ }
1518
+ };
1519
+
1520
+ realtimeEventSource.onerror = () => {
1521
+ stopRealtimeRecording();
1522
+ showError('실시간 통역 연결이 끊어졌습니다.');
1523
+ };
1524
+
1525
+ // Start recording
1526
+ realtimeStream = await navigator.mediaDevices.getUserMedia({ audio: true });
1527
+
1528
+ isRealtimeRecording = true;
1529
+ btn.textContent = '통역 중지';
1530
+ status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>실시간 통역 중...</div>';
1531
+
1532
+ // Send audio data periodically
1533
+ startRealtimeAudioCapture(result.session_id);
1534
+ }
1535
+ } catch (err) {
1536
+ showError('실시간 통역을 시작할 수 없습니다.');
1537
+ }
1538
+ } else {
1539
+ stopRealtimeRecording();
1540
+ }
1541
+ }
1542
+
1543
+ function stopRealtimeRecording() {
1544
+ if (realtimeEventSource) {
1545
+ realtimeEventSource.close();
1546
+ realtimeEventSource = null;
1547
+ }
1548
+
1549
+ if (realtimeStream) {
1550
+ realtimeStream.getTracks().forEach(track => track.stop());
1551
+ realtimeStream = null;
1552
+ }
1553
+
1554
+ isRealtimeRecording = false;
1555
+ document.getElementById('realtime-start-btn').textContent = '실시간 통역 시작';
1556
+ document.getElementById('realtime-status').textContent = '';
1557
+ }
1558
+
1559
+ async function startRealtimeAudioCapture(sessionId) {
1560
+ const audioContext = new AudioContext({ sampleRate: 16000 });
1561
+ const source = audioContext.createMediaStreamSource(realtimeStream);
1562
+ const processor = audioContext.createScriptProcessor(4096, 1, 1);
1563
+
1564
+ let audioBuffer = [];
1565
+ let silenceFrames = 0;
1566
+ const SILENCE_THRESHOLD = 0.01;
1567
+ const MIN_BUFFER_SIZE = 16000 * 2; // 2 seconds minimum
1568
+ const MAX_BUFFER_SIZE = 16000 * 15; // 15 seconds maximum
1569
+
1570
+ processor.onaudioprocess = async (e) => {
1571
+ if (!isRealtimeRecording) return;
1572
+
1573
+ const inputData = e.inputBuffer.getChannelData(0);
1574
+ audioBuffer.push(...inputData);
1575
+
1576
+ // Simple voice activity detection
1577
+ const rms = Math.sqrt(inputData.reduce((sum, val) => sum + val * val, 0) / inputData.length);
1578
+
1579
+ if (rms < SILENCE_THRESHOLD) {
1580
+ silenceFrames++;
1581
+ } else {
1582
+ silenceFrames = 0;
1583
+ }
1584
+
1585
+ // Send audio when we have enough silence or max buffer reached
1586
+ if ((silenceFrames > 20 && audioBuffer.length > MIN_BUFFER_SIZE) ||
1587
+ audioBuffer.length > MAX_BUFFER_SIZE) {
1588
+
1589
+ const audioData = new Float32Array(audioBuffer);
1590
+ audioBuffer = [];
1591
+ silenceFrames = 0;
1592
+
1593
+ // Convert to 16-bit PCM
1594
+ const pcmData = new Int16Array(audioData.length);
1595
+ for (let i = 0; i < audioData.length; i++) {
1596
+ pcmData[i] = Math.max(-32768, Math.min(32767, audioData[i] * 32768));
1597
+ }
1598
+
1599
+ // Send to server
1600
+ const formData = new FormData();
1601
+ formData.append('audio', new Blob([pcmData.buffer], { type: 'audio/pcm' }));
1602
+ formData.append('session_id', sessionId);
1603
+
1604
+ fetch('/whisper/realtime/process', {
1605
+ method: 'POST',
1606
+ body: formData
1607
+ }).catch(err => console.error('Error sending audio:', err));
1608
+ }
1609
+ };
1610
+
1611
+ source.connect(processor);
1612
+ processor.connect(audioContext.destination);
1613
+ }
1614
+
1615
  // Initialize on page load
1616
  window.addEventListener('DOMContentLoaded', () => {
1617
  sendButton.style.display = 'block';
 
1621
 
1622
  </html>"""
1623
 
1624
+ # Whisper model loader
1625
+ def _get_whisper_pipe():
1626
+ """Lazy load Whisper pipeline"""
1627
+ if not hasattr(_get_whisper_pipe, "pipe"):
1628
+ device = 0 if torch.cuda.is_available() else "cpu"
1629
+ _get_whisper_pipe.pipe = pipeline(
1630
+ task="automatic-speech-recognition",
1631
+ model=WHISPER_MODEL_NAME,
1632
+ chunk_length_s=30,
1633
+ device=device,
1634
+ )
1635
+ return _get_whisper_pipe.pipe
1636
+
1637
+ # Audio helpers for Whisper
1638
+ def _ensure_16k_whisper(y: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
1639
+ """Resample audio to 16kHz for Whisper"""
1640
+ if sr == WHISPER_SAMPLE_RATE:
1641
+ return y.astype(np.float32), WHISPER_SAMPLE_RATE
1642
+ g = np.gcd(sr, WHISPER_SAMPLE_RATE)
1643
+ y = sps.resample_poly(y, WHISPER_SAMPLE_RATE // g, sr // g).astype(np.float32)
1644
+ return y, WHISPER_SAMPLE_RATE
1645
+
1646
+ def _should_flush_whisper(buffer: np.ndarray, sr: int) -> bool:
1647
+ """Check if audio buffer should be flushed for processing"""
1648
+ dur = len(buffer) / sr
1649
+ if dur < MIN_SEG_SEC:
1650
+ return False
1651
+ tail_len = int(SILENCE_SEC * sr)
1652
+ tail = buffer[-tail_len:]
1653
+ rms = np.sqrt(np.mean(tail ** 2)) if len(tail) else 1.0
1654
+ end_of_sentence = rms < SILENCE_THRESH
1655
+ return end_of_sentence or dur >= MAX_SEG_SEC
1656
+
1657
+ # Translation helper
1658
+ def _translate_text_4langs(text: str) -> str:
1659
+ """Translate text to 4 languages using OpenAI"""
1660
+ try:
1661
+ client = openai.OpenAI()
1662
+ prompt = (
1663
+ "Translate the following text into English (EN), Chinese (ZH), Thai (TH) and Russian (RU).\n"
1664
+ "Return ONLY the translations in this format (one per line):\n"
1665
+ "EN: <english>\nZH: <chinese>\nTH: <thai>\nRU: <russian>\n\n"
1666
+ f"Text: {text}"
1667
+ )
1668
+
1669
+ response = client.chat.completions.create(
1670
+ model="gpt-4o-mini",
1671
+ messages=[
1672
+ {"role": "system", "content": "You are a professional translator."},
1673
+ {"role": "user", "content": prompt}
1674
+ ],
1675
+ temperature=0.7,
1676
+ max_tokens=512
1677
+ )
1678
+
1679
+ return response.choices[0].message.content.strip()
1680
+ except Exception as e:
1681
+ print(f"Translation error: {e}")
1682
+ return f"Translation error: {str(e)}"
1683
+
1684
+ # ffmpeg check
1685
+ def _check_ffmpeg() -> bool:
1686
+ try:
1687
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
1688
+ return True
1689
+ except Exception:
1690
+ return False
1691
+
1692
+ _HAS_FFMPEG = _check_ffmpeg()
1693
+
1694
+ def extract_audio_from_video(video_path: str) -> str:
1695
+ """Extract audio from video file"""
1696
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1697
+ tmp.close()
1698
+
1699
+ if _HAS_FFMPEG:
1700
+ cmd = [
1701
+ "ffmpeg", "-i", video_path, "-vn",
1702
+ "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", tmp.name
1703
+ ]
1704
+ result = subprocess.run(cmd, capture_output=True)
1705
+ if result.returncode != 0:
1706
+ os.unlink(tmp.name)
1707
+ raise RuntimeError("ffmpeg error extracting audio")
1708
+ return tmp.name
1709
+ else:
1710
+ raise RuntimeError("ffmpeg is required for video processing")
1711
+
1712
+ # GPU workers for Whisper
1713
+ def gpu_transcribe_whisper(audio_path: str) -> str:
1714
+ """Transcribe audio using Whisper on GPU"""
1715
+ pipe = _get_whisper_pipe()
1716
+ result = pipe(audio_path, batch_size=WHISPER_BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)
1717
+ return result["text"].strip()
1718
+
1719
+ def gpu_asr_translate_whisper(audio: np.ndarray, sr: int) -> str:
1720
+ """Transcribe and translate audio for realtime"""
1721
+ pipe = _get_whisper_pipe()
1722
+ ko = pipe({"array": audio, "sampling_rate": sr}, batch_size=WHISPER_BATCH_SIZE)["text"].strip()
1723
+ trans = _translate_text_4langs(ko).replace("\n", "<br>")
1724
+ ts = datetime.now(SEOUL_TZ).strftime("%Y-%m-%d %H:%M:%S")
1725
+ return f"[{ts}]<br>[KO] {ko}<br>{trans}<br>{'-'*40}<br>"
1726
 
1727
  class BraveSearchClient:
1728
  """Brave Search API client"""
 
1773
  # Store connection settings
1774
  connection_settings = {}
1775
 
1776
+ # Store realtime sessions
1777
+ realtime_sessions = {}
1778
+
1779
  # Initialize OpenAI client for text chat
1780
  client = openai.AsyncOpenAI()
1781
 
 
2435
  return StreamingResponse(output_stream(), media_type="text/event-stream")
2436
 
2437
 
2438
+ # Whisper endpoints
2439
+ @app.post("/whisper/transcribe")
2440
+ async def whisper_transcribe(audio: UploadFile = File(...)):
2441
+ """Transcribe audio using Whisper"""
2442
+ try:
2443
+ # Save uploaded file temporarily
2444
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
2445
+ content = await audio.read()
2446
+ tmp.write(content)
2447
+ tmp_path = tmp.name
2448
+
2449
+ # Transcribe
2450
+ text = await asyncio.get_event_loop().run_in_executor(
2451
+ whisper_executor, gpu_transcribe_whisper, tmp_path
2452
+ )
2453
+
2454
+ # Translate
2455
+ translation = _translate_text_4langs(text)
2456
+
2457
+ # Clean up
2458
+ os.unlink(tmp_path)
2459
+
2460
+ return {"text": text, "translation": translation}
2461
+
2462
+ except Exception as e:
2463
+ print(f"Whisper transcribe error: {e}")
2464
+ return {"error": str(e)}
2465
+
2466
+
2467
+ @app.post("/whisper/audio")
2468
+ async def whisper_audio(audio: UploadFile = File(...)):
2469
+ """Process audio file"""
2470
+ try:
2471
+ # Save uploaded file temporarily
2472
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp:
2473
+ content = await audio.read()
2474
+ tmp.write(content)
2475
+ tmp_path = tmp.name
2476
+
2477
+ # Transcribe
2478
+ text = await asyncio.get_event_loop().run_in_executor(
2479
+ whisper_executor, gpu_transcribe_whisper, tmp_path
2480
+ )
2481
+
2482
+ # Translate
2483
+ translation = _translate_text_4langs(text)
2484
+
2485
+ # Clean up
2486
+ os.unlink(tmp_path)
2487
+
2488
+ return {"text": text, "translation": translation}
2489
+
2490
+ except Exception as e:
2491
+ print(f"Whisper audio error: {e}")
2492
+ return {"error": str(e)}
2493
+
2494
+
2495
+ @app.post("/whisper/video")
2496
+ async def whisper_video(video: UploadFile = File(...)):
2497
+ """Process video file"""
2498
+ try:
2499
+ # Save uploaded file temporarily
2500
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(video.filename).suffix) as tmp:
2501
+ content = await video.read()
2502
+ tmp.write(content)
2503
+ tmp_path = tmp.name
2504
+
2505
+ # Extract audio
2506
+ audio_path = await asyncio.get_event_loop().run_in_executor(
2507
+ None, extract_audio_from_video, tmp_path
2508
+ )
2509
+
2510
+ # Transcribe
2511
+ text = await asyncio.get_event_loop().run_in_executor(
2512
+ whisper_executor, gpu_transcribe_whisper, audio_path
2513
+ )
2514
+
2515
+ # Translate
2516
+ translation = _translate_text_4langs(text)
2517
+
2518
+ # Clean up
2519
+ os.unlink(tmp_path)
2520
+ os.unlink(audio_path)
2521
+
2522
+ return {"text": text, "translation": translation}
2523
+
2524
+ except Exception as e:
2525
+ print(f"Whisper video error: {e}")
2526
+ return {"error": str(e)}
2527
+
2528
+
2529
+ @app.post("/whisper/pdf")
2530
+ async def whisper_pdf(pdf: UploadFile = File(...), max_pages: int = Form(10)):
2531
+ """Process PDF file"""
2532
+ try:
2533
+ # Save uploaded file temporarily
2534
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
2535
+ content = await pdf.read()
2536
+ tmp.write(content)
2537
+ tmp_path = tmp.name
2538
+
2539
+ # Extract text
2540
+ extracted = []
2541
+ with pdfplumber.open(tmp_path) as pdf_doc:
2542
+ pages = pdf_doc.pages[:max_pages]
2543
+ for idx, pg in enumerate(pages, start=1):
2544
+ txt = pg.extract_text() or ""
2545
+ if txt.strip():
2546
+ extracted.append(f"[Page {idx}]\n{txt}")
2547
+
2548
+ full_text = "\n\n".join(extracted)
2549
+
2550
+ # Translate each page
2551
+ translated = []
2552
+ for page_text in extracted:
2553
+ trans = _translate_text_4langs(page_text.split('\n', 1)[1]) # Skip page header
2554
+ translated.append(page_text.split('\n')[0] + "\n" + trans)
2555
+
2556
+ # Clean up
2557
+ os.unlink(tmp_path)
2558
+
2559
+ return {"text": full_text, "translation": "\n\n".join(translated)}
2560
+
2561
+ except Exception as e:
2562
+ print(f"Whisper PDF error: {e}")
2563
+ return {"error": str(e)}
2564
+
2565
+
2566
+ @app.post("/whisper/realtime/start")
2567
+ async def whisper_realtime_start():
2568
+ """Start realtime transcription session"""
2569
+ session_id = os.urandom(16).hex()
2570
+ realtime_sessions[session_id] = {
2571
+ "buffer": [],
2572
+ "queue": asyncio.Queue(),
2573
+ "active": True
2574
+ }
2575
+ return {"session_id": session_id}
2576
+
2577
+
2578
+ @app.post("/whisper/realtime/process")
2579
+ async def whisper_realtime_process(
2580
+ audio: UploadFile = File(...),
2581
+ session_id: str = Form(...)
2582
+ ):
2583
+ """Process realtime audio chunk"""
2584
+ if session_id not in realtime_sessions:
2585
+ return {"error": "Invalid session"}
2586
+
2587
+ try:
2588
+ # Read audio data
2589
+ content = await audio.read()
2590
+ audio_array = np.frombuffer(content, dtype=np.int16).astype(np.float32) / 32768.0
2591
+
2592
+ # Process in executor
2593
+ result = await asyncio.get_event_loop().run_in_executor(
2594
+ whisper_executor, gpu_asr_translate_whisper, audio_array, WHISPER_SAMPLE_RATE
2595
+ )
2596
+
2597
+ # Parse result
2598
+ lines = result.split('<br>')
2599
+ timestamp = lines[0].strip('[]') if lines else ""
2600
+ text = lines[1].replace('[KO]', '').strip() if len(lines) > 1 else ""
2601
+ translation = '<br>'.join(lines[2:-2]) if len(lines) > 3 else ""
2602
+
2603
+ # Queue result
2604
+ await realtime_sessions[session_id]["queue"].put({
2605
+ "timestamp": timestamp,
2606
+ "text": text,
2607
+ "translation": translation
2608
+ })
2609
+
2610
+ return {"status": "ok"}
2611
+
2612
+ except Exception as e:
2613
+ print(f"Realtime process error: {e}")
2614
+ return {"error": str(e)}
2615
+
2616
+
2617
+ @app.get("/whisper/realtime/stream")
2618
+ async def whisper_realtime_stream(session_id: str):
2619
+ """Stream realtime results"""
2620
+ if session_id not in realtime_sessions:
2621
+ return JSONResponse({"error": "Invalid session"}, status_code=404)
2622
+
2623
+ async def stream_results():
2624
+ session = realtime_sessions[session_id]
2625
+ try:
2626
+ while session["active"]:
2627
+ try:
2628
+ result = await asyncio.wait_for(session["queue"].get(), timeout=1.0)
2629
+ yield f"data: {json.dumps(result)}\n\n"
2630
+ except asyncio.TimeoutError:
2631
+ yield f"data: {json.dumps({'keepalive': True})}\n\n"
2632
+ except Exception as e:
2633
+ print(f"Stream error: {e}")
2634
+ finally:
2635
+ # Cleanup session
2636
+ if session_id in realtime_sessions:
2637
+ del realtime_sessions[session_id]
2638
+
2639
+ return StreamingResponse(stream_results(), media_type="text/event-stream")
2640
+
2641
+
2642
  @app.get("/")
2643
  async def index():
2644
  """Serve the HTML page"""