seawolf2357 commited on
Commit
6415ee8
·
verified ·
1 Parent(s): 00cb0fa

Update app-backup4.py

Browse files
Files changed (1) hide show
  1. app-backup4.py +131 -1085
app-backup4.py CHANGED
@@ -6,8 +6,8 @@ import os
6
  import numpy as np
7
  import openai
8
  from dotenv import load_dotenv
9
- from fastapi import FastAPI, Request, UploadFile, File, Form
10
- from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
11
  from fastrtc import (
12
  AdditionalOutputs,
13
  AsyncStreamHandler,
@@ -23,35 +23,10 @@ import gradio as gr
23
  import io
24
  from scipy import signal
25
  import wave
26
- import torch
27
- from transformers import pipeline
28
- import tempfile
29
- import subprocess
30
- import pdfplumber
31
- import scipy.signal as sps
32
- from datetime import datetime
33
- from zoneinfo import ZoneInfo
34
- import concurrent.futures
35
 
36
  load_dotenv()
37
 
38
  SAMPLE_RATE = 24000
39
- WHISPER_SAMPLE_RATE = 16000
40
- SEOUL_TZ = ZoneInfo("Asia/Seoul")
41
-
42
- # Whisper model settings
43
- WHISPER_MODEL_NAME = "openai/whisper-large-v3-turbo"
44
- WHISPER_BATCH_SIZE = 8
45
-
46
- # Real-time segmentation parameters
47
- MIN_SEG_SEC = 10
48
- MAX_SEG_SEC = 15
49
- SILENCE_SEC = 0.6
50
- SILENCE_THRESH = 1e-4
51
-
52
- # CPU-side pool for Whisper tasks
53
- whisper_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
54
- whisper_futures_queue: list[concurrent.futures.Future] = []
55
 
56
  # Supported languages for OpenAI Realtime API
57
  SUPPORTED_LANGUAGES = {
@@ -86,14 +61,14 @@ SUPPORTED_LANGUAGES = {
86
  "ms": "Bahasa Melayu (Malay)"
87
  }
88
 
89
- # HTML content embedded as a string (extended with new tabs)
90
  HTML_CONTENT = """<!DOCTYPE html>
91
  <html lang="ko">
92
 
93
  <head>
94
  <meta charset="UTF-8">
95
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
96
- <title>Mouth of 'MOUSE' - Extended</title>
97
  <style>
98
  :root {
99
  --primary-color: #6f42c1;
@@ -135,43 +110,6 @@ HTML_CONTENT = """<!DOCTYPE html>
135
  flex-shrink: 0;
136
  background-color: var(--card-bg);
137
  }
138
- .tabs-container {
139
- display: flex;
140
- gap: 10px;
141
- margin-bottom: 20px;
142
- border-bottom: 2px solid var(--border-color);
143
- padding-bottom: 10px;
144
- overflow-x: auto;
145
- scrollbar-width: thin;
146
- scrollbar-color: var(--primary-color) var(--card-bg);
147
- }
148
- .tab-button {
149
- padding: 10px 20px;
150
- background-color: var(--card-bg);
151
- color: var(--text-color);
152
- border: 1px solid var(--border-color);
153
- border-radius: 8px 8px 0 0;
154
- cursor: pointer;
155
- transition: all 0.3s;
156
- white-space: nowrap;
157
- font-size: 14px;
158
- }
159
- .tab-button:hover {
160
- background-color: var(--secondary-color);
161
- }
162
- .tab-button.active {
163
- background-color: var(--primary-color);
164
- border-bottom: 2px solid var(--primary-color);
165
- }
166
- .tab-content {
167
- display: none;
168
- flex-grow: 1;
169
- overflow: hidden;
170
- flex-direction: column;
171
- }
172
- .tab-content.active {
173
- display: flex;
174
- }
175
  .main-content {
176
  display: flex;
177
  gap: 20px;
@@ -281,7 +219,7 @@ HTML_CONTENT = """<!DOCTYPE html>
281
  .text-input-section {
282
  margin-top: 15px;
283
  }
284
- input[type="text"], input[type="file"], textarea {
285
  width: 100%;
286
  background-color: var(--dark-bg);
287
  color: var(--text-color);
@@ -380,91 +318,11 @@ HTML_CONTENT = """<!DOCTYPE html>
380
  flex-shrink: 0;
381
  padding-top: 20px;
382
  }
383
- /* Whisper Tab Styles */
384
- .whisper-container {
385
- padding: 20px;
386
- background-color: var(--card-bg);
387
- border-radius: 12px;
388
- border: 1px solid var(--border-color);
389
- margin: 20px auto;
390
- max-width: 800px;
391
- width: 100%;
392
- }
393
- .whisper-input-section {
394
- margin-bottom: 20px;
395
- }
396
- .whisper-output-section {
397
- display: grid;
398
- grid-template-columns: 1fr 1fr;
399
- gap: 20px;
400
- margin-top: 20px;
401
- }
402
- .whisper-output {
403
- background-color: var(--dark-bg);
404
- padding: 15px;
405
- border-radius: 8px;
406
- border: 1px solid var(--border-color);
407
- max-height: 300px;
408
- overflow-y: auto;
409
- }
410
- .whisper-output h4 {
411
- margin-top: 0;
412
- color: var(--primary-color);
413
- }
414
- .file-upload-area {
415
- border: 2px dashed var(--border-color);
416
- border-radius: 8px;
417
- padding: 30px;
418
- text-align: center;
419
- cursor: pointer;
420
- transition: all 0.3s;
421
- }
422
- .file-upload-area:hover {
423
- border-color: var(--primary-color);
424
- background-color: rgba(111, 66, 193, 0.1);
425
- }
426
- .file-upload-area.drag-over {
427
- border-color: var(--primary-color);
428
- background-color: rgba(111, 66, 193, 0.2);
429
- }
430
- .realtime-output {
431
- background-color: var(--dark-bg);
432
- padding: 20px;
433
- border-radius: 8px;
434
- margin-top: 20px;
435
- min-height: 200px;
436
- max-height: 400px;
437
- overflow-y: auto;
438
- }
439
- .recording-indicator {
440
- display: inline-flex;
441
- align-items: center;
442
- gap: 10px;
443
- padding: 10px 20px;
444
- background-color: #f44336;
445
- color: white;
446
- border-radius: 20px;
447
- animation: pulse 1.5s infinite;
448
- }
449
- .recording-dot {
450
- width: 10px;
451
- height: 10px;
452
- background-color: white;
453
- border-radius: 50%;
454
- animation: blink 1s infinite;
455
- }
456
- @keyframes blink {
457
- 0%, 50% { opacity: 1; }
458
- 51%, 100% { opacity: 0; }
459
- }
460
  /* Responsive design */
461
  @media (max-width: 1024px) {
462
  .sidebar {
463
  width: 300px;
464
  }
465
- .whisper-output-section {
466
- grid-template-columns: 1fr;
467
- }
468
  }
469
  @media (max-width: 768px) {
470
  .main-content {
@@ -477,9 +335,6 @@ HTML_CONTENT = """<!DOCTYPE html>
477
  .chat-section {
478
  height: 400px;
479
  }
480
- .tabs-container {
481
- flex-wrap: wrap;
482
- }
483
  }
484
  button {
485
  background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
@@ -577,10 +432,6 @@ HTML_CONTENT = """<!DOCTYPE html>
577
  background-color: #ff9800;
578
  color: white;
579
  }
580
- .toast.success {
581
- background-color: #4caf50;
582
- color: white;
583
- }
584
  .status-indicator {
585
  display: inline-flex;
586
  align-items: center;
@@ -657,7 +508,7 @@ HTML_CONTENT = """<!DOCTYPE html>
657
  <div class="mouse-ears mouse-ear-right"></div>
658
  <div class="mouse-face"></div>
659
  </div>
660
- <h1>MOUSE Extended</h1>
661
  </div>
662
  <div class="status-indicator">
663
  <div id="status-dot" class="status-dot disconnected"></div>
@@ -665,252 +516,131 @@ HTML_CONTENT = """<!DOCTYPE html>
665
  </div>
666
  </div>
667
 
668
- <div class="tabs-container">
669
- <button class="tab-button active" onclick="switchTab('voice-chat')">음성 채팅</button>
670
- <button class="tab-button" onclick="switchTab('mic-whisper')">마이크 전사</button>
671
- <button class="tab-button" onclick="switchTab('audio-whisper')">오디오 파일</button>
672
- <button class="tab-button" onclick="switchTab('video-whisper')">비디오 파일</button>
673
- <button class="tab-button" onclick="switchTab('pdf-whisper')">PDF 번역</button>
674
- <button class="tab-button" onclick="switchTab('realtime-whisper')">실시간 통역</button>
675
- </div>
676
-
677
- <!-- Voice Chat Tab (Original) -->
678
- <div id="voice-chat" class="tab-content active">
679
- <div class="main-content">
680
- <div class="sidebar">
681
- <div class="settings-section">
682
- <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
683
- <div class="settings-grid">
684
- <div class="setting-item">
685
- <span class="setting-label">웹 검색</span>
686
- <div id="search-toggle" class="toggle-switch">
687
- <div class="toggle-slider"></div>
688
- </div>
689
- </div>
690
- <div class="setting-item">
691
- <span class="setting-label">다국어 번역 채팅</span>
692
- <select id="language-select">
693
- <option value="">비활성화</option>
694
- <option value="ko">한국어 (Korean)</option>
695
- <option value="en">English</option>
696
- <option value="es">Español (Spanish)</option>
697
- <option value="fr">Français (French)</option>
698
- <option value="de">Deutsch (German)</option>
699
- <option value="it">Italiano (Italian)</option>
700
- <option value="pt">Português (Portuguese)</option>
701
- <option value="ru">Русский (Russian)</option>
702
- <option value="ja">日本語 (Japanese)</option>
703
- <option value="zh">中文 (Chinese)</option>
704
- <option value="ar">العربية (Arabic)</option>
705
- <option value="hi">हिन्दी (Hindi)</option>
706
- <option value="nl">Nederlands (Dutch)</option>
707
- <option value="pl">Polski (Polish)</option>
708
- <option value="tr">Türkçe (Turkish)</option>
709
- <option value="vi">Tiếng Việt (Vietnamese)</option>
710
- <option value="th">ไทย (Thai)</option>
711
- <option value="id">Bahasa Indonesia</option>
712
- <option value="sv">Svenska (Swedish)</option>
713
- <option value="da">Dansk (Danish)</option>
714
- <option value="no">Norsk (Norwegian)</option>
715
- <option value="fi">Suomi (Finnish)</option>
716
- <option value="he">עברית (Hebrew)</option>
717
- <option value="uk">Українська (Ukrainian)</option>
718
- <option value="cs">Čeština (Czech)</option>
719
- <option value="el">Ελληνικά (Greek)</option>
720
- <option value="ro">Română (Romanian)</option>
721
- <option value="hu">Magyar (Hungarian)</option>
722
- <option value="ms">Bahasa Melayu (Malay)</option>
723
- </select>
724
  </div>
725
  </div>
726
- <div class="text-input-section">
727
- <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
728
- <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  </div>
730
  </div>
731
-
732
- <div class="controls">
733
- <button id="start-button">대화 시작</button>
734
  </div>
735
  </div>
736
 
737
- <div class="chat-section">
738
- <div class="chat-container">
739
- <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
740
- <div class="chat-messages" id="chat-messages"></div>
741
- <div class="text-input-section" style="margin-top: 10px;">
742
- <div style="display: flex; gap: 10px;">
743
- <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
744
- <button id="send-button" style="display: none;">전송</button>
745
- </div>
746
- </div>
747
- </div>
748
- </div>
749
- </div>
750
- </div>
751
-
752
- <!-- Microphone Whisper Tab -->
753
- <div id="mic-whisper" class="tab-content">
754
- <div class="whisper-container">
755
- <h3>마이크 녹음 → 전사 및 4개 언어 번역</h3>
756
- <div class="whisper-input-section">
757
- <button id="mic-record-btn" onclick="toggleMicRecording()">녹음 시작</button>
758
- <div id="mic-status" style="margin-top: 10px;"></div>
759
- </div>
760
- <div class="whisper-output-section">
761
- <div class="whisper-output">
762
- <h4>원문</h4>
763
- <div id="mic-original"></div>
764
- </div>
765
- <div class="whisper-output">
766
- <h4>번역 (EN/ZH/TH/RU)</h4>
767
- <div id="mic-translation"></div>
768
- </div>
769
  </div>
770
  </div>
771
- </div>
772
-
773
- <!-- Audio File Whisper Tab -->
774
- <div id="audio-whisper" class="tab-content">
775
- <div class="whisper-container">
776
- <h3>오디오 파일 전사 및 4개 언어 번역</h3>
777
- <div class="whisper-input-section">
778
- <div class="file-upload-area" id="audio-upload-area">
779
- <p>오디오 파일을 드래그하거나 클릭하여 업로드</p>
780
- <input type="file" id="audio-file-input" accept="audio/*" style="display: none;">
781
- </div>
782
- </div>
783
- <div class="whisper-output-section">
784
- <div class="whisper-output">
785
- <h4>원문</h4>
786
- <div id="audio-original"></div>
787
- </div>
788
- <div class="whisper-output">
789
- <h4>번역 (EN/ZH/TH/RU)</h4>
790
- <div id="audio-translation"></div>
791
- </div>
792
- </div>
793
- </div>
794
- </div>
795
-
796
- <!-- Video File Whisper Tab -->
797
- <div id="video-whisper" class="tab-content">
798
- <div class="whisper-container">
799
- <h3>비디오 파일 → 오디오 추출 → 전사 및 4개 언어 번역</h3>
800
- <div class="whisper-input-section">
801
- <div class="file-upload-area" id="video-upload-area">
802
- <p>비디오 파일을 드래그하거나 클릭하여 업로드</p>
803
- <input type="file" id="video-file-input" accept="video/*" style="display: none;">
804
- </div>
805
- </div>
806
- <div class="whisper-output-section">
807
- <div class="whisper-output">
808
- <h4>원문</h4>
809
- <div id="video-original"></div>
810
- </div>
811
- <div class="whisper-output">
812
- <h4>번역 (EN/ZH/TH/RU)</h4>
813
- <div id="video-translation"></div>
814
- </div>
815
- </div>
816
- </div>
817
- </div>
818
-
819
- <!-- PDF Whisper Tab -->
820
- <div id="pdf-whisper" class="tab-content">
821
- <div class="whisper-container">
822
- <h3>PDF 파일 → 텍스트 추출 → 4개 언어 번역</h3>
823
- <div class="whisper-input-section">
824
- <div class="file-upload-area" id="pdf-upload-area">
825
- <p>PDF 파일을 드래그하거나 클릭하여 업로드</p>
826
- <input type="file" id="pdf-file-input" accept=".pdf" style="display: none;">
827
- </div>
828
- <div class="setting-item" style="margin-top: 15px;">
829
- <span class="setting-label">최대 페이지:</span>
830
- <input type="number" id="pdf-max-pages" value="10" min="1" max="50" style="width: 80px;">
831
- </div>
832
- </div>
833
- <div class="whisper-output-section">
834
- <div class="whisper-output">
835
- <h4>추출된 텍스트</h4>
836
- <div id="pdf-original"></div>
837
- </div>
838
- <div class="whisper-output">
839
- <h4>번역 (EN/ZH/TH/RU)</h4>
840
- <div id="pdf-translation"></div>
841
  </div>
842
  </div>
843
  </div>
844
  </div>
845
-
846
- <!-- Realtime Whisper Tab -->
847
- <div id="realtime-whisper" class="tab-content">
848
- <div class="whisper-container">
849
- <h3>실시간 통역 (Korean → EN/ZH/TH/RU)</h3>
850
- <p>10-15초 문장 단위로 자동 전환 — 최신 내용이 위에 표시됩니다.</p>
851
- <div class="whisper-input-section">
852
- <button id="realtime-start-btn" onclick="toggleRealtimeRecording()">실시간 통역 시작</button>
853
- <div id="realtime-status" style="margin-top: 10px;"></div>
854
- </div>
855
- <div class="realtime-output" id="realtime-output"></div>
856
- </div>
857
- </div>
858
  </div>
859
  <audio id="audio-output"></audio>
860
 
861
  <script>
862
- // Tab switching functionality - 맨 앞에 배치
863
- function switchTab(tabName) {
864
- console.log('Switching to tab:', tabName);
865
-
866
- // Hide all tabs
867
- document.querySelectorAll('.tab-content').forEach(tab => {
868
- tab.style.display = 'none';
869
- tab.classList.remove('active');
870
- });
871
-
872
- console.log('All initialized!');
873
-
874
- // Remove active from all buttons
875
- document.querySelectorAll('.tab-button').forEach(btn => {
876
- btn.classList.remove('active');
877
- });
878
-
879
- // Show selected tab
880
- const selectedTab = document.getElementById(tabName);
881
- if (selectedTab) {
882
- selectedTab.style.display = 'flex';
883
- selectedTab.classList.add('active');
884
- }
885
-
886
- // Mark button as active
887
- event.target.classList.add('active');
888
- }
889
-
890
- // Global variables
891
- let peerConnection = null;
892
- let webrtc_id = null;
893
  let webSearchEnabled = false;
894
  let selectedLanguage = "";
895
  let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
 
 
 
 
 
 
 
 
 
 
896
  let audioLevel = 0;
897
- let animationFrame = null;
898
- let audioContext = null;
899
- let analyser = null;
900
- let audioSource = null;
901
  let dataChannel = null;
902
  let isVoiceActive = false;
903
 
904
- // Whisper variables
905
- let micRecorder = null;
906
- let isRecording = false;
907
- let realtimeRecorder = null;
908
- let isRealtimeRecording = false;
909
- let realtimeStream = null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
 
911
  async function sendTextMessage() {
912
- const textInput = document.getElementById('text-input');
913
- const chatMessages = document.getElementById('chat-messages');
914
  const message = textInput.value.trim();
915
  if (!message) return;
916
 
@@ -964,25 +694,20 @@ HTML_CONTENT = """<!DOCTYPE html>
964
  }
965
 
966
  function updateStatus(state) {
967
- const statusDot = document.getElementById('status-dot');
968
- const statusText = document.getElementById('status-text');
969
- const sendButton = document.getElementById('send-button');
970
-
971
  statusDot.className = 'status-dot ' + state;
972
  if (state === 'connected') {
973
  statusText.textContent = '연결됨';
974
- if (sendButton) sendButton.style.display = 'block';
975
  isVoiceActive = true;
976
  } else if (state === 'connecting') {
977
  statusText.textContent = '연결 중...';
978
- if (sendButton) sendButton.style.display = 'none';
979
  } else {
980
  statusText.textContent = '연결 대기 중';
981
- if (sendButton) sendButton.style.display = 'block'; // Show send button even when disconnected for text chat
982
  isVoiceActive = false;
983
  }
984
  }
985
-
986
  function updateButtonState() {
987
  const button = document.getElementById('start-button');
988
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
@@ -1012,7 +737,6 @@ HTML_CONTENT = """<!DOCTYPE html>
1012
  updateStatus('disconnected');
1013
  }
1014
  }
1015
-
1016
  function setupAudioVisualization(stream) {
1017
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
1018
  analyser = audioContext.createAnalyser();
@@ -1047,7 +771,6 @@ HTML_CONTENT = """<!DOCTYPE html>
1047
 
1048
  updateAudioLevel();
1049
  }
1050
-
1051
  function showError(message) {
1052
  const toast = document.getElementById('error-toast');
1053
  toast.textContent = message;
@@ -1057,20 +780,8 @@ HTML_CONTENT = """<!DOCTYPE html>
1057
  toast.style.display = 'none';
1058
  }, 5000);
1059
  }
1060
-
1061
- function showSuccess(message) {
1062
- const toast = document.getElementById('error-toast');
1063
- toast.textContent = message;
1064
- toast.className = 'toast success';
1065
- toast.style.display = 'block';
1066
- setTimeout(() => {
1067
- toast.style.display = 'none';
1068
- }, 3000);
1069
- }
1070
-
1071
  async function setupWebRTC() {
1072
- const audioOutput = document.getElementById('audio-output');
1073
- const config = typeof __RTC_CONFIGURATION__ !== 'undefined' ? __RTC_CONFIGURATION__ : {iceServers: [{urls: 'stun:stun.l.google.com:19302'}]};
1074
  peerConnection = new RTCPeerConnection(config);
1075
  const timeoutId = setTimeout(() => {
1076
  const toast = document.getElementById('error-toast');
@@ -1134,6 +845,14 @@ HTML_CONTENT = """<!DOCTYPE html>
1134
  });
1135
  webrtc_id = Math.random().toString(36).substring(7);
1136
 
 
 
 
 
 
 
 
 
1137
  const response = await fetch('/webrtc/offer', {
1138
  method: 'POST',
1139
  headers: { 'Content-Type': 'application/json' },
@@ -1178,9 +897,7 @@ HTML_CONTENT = """<!DOCTYPE html>
1178
  stop();
1179
  }
1180
  }
1181
-
1182
  function addMessage(role, content) {
1183
- const chatMessages = document.getElementById('chat-messages');
1184
  const messageDiv = document.createElement('div');
1185
  messageDiv.classList.add('message', role);
1186
 
@@ -1269,490 +986,26 @@ HTML_CONTENT = """<!DOCTYPE html>
1269
  webrtc_id = null;
1270
  }
1271
  }
1272
-
1273
- // Whisper Tab Functions
1274
-
1275
- // Microphone recording
1276
- async function toggleMicRecording() {
1277
- const btn = document.getElementById('mic-record-btn');
1278
- const status = document.getElementById('mic-status');
1279
-
1280
- if (!isRecording) {
1281
- try {
1282
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
1283
- micRecorder = new MediaRecorder(stream);
1284
- const chunks = [];
1285
-
1286
- micRecorder.ondataavailable = (e) => chunks.push(e.data);
1287
- micRecorder.onstop = async () => {
1288
- const blob = new Blob(chunks, { type: 'audio/webm' });
1289
- await processAudioBlob(blob, 'mic');
1290
- stream.getTracks().forEach(track => track.stop());
1291
- };
1292
-
1293
- micRecorder.start();
1294
- isRecording = true;
1295
- btn.textContent = '녹음 중지';
1296
- status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>녹음 중...</div>';
1297
- } catch (err) {
1298
- showError('마이크 접근 권한이 필요합니다.');
1299
- }
1300
  } else {
1301
- micRecorder.stop();
1302
- isRecording = false;
1303
- btn.textContent = '녹음 시작';
1304
- status.textContent = '처리 중...';
1305
- }
1306
- }
1307
-
1308
- // Process audio blob (for microphone recording)
1309
- async function processAudioBlob(blob, type) {
1310
- const formData = new FormData();
1311
- formData.append('audio', blob, 'recording.webm');
1312
-
1313
- try {
1314
- const response = await fetch('/whisper/transcribe', {
1315
- method: 'POST',
1316
- body: formData
1317
- });
1318
-
1319
- const result = await response.json();
1320
- if (result.error) {
1321
- showError(result.error);
1322
- } else {
1323
- document.getElementById(`${type}-original`).textContent = result.text;
1324
- document.getElementById(`${type}-translation`).innerHTML = result.translation.replace(/\n/g, '<br>');
1325
- document.getElementById(`${type}-status`).textContent = '';
1326
- }
1327
- } catch (error) {
1328
- showError('처리 중 오류가 발생했습니다.');
1329
- document.getElementById(`${type}-status`).textContent = '';
1330
- }
1331
- }
1332
-
1333
- // Process audio file
1334
- async function processAudioFile(file) {
1335
- const formData = new FormData();
1336
- formData.append('audio', file);
1337
-
1338
- try {
1339
- showSuccess('오디오 파일 처리 중...');
1340
- const response = await fetch('/whisper/audio', {
1341
- method: 'POST',
1342
- body: formData
1343
- });
1344
-
1345
- const result = await response.json();
1346
- if (result.error) {
1347
- showError(result.error);
1348
- } else {
1349
- document.getElementById('audio-original').textContent = result.text;
1350
- document.getElementById('audio-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1351
- }
1352
- } catch (error) {
1353
- showError('오디오 파일 처리 중 오류가 발생했습니다.');
1354
- }
1355
- }
1356
-
1357
- // Process video file
1358
- async function processVideoFile(file) {
1359
- const formData = new FormData();
1360
- formData.append('video', file);
1361
-
1362
- try {
1363
- showSuccess('비디오 파일 처리 중... (시간이 걸릴 수 있습니다)');
1364
- const response = await fetch('/whisper/video', {
1365
- method: 'POST',
1366
- body: formData
1367
- });
1368
-
1369
- const result = await response.json();
1370
- if (result.error) {
1371
- showError(result.error);
1372
- } else {
1373
- document.getElementById('video-original').textContent = result.text;
1374
- document.getElementById('video-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1375
- }
1376
- } catch (error) {
1377
- showError('비디오 파일 처리 중 오류가 발생했습니다.');
1378
- }
1379
- }
1380
-
1381
- // Process PDF file
1382
- async function processPDFFile(file) {
1383
- const formData = new FormData();
1384
- formData.append('pdf', file);
1385
- formData.append('max_pages', document.getElementById('pdf-max-pages').value);
1386
-
1387
- try {
1388
- showSuccess('PDF 파일 처리 중...');
1389
- const response = await fetch('/whisper/pdf', {
1390
- method: 'POST',
1391
- body: formData
1392
- });
1393
-
1394
- const result = await response.json();
1395
- if (result.error) {
1396
- showError(result.error);
1397
- } else {
1398
- document.getElementById('pdf-original').textContent = result.text;
1399
- document.getElementById('pdf-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
1400
- }
1401
- } catch (error) {
1402
- showError('PDF 파일 처리 중 오류가 발생했습니다.');
1403
- }
1404
- }
1405
-
1406
- // Realtime recording
1407
- let realtimeEventSource = null;
1408
-
1409
- async function toggleRealtimeRecording() {
1410
- const btn = document.getElementById('realtime-start-btn');
1411
- const status = document.getElementById('realtime-status');
1412
- const output = document.getElementById('realtime-output');
1413
-
1414
- if (!isRealtimeRecording) {
1415
- try {
1416
- const response = await fetch('/whisper/realtime/start', {
1417
- method: 'POST'
1418
- });
1419
-
1420
- const result = await response.json();
1421
- if (result.session_id) {
1422
- // Start listening for results
1423
- realtimeEventSource = new EventSource(`/whisper/realtime/stream?session_id=${result.session_id}`);
1424
-
1425
- realtimeEventSource.onmessage = (event) => {
1426
- const data = JSON.parse(event.data);
1427
- if (data.timestamp) {
1428
- const segment = document.createElement('div');
1429
- segment.style.borderBottom = '1px solid #333';
1430
- segment.style.paddingBottom = '15px';
1431
- segment.style.marginBottom = '15px';
1432
- segment.innerHTML = `
1433
- <strong>[${data.timestamp}]</strong><br>
1434
- <strong>[KO]</strong> ${data.text}<br>
1435
- ${data.translation.replace(/\n/g, '<br>')}
1436
- `;
1437
- output.insertBefore(segment, output.firstChild);
1438
- }
1439
- };
1440
-
1441
- realtimeEventSource.onerror = () => {
1442
- stopRealtimeRecording();
1443
- showError('실시간 통역 연결이 끊어졌습니다.');
1444
- };
1445
-
1446
- // Start recording
1447
- realtimeStream = await navigator.mediaDevices.getUserMedia({ audio: true });
1448
-
1449
- isRealtimeRecording = true;
1450
- btn.textContent = '통역 중지';
1451
- status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>실시간 통역 중...</div>';
1452
-
1453
- // Send audio data periodically
1454
- startRealtimeAudioCapture(result.session_id);
1455
- }
1456
- } catch (err) {
1457
- showError('실시간 통역을 시작할 수 없습니다.');
1458
- }
1459
- } else {
1460
- stopRealtimeRecording();
1461
- }
1462
- }
1463
-
1464
- function stopRealtimeRecording() {
1465
- if (realtimeEventSource) {
1466
- realtimeEventSource.close();
1467
- realtimeEventSource = null;
1468
- }
1469
-
1470
- if (realtimeStream) {
1471
- realtimeStream.getTracks().forEach(track => track.stop());
1472
- realtimeStream = null;
1473
  }
1474
-
1475
- isRealtimeRecording = false;
1476
- document.getElementById('realtime-start-btn').textContent = '실시간 통역 시작';
1477
- document.getElementById('realtime-status').textContent = '';
1478
- }
1479
 
1480
- async function startRealtimeAudioCapture(sessionId) {
1481
- const audioContext = new AudioContext({ sampleRate: 16000 });
1482
- const source = audioContext.createMediaStreamSource(realtimeStream);
1483
- const processor = audioContext.createScriptProcessor(4096, 1, 1);
1484
-
1485
- let audioBuffer = [];
1486
- let silenceFrames = 0;
1487
- const SILENCE_THRESHOLD = 0.01;
1488
- const MIN_BUFFER_SIZE = 16000 * 2; // 2 seconds minimum
1489
- const MAX_BUFFER_SIZE = 16000 * 15; // 15 seconds maximum
1490
-
1491
- processor.onaudioprocess = async (e) => {
1492
- if (!isRealtimeRecording) return;
1493
-
1494
- const inputData = e.inputBuffer.getChannelData(0);
1495
- audioBuffer.push(...inputData);
1496
-
1497
- // Simple voice activity detection
1498
- const rms = Math.sqrt(inputData.reduce((sum, val) => sum + val * val, 0) / inputData.length);
1499
-
1500
- if (rms < SILENCE_THRESHOLD) {
1501
- silenceFrames++;
1502
- } else {
1503
- silenceFrames = 0;
1504
- }
1505
-
1506
- // Send audio when we have enough silence or max buffer reached
1507
- if ((silenceFrames > 20 && audioBuffer.length > MIN_BUFFER_SIZE) ||
1508
- audioBuffer.length > MAX_BUFFER_SIZE) {
1509
-
1510
- const audioData = new Float32Array(audioBuffer);
1511
- audioBuffer = [];
1512
- silenceFrames = 0;
1513
-
1514
- // Convert to 16-bit PCM
1515
- const pcmData = new Int16Array(audioData.length);
1516
- for (let i = 0; i < audioData.length; i++) {
1517
- pcmData[i] = Math.max(-32768, Math.min(32767, audioData[i] * 32768));
1518
- }
1519
-
1520
- // Send to server
1521
- const formData = new FormData();
1522
- formData.append('audio', new Blob([pcmData.buffer], { type: 'audio/pcm' }));
1523
- formData.append('session_id', sessionId);
1524
-
1525
- fetch('/whisper/realtime/process', {
1526
- method: 'POST',
1527
- body: formData
1528
- }).catch(err => console.error('Error sending audio:', err));
1529
- }
1530
- };
1531
-
1532
- source.connect(processor);
1533
- processor.connect(audioContext.destination);
1534
- }
1535
-
1536
- // Simple initialization
1537
- window.onload = function() {
1538
- console.log('Page loaded!');
1539
-
1540
- // Web search toggle
1541
- document.getElementById('search-toggle').onclick = function() {
1542
- webSearchEnabled = !webSearchEnabled;
1543
- this.classList.toggle('active', webSearchEnabled);
1544
- console.log('Web search:', webSearchEnabled);
1545
- };
1546
-
1547
- // Language select
1548
- document.getElementById('language-select').onchange = function() {
1549
- selectedLanguage = this.value;
1550
- console.log('Language:', selectedLanguage);
1551
- };
1552
-
1553
- // System prompt
1554
- document.getElementById('system-prompt').oninput = function() {
1555
- systemPrompt = this.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
1556
- };
1557
-
1558
- // Text input enter key
1559
- document.getElementById('text-input').onkeypress = function(e) {
1560
- if (e.key === 'Enter' && !e.shiftKey) {
1561
- e.preventDefault();
1562
- sendTextMessage();
1563
- }
1564
- };
1565
-
1566
- // Send button
1567
- document.getElementById('send-button').onclick = sendTextMessage;
1568
- document.getElementById('send-button').style.display = 'block';
1569
-
1570
- // Start button
1571
- document.getElementById('start-button').onclick = function() {
1572
- if (!peerConnection || peerConnection.connectionState !== 'connected') {
1573
- setupWebRTC();
1574
- } else {
1575
- stop();
1576
- }
1577
- };
1578
-
1579
- // File upload areas
1580
- const audioUploadArea = document.getElementById('audio-upload-area');
1581
- if (audioUploadArea) {
1582
- audioUploadArea.onclick = function() {
1583
- document.getElementById('audio-file-input').click();
1584
- };
1585
- }
1586
-
1587
- const videoUploadArea = document.getElementById('video-upload-area');
1588
- if (videoUploadArea) {
1589
- videoUploadArea.onclick = function() {
1590
- document.getElementById('video-file-input').click();
1591
- };
1592
- }
1593
-
1594
- const pdfUploadArea = document.getElementById('pdf-upload-area');
1595
- if (pdfUploadArea) {
1596
- pdfUploadArea.onclick = function() {
1597
- document.getElementById('pdf-file-input').click();
1598
- };
1599
- }
1600
-
1601
- // File input handlers
1602
- const audioFileInput = document.getElementById('audio-file-input');
1603
- if (audioFileInput) {
1604
- audioFileInput.onchange = function(e) {
1605
- if (e.target.files[0]) processAudioFile(e.target.files[0]);
1606
- };
1607
- }
1608
-
1609
- const videoFileInput = document.getElementById('video-file-input');
1610
- if (videoFileInput) {
1611
- videoFileInput.onchange = function(e) {
1612
- if (e.target.files[0]) processVideoFile(e.target.files[0]);
1613
- };
1614
- }
1615
-
1616
- const pdfFileInput = document.getElementById('pdf-file-input');
1617
- if (pdfFileInput) {
1618
- pdfFileInput.onchange = function(e) {
1619
- if (e.target.files[0]) processPDFFile(e.target.files[0]);
1620
- };
1621
- }
1622
-
1623
- // Drag and drop handlers
1624
- ['audio', 'video', 'pdf'].forEach(type => {
1625
- const area = document.getElementById(`${type}-upload-area`);
1626
- if (area) {
1627
- area.ondragover = function(e) {
1628
- e.preventDefault();
1629
- area.classList.add('drag-over');
1630
- };
1631
-
1632
- area.ondragleave = function() {
1633
- area.classList.remove('drag-over');
1634
- };
1635
-
1636
- area.ondrop = function(e) {
1637
- e.preventDefault();
1638
- area.classList.remove('drag-over');
1639
- const file = e.dataTransfer.files[0];
1640
- if (file) {
1641
- if (type === 'audio') processAudioFile(file);
1642
- else if (type === 'video') processVideoFile(file);
1643
- else if (type === 'pdf') processPDFFile(file);
1644
- }
1645
- };
1646
- }
1647
- });
1648
- };
1649
  </script>
1650
  </body>
1651
 
1652
  </html>"""
1653
 
1654
- # Whisper model loader
1655
- def _get_whisper_pipe():
1656
- """Lazy load Whisper pipeline"""
1657
- if not hasattr(_get_whisper_pipe, "pipe"):
1658
- device = 0 if torch.cuda.is_available() else "cpu"
1659
- _get_whisper_pipe.pipe = pipeline(
1660
- task="automatic-speech-recognition",
1661
- model=WHISPER_MODEL_NAME,
1662
- chunk_length_s=30,
1663
- device=device,
1664
- )
1665
- return _get_whisper_pipe.pipe
1666
-
1667
- # Audio helpers for Whisper
1668
- def _ensure_16k_whisper(y: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
1669
- """Resample audio to 16kHz for Whisper"""
1670
- if sr == WHISPER_SAMPLE_RATE:
1671
- return y.astype(np.float32), WHISPER_SAMPLE_RATE
1672
- g = np.gcd(sr, WHISPER_SAMPLE_RATE)
1673
- y = sps.resample_poly(y, WHISPER_SAMPLE_RATE // g, sr // g).astype(np.float32)
1674
- return y, WHISPER_SAMPLE_RATE
1675
-
1676
- def _should_flush_whisper(buffer: np.ndarray, sr: int) -> bool:
1677
- """Check if audio buffer should be flushed for processing"""
1678
- dur = len(buffer) / sr
1679
- if dur < MIN_SEG_SEC:
1680
- return False
1681
- tail_len = int(SILENCE_SEC * sr)
1682
- tail = buffer[-tail_len:]
1683
- rms = np.sqrt(np.mean(tail ** 2)) if len(tail) else 1.0
1684
- end_of_sentence = rms < SILENCE_THRESH
1685
- return end_of_sentence or dur >= MAX_SEG_SEC
1686
-
1687
- # Translation helper
1688
- def _translate_text_4langs(text: str) -> str:
1689
- """Translate text to 4 languages using OpenAI"""
1690
- try:
1691
- client = openai.OpenAI()
1692
- prompt = (
1693
- "Translate the following text into English (EN), Chinese (ZH), Thai (TH) and Russian (RU).\n"
1694
- "Return ONLY the translations in this format (one per line):\n"
1695
- "EN: <english>\nZH: <chinese>\nTH: <thai>\nRU: <russian>\n\n"
1696
- f"Text: {text}"
1697
- )
1698
-
1699
- response = client.chat.completions.create(
1700
- model="gpt-4o-mini",
1701
- messages=[
1702
- {"role": "system", "content": "You are a professional translator."},
1703
- {"role": "user", "content": prompt}
1704
- ],
1705
- temperature=0.7,
1706
- max_tokens=512
1707
- )
1708
-
1709
- return response.choices[0].message.content.strip()
1710
- except Exception as e:
1711
- print(f"Translation error: {e}")
1712
- return f"Translation error: {str(e)}"
1713
-
1714
- # ffmpeg check
1715
- def _check_ffmpeg() -> bool:
1716
- try:
1717
- subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
1718
- return True
1719
- except Exception:
1720
- return False
1721
-
1722
- _HAS_FFMPEG = _check_ffmpeg()
1723
-
1724
- def extract_audio_from_video(video_path: str) -> str:
1725
- """Extract audio from video file"""
1726
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1727
- tmp.close()
1728
-
1729
- if _HAS_FFMPEG:
1730
- cmd = [
1731
- "ffmpeg", "-i", video_path, "-vn",
1732
- "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", tmp.name
1733
- ]
1734
- result = subprocess.run(cmd, capture_output=True)
1735
- if result.returncode != 0:
1736
- os.unlink(tmp.name)
1737
- raise RuntimeError("ffmpeg error extracting audio")
1738
- return tmp.name
1739
- else:
1740
- raise RuntimeError("ffmpeg is required for video processing")
1741
-
1742
- # GPU workers for Whisper
1743
- def gpu_transcribe_whisper(audio_path: str) -> str:
1744
- """Transcribe audio using Whisper on GPU"""
1745
- pipe = _get_whisper_pipe()
1746
- result = pipe(audio_path, batch_size=WHISPER_BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)
1747
- return result["text"].strip()
1748
-
1749
- def gpu_asr_translate_whisper(audio: np.ndarray, sr: int) -> str:
1750
- """Transcribe and translate audio for realtime"""
1751
- pipe = _get_whisper_pipe()
1752
- ko = pipe({"array": audio, "sampling_rate": sr}, batch_size=WHISPER_BATCH_SIZE)["text"].strip()
1753
- trans = _translate_text_4langs(ko).replace("\n", "<br>")
1754
- ts = datetime.now(SEOUL_TZ).strftime("%Y-%m-%d %H:%M:%S")
1755
- return f"[{ts}]<br>[KO] {ko}<br>{trans}<br>{'-'*40}<br>"
1756
 
1757
  class BraveSearchClient:
1758
  """Brave Search API client"""
@@ -1803,9 +1056,6 @@ print(f"Search client initialized: {search_client is not None}, API key present:
1803
  # Store connection settings
1804
  connection_settings = {}
1805
 
1806
- # Store realtime sessions
1807
- realtime_sessions = {}
1808
-
1809
  # Initialize OpenAI client for text chat
1810
  client = openai.AsyncOpenAI()
1811
 
@@ -1890,7 +1140,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1890
 
1891
  # Call GPT-4o-mini
1892
  response = await client.chat.completions.create(
1893
- model="gpt-4o-mini",
1894
  messages=messages,
1895
  temperature=0.7,
1896
  max_tokens=2000
@@ -1906,7 +1156,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1906
  # Try again with stronger prompt
1907
  messages[-1] = {"role": "user", "content": f"ONLY {SUPPORTED_LANGUAGES.get(target_language, target_language)}, NO KOREAN: {message}"}
1908
  retry_response = await client.chat.completions.create(
1909
- model="gpt-4o-mini",
1910
  messages=messages,
1911
  temperature=0.3,
1912
  max_tokens=2000
@@ -2465,210 +1715,6 @@ async def outputs(webrtc_id: str):
2465
  return StreamingResponse(output_stream(), media_type="text/event-stream")
2466
 
2467
 
2468
- # Whisper endpoints
2469
- @app.post("/whisper/transcribe")
2470
- async def whisper_transcribe(audio: UploadFile = File(...)):
2471
- """Transcribe audio using Whisper"""
2472
- try:
2473
- # Save uploaded file temporarily
2474
- with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
2475
- content = await audio.read()
2476
- tmp.write(content)
2477
- tmp_path = tmp.name
2478
-
2479
- # Transcribe
2480
- text = await asyncio.get_event_loop().run_in_executor(
2481
- whisper_executor, gpu_transcribe_whisper, tmp_path
2482
- )
2483
-
2484
- # Translate
2485
- translation = _translate_text_4langs(text)
2486
-
2487
- # Clean up
2488
- os.unlink(tmp_path)
2489
-
2490
- return {"text": text, "translation": translation}
2491
-
2492
- except Exception as e:
2493
- print(f"Whisper transcribe error: {e}")
2494
- return {"error": str(e)}
2495
-
2496
-
2497
- @app.post("/whisper/audio")
2498
- async def whisper_audio(audio: UploadFile = File(...)):
2499
- """Process audio file"""
2500
- try:
2501
- # Save uploaded file temporarily
2502
- with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp:
2503
- content = await audio.read()
2504
- tmp.write(content)
2505
- tmp_path = tmp.name
2506
-
2507
- # Transcribe
2508
- text = await asyncio.get_event_loop().run_in_executor(
2509
- whisper_executor, gpu_transcribe_whisper, tmp_path
2510
- )
2511
-
2512
- # Translate
2513
- translation = _translate_text_4langs(text)
2514
-
2515
- # Clean up
2516
- os.unlink(tmp_path)
2517
-
2518
- return {"text": text, "translation": translation}
2519
-
2520
- except Exception as e:
2521
- print(f"Whisper audio error: {e}")
2522
- return {"error": str(e)}
2523
-
2524
-
2525
- @app.post("/whisper/video")
2526
- async def whisper_video(video: UploadFile = File(...)):
2527
- """Process video file"""
2528
- try:
2529
- # Save uploaded file temporarily
2530
- with tempfile.NamedTemporaryFile(delete=False, suffix=Path(video.filename).suffix) as tmp:
2531
- content = await video.read()
2532
- tmp.write(content)
2533
- tmp_path = tmp.name
2534
-
2535
- # Extract audio
2536
- audio_path = await asyncio.get_event_loop().run_in_executor(
2537
- None, extract_audio_from_video, tmp_path
2538
- )
2539
-
2540
- # Transcribe
2541
- text = await asyncio.get_event_loop().run_in_executor(
2542
- whisper_executor, gpu_transcribe_whisper, audio_path
2543
- )
2544
-
2545
- # Translate
2546
- translation = _translate_text_4langs(text)
2547
-
2548
- # Clean up
2549
- os.unlink(tmp_path)
2550
- os.unlink(audio_path)
2551
-
2552
- return {"text": text, "translation": translation}
2553
-
2554
- except Exception as e:
2555
- print(f"Whisper video error: {e}")
2556
- return {"error": str(e)}
2557
-
2558
-
2559
- @app.post("/whisper/pdf")
2560
- async def whisper_pdf(pdf: UploadFile = File(...), max_pages: int = Form(10)):
2561
- """Process PDF file"""
2562
- try:
2563
- # Save uploaded file temporarily
2564
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
2565
- content = await pdf.read()
2566
- tmp.write(content)
2567
- tmp_path = tmp.name
2568
-
2569
- # Extract text
2570
- extracted = []
2571
- with pdfplumber.open(tmp_path) as pdf_doc:
2572
- pages = pdf_doc.pages[:max_pages]
2573
- for idx, pg in enumerate(pages, start=1):
2574
- txt = pg.extract_text() or ""
2575
- if txt.strip():
2576
- extracted.append(f"[Page {idx}]\n{txt}")
2577
-
2578
- full_text = "\n\n".join(extracted)
2579
-
2580
- # Translate each page
2581
- translated = []
2582
- for page_text in extracted:
2583
- trans = _translate_text_4langs(page_text.split('\n', 1)[1]) # Skip page header
2584
- translated.append(page_text.split('\n')[0] + "\n" + trans)
2585
-
2586
- # Clean up
2587
- os.unlink(tmp_path)
2588
-
2589
- return {"text": full_text, "translation": "\n\n".join(translated)}
2590
-
2591
- except Exception as e:
2592
- print(f"Whisper PDF error: {e}")
2593
- return {"error": str(e)}
2594
-
2595
-
2596
- @app.post("/whisper/realtime/start")
2597
- async def whisper_realtime_start():
2598
- """Start realtime transcription session"""
2599
- session_id = os.urandom(16).hex()
2600
- realtime_sessions[session_id] = {
2601
- "buffer": [],
2602
- "queue": asyncio.Queue(),
2603
- "active": True
2604
- }
2605
- return {"session_id": session_id}
2606
-
2607
-
2608
- @app.post("/whisper/realtime/process")
2609
- async def whisper_realtime_process(
2610
- audio: UploadFile = File(...),
2611
- session_id: str = Form(...)
2612
- ):
2613
- """Process realtime audio chunk"""
2614
- if session_id not in realtime_sessions:
2615
- return {"error": "Invalid session"}
2616
-
2617
- try:
2618
- # Read audio data
2619
- content = await audio.read()
2620
- audio_array = np.frombuffer(content, dtype=np.int16).astype(np.float32) / 32768.0
2621
-
2622
- # Process in executor
2623
- result = await asyncio.get_event_loop().run_in_executor(
2624
- whisper_executor, gpu_asr_translate_whisper, audio_array, WHISPER_SAMPLE_RATE
2625
- )
2626
-
2627
- # Parse result
2628
- lines = result.split('<br>')
2629
- timestamp = lines[0].strip('[]') if lines else ""
2630
- text = lines[1].replace('[KO]', '').strip() if len(lines) > 1 else ""
2631
- translation = '<br>'.join(lines[2:-2]) if len(lines) > 3 else ""
2632
-
2633
- # Queue result
2634
- await realtime_sessions[session_id]["queue"].put({
2635
- "timestamp": timestamp,
2636
- "text": text,
2637
- "translation": translation
2638
- })
2639
-
2640
- return {"status": "ok"}
2641
-
2642
- except Exception as e:
2643
- print(f"Realtime process error: {e}")
2644
- return {"error": str(e)}
2645
-
2646
-
2647
- @app.get("/whisper/realtime/stream")
2648
- async def whisper_realtime_stream(session_id: str):
2649
- """Stream realtime results"""
2650
- if session_id not in realtime_sessions:
2651
- return JSONResponse({"error": "Invalid session"}, status_code=404)
2652
-
2653
- async def stream_results():
2654
- session = realtime_sessions[session_id]
2655
- try:
2656
- while session["active"]:
2657
- try:
2658
- result = await asyncio.wait_for(session["queue"].get(), timeout=1.0)
2659
- yield f"data: {json.dumps(result)}\n\n"
2660
- except asyncio.TimeoutError:
2661
- yield f"data: {json.dumps({'keepalive': True})}\n\n"
2662
- except Exception as e:
2663
- print(f"Stream error: {e}")
2664
- finally:
2665
- # Cleanup session
2666
- if session_id in realtime_sessions:
2667
- del realtime_sessions[session_id]
2668
-
2669
- return StreamingResponse(stream_results(), media_type="text/event-stream")
2670
-
2671
-
2672
  @app.get("/")
2673
  async def index():
2674
  """Serve the HTML page"""
 
6
  import numpy as np
7
  import openai
8
  from dotenv import load_dotenv
9
+ from fastapi import FastAPI, Request
10
+ from fastapi.responses import HTMLResponse, StreamingResponse
11
  from fastrtc import (
12
  AdditionalOutputs,
13
  AsyncStreamHandler,
 
23
  import io
24
  from scipy import signal
25
  import wave
 
 
 
 
 
 
 
 
 
26
 
27
  load_dotenv()
28
 
29
  SAMPLE_RATE = 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Supported languages for OpenAI Realtime API
32
  SUPPORTED_LANGUAGES = {
 
61
  "ms": "Bahasa Melayu (Malay)"
62
  }
63
 
64
+ # HTML content embedded as a string
65
  HTML_CONTENT = """<!DOCTYPE html>
66
  <html lang="ko">
67
 
68
  <head>
69
  <meta charset="UTF-8">
70
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
71
+ <title>Mouth of 'MOUSE'</title>
72
  <style>
73
  :root {
74
  --primary-color: #6f42c1;
 
110
  flex-shrink: 0;
111
  background-color: var(--card-bg);
112
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  .main-content {
114
  display: flex;
115
  gap: 20px;
 
219
  .text-input-section {
220
  margin-top: 15px;
221
  }
222
+ input[type="text"], textarea {
223
  width: 100%;
224
  background-color: var(--dark-bg);
225
  color: var(--text-color);
 
318
  flex-shrink: 0;
319
  padding-top: 20px;
320
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  /* Responsive design */
322
  @media (max-width: 1024px) {
323
  .sidebar {
324
  width: 300px;
325
  }
 
 
 
326
  }
327
  @media (max-width: 768px) {
328
  .main-content {
 
335
  .chat-section {
336
  height: 400px;
337
  }
 
 
 
338
  }
339
  button {
340
  background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
 
432
  background-color: #ff9800;
433
  color: white;
434
  }
 
 
 
 
435
  .status-indicator {
436
  display: inline-flex;
437
  align-items: center;
 
508
  <div class="mouse-ears mouse-ear-right"></div>
509
  <div class="mouse-face"></div>
510
  </div>
511
+ <h1>MOUSE 음성 챗</h1>
512
  </div>
513
  <div class="status-indicator">
514
  <div id="status-dot" class="status-dot disconnected"></div>
 
516
  </div>
517
  </div>
518
 
519
+ <div class="main-content">
520
+ <div class="sidebar">
521
+ <div class="settings-section">
522
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
523
+ <div class="settings-grid">
524
+ <div class="setting-item">
525
+ <span class="setting-label">웹 검색</span>
526
+ <div id="search-toggle" class="toggle-switch">
527
+ <div class="toggle-slider"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  </div>
529
  </div>
530
+ <div class="setting-item">
531
+ <span class="setting-label">다국어 번역 채팅</span>
532
+ <select id="language-select">
533
+ <option value="">비활성화</option>
534
+ <option value="ko">한국어 (Korean)</option>
535
+ <option value="en">English</option>
536
+ <option value="es">Español (Spanish)</option>
537
+ <option value="fr">Français (French)</option>
538
+ <option value="de">Deutsch (German)</option>
539
+ <option value="it">Italiano (Italian)</option>
540
+ <option value="pt">Português (Portuguese)</option>
541
+ <option value="ru">Русский (Russian)</option>
542
+ <option value="ja">日本語 (Japanese)</option>
543
+ <option value="zh">中文 (Chinese)</option>
544
+ <option value="ar">العربية (Arabic)</option>
545
+ <option value="hi">हिन्दी (Hindi)</option>
546
+ <option value="nl">Nederlands (Dutch)</option>
547
+ <option value="pl">Polski (Polish)</option>
548
+ <option value="tr">Türkçe (Turkish)</option>
549
+ <option value="vi">Tiếng Việt (Vietnamese)</option>
550
+ <option value="th">ไทย (Thai)</option>
551
+ <option value="id">Bahasa Indonesia</option>
552
+ <option value="sv">Svenska (Swedish)</option>
553
+ <option value="da">Dansk (Danish)</option>
554
+ <option value="no">Norsk (Norwegian)</option>
555
+ <option value="fi">Suomi (Finnish)</option>
556
+ <option value="he">עברית (Hebrew)</option>
557
+ <option value="uk">Українська (Ukrainian)</option>
558
+ <option value="cs">Čeština (Czech)</option>
559
+ <option value="el">Ελληνικά (Greek)</option>
560
+ <option value="ro">Română (Romanian)</option>
561
+ <option value="hu">Magyar (Hungarian)</option>
562
+ <option value="ms">Bahasa Melayu (Malay)</option>
563
+ </select>
564
  </div>
565
  </div>
566
+ <div class="text-input-section">
567
+ <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
568
+ <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
569
  </div>
570
  </div>
571
 
572
+ <div class="controls">
573
+ <button id="start-button">대화 시작</button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  </div>
575
  </div>
576
+
577
+ <div class="chat-section">
578
+ <div class="chat-container">
579
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
580
+ <div class="chat-messages" id="chat-messages"></div>
581
+ <div class="text-input-section" style="margin-top: 10px;">
582
+ <div style="display: flex; gap: 10px;">
583
+ <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
584
+ <button id="send-button" style="display: none;">전송</button>
585
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  </div>
587
  </div>
588
  </div>
589
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  </div>
591
  <audio id="audio-output"></audio>
592
 
593
  <script>
594
+ let peerConnection;
595
+ let webrtc_id;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  let webSearchEnabled = false;
597
  let selectedLanguage = "";
598
  let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
599
+ const audioOutput = document.getElementById('audio-output');
600
+ const startButton = document.getElementById('start-button');
601
+ const sendButton = document.getElementById('send-button');
602
+ const chatMessages = document.getElementById('chat-messages');
603
+ const statusDot = document.getElementById('status-dot');
604
+ const statusText = document.getElementById('status-text');
605
+ const searchToggle = document.getElementById('search-toggle');
606
+ const languageSelect = document.getElementById('language-select');
607
+ const systemPromptInput = document.getElementById('system-prompt');
608
+ const textInput = document.getElementById('text-input');
609
  let audioLevel = 0;
610
+ let animationFrame;
611
+ let audioContext, analyser, audioSource;
 
 
612
  let dataChannel = null;
613
  let isVoiceActive = false;
614
 
615
+ // Web search toggle functionality
616
+ searchToggle.addEventListener('click', () => {
617
+ webSearchEnabled = !webSearchEnabled;
618
+ searchToggle.classList.toggle('active', webSearchEnabled);
619
+ console.log('Web search enabled:', webSearchEnabled);
620
+ });
621
+
622
+ // Language selection
623
+ languageSelect.addEventListener('change', () => {
624
+ selectedLanguage = languageSelect.value;
625
+ console.log('Selected language:', selectedLanguage);
626
+ });
627
+
628
+ // System prompt update
629
+ systemPromptInput.addEventListener('input', () => {
630
+ systemPrompt = systemPromptInput.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
631
+ });
632
+
633
+ // Text input handling
634
+ textInput.addEventListener('keypress', (e) => {
635
+ if (e.key === 'Enter' && !e.shiftKey) {
636
+ e.preventDefault();
637
+ sendTextMessage();
638
+ }
639
+ });
640
+
641
+ sendButton.addEventListener('click', sendTextMessage);
642
 
643
  async function sendTextMessage() {
 
 
644
  const message = textInput.value.trim();
645
  if (!message) return;
646
 
 
694
  }
695
 
696
  function updateStatus(state) {
 
 
 
 
697
  statusDot.className = 'status-dot ' + state;
698
  if (state === 'connected') {
699
  statusText.textContent = '연결됨';
700
+ sendButton.style.display = 'block';
701
  isVoiceActive = true;
702
  } else if (state === 'connecting') {
703
  statusText.textContent = '연결 중...';
704
+ sendButton.style.display = 'none';
705
  } else {
706
  statusText.textContent = '연결 대기 중';
707
+ sendButton.style.display = 'block'; // Show send button even when disconnected for text chat
708
  isVoiceActive = false;
709
  }
710
  }
 
711
  function updateButtonState() {
712
  const button = document.getElementById('start-button');
713
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
 
737
  updateStatus('disconnected');
738
  }
739
  }
 
740
  function setupAudioVisualization(stream) {
741
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
742
  analyser = audioContext.createAnalyser();
 
771
 
772
  updateAudioLevel();
773
  }
 
774
  function showError(message) {
775
  const toast = document.getElementById('error-toast');
776
  toast.textContent = message;
 
780
  toast.style.display = 'none';
781
  }, 5000);
782
  }
 
 
 
 
 
 
 
 
 
 
 
783
  async function setupWebRTC() {
784
+ const config = __RTC_CONFIGURATION__;
 
785
  peerConnection = new RTCPeerConnection(config);
786
  const timeoutId = setTimeout(() => {
787
  const toast = document.getElementById('error-toast');
 
845
  });
846
  webrtc_id = Math.random().toString(36).substring(7);
847
 
848
+ // Log current settings before sending
849
+ console.log('Sending offer with settings:', {
850
+ webrtc_id: webrtc_id,
851
+ web_search_enabled: webSearchEnabled,
852
+ target_language: selectedLanguage,
853
+ system_prompt: systemPrompt
854
+ });
855
+
856
  const response = await fetch('/webrtc/offer', {
857
  method: 'POST',
858
  headers: { 'Content-Type': 'application/json' },
 
897
  stop();
898
  }
899
  }
 
900
  function addMessage(role, content) {
 
901
  const messageDiv = document.createElement('div');
902
  messageDiv.classList.add('message', role);
903
 
 
986
  webrtc_id = null;
987
  }
988
  }
989
+ startButton.addEventListener('click', () => {
990
+ console.log('clicked');
991
+ console.log(peerConnection, peerConnection?.connectionState);
992
+ if (!peerConnection || peerConnection.connectionState !== 'connected') {
993
+ setupWebRTC();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
994
  } else {
995
+ console.log('stopping');
996
+ stop();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
997
  }
998
+ });
 
 
 
 
999
 
1000
+ // Initialize on page load
1001
+ window.addEventListener('DOMContentLoaded', () => {
1002
+ sendButton.style.display = 'block';
1003
+ });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1004
  </script>
1005
  </body>
1006
 
1007
  </html>"""
1008
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
 
1010
  class BraveSearchClient:
1011
  """Brave Search API client"""
 
1056
  # Store connection settings
1057
  connection_settings = {}
1058
 
 
 
 
1059
  # Initialize OpenAI client for text chat
1060
  client = openai.AsyncOpenAI()
1061
 
 
1140
 
1141
  # Call GPT-4o-mini
1142
  response = await client.chat.completions.create(
1143
+ model="gpt-4.1-mini",
1144
  messages=messages,
1145
  temperature=0.7,
1146
  max_tokens=2000
 
1156
  # Try again with stronger prompt
1157
  messages[-1] = {"role": "user", "content": f"ONLY {SUPPORTED_LANGUAGES.get(target_language, target_language)}, NO KOREAN: {message}"}
1158
  retry_response = await client.chat.completions.create(
1159
+ model="gpt-4.1-mini",
1160
  messages=messages,
1161
  temperature=0.3,
1162
  max_tokens=2000
 
1715
  return StreamingResponse(output_stream(), media_type="text/event-stream")
1716
 
1717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1718
  @app.get("/")
1719
  async def index():
1720
  """Serve the HTML page"""