Private-AI

Running

App Files Files Community

seawolf2357 commited on 18 days ago

Commit

c509d9d

verified ·

1 Parent(s): a8e8360

Update app.py

Browse files

Files changed (1) hide show

app.py +996 -72

app.py CHANGED Viewed

@@ -6,8 +6,8 @@ import os
 import numpy as np
 import openai
 from dotenv import load_dotenv
-from fastapi import FastAPI, Request
-from fastapi.responses import HTMLResponse, StreamingResponse
 from fastrtc import (
     AdditionalOutputs,
     AsyncStreamHandler,
@@ -23,10 +23,35 @@ import gradio as gr
 import io
 from scipy import signal
 import wave
 load_dotenv()
 SAMPLE_RATE = 24000
 # Supported languages for OpenAI Realtime API
 SUPPORTED_LANGUAGES = {
@@ -61,14 +86,14 @@ SUPPORTED_LANGUAGES = {
     "ms": "Bahasa Melayu (Malay)"
 }
-# HTML content embedded as a string
 HTML_CONTENT = """<!DOCTYPE html>
 <html lang="ko">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Mouth of 'MOUSE'</title>
     <style>
         :root {
             --primary-color: #6f42c1;
@@ -110,6 +135,43 @@ HTML_CONTENT = """<!DOCTYPE html>
             flex-shrink: 0;
             background-color: var(--card-bg);
         }
         .main-content {
             display: flex;
             gap: 20px;
@@ -219,7 +281,7 @@ HTML_CONTENT = """<!DOCTYPE html>
         .text-input-section {
             margin-top: 15px;
         }
-        input[type="text"], textarea {
             width: 100%;
             background-color: var(--dark-bg);
             color: var(--text-color);
@@ -318,11 +380,91 @@ HTML_CONTENT = """<!DOCTYPE html>
             flex-shrink: 0;
             padding-top: 20px;
         }
         /* Responsive design */
         @media (max-width: 1024px) {
             .sidebar {
                 width: 300px;
             }
         }
         @media (max-width: 768px) {
             .main-content {
@@ -335,6 +477,9 @@ HTML_CONTENT = """<!DOCTYPE html>
             .chat-section {
                 height: 400px;
             }
         }
         button {
             background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
@@ -432,6 +577,10 @@ HTML_CONTENT = """<!DOCTYPE html>
             background-color: #ff9800;
             color: white;
         }
         .status-indicator {
             display: inline-flex;
             align-items: center;
@@ -508,7 +657,7 @@ HTML_CONTENT = """<!DOCTYPE html>
                     <div class="mouse-ears mouse-ear-right"></div>
                     <div class="mouse-face"></div>
                 </div>
-                <h1>MOUSE 음성 챗</h1>
             </div>
             <div class="status-indicator">
                 <div id="status-dot" class="status-dot disconnected"></div>
@@ -516,81 +665,218 @@ HTML_CONTENT = """<!DOCTYPE html>
             </div>
         </div>
-        <div class="main-content">
-            <div class="sidebar">
-                <div class="settings-section">
-                    <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
-                    <div class="settings-grid">
-                        <div class="setting-item">
-                            <span class="setting-label">웹 검색</span>
-                            <div id="search-toggle" class="toggle-switch">
-                                <div class="toggle-slider"></div>
                             </div>
                         </div>
-                        <div class="setting-item">
-                            <span class="setting-label">다국어 번역 채팅</span>
-                            <select id="language-select">
-                                <option value="">비활성화</option>
-                                <option value="ko">한국어 (Korean)</option>
-                                <option value="en">English</option>
-                                <option value="es">Español (Spanish)</option>
-                                <option value="fr">Français (French)</option>
-                                <option value="de">Deutsch (German)</option>
-                                <option value="it">Italiano (Italian)</option>
-                                <option value="pt">Português (Portuguese)</option>
-                                <option value="ru">Русский (Russian)</option>
-                                <option value="ja">日本語 (Japanese)</option>
-                                <option value="zh">中文 (Chinese)</option>
-                                <option value="ar">العربية (Arabic)</option>
-                                <option value="hi">हिन्दी (Hindi)</option>
-                                <option value="nl">Nederlands (Dutch)</option>
-                                <option value="pl">Polski (Polish)</option>
-                                <option value="tr">Türkçe (Turkish)</option>
-                                <option value="vi">Tiếng Việt (Vietnamese)</option>
-                                <option value="th">ไทย (Thai)</option>
-                                <option value="id">Bahasa Indonesia</option>
-                                <option value="sv">Svenska (Swedish)</option>
-                                <option value="da">Dansk (Danish)</option>
-                                <option value="no">Norsk (Norwegian)</option>
-                                <option value="fi">Suomi (Finnish)</option>
-                                <option value="he">עברית (Hebrew)</option>
-                                <option value="uk">Українська (Ukrainian)</option>
-                                <option value="cs">Čeština (Czech)</option>
-                                <option value="el">Ελληνικά (Greek)</option>
-                                <option value="ro">Română (Romanian)</option>
-                                <option value="hu">Magyar (Hungarian)</option>
-                                <option value="ms">Bahasa Melayu (Malay)</option>
-                            </select>
                         </div>
                     </div>
-                    <div class="text-input-section">
-                        <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
-                        <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
                     </div>
                 </div>
-                <div class="controls">
-                    <button id="start-button">대화 시작</button>
                 </div>
             </div>
-            <div class="chat-section">
-                <div class="chat-container">
-                    <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
-                    <div class="chat-messages" id="chat-messages"></div>
-                    <div class="text-input-section" style="margin-top: 10px;">
-                        <div style="display: flex; gap: 10px;">
-                            <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
-                            <button id="send-button" style="display: none;">전송</button>
-                        </div>
                     </div>
                 </div>
             </div>
         </div>
     </div>
     <audio id="audio-output"></audio>
     <script>
         let peerConnection;
         let webrtc_id;
         let webSearchEnabled = false;
@@ -612,6 +898,13 @@ HTML_CONTENT = """<!DOCTYPE html>
         let dataChannel = null;
         let isVoiceActive = false;
         // Web search toggle functionality
         searchToggle.addEventListener('click', () => {
             webSearchEnabled = !webSearchEnabled;
@@ -708,6 +1001,7 @@ HTML_CONTENT = """<!DOCTYPE html>
                 isVoiceActive = false;
             }
         }
         function updateButtonState() {
             const button = document.getElementById('start-button');
             if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
@@ -737,6 +1031,7 @@ HTML_CONTENT = """<!DOCTYPE html>
                 updateStatus('disconnected');
             }
         }
         function setupAudioVisualization(stream) {
             audioContext = new (window.AudioContext || window.webkitAudioContext)();
             analyser = audioContext.createAnalyser();
@@ -771,6 +1066,7 @@ HTML_CONTENT = """<!DOCTYPE html>
             updateAudioLevel();
         }
         function showError(message) {
             const toast = document.getElementById('error-toast');
             toast.textContent = message;
@@ -780,6 +1076,17 @@ HTML_CONTENT = """<!DOCTYPE html>
                 toast.style.display = 'none';
             }, 5000);
         }
         async function setupWebRTC() {
             const config = __RTC_CONFIGURATION__;
             peerConnection = new RTCPeerConnection(config);
@@ -845,14 +1152,6 @@ HTML_CONTENT = """<!DOCTYPE html>
                 });
                 webrtc_id = Math.random().toString(36).substring(7);
-                // Log current settings before sending
-                console.log('Sending offer with settings:', {
-                    webrtc_id: webrtc_id,
-                    web_search_enabled: webSearchEnabled,
-                    target_language: selectedLanguage,
-                    system_prompt: systemPrompt
-                });
                 const response = await fetch('/webrtc/offer', {
                     method: 'POST',
                     headers: { 'Content-Type': 'application/json' },
@@ -897,6 +1196,7 @@ HTML_CONTENT = """<!DOCTYPE html>
                 stop();
             }
         }
         function addMessage(role, content) {
             const messageDiv = document.createElement('div');
             messageDiv.classList.add('message', role);
@@ -986,6 +1286,7 @@ HTML_CONTENT = """<!DOCTYPE html>
                 webrtc_id = null;
             }
         }
         startButton.addEventListener('click', () => {
             console.log('clicked');
             console.log(peerConnection, peerConnection?.connectionState);
@@ -997,6 +1298,320 @@ HTML_CONTENT = """<!DOCTYPE html>
             }
         });
         // Initialize on page load
         window.addEventListener('DOMContentLoaded', () => {
             sendButton.style.display = 'block';
@@ -1006,6 +1621,108 @@ HTML_CONTENT = """<!DOCTYPE html>
 </html>"""
 class BraveSearchClient:
     """Brave Search API client"""
@@ -1056,6 +1773,9 @@ print(f"Search client initialized: {search_client is not None}, API key present:
 # Store connection settings
 connection_settings = {}
 # Initialize OpenAI client for text chat
 client = openai.AsyncOpenAI()
@@ -1715,6 +2435,210 @@ async def outputs(webrtc_id: str):
     return StreamingResponse(output_stream(), media_type="text/event-stream")
 @app.get("/")
 async def index():
     """Serve the HTML page"""

 import numpy as np
 import openai
 from dotenv import load_dotenv
+from fastapi import FastAPI, Request, UploadFile, File, Form
+from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
 from fastrtc import (
     AdditionalOutputs,
     AsyncStreamHandler,
 import io
 from scipy import signal
 import wave
+import torch
+from transformers import pipeline
+import tempfile
+import subprocess
+import pdfplumber
+import scipy.signal as sps
+from datetime import datetime
+from zoneinfo import ZoneInfo
+import concurrent.futures
 load_dotenv()
 SAMPLE_RATE = 24000
+WHISPER_SAMPLE_RATE = 16000
+SEOUL_TZ = ZoneInfo("Asia/Seoul")
+# Whisper model settings
+WHISPER_MODEL_NAME = "openai/whisper-large-v3-turbo"
+WHISPER_BATCH_SIZE = 8
+# Real-time segmentation parameters
+MIN_SEG_SEC = 10
+MAX_SEG_SEC = 15
+SILENCE_SEC = 0.6
+SILENCE_THRESH = 1e-4
+# CPU-side pool for Whisper tasks
+whisper_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
+whisper_futures_queue: list[concurrent.futures.Future] = []
 # Supported languages for OpenAI Realtime API
 SUPPORTED_LANGUAGES = {
     "ms": "Bahasa Melayu (Malay)"
 }
+# HTML content embedded as a string (extended with new tabs)
 HTML_CONTENT = """<!DOCTYPE html>
 <html lang="ko">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Mouth of 'MOUSE' - Extended</title>
     <style>
         :root {
             --primary-color: #6f42c1;
             flex-shrink: 0;
             background-color: var(--card-bg);
         }
+        .tabs-container {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 20px;
+            border-bottom: 2px solid var(--border-color);
+            padding-bottom: 10px;
+            overflow-x: auto;
+            scrollbar-width: thin;
+            scrollbar-color: var(--primary-color) var(--card-bg);
+        }
+        .tab-button {
+            padding: 10px 20px;
+            background-color: var(--card-bg);
+            color: var(--text-color);
+            border: 1px solid var(--border-color);
+            border-radius: 8px 8px 0 0;
+            cursor: pointer;
+            transition: all 0.3s;
+            white-space: nowrap;
+            font-size: 14px;
+        }
+        .tab-button:hover {
+            background-color: var(--secondary-color);
+        }
+        .tab-button.active {
+            background-color: var(--primary-color);
+            border-bottom: 2px solid var(--primary-color);
+        }
+        .tab-content {
+            display: none;
+            flex-grow: 1;
+            overflow: hidden;
+        }
+        .tab-content.active {
+            display: flex;
+            flex-direction: column;
+        }
         .main-content {
             display: flex;
             gap: 20px;
         .text-input-section {
             margin-top: 15px;
         }
+        input[type="text"], input[type="file"], textarea {
             width: 100%;
             background-color: var(--dark-bg);
             color: var(--text-color);
             flex-shrink: 0;
             padding-top: 20px;
         }
+        /* Whisper Tab Styles */
+        .whisper-container {
+            padding: 20px;
+            background-color: var(--card-bg);
+            border-radius: 12px;
+            border: 1px solid var(--border-color);
+            margin: 20px auto;
+            max-width: 800px;
+            width: 100%;
+        }
+        .whisper-input-section {
+            margin-bottom: 20px;
+        }
+        .whisper-output-section {
+            display: grid;
+            grid-template-columns: 1fr 1fr;
+            gap: 20px;
+            margin-top: 20px;
+        }
+        .whisper-output {
+            background-color: var(--dark-bg);
+            padding: 15px;
+            border-radius: 8px;
+            border: 1px solid var(--border-color);
+            max-height: 300px;
+            overflow-y: auto;
+        }
+        .whisper-output h4 {
+            margin-top: 0;
+            color: var(--primary-color);
+        }
+        .file-upload-area {
+            border: 2px dashed var(--border-color);
+            border-radius: 8px;
+            padding: 30px;
+            text-align: center;
+            cursor: pointer;
+            transition: all 0.3s;
+        }
+        .file-upload-area:hover {
+            border-color: var(--primary-color);
+            background-color: rgba(111, 66, 193, 0.1);
+        }
+        .file-upload-area.drag-over {
+            border-color: var(--primary-color);
+            background-color: rgba(111, 66, 193, 0.2);
+        }
+        .realtime-output {
+            background-color: var(--dark-bg);
+            padding: 20px;
+            border-radius: 8px;
+            margin-top: 20px;
+            min-height: 200px;
+            max-height: 400px;
+            overflow-y: auto;
+        }
+        .recording-indicator {
+            display: inline-flex;
+            align-items: center;
+            gap: 10px;
+            padding: 10px 20px;
+            background-color: #f44336;
+            color: white;
+            border-radius: 20px;
+            animation: pulse 1.5s infinite;
+        }
+        .recording-dot {
+            width: 10px;
+            height: 10px;
+            background-color: white;
+            border-radius: 50%;
+            animation: blink 1s infinite;
+        }
+        @keyframes blink {
+            0%, 50% { opacity: 1; }
+            51%, 100% { opacity: 0; }
+        }
         /* Responsive design */
         @media (max-width: 1024px) {
             .sidebar {
                 width: 300px;
             }
+            .whisper-output-section {
+                grid-template-columns: 1fr;
+            }
         }
         @media (max-width: 768px) {
             .main-content {
             .chat-section {
                 height: 400px;
             }
+            .tabs-container {
+                flex-wrap: wrap;
+            }
         }
         button {
             background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
             background-color: #ff9800;
             color: white;
         }
+        .toast.success {
+            background-color: #4caf50;
+            color: white;
+        }
         .status-indicator {
             display: inline-flex;
             align-items: center;
                     <div class="mouse-ears mouse-ear-right"></div>
                     <div class="mouse-face"></div>
                 </div>
+                <h1>MOUSE Extended</h1>
             </div>
             <div class="status-indicator">
                 <div id="status-dot" class="status-dot disconnected"></div>
             </div>
         </div>
+        <div class="tabs-container">
+            <button class="tab-button active" onclick="switchTab('voice-chat')">음성 채팅</button>
+            <button class="tab-button" onclick="switchTab('mic-whisper')">마이크 전사</button>
+            <button class="tab-button" onclick="switchTab('audio-whisper')">오디오 파일</button>
+            <button class="tab-button" onclick="switchTab('video-whisper')">비디오 파일</button>
+            <button class="tab-button" onclick="switchTab('pdf-whisper')">PDF 번역</button>
+            <button class="tab-button" onclick="switchTab('realtime-whisper')">실시간 통역</button>
+        </div>
+        <!-- Voice Chat Tab (Original) -->
+        <div id="voice-chat" class="tab-content active">
+            <div class="main-content">
+                <div class="sidebar">
+                    <div class="settings-section">
+                        <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
+                        <div class="settings-grid">
+                            <div class="setting-item">
+                                <span class="setting-label">웹 검색</span>
+                                <div id="search-toggle" class="toggle-switch">
+                                    <div class="toggle-slider"></div>
+                                </div>
+                            </div>
+                            <div class="setting-item">
+                                <span class="setting-label">다국어 번역 채팅</span>
+                                <select id="language-select">
+                                    <option value="">비활성화</option>
+                                    <option value="ko">한국어 (Korean)</option>
+                                    <option value="en">English</option>
+                                    <option value="es">Español (Spanish)</option>
+                                    <option value="fr">Français (French)</option>
+                                    <option value="de">Deutsch (German)</option>
+                                    <option value="it">Italiano (Italian)</option>
+                                    <option value="pt">Português (Portuguese)</option>
+                                    <option value="ru">Русский (Russian)</option>
+                                    <option value="ja">日本語 (Japanese)</option>
+                                    <option value="zh">中文 (Chinese)</option>
+                                    <option value="ar">العربية (Arabic)</option>
+                                    <option value="hi">हिन्दी (Hindi)</option>
+                                    <option value="nl">Nederlands (Dutch)</option>
+                                    <option value="pl">Polski (Polish)</option>
+                                    <option value="tr">Türkçe (Turkish)</option>
+                                    <option value="vi">Tiếng Việt (Vietnamese)</option>
+                                    <option value="th">ไทย (Thai)</option>
+                                    <option value="id">Bahasa Indonesia</option>
+                                    <option value="sv">Svenska (Swedish)</option>
+                                    <option value="da">Dansk (Danish)</option>
+                                    <option value="no">Norsk (Norwegian)</option>
+                                    <option value="fi">Suomi (Finnish)</option>
+                                    <option value="he">עברית (Hebrew)</option>
+                                    <option value="uk">Українська (Ukrainian)</option>
+                                    <option value="cs">Čeština (Czech)</option>
+                                    <option value="el">Ελληνικά (Greek)</option>
+                                    <option value="ro">Română (Romanian)</option>
+                                    <option value="hu">Magyar (Hungarian)</option>
+                                    <option value="ms">Bahasa Melayu (Malay)</option>
+                                </select>
                             </div>
                         </div>
+                        <div class="text-input-section">
+                            <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
+                            <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
                         </div>
                     </div>
+                    <div class="controls">
+                        <button id="start-button">대화 시작</button>
                     </div>
                 </div>
+                <div class="chat-section">
+                    <div class="chat-container">
+                        <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
+                        <div class="chat-messages" id="chat-messages"></div>
+                        <div class="text-input-section" style="margin-top: 10px;">
+                            <div style="display: flex; gap: 10px;">
+                                <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
+                                <button id="send-button" style="display: none;">전송</button>
+                            </div>
+                        </div>
+                    </div>
                 </div>
             </div>
+        </div>
+        <!-- Microphone Whisper Tab -->
+        <div id="mic-whisper" class="tab-content">
+            <div class="whisper-container">
+                <h3>마이크 녹음 → 전사 및 4개 언어 번역</h3>
+                <div class="whisper-input-section">
+                    <button id="mic-record-btn" onclick="toggleMicRecording()">녹음 시작</button>
+                    <div id="mic-status" style="margin-top: 10px;"></div>
+                </div>
+                <div class="whisper-output-section">
+                    <div class="whisper-output">
+                        <h4>원문</h4>
+                        <div id="mic-original"></div>
+                    </div>
+                    <div class="whisper-output">
+                        <h4>번역 (EN/ZH/TH/RU)</h4>
+                        <div id="mic-translation"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- Audio File Whisper Tab -->
+        <div id="audio-whisper" class="tab-content">
+            <div class="whisper-container">
+                <h3>오디오 파일 → 전사 및 4개 언어 번역</h3>
+                <div class="whisper-input-section">
+                    <div class="file-upload-area" id="audio-upload-area">
+                        <p>오디오 파일을 드래그하거나 클릭하여 업로드</p>
+                        <input type="file" id="audio-file-input" accept="audio/*" style="display: none;">
+                    </div>
+                </div>
+                <div class="whisper-output-section">
+                    <div class="whisper-output">
+                        <h4>원문</h4>
+                        <div id="audio-original"></div>
+                    </div>
+                    <div class="whisper-output">
+                        <h4>번역 (EN/ZH/TH/RU)</h4>
+                        <div id="audio-translation"></div>
                     </div>
                 </div>
             </div>
         </div>
+        <!-- Video File Whisper Tab -->
+        <div id="video-whisper" class="tab-content">
+            <div class="whisper-container">
+                <h3>비디오 파일 → 오디오 추출 → 전사 및 4개 언어 번역</h3>
+                <div class="whisper-input-section">
+                    <div class="file-upload-area" id="video-upload-area">
+                        <p>비디오 파일을 드래그하거나 클릭하여 업로드</p>
+                        <input type="file" id="video-file-input" accept="video/*" style="display: none;">
+                    </div>
+                </div>
+                <div class="whisper-output-section">
+                    <div class="whisper-output">
+                        <h4>원문</h4>
+                        <div id="video-original"></div>
+                    </div>
+                    <div class="whisper-output">
+                        <h4>번역 (EN/ZH/TH/RU)</h4>
+                        <div id="video-translation"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- PDF Whisper Tab -->
+        <div id="pdf-whisper" class="tab-content">
+            <div class="whisper-container">
+                <h3>PDF 파일 → 텍스트 추출 → 4개 언어 번역</h3>
+                <div class="whisper-input-section">
+                    <div class="file-upload-area" id="pdf-upload-area">
+                        <p>PDF 파일을 드래그하거나 클릭하여 업로드</p>
+                        <input type="file" id="pdf-file-input" accept=".pdf" style="display: none;">
+                    </div>
+                    <div class="setting-item" style="margin-top: 15px;">
+                        <span class="setting-label">최대 페이지:</span>
+                        <input type="number" id="pdf-max-pages" value="10" min="1" max="50" style="width: 80px;">
+                    </div>
+                </div>
+                <div class="whisper-output-section">
+                    <div class="whisper-output">
+                        <h4>추출된 텍스트</h4>
+                        <div id="pdf-original"></div>
+                    </div>
+                    <div class="whisper-output">
+                        <h4>번역 (EN/ZH/TH/RU)</h4>
+                        <div id="pdf-translation"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- Realtime Whisper Tab -->
+        <div id="realtime-whisper" class="tab-content">
+            <div class="whisper-container">
+                <h3>실시간 통역 (Korean → EN/ZH/TH/RU)</h3>
+                <p>10-15초 문장 단위로 자동 전환 — 최신 내용이 위에 표시됩니다.</p>
+                <div class="whisper-input-section">
+                    <button id="realtime-start-btn" onclick="toggleRealtimeRecording()">실시간 통역 시작</button>
+                    <div id="realtime-status" style="margin-top: 10px;"></div>
+                </div>
+                <div class="realtime-output" id="realtime-output"></div>
+            </div>
+        </div>
     </div>
     <audio id="audio-output"></audio>
     <script>
+        // Tab switching functionality
+        function switchTab(tabName) {
+            // Hide all tabs
+            const tabs = document.querySelectorAll('.tab-content');
+            tabs.forEach(tab => tab.classList.remove('active'));
+            // Remove active class from all buttons
+            const buttons = document.querySelectorAll('.tab-button');
+            buttons.forEach(btn => btn.classList.remove('active'));
+            // Show selected tab
+            document.getElementById(tabName).classList.add('active');
+            // Add active class to clicked button
+            event.target.classList.add('active');
+        }
+        // Original voice chat variables and functions
         let peerConnection;
         let webrtc_id;
         let webSearchEnabled = false;
         let dataChannel = null;
         let isVoiceActive = false;
+        // Whisper-related variables
+        let micRecorder = null;
+        let isRecording = false;
+        let realtimeRecorder = null;
+        let isRealtimeRecording = false;
+        let realtimeStream = null;
         // Web search toggle functionality
         searchToggle.addEventListener('click', () => {
             webSearchEnabled = !webSearchEnabled;
                 isVoiceActive = false;
             }
         }
         function updateButtonState() {
             const button = document.getElementById('start-button');
             if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
                 updateStatus('disconnected');
             }
         }
         function setupAudioVisualization(stream) {
             audioContext = new (window.AudioContext || window.webkitAudioContext)();
             analyser = audioContext.createAnalyser();
             updateAudioLevel();
         }
         function showError(message) {
             const toast = document.getElementById('error-toast');
             toast.textContent = message;
                 toast.style.display = 'none';
             }, 5000);
         }
+        function showSuccess(message) {
+            const toast = document.getElementById('error-toast');
+            toast.textContent = message;
+            toast.className = 'toast success';
+            toast.style.display = 'block';
+            setTimeout(() => {
+                toast.style.display = 'none';
+            }, 3000);
+        }
         async function setupWebRTC() {
             const config = __RTC_CONFIGURATION__;
             peerConnection = new RTCPeerConnection(config);
                 });
                 webrtc_id = Math.random().toString(36).substring(7);
                 const response = await fetch('/webrtc/offer', {
                     method: 'POST',
                     headers: { 'Content-Type': 'application/json' },
                 stop();
             }
         }
         function addMessage(role, content) {
             const messageDiv = document.createElement('div');
             messageDiv.classList.add('message', role);
                 webrtc_id = null;
             }
         }
         startButton.addEventListener('click', () => {
             console.log('clicked');
             console.log(peerConnection, peerConnection?.connectionState);
             }
         });
+        // Whisper Tab Functions
+        // Microphone recording
+        async function toggleMicRecording() {
+            const btn = document.getElementById('mic-record-btn');
+            const status = document.getElementById('mic-status');
+            if (!isRecording) {
+                try {
+                    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                    micRecorder = new MediaRecorder(stream);
+                    const chunks = [];
+                    micRecorder.ondataavailable = (e) => chunks.push(e.data);
+                    micRecorder.onstop = async () => {
+                        const blob = new Blob(chunks, { type: 'audio/webm' });
+                        await processAudioBlob(blob, 'mic');
+                        stream.getTracks().forEach(track => track.stop());
+                    };
+                    micRecorder.start();
+                    isRecording = true;
+                    btn.textContent = '녹음 중지';
+                    status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>녹음 중...</div>';
+                } catch (err) {
+                    showError('마이크 접근 권한이 필요합니다.');
+                }
+            } else {
+                micRecorder.stop();
+                isRecording = false;
+                btn.textContent = '녹음 시작';
+                status.textContent = '처리 중...';
+            }
+        }
+        // File upload handlers
+        document.getElementById('audio-upload-area').addEventListener('click', () => {
+            document.getElementById('audio-file-input').click();
+        });
+        document.getElementById('video-upload-area').addEventListener('click', () => {
+            document.getElementById('video-file-input').click();
+        });
+        document.getElementById('pdf-upload-area').addEventListener('click', () => {
+            document.getElementById('pdf-file-input').click();
+        });
+        // Drag and drop handlers
+        ['audio', 'video', 'pdf'].forEach(type => {
+            const area = document.getElementById(`${type}-upload-area`);
+            area.addEventListener('dragover', (e) => {
+                e.preventDefault();
+                area.classList.add('drag-over');
+            });
+            area.addEventListener('dragleave', () => {
+                area.classList.remove('drag-over');
+            });
+            area.addEventListener('drop', (e) => {
+                e.preventDefault();
+                area.classList.remove('drag-over');
+                const file = e.dataTransfer.files[0];
+                if (file) {
+                    if (type === 'audio') processAudioFile(file);
+                    else if (type === 'video') processVideoFile(file);
+                    else if (type === 'pdf') processPDFFile(file);
+                }
+            });
+        });
+        // File input change handlers
+        document.getElementById('audio-file-input').addEventListener('change', (e) => {
+            if (e.target.files[0]) processAudioFile(e.target.files[0]);
+        });
+        document.getElementById('video-file-input').addEventListener('change', (e) => {
+            if (e.target.files[0]) processVideoFile(e.target.files[0]);
+        });
+        document.getElementById('pdf-file-input').addEventListener('change', (e) => {
+            if (e.target.files[0]) processPDFFile(e.target.files[0]);
+        });
+        // Process audio blob (for microphone recording)
+        async function processAudioBlob(blob, type) {
+            const formData = new FormData();
+            formData.append('audio', blob, 'recording.webm');
+            try {
+                const response = await fetch('/whisper/transcribe', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (result.error) {
+                    showError(result.error);
+                } else {
+                    document.getElementById(`${type}-original`).textContent = result.text;
+                    document.getElementById(`${type}-translation`).innerHTML = result.translation.replace(/\n/g, '<br>');
+                    document.getElementById(`${type}-status`).textContent = '';
+                }
+            } catch (error) {
+                showError('처리 중 오류가 발생했습니다.');
+                document.getElementById(`${type}-status`).textContent = '';
+            }
+        }
+        // Process audio file
+        async function processAudioFile(file) {
+            const formData = new FormData();
+            formData.append('audio', file);
+            try {
+                showSuccess('오디오 파일 처리 중...');
+                const response = await fetch('/whisper/audio', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (result.error) {
+                    showError(result.error);
+                } else {
+                    document.getElementById('audio-original').textContent = result.text;
+                    document.getElementById('audio-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
+                }
+            } catch (error) {
+                showError('오디오 파일 처리 중 오류가 발생했습니다.');
+            }
+        }
+        // Process video file
+        async function processVideoFile(file) {
+            const formData = new FormData();
+            formData.append('video', file);
+            try {
+                showSuccess('비디오 파일 처리 중... (시간이 걸릴 수 있습니다)');
+                const response = await fetch('/whisper/video', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (result.error) {
+                    showError(result.error);
+                } else {
+                    document.getElementById('video-original').textContent = result.text;
+                    document.getElementById('video-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
+                }
+            } catch (error) {
+                showError('비디오 파일 처리 중 오류가 발생했습니다.');
+            }
+        }
+        // Process PDF file
+        async function processPDFFile(file) {
+            const formData = new FormData();
+            formData.append('pdf', file);
+            formData.append('max_pages', document.getElementById('pdf-max-pages').value);
+            try {
+                showSuccess('PDF 파일 처리 중...');
+                const response = await fetch('/whisper/pdf', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (result.error) {
+                    showError(result.error);
+                } else {
+                    document.getElementById('pdf-original').textContent = result.text;
+                    document.getElementById('pdf-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
+                }
+            } catch (error) {
+                showError('PDF 파일 처리 중 오류가 발생했습니다.');
+            }
+        }
+        // Realtime recording
+        let realtimeEventSource = null;
+        async function toggleRealtimeRecording() {
+            const btn = document.getElementById('realtime-start-btn');
+            const status = document.getElementById('realtime-status');
+            const output = document.getElementById('realtime-output');
+            if (!isRealtimeRecording) {
+                try {
+                    const response = await fetch('/whisper/realtime/start', {
+                        method: 'POST'
+                    });
+                    const result = await response.json();
+                    if (result.session_id) {
+                        // Start listening for results
+                        realtimeEventSource = new EventSource(`/whisper/realtime/stream?session_id=${result.session_id}`);
+                        realtimeEventSource.onmessage = (event) => {
+                            const data = JSON.parse(event.data);
+                            if (data.timestamp) {
+                                const segment = document.createElement('div');
+                                segment.style.borderBottom = '1px solid #333';
+                                segment.style.paddingBottom = '15px';
+                                segment.style.marginBottom = '15px';
+                                segment.innerHTML = `
+                                    <strong>[${data.timestamp}]</strong><br>
+                                    <strong>[KO]</strong> ${data.text}<br>
+                                    ${data.translation.replace(/\n/g, '<br>')}
+                                `;
+                                output.insertBefore(segment, output.firstChild);
+                            }
+                        };
+                        realtimeEventSource.onerror = () => {
+                            stopRealtimeRecording();
+                            showError('실시간 통역 연결이 끊어졌습니다.');
+                        };
+                        // Start recording
+                        realtimeStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                        isRealtimeRecording = true;
+                        btn.textContent = '통역 중지';
+                        status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>실시간 통역 중...</div>';
+                        // Send audio data periodically
+                        startRealtimeAudioCapture(result.session_id);
+                    }
+                } catch (err) {
+                    showError('실시간 통역을 시작할 수 없습니다.');
+                }
+            } else {
+                stopRealtimeRecording();
+            }
+        }
+        function stopRealtimeRecording() {
+            if (realtimeEventSource) {
+                realtimeEventSource.close();
+                realtimeEventSource = null;
+            }
+            if (realtimeStream) {
+                realtimeStream.getTracks().forEach(track => track.stop());
+                realtimeStream = null;
+            }
+            isRealtimeRecording = false;
+            document.getElementById('realtime-start-btn').textContent = '실시간 통역 시작';
+            document.getElementById('realtime-status').textContent = '';
+        }
+        async function startRealtimeAudioCapture(sessionId) {
+            const audioContext = new AudioContext({ sampleRate: 16000 });
+            const source = audioContext.createMediaStreamSource(realtimeStream);
+            const processor = audioContext.createScriptProcessor(4096, 1, 1);
+            let audioBuffer = [];
+            let silenceFrames = 0;
+            const SILENCE_THRESHOLD = 0.01;
+            const MIN_BUFFER_SIZE = 16000 * 2; // 2 seconds minimum
+            const MAX_BUFFER_SIZE = 16000 * 15; // 15 seconds maximum
+            processor.onaudioprocess = async (e) => {
+                if (!isRealtimeRecording) return;
+                const inputData = e.inputBuffer.getChannelData(0);
+                audioBuffer.push(...inputData);
+                // Simple voice activity detection
+                const rms = Math.sqrt(inputData.reduce((sum, val) => sum + val * val, 0) / inputData.length);
+                if (rms < SILENCE_THRESHOLD) {
+                    silenceFrames++;
+                } else {
+                    silenceFrames = 0;
+                }
+                // Send audio when we have enough silence or max buffer reached
+                if ((silenceFrames > 20 && audioBuffer.length > MIN_BUFFER_SIZE) ||
+                    audioBuffer.length > MAX_BUFFER_SIZE) {
+                    const audioData = new Float32Array(audioBuffer);
+                    audioBuffer = [];
+                    silenceFrames = 0;
+                    // Convert to 16-bit PCM
+                    const pcmData = new Int16Array(audioData.length);
+                    for (let i = 0; i < audioData.length; i++) {
+                        pcmData[i] = Math.max(-32768, Math.min(32767, audioData[i] * 32768));
+                    }
+                    // Send to server
+                    const formData = new FormData();
+                    formData.append('audio', new Blob([pcmData.buffer], { type: 'audio/pcm' }));
+                    formData.append('session_id', sessionId);
+                    fetch('/whisper/realtime/process', {
+                        method: 'POST',
+                        body: formData
+                    }).catch(err => console.error('Error sending audio:', err));
+                }
+            };
+            source.connect(processor);
+            processor.connect(audioContext.destination);
+        }
         // Initialize on page load
         window.addEventListener('DOMContentLoaded', () => {
             sendButton.style.display = 'block';
 </html>"""
+# Whisper model loader
+def _get_whisper_pipe():
+    """Lazy load Whisper pipeline"""
+    if not hasattr(_get_whisper_pipe, "pipe"):
+        device = 0 if torch.cuda.is_available() else "cpu"
+        _get_whisper_pipe.pipe = pipeline(
+            task="automatic-speech-recognition",
+            model=WHISPER_MODEL_NAME,
+            chunk_length_s=30,
+            device=device,
+        )
+    return _get_whisper_pipe.pipe
+# Audio helpers for Whisper
+def _ensure_16k_whisper(y: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
+    """Resample audio to 16kHz for Whisper"""
+    if sr == WHISPER_SAMPLE_RATE:
+        return y.astype(np.float32), WHISPER_SAMPLE_RATE
+    g = np.gcd(sr, WHISPER_SAMPLE_RATE)
+    y = sps.resample_poly(y, WHISPER_SAMPLE_RATE // g, sr // g).astype(np.float32)
+    return y, WHISPER_SAMPLE_RATE
+def _should_flush_whisper(buffer: np.ndarray, sr: int) -> bool:
+    """Check if audio buffer should be flushed for processing"""
+    dur = len(buffer) / sr
+    if dur < MIN_SEG_SEC:
+        return False
+    tail_len = int(SILENCE_SEC * sr)
+    tail = buffer[-tail_len:]
+    rms = np.sqrt(np.mean(tail ** 2)) if len(tail) else 1.0
+    end_of_sentence = rms < SILENCE_THRESH
+    return end_of_sentence or dur >= MAX_SEG_SEC
+# Translation helper
+def _translate_text_4langs(text: str) -> str:
+    """Translate text to 4 languages using OpenAI"""
+    try:
+        client = openai.OpenAI()
+        prompt = (
+            "Translate the following text into English (EN), Chinese (ZH), Thai (TH) and Russian (RU).\n"
+            "Return ONLY the translations in this format (one per line):\n"
+            "EN: <english>\nZH: <chinese>\nTH: <thai>\nRU: <russian>\n\n"
+            f"Text: {text}"
+        )
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are a professional translator."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.7,
+            max_tokens=512
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Translation error: {e}")
+        return f"Translation error: {str(e)}"
+# ffmpeg check
+def _check_ffmpeg() -> bool:
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except Exception:
+        return False
+_HAS_FFMPEG = _check_ffmpeg()
+def extract_audio_from_video(video_path: str) -> str:
+    """Extract audio from video file"""
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    tmp.close()
+    if _HAS_FFMPEG:
+        cmd = [
+            "ffmpeg", "-i", video_path, "-vn",
+            "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", tmp.name
+        ]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode != 0:
+            os.unlink(tmp.name)
+            raise RuntimeError("ffmpeg error extracting audio")
+        return tmp.name
+    else:
+        raise RuntimeError("ffmpeg is required for video processing")
+# GPU workers for Whisper
+def gpu_transcribe_whisper(audio_path: str) -> str:
+    """Transcribe audio using Whisper on GPU"""
+    pipe = _get_whisper_pipe()
+    result = pipe(audio_path, batch_size=WHISPER_BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)
+    return result["text"].strip()
+def gpu_asr_translate_whisper(audio: np.ndarray, sr: int) -> str:
+    """Transcribe and translate audio for realtime"""
+    pipe = _get_whisper_pipe()
+    ko = pipe({"array": audio, "sampling_rate": sr}, batch_size=WHISPER_BATCH_SIZE)["text"].strip()
+    trans = _translate_text_4langs(ko).replace("\n", "<br>")
+    ts = datetime.now(SEOUL_TZ).strftime("%Y-%m-%d %H:%M:%S")
+    return f"[{ts}]<br>[KO] {ko}<br>{trans}<br>{'-'*40}<br>"
 class BraveSearchClient:
     """Brave Search API client"""
 # Store connection settings
 connection_settings = {}
+# Store realtime sessions
+realtime_sessions = {}
 # Initialize OpenAI client for text chat
 client = openai.AsyncOpenAI()
     return StreamingResponse(output_stream(), media_type="text/event-stream")
+# Whisper endpoints
+@app.post("/whisper/transcribe")
+async def whisper_transcribe(audio: UploadFile = File(...)):
+    """Transcribe audio using Whisper"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
+            content = await audio.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        # Transcribe
+        text = await asyncio.get_event_loop().run_in_executor(
+            whisper_executor, gpu_transcribe_whisper, tmp_path
+        )
+        # Translate
+        translation = _translate_text_4langs(text)
+        # Clean up
+        os.unlink(tmp_path)
+        return {"text": text, "translation": translation}
+    except Exception as e:
+        print(f"Whisper transcribe error: {e}")
+        return {"error": str(e)}
+@app.post("/whisper/audio")
+async def whisper_audio(audio: UploadFile = File(...)):
+    """Process audio file"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp:
+            content = await audio.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        # Transcribe
+        text = await asyncio.get_event_loop().run_in_executor(
+            whisper_executor, gpu_transcribe_whisper, tmp_path
+        )
+        # Translate
+        translation = _translate_text_4langs(text)
+        # Clean up
+        os.unlink(tmp_path)
+        return {"text": text, "translation": translation}
+    except Exception as e:
+        print(f"Whisper audio error: {e}")
+        return {"error": str(e)}
+@app.post("/whisper/video")
+async def whisper_video(video: UploadFile = File(...)):
+    """Process video file"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=Path(video.filename).suffix) as tmp:
+            content = await video.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        # Extract audio
+        audio_path = await asyncio.get_event_loop().run_in_executor(
+            None, extract_audio_from_video, tmp_path
+        )
+        # Transcribe
+        text = await asyncio.get_event_loop().run_in_executor(
+            whisper_executor, gpu_transcribe_whisper, audio_path
+        )
+        # Translate
+        translation = _translate_text_4langs(text)
+        # Clean up
+        os.unlink(tmp_path)
+        os.unlink(audio_path)
+        return {"text": text, "translation": translation}
+    except Exception as e:
+        print(f"Whisper video error: {e}")
+        return {"error": str(e)}
+@app.post("/whisper/pdf")
+async def whisper_pdf(pdf: UploadFile = File(...), max_pages: int = Form(10)):
+    """Process PDF file"""
+    try:
+        # Save uploaded file temporarily
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            content = await pdf.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        # Extract text
+        extracted = []
+        with pdfplumber.open(tmp_path) as pdf_doc:
+            pages = pdf_doc.pages[:max_pages]
+            for idx, pg in enumerate(pages, start=1):
+                txt = pg.extract_text() or ""
+                if txt.strip():
+                    extracted.append(f"[Page {idx}]\n{txt}")
+        full_text = "\n\n".join(extracted)
+        # Translate each page
+        translated = []
+        for page_text in extracted:
+            trans = _translate_text_4langs(page_text.split('\n', 1)[1])  # Skip page header
+            translated.append(page_text.split('\n')[0] + "\n" + trans)
+        # Clean up
+        os.unlink(tmp_path)
+        return {"text": full_text, "translation": "\n\n".join(translated)}
+    except Exception as e:
+        print(f"Whisper PDF error: {e}")
+        return {"error": str(e)}
+@app.post("/whisper/realtime/start")
+async def whisper_realtime_start():
+    """Start realtime transcription session"""
+    session_id = os.urandom(16).hex()
+    realtime_sessions[session_id] = {
+        "buffer": [],
+        "queue": asyncio.Queue(),
+        "active": True
+    }
+    return {"session_id": session_id}
+@app.post("/whisper/realtime/process")
+async def whisper_realtime_process(
+    audio: UploadFile = File(...),
+    session_id: str = Form(...)
+):
+    """Process realtime audio chunk"""
+    if session_id not in realtime_sessions:
+        return {"error": "Invalid session"}
+    try:
+        # Read audio data
+        content = await audio.read()
+        audio_array = np.frombuffer(content, dtype=np.int16).astype(np.float32) / 32768.0
+        # Process in executor
+        result = await asyncio.get_event_loop().run_in_executor(
+            whisper_executor, gpu_asr_translate_whisper, audio_array, WHISPER_SAMPLE_RATE
+        )
+        # Parse result
+        lines = result.split('<br>')
+        timestamp = lines[0].strip('[]') if lines else ""
+        text = lines[1].replace('[KO]', '').strip() if len(lines) > 1 else ""
+        translation = '<br>'.join(lines[2:-2]) if len(lines) > 3 else ""
+        # Queue result
+        await realtime_sessions[session_id]["queue"].put({
+            "timestamp": timestamp,
+            "text": text,
+            "translation": translation
+        })
+        return {"status": "ok"}
+    except Exception as e:
+        print(f"Realtime process error: {e}")
+        return {"error": str(e)}
+@app.get("/whisper/realtime/stream")
+async def whisper_realtime_stream(session_id: str):
+    """Stream realtime results"""
+    if session_id not in realtime_sessions:
+        return JSONResponse({"error": "Invalid session"}, status_code=404)
+    async def stream_results():
+        session = realtime_sessions[session_id]
+        try:
+            while session["active"]:
+                try:
+                    result = await asyncio.wait_for(session["queue"].get(), timeout=1.0)
+                    yield f"data: {json.dumps(result)}\n\n"
+                except asyncio.TimeoutError:
+                    yield f"data: {json.dumps({'keepalive': True})}\n\n"
+        except Exception as e:
+            print(f"Stream error: {e}")
+        finally:
+            # Cleanup session
+            if session_id in realtime_sessions:
+                del realtime_sessions[session_id]
+    return StreamingResponse(stream_results(), media_type="text/event-stream")
 @app.get("/")
 async def index():
     """Serve the HTML page"""