Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,8 +6,8 @@ import os
|
|
6 |
import numpy as np
|
7 |
import openai
|
8 |
from dotenv import load_dotenv
|
9 |
-
from fastapi import FastAPI, Request
|
10 |
-
from fastapi.responses import HTMLResponse, StreamingResponse
|
11 |
from fastrtc import (
|
12 |
AdditionalOutputs,
|
13 |
AsyncStreamHandler,
|
@@ -23,10 +23,35 @@ import gradio as gr
|
|
23 |
import io
|
24 |
from scipy import signal
|
25 |
import wave
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
load_dotenv()
|
28 |
|
29 |
SAMPLE_RATE = 24000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Supported languages for OpenAI Realtime API
|
32 |
SUPPORTED_LANGUAGES = {
|
@@ -61,14 +86,14 @@ SUPPORTED_LANGUAGES = {
|
|
61 |
"ms": "Bahasa Melayu (Malay)"
|
62 |
}
|
63 |
|
64 |
-
# HTML content embedded as a string
|
65 |
HTML_CONTENT = """<!DOCTYPE html>
|
66 |
<html lang="ko">
|
67 |
|
68 |
<head>
|
69 |
<meta charset="UTF-8">
|
70 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
71 |
-
<title>Mouth of 'MOUSE'</title>
|
72 |
<style>
|
73 |
:root {
|
74 |
--primary-color: #6f42c1;
|
@@ -110,6 +135,43 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
110 |
flex-shrink: 0;
|
111 |
background-color: var(--card-bg);
|
112 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
.main-content {
|
114 |
display: flex;
|
115 |
gap: 20px;
|
@@ -219,7 +281,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
219 |
.text-input-section {
|
220 |
margin-top: 15px;
|
221 |
}
|
222 |
-
input[type="text"], textarea {
|
223 |
width: 100%;
|
224 |
background-color: var(--dark-bg);
|
225 |
color: var(--text-color);
|
@@ -318,11 +380,91 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
318 |
flex-shrink: 0;
|
319 |
padding-top: 20px;
|
320 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
/* Responsive design */
|
322 |
@media (max-width: 1024px) {
|
323 |
.sidebar {
|
324 |
width: 300px;
|
325 |
}
|
|
|
|
|
|
|
326 |
}
|
327 |
@media (max-width: 768px) {
|
328 |
.main-content {
|
@@ -335,6 +477,9 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
335 |
.chat-section {
|
336 |
height: 400px;
|
337 |
}
|
|
|
|
|
|
|
338 |
}
|
339 |
button {
|
340 |
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
|
@@ -432,6 +577,10 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
432 |
background-color: #ff9800;
|
433 |
color: white;
|
434 |
}
|
|
|
|
|
|
|
|
|
435 |
.status-indicator {
|
436 |
display: inline-flex;
|
437 |
align-items: center;
|
@@ -508,7 +657,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
508 |
<div class="mouse-ears mouse-ear-right"></div>
|
509 |
<div class="mouse-face"></div>
|
510 |
</div>
|
511 |
-
<h1>MOUSE
|
512 |
</div>
|
513 |
<div class="status-indicator">
|
514 |
<div id="status-dot" class="status-dot disconnected"></div>
|
@@ -516,81 +665,218 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
516 |
</div>
|
517 |
</div>
|
518 |
|
519 |
-
<div class="
|
520 |
-
<
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
</div>
|
529 |
</div>
|
530 |
-
<div class="
|
531 |
-
<
|
532 |
-
<
|
533 |
-
<option value="">비활성화</option>
|
534 |
-
<option value="ko">한국어 (Korean)</option>
|
535 |
-
<option value="en">English</option>
|
536 |
-
<option value="es">Español (Spanish)</option>
|
537 |
-
<option value="fr">Français (French)</option>
|
538 |
-
<option value="de">Deutsch (German)</option>
|
539 |
-
<option value="it">Italiano (Italian)</option>
|
540 |
-
<option value="pt">Português (Portuguese)</option>
|
541 |
-
<option value="ru">Русский (Russian)</option>
|
542 |
-
<option value="ja">日本語 (Japanese)</option>
|
543 |
-
<option value="zh">中文 (Chinese)</option>
|
544 |
-
<option value="ar">العربية (Arabic)</option>
|
545 |
-
<option value="hi">हिन्दी (Hindi)</option>
|
546 |
-
<option value="nl">Nederlands (Dutch)</option>
|
547 |
-
<option value="pl">Polski (Polish)</option>
|
548 |
-
<option value="tr">Türkçe (Turkish)</option>
|
549 |
-
<option value="vi">Tiếng Việt (Vietnamese)</option>
|
550 |
-
<option value="th">ไทย (Thai)</option>
|
551 |
-
<option value="id">Bahasa Indonesia</option>
|
552 |
-
<option value="sv">Svenska (Swedish)</option>
|
553 |
-
<option value="da">Dansk (Danish)</option>
|
554 |
-
<option value="no">Norsk (Norwegian)</option>
|
555 |
-
<option value="fi">Suomi (Finnish)</option>
|
556 |
-
<option value="he">עברית (Hebrew)</option>
|
557 |
-
<option value="uk">Українська (Ukrainian)</option>
|
558 |
-
<option value="cs">Čeština (Czech)</option>
|
559 |
-
<option value="el">Ελληνικά (Greek)</option>
|
560 |
-
<option value="ro">Română (Romanian)</option>
|
561 |
-
<option value="hu">Magyar (Hungarian)</option>
|
562 |
-
<option value="ms">Bahasa Melayu (Malay)</option>
|
563 |
-
</select>
|
564 |
</div>
|
565 |
</div>
|
566 |
-
|
567 |
-
|
568 |
-
<
|
569 |
</div>
|
570 |
</div>
|
571 |
|
572 |
-
<div class="
|
573 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
</div>
|
575 |
</div>
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
586 |
</div>
|
587 |
</div>
|
588 |
</div>
|
589 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
590 |
</div>
|
591 |
<audio id="audio-output"></audio>
|
592 |
|
593 |
<script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
594 |
let peerConnection;
|
595 |
let webrtc_id;
|
596 |
let webSearchEnabled = false;
|
@@ -612,6 +898,13 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
612 |
let dataChannel = null;
|
613 |
let isVoiceActive = false;
|
614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
615 |
// Web search toggle functionality
|
616 |
searchToggle.addEventListener('click', () => {
|
617 |
webSearchEnabled = !webSearchEnabled;
|
@@ -708,6 +1001,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
708 |
isVoiceActive = false;
|
709 |
}
|
710 |
}
|
|
|
711 |
function updateButtonState() {
|
712 |
const button = document.getElementById('start-button');
|
713 |
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
@@ -737,6 +1031,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
737 |
updateStatus('disconnected');
|
738 |
}
|
739 |
}
|
|
|
740 |
function setupAudioVisualization(stream) {
|
741 |
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
742 |
analyser = audioContext.createAnalyser();
|
@@ -771,6 +1066,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
771 |
|
772 |
updateAudioLevel();
|
773 |
}
|
|
|
774 |
function showError(message) {
|
775 |
const toast = document.getElementById('error-toast');
|
776 |
toast.textContent = message;
|
@@ -780,6 +1076,17 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
780 |
toast.style.display = 'none';
|
781 |
}, 5000);
|
782 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
async function setupWebRTC() {
|
784 |
const config = __RTC_CONFIGURATION__;
|
785 |
peerConnection = new RTCPeerConnection(config);
|
@@ -845,14 +1152,6 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
845 |
});
|
846 |
webrtc_id = Math.random().toString(36).substring(7);
|
847 |
|
848 |
-
// Log current settings before sending
|
849 |
-
console.log('Sending offer with settings:', {
|
850 |
-
webrtc_id: webrtc_id,
|
851 |
-
web_search_enabled: webSearchEnabled,
|
852 |
-
target_language: selectedLanguage,
|
853 |
-
system_prompt: systemPrompt
|
854 |
-
});
|
855 |
-
|
856 |
const response = await fetch('/webrtc/offer', {
|
857 |
method: 'POST',
|
858 |
headers: { 'Content-Type': 'application/json' },
|
@@ -897,6 +1196,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
897 |
stop();
|
898 |
}
|
899 |
}
|
|
|
900 |
function addMessage(role, content) {
|
901 |
const messageDiv = document.createElement('div');
|
902 |
messageDiv.classList.add('message', role);
|
@@ -986,6 +1286,7 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
986 |
webrtc_id = null;
|
987 |
}
|
988 |
}
|
|
|
989 |
startButton.addEventListener('click', () => {
|
990 |
console.log('clicked');
|
991 |
console.log(peerConnection, peerConnection?.connectionState);
|
@@ -997,6 +1298,320 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
997 |
}
|
998 |
});
|
999 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1000 |
// Initialize on page load
|
1001 |
window.addEventListener('DOMContentLoaded', () => {
|
1002 |
sendButton.style.display = 'block';
|
@@ -1006,6 +1621,108 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1006 |
|
1007 |
</html>"""
|
1008 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1009 |
|
1010 |
class BraveSearchClient:
|
1011 |
"""Brave Search API client"""
|
@@ -1056,6 +1773,9 @@ print(f"Search client initialized: {search_client is not None}, API key present:
|
|
1056 |
# Store connection settings
|
1057 |
connection_settings = {}
|
1058 |
|
|
|
|
|
|
|
1059 |
# Initialize OpenAI client for text chat
|
1060 |
client = openai.AsyncOpenAI()
|
1061 |
|
@@ -1715,6 +2435,210 @@ async def outputs(webrtc_id: str):
|
|
1715 |
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
1716 |
|
1717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1718 |
@app.get("/")
|
1719 |
async def index():
|
1720 |
"""Serve the HTML page"""
|
|
|
6 |
import numpy as np
|
7 |
import openai
|
8 |
from dotenv import load_dotenv
|
9 |
+
from fastapi import FastAPI, Request, UploadFile, File, Form
|
10 |
+
from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
|
11 |
from fastrtc import (
|
12 |
AdditionalOutputs,
|
13 |
AsyncStreamHandler,
|
|
|
23 |
import io
|
24 |
from scipy import signal
|
25 |
import wave
|
26 |
+
import torch
|
27 |
+
from transformers import pipeline
|
28 |
+
import tempfile
|
29 |
+
import subprocess
|
30 |
+
import pdfplumber
|
31 |
+
import scipy.signal as sps
|
32 |
+
from datetime import datetime
|
33 |
+
from zoneinfo import ZoneInfo
|
34 |
+
import concurrent.futures
|
35 |
|
36 |
load_dotenv()
|
37 |
|
38 |
SAMPLE_RATE = 24000
|
39 |
+
WHISPER_SAMPLE_RATE = 16000
|
40 |
+
SEOUL_TZ = ZoneInfo("Asia/Seoul")
|
41 |
+
|
42 |
+
# Whisper model settings
|
43 |
+
WHISPER_MODEL_NAME = "openai/whisper-large-v3-turbo"
|
44 |
+
WHISPER_BATCH_SIZE = 8
|
45 |
+
|
46 |
+
# Real-time segmentation parameters
|
47 |
+
MIN_SEG_SEC = 10
|
48 |
+
MAX_SEG_SEC = 15
|
49 |
+
SILENCE_SEC = 0.6
|
50 |
+
SILENCE_THRESH = 1e-4
|
51 |
+
|
52 |
+
# CPU-side pool for Whisper tasks
|
53 |
+
whisper_executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
|
54 |
+
whisper_futures_queue: list[concurrent.futures.Future] = []
|
55 |
|
56 |
# Supported languages for OpenAI Realtime API
|
57 |
SUPPORTED_LANGUAGES = {
|
|
|
86 |
"ms": "Bahasa Melayu (Malay)"
|
87 |
}
|
88 |
|
89 |
+
# HTML content embedded as a string (extended with new tabs)
|
90 |
HTML_CONTENT = """<!DOCTYPE html>
|
91 |
<html lang="ko">
|
92 |
|
93 |
<head>
|
94 |
<meta charset="UTF-8">
|
95 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
96 |
+
<title>Mouth of 'MOUSE' - Extended</title>
|
97 |
<style>
|
98 |
:root {
|
99 |
--primary-color: #6f42c1;
|
|
|
135 |
flex-shrink: 0;
|
136 |
background-color: var(--card-bg);
|
137 |
}
|
138 |
+
.tabs-container {
|
139 |
+
display: flex;
|
140 |
+
gap: 10px;
|
141 |
+
margin-bottom: 20px;
|
142 |
+
border-bottom: 2px solid var(--border-color);
|
143 |
+
padding-bottom: 10px;
|
144 |
+
overflow-x: auto;
|
145 |
+
scrollbar-width: thin;
|
146 |
+
scrollbar-color: var(--primary-color) var(--card-bg);
|
147 |
+
}
|
148 |
+
.tab-button {
|
149 |
+
padding: 10px 20px;
|
150 |
+
background-color: var(--card-bg);
|
151 |
+
color: var(--text-color);
|
152 |
+
border: 1px solid var(--border-color);
|
153 |
+
border-radius: 8px 8px 0 0;
|
154 |
+
cursor: pointer;
|
155 |
+
transition: all 0.3s;
|
156 |
+
white-space: nowrap;
|
157 |
+
font-size: 14px;
|
158 |
+
}
|
159 |
+
.tab-button:hover {
|
160 |
+
background-color: var(--secondary-color);
|
161 |
+
}
|
162 |
+
.tab-button.active {
|
163 |
+
background-color: var(--primary-color);
|
164 |
+
border-bottom: 2px solid var(--primary-color);
|
165 |
+
}
|
166 |
+
.tab-content {
|
167 |
+
display: none;
|
168 |
+
flex-grow: 1;
|
169 |
+
overflow: hidden;
|
170 |
+
}
|
171 |
+
.tab-content.active {
|
172 |
+
display: flex;
|
173 |
+
flex-direction: column;
|
174 |
+
}
|
175 |
.main-content {
|
176 |
display: flex;
|
177 |
gap: 20px;
|
|
|
281 |
.text-input-section {
|
282 |
margin-top: 15px;
|
283 |
}
|
284 |
+
input[type="text"], input[type="file"], textarea {
|
285 |
width: 100%;
|
286 |
background-color: var(--dark-bg);
|
287 |
color: var(--text-color);
|
|
|
380 |
flex-shrink: 0;
|
381 |
padding-top: 20px;
|
382 |
}
|
383 |
+
/* Whisper Tab Styles */
|
384 |
+
.whisper-container {
|
385 |
+
padding: 20px;
|
386 |
+
background-color: var(--card-bg);
|
387 |
+
border-radius: 12px;
|
388 |
+
border: 1px solid var(--border-color);
|
389 |
+
margin: 20px auto;
|
390 |
+
max-width: 800px;
|
391 |
+
width: 100%;
|
392 |
+
}
|
393 |
+
.whisper-input-section {
|
394 |
+
margin-bottom: 20px;
|
395 |
+
}
|
396 |
+
.whisper-output-section {
|
397 |
+
display: grid;
|
398 |
+
grid-template-columns: 1fr 1fr;
|
399 |
+
gap: 20px;
|
400 |
+
margin-top: 20px;
|
401 |
+
}
|
402 |
+
.whisper-output {
|
403 |
+
background-color: var(--dark-bg);
|
404 |
+
padding: 15px;
|
405 |
+
border-radius: 8px;
|
406 |
+
border: 1px solid var(--border-color);
|
407 |
+
max-height: 300px;
|
408 |
+
overflow-y: auto;
|
409 |
+
}
|
410 |
+
.whisper-output h4 {
|
411 |
+
margin-top: 0;
|
412 |
+
color: var(--primary-color);
|
413 |
+
}
|
414 |
+
.file-upload-area {
|
415 |
+
border: 2px dashed var(--border-color);
|
416 |
+
border-radius: 8px;
|
417 |
+
padding: 30px;
|
418 |
+
text-align: center;
|
419 |
+
cursor: pointer;
|
420 |
+
transition: all 0.3s;
|
421 |
+
}
|
422 |
+
.file-upload-area:hover {
|
423 |
+
border-color: var(--primary-color);
|
424 |
+
background-color: rgba(111, 66, 193, 0.1);
|
425 |
+
}
|
426 |
+
.file-upload-area.drag-over {
|
427 |
+
border-color: var(--primary-color);
|
428 |
+
background-color: rgba(111, 66, 193, 0.2);
|
429 |
+
}
|
430 |
+
.realtime-output {
|
431 |
+
background-color: var(--dark-bg);
|
432 |
+
padding: 20px;
|
433 |
+
border-radius: 8px;
|
434 |
+
margin-top: 20px;
|
435 |
+
min-height: 200px;
|
436 |
+
max-height: 400px;
|
437 |
+
overflow-y: auto;
|
438 |
+
}
|
439 |
+
.recording-indicator {
|
440 |
+
display: inline-flex;
|
441 |
+
align-items: center;
|
442 |
+
gap: 10px;
|
443 |
+
padding: 10px 20px;
|
444 |
+
background-color: #f44336;
|
445 |
+
color: white;
|
446 |
+
border-radius: 20px;
|
447 |
+
animation: pulse 1.5s infinite;
|
448 |
+
}
|
449 |
+
.recording-dot {
|
450 |
+
width: 10px;
|
451 |
+
height: 10px;
|
452 |
+
background-color: white;
|
453 |
+
border-radius: 50%;
|
454 |
+
animation: blink 1s infinite;
|
455 |
+
}
|
456 |
+
@keyframes blink {
|
457 |
+
0%, 50% { opacity: 1; }
|
458 |
+
51%, 100% { opacity: 0; }
|
459 |
+
}
|
460 |
/* Responsive design */
|
461 |
@media (max-width: 1024px) {
|
462 |
.sidebar {
|
463 |
width: 300px;
|
464 |
}
|
465 |
+
.whisper-output-section {
|
466 |
+
grid-template-columns: 1fr;
|
467 |
+
}
|
468 |
}
|
469 |
@media (max-width: 768px) {
|
470 |
.main-content {
|
|
|
477 |
.chat-section {
|
478 |
height: 400px;
|
479 |
}
|
480 |
+
.tabs-container {
|
481 |
+
flex-wrap: wrap;
|
482 |
+
}
|
483 |
}
|
484 |
button {
|
485 |
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
|
|
|
577 |
background-color: #ff9800;
|
578 |
color: white;
|
579 |
}
|
580 |
+
.toast.success {
|
581 |
+
background-color: #4caf50;
|
582 |
+
color: white;
|
583 |
+
}
|
584 |
.status-indicator {
|
585 |
display: inline-flex;
|
586 |
align-items: center;
|
|
|
657 |
<div class="mouse-ears mouse-ear-right"></div>
|
658 |
<div class="mouse-face"></div>
|
659 |
</div>
|
660 |
+
<h1>MOUSE Extended</h1>
|
661 |
</div>
|
662 |
<div class="status-indicator">
|
663 |
<div id="status-dot" class="status-dot disconnected"></div>
|
|
|
665 |
</div>
|
666 |
</div>
|
667 |
|
668 |
+
<div class="tabs-container">
|
669 |
+
<button class="tab-button active" onclick="switchTab('voice-chat')">음성 채팅</button>
|
670 |
+
<button class="tab-button" onclick="switchTab('mic-whisper')">마이크 전사</button>
|
671 |
+
<button class="tab-button" onclick="switchTab('audio-whisper')">오디오 파일</button>
|
672 |
+
<button class="tab-button" onclick="switchTab('video-whisper')">비디오 파일</button>
|
673 |
+
<button class="tab-button" onclick="switchTab('pdf-whisper')">PDF 번역</button>
|
674 |
+
<button class="tab-button" onclick="switchTab('realtime-whisper')">실시간 통역</button>
|
675 |
+
</div>
|
676 |
+
|
677 |
+
<!-- Voice Chat Tab (Original) -->
|
678 |
+
<div id="voice-chat" class="tab-content active">
|
679 |
+
<div class="main-content">
|
680 |
+
<div class="sidebar">
|
681 |
+
<div class="settings-section">
|
682 |
+
<h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
|
683 |
+
<div class="settings-grid">
|
684 |
+
<div class="setting-item">
|
685 |
+
<span class="setting-label">웹 검색</span>
|
686 |
+
<div id="search-toggle" class="toggle-switch">
|
687 |
+
<div class="toggle-slider"></div>
|
688 |
+
</div>
|
689 |
+
</div>
|
690 |
+
<div class="setting-item">
|
691 |
+
<span class="setting-label">다국어 번역 채팅</span>
|
692 |
+
<select id="language-select">
|
693 |
+
<option value="">비활성화</option>
|
694 |
+
<option value="ko">한국어 (Korean)</option>
|
695 |
+
<option value="en">English</option>
|
696 |
+
<option value="es">Español (Spanish)</option>
|
697 |
+
<option value="fr">Français (French)</option>
|
698 |
+
<option value="de">Deutsch (German)</option>
|
699 |
+
<option value="it">Italiano (Italian)</option>
|
700 |
+
<option value="pt">Português (Portuguese)</option>
|
701 |
+
<option value="ru">Русский (Russian)</option>
|
702 |
+
<option value="ja">日本語 (Japanese)</option>
|
703 |
+
<option value="zh">中文 (Chinese)</option>
|
704 |
+
<option value="ar">العربية (Arabic)</option>
|
705 |
+
<option value="hi">हिन्दी (Hindi)</option>
|
706 |
+
<option value="nl">Nederlands (Dutch)</option>
|
707 |
+
<option value="pl">Polski (Polish)</option>
|
708 |
+
<option value="tr">Türkçe (Turkish)</option>
|
709 |
+
<option value="vi">Tiếng Việt (Vietnamese)</option>
|
710 |
+
<option value="th">ไทย (Thai)</option>
|
711 |
+
<option value="id">Bahasa Indonesia</option>
|
712 |
+
<option value="sv">Svenska (Swedish)</option>
|
713 |
+
<option value="da">Dansk (Danish)</option>
|
714 |
+
<option value="no">Norsk (Norwegian)</option>
|
715 |
+
<option value="fi">Suomi (Finnish)</option>
|
716 |
+
<option value="he">עברית (Hebrew)</option>
|
717 |
+
<option value="uk">Українська (Ukrainian)</option>
|
718 |
+
<option value="cs">Čeština (Czech)</option>
|
719 |
+
<option value="el">Ελληνικά (Greek)</option>
|
720 |
+
<option value="ro">Română (Romanian)</option>
|
721 |
+
<option value="hu">Magyar (Hungarian)</option>
|
722 |
+
<option value="ms">Bahasa Melayu (Malay)</option>
|
723 |
+
</select>
|
724 |
</div>
|
725 |
</div>
|
726 |
+
<div class="text-input-section">
|
727 |
+
<label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
|
728 |
+
<textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
729 |
</div>
|
730 |
</div>
|
731 |
+
|
732 |
+
<div class="controls">
|
733 |
+
<button id="start-button">대화 시작</button>
|
734 |
</div>
|
735 |
</div>
|
736 |
|
737 |
+
<div class="chat-section">
|
738 |
+
<div class="chat-container">
|
739 |
+
<h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
|
740 |
+
<div class="chat-messages" id="chat-messages"></div>
|
741 |
+
<div class="text-input-section" style="margin-top: 10px;">
|
742 |
+
<div style="display: flex; gap: 10px;">
|
743 |
+
<input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
|
744 |
+
<button id="send-button" style="display: none;">전송</button>
|
745 |
+
</div>
|
746 |
+
</div>
|
747 |
+
</div>
|
748 |
</div>
|
749 |
</div>
|
750 |
+
</div>
|
751 |
+
|
752 |
+
<!-- Microphone Whisper Tab -->
|
753 |
+
<div id="mic-whisper" class="tab-content">
|
754 |
+
<div class="whisper-container">
|
755 |
+
<h3>마이크 녹음 → 전사 및 4개 언어 번역</h3>
|
756 |
+
<div class="whisper-input-section">
|
757 |
+
<button id="mic-record-btn" onclick="toggleMicRecording()">녹음 시작</button>
|
758 |
+
<div id="mic-status" style="margin-top: 10px;"></div>
|
759 |
+
</div>
|
760 |
+
<div class="whisper-output-section">
|
761 |
+
<div class="whisper-output">
|
762 |
+
<h4>원문</h4>
|
763 |
+
<div id="mic-original"></div>
|
764 |
+
</div>
|
765 |
+
<div class="whisper-output">
|
766 |
+
<h4>번역 (EN/ZH/TH/RU)</h4>
|
767 |
+
<div id="mic-translation"></div>
|
768 |
+
</div>
|
769 |
+
</div>
|
770 |
+
</div>
|
771 |
+
</div>
|
772 |
+
|
773 |
+
<!-- Audio File Whisper Tab -->
|
774 |
+
<div id="audio-whisper" class="tab-content">
|
775 |
+
<div class="whisper-container">
|
776 |
+
<h3>오디오 파일 → 전사 및 4개 언어 번역</h3>
|
777 |
+
<div class="whisper-input-section">
|
778 |
+
<div class="file-upload-area" id="audio-upload-area">
|
779 |
+
<p>오디오 파일을 드래그하거나 클릭하여 업로드</p>
|
780 |
+
<input type="file" id="audio-file-input" accept="audio/*" style="display: none;">
|
781 |
+
</div>
|
782 |
+
</div>
|
783 |
+
<div class="whisper-output-section">
|
784 |
+
<div class="whisper-output">
|
785 |
+
<h4>원문</h4>
|
786 |
+
<div id="audio-original"></div>
|
787 |
+
</div>
|
788 |
+
<div class="whisper-output">
|
789 |
+
<h4>번역 (EN/ZH/TH/RU)</h4>
|
790 |
+
<div id="audio-translation"></div>
|
791 |
</div>
|
792 |
</div>
|
793 |
</div>
|
794 |
</div>
|
795 |
+
|
796 |
+
<!-- Video File Whisper Tab -->
|
797 |
+
<div id="video-whisper" class="tab-content">
|
798 |
+
<div class="whisper-container">
|
799 |
+
<h3>비디오 파일 → 오디오 추출 → 전사 및 4개 언어 번역</h3>
|
800 |
+
<div class="whisper-input-section">
|
801 |
+
<div class="file-upload-area" id="video-upload-area">
|
802 |
+
<p>비디오 파일을 드래그하거나 클릭하여 업로드</p>
|
803 |
+
<input type="file" id="video-file-input" accept="video/*" style="display: none;">
|
804 |
+
</div>
|
805 |
+
</div>
|
806 |
+
<div class="whisper-output-section">
|
807 |
+
<div class="whisper-output">
|
808 |
+
<h4>원문</h4>
|
809 |
+
<div id="video-original"></div>
|
810 |
+
</div>
|
811 |
+
<div class="whisper-output">
|
812 |
+
<h4>번역 (EN/ZH/TH/RU)</h4>
|
813 |
+
<div id="video-translation"></div>
|
814 |
+
</div>
|
815 |
+
</div>
|
816 |
+
</div>
|
817 |
+
</div>
|
818 |
+
|
819 |
+
<!-- PDF Whisper Tab -->
|
820 |
+
<div id="pdf-whisper" class="tab-content">
|
821 |
+
<div class="whisper-container">
|
822 |
+
<h3>PDF 파일 → 텍스트 추출 → 4개 언어 번역</h3>
|
823 |
+
<div class="whisper-input-section">
|
824 |
+
<div class="file-upload-area" id="pdf-upload-area">
|
825 |
+
<p>PDF 파일을 드래그하거나 클릭하여 업로드</p>
|
826 |
+
<input type="file" id="pdf-file-input" accept=".pdf" style="display: none;">
|
827 |
+
</div>
|
828 |
+
<div class="setting-item" style="margin-top: 15px;">
|
829 |
+
<span class="setting-label">최대 페이지:</span>
|
830 |
+
<input type="number" id="pdf-max-pages" value="10" min="1" max="50" style="width: 80px;">
|
831 |
+
</div>
|
832 |
+
</div>
|
833 |
+
<div class="whisper-output-section">
|
834 |
+
<div class="whisper-output">
|
835 |
+
<h4>추출된 텍스트</h4>
|
836 |
+
<div id="pdf-original"></div>
|
837 |
+
</div>
|
838 |
+
<div class="whisper-output">
|
839 |
+
<h4>번역 (EN/ZH/TH/RU)</h4>
|
840 |
+
<div id="pdf-translation"></div>
|
841 |
+
</div>
|
842 |
+
</div>
|
843 |
+
</div>
|
844 |
+
</div>
|
845 |
+
|
846 |
+
<!-- Realtime Whisper Tab -->
|
847 |
+
<div id="realtime-whisper" class="tab-content">
|
848 |
+
<div class="whisper-container">
|
849 |
+
<h3>실시간 통역 (Korean → EN/ZH/TH/RU)</h3>
|
850 |
+
<p>10-15초 문장 단위로 자동 전환 — 최신 내용이 위에 표시됩니다.</p>
|
851 |
+
<div class="whisper-input-section">
|
852 |
+
<button id="realtime-start-btn" onclick="toggleRealtimeRecording()">실시간 통역 시작</button>
|
853 |
+
<div id="realtime-status" style="margin-top: 10px;"></div>
|
854 |
+
</div>
|
855 |
+
<div class="realtime-output" id="realtime-output"></div>
|
856 |
+
</div>
|
857 |
+
</div>
|
858 |
</div>
|
859 |
<audio id="audio-output"></audio>
|
860 |
|
861 |
<script>
|
862 |
+
// Tab switching functionality
|
863 |
+
function switchTab(tabName) {
|
864 |
+
// Hide all tabs
|
865 |
+
const tabs = document.querySelectorAll('.tab-content');
|
866 |
+
tabs.forEach(tab => tab.classList.remove('active'));
|
867 |
+
|
868 |
+
// Remove active class from all buttons
|
869 |
+
const buttons = document.querySelectorAll('.tab-button');
|
870 |
+
buttons.forEach(btn => btn.classList.remove('active'));
|
871 |
+
|
872 |
+
// Show selected tab
|
873 |
+
document.getElementById(tabName).classList.add('active');
|
874 |
+
|
875 |
+
// Add active class to clicked button
|
876 |
+
event.target.classList.add('active');
|
877 |
+
}
|
878 |
+
|
879 |
+
// Original voice chat variables and functions
|
880 |
let peerConnection;
|
881 |
let webrtc_id;
|
882 |
let webSearchEnabled = false;
|
|
|
898 |
let dataChannel = null;
|
899 |
let isVoiceActive = false;
|
900 |
|
901 |
+
// Whisper-related variables
|
902 |
+
let micRecorder = null;
|
903 |
+
let isRecording = false;
|
904 |
+
let realtimeRecorder = null;
|
905 |
+
let isRealtimeRecording = false;
|
906 |
+
let realtimeStream = null;
|
907 |
+
|
908 |
// Web search toggle functionality
|
909 |
searchToggle.addEventListener('click', () => {
|
910 |
webSearchEnabled = !webSearchEnabled;
|
|
|
1001 |
isVoiceActive = false;
|
1002 |
}
|
1003 |
}
|
1004 |
+
|
1005 |
function updateButtonState() {
|
1006 |
const button = document.getElementById('start-button');
|
1007 |
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
|
|
1031 |
updateStatus('disconnected');
|
1032 |
}
|
1033 |
}
|
1034 |
+
|
1035 |
function setupAudioVisualization(stream) {
|
1036 |
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
1037 |
analyser = audioContext.createAnalyser();
|
|
|
1066 |
|
1067 |
updateAudioLevel();
|
1068 |
}
|
1069 |
+
|
1070 |
function showError(message) {
|
1071 |
const toast = document.getElementById('error-toast');
|
1072 |
toast.textContent = message;
|
|
|
1076 |
toast.style.display = 'none';
|
1077 |
}, 5000);
|
1078 |
}
|
1079 |
+
|
1080 |
+
function showSuccess(message) {
|
1081 |
+
const toast = document.getElementById('error-toast');
|
1082 |
+
toast.textContent = message;
|
1083 |
+
toast.className = 'toast success';
|
1084 |
+
toast.style.display = 'block';
|
1085 |
+
setTimeout(() => {
|
1086 |
+
toast.style.display = 'none';
|
1087 |
+
}, 3000);
|
1088 |
+
}
|
1089 |
+
|
1090 |
async function setupWebRTC() {
|
1091 |
const config = __RTC_CONFIGURATION__;
|
1092 |
peerConnection = new RTCPeerConnection(config);
|
|
|
1152 |
});
|
1153 |
webrtc_id = Math.random().toString(36).substring(7);
|
1154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1155 |
const response = await fetch('/webrtc/offer', {
|
1156 |
method: 'POST',
|
1157 |
headers: { 'Content-Type': 'application/json' },
|
|
|
1196 |
stop();
|
1197 |
}
|
1198 |
}
|
1199 |
+
|
1200 |
function addMessage(role, content) {
|
1201 |
const messageDiv = document.createElement('div');
|
1202 |
messageDiv.classList.add('message', role);
|
|
|
1286 |
webrtc_id = null;
|
1287 |
}
|
1288 |
}
|
1289 |
+
|
1290 |
startButton.addEventListener('click', () => {
|
1291 |
console.log('clicked');
|
1292 |
console.log(peerConnection, peerConnection?.connectionState);
|
|
|
1298 |
}
|
1299 |
});
|
1300 |
|
1301 |
+
// Whisper Tab Functions
|
1302 |
+
|
1303 |
+
// Microphone recording
|
1304 |
+
async function toggleMicRecording() {
|
1305 |
+
const btn = document.getElementById('mic-record-btn');
|
1306 |
+
const status = document.getElementById('mic-status');
|
1307 |
+
|
1308 |
+
if (!isRecording) {
|
1309 |
+
try {
|
1310 |
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
1311 |
+
micRecorder = new MediaRecorder(stream);
|
1312 |
+
const chunks = [];
|
1313 |
+
|
1314 |
+
micRecorder.ondataavailable = (e) => chunks.push(e.data);
|
1315 |
+
micRecorder.onstop = async () => {
|
1316 |
+
const blob = new Blob(chunks, { type: 'audio/webm' });
|
1317 |
+
await processAudioBlob(blob, 'mic');
|
1318 |
+
stream.getTracks().forEach(track => track.stop());
|
1319 |
+
};
|
1320 |
+
|
1321 |
+
micRecorder.start();
|
1322 |
+
isRecording = true;
|
1323 |
+
btn.textContent = '녹음 중지';
|
1324 |
+
status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>녹음 중...</div>';
|
1325 |
+
} catch (err) {
|
1326 |
+
showError('마이크 접근 권한이 필요합니다.');
|
1327 |
+
}
|
1328 |
+
} else {
|
1329 |
+
micRecorder.stop();
|
1330 |
+
isRecording = false;
|
1331 |
+
btn.textContent = '녹음 시작';
|
1332 |
+
status.textContent = '처리 중...';
|
1333 |
+
}
|
1334 |
+
}
|
1335 |
+
|
1336 |
+
// File upload handlers
|
1337 |
+
document.getElementById('audio-upload-area').addEventListener('click', () => {
|
1338 |
+
document.getElementById('audio-file-input').click();
|
1339 |
+
});
|
1340 |
+
|
1341 |
+
document.getElementById('video-upload-area').addEventListener('click', () => {
|
1342 |
+
document.getElementById('video-file-input').click();
|
1343 |
+
});
|
1344 |
+
|
1345 |
+
document.getElementById('pdf-upload-area').addEventListener('click', () => {
|
1346 |
+
document.getElementById('pdf-file-input').click();
|
1347 |
+
});
|
1348 |
+
|
1349 |
+
// Drag and drop handlers
|
1350 |
+
['audio', 'video', 'pdf'].forEach(type => {
|
1351 |
+
const area = document.getElementById(`${type}-upload-area`);
|
1352 |
+
|
1353 |
+
area.addEventListener('dragover', (e) => {
|
1354 |
+
e.preventDefault();
|
1355 |
+
area.classList.add('drag-over');
|
1356 |
+
});
|
1357 |
+
|
1358 |
+
area.addEventListener('dragleave', () => {
|
1359 |
+
area.classList.remove('drag-over');
|
1360 |
+
});
|
1361 |
+
|
1362 |
+
area.addEventListener('drop', (e) => {
|
1363 |
+
e.preventDefault();
|
1364 |
+
area.classList.remove('drag-over');
|
1365 |
+
const file = e.dataTransfer.files[0];
|
1366 |
+
if (file) {
|
1367 |
+
if (type === 'audio') processAudioFile(file);
|
1368 |
+
else if (type === 'video') processVideoFile(file);
|
1369 |
+
else if (type === 'pdf') processPDFFile(file);
|
1370 |
+
}
|
1371 |
+
});
|
1372 |
+
});
|
1373 |
+
|
1374 |
+
// File input change handlers
|
1375 |
+
document.getElementById('audio-file-input').addEventListener('change', (e) => {
|
1376 |
+
if (e.target.files[0]) processAudioFile(e.target.files[0]);
|
1377 |
+
});
|
1378 |
+
|
1379 |
+
document.getElementById('video-file-input').addEventListener('change', (e) => {
|
1380 |
+
if (e.target.files[0]) processVideoFile(e.target.files[0]);
|
1381 |
+
});
|
1382 |
+
|
1383 |
+
document.getElementById('pdf-file-input').addEventListener('change', (e) => {
|
1384 |
+
if (e.target.files[0]) processPDFFile(e.target.files[0]);
|
1385 |
+
});
|
1386 |
+
|
1387 |
+
// Process audio blob (for microphone recording)
|
1388 |
+
async function processAudioBlob(blob, type) {
|
1389 |
+
const formData = new FormData();
|
1390 |
+
formData.append('audio', blob, 'recording.webm');
|
1391 |
+
|
1392 |
+
try {
|
1393 |
+
const response = await fetch('/whisper/transcribe', {
|
1394 |
+
method: 'POST',
|
1395 |
+
body: formData
|
1396 |
+
});
|
1397 |
+
|
1398 |
+
const result = await response.json();
|
1399 |
+
if (result.error) {
|
1400 |
+
showError(result.error);
|
1401 |
+
} else {
|
1402 |
+
document.getElementById(`${type}-original`).textContent = result.text;
|
1403 |
+
document.getElementById(`${type}-translation`).innerHTML = result.translation.replace(/\n/g, '<br>');
|
1404 |
+
document.getElementById(`${type}-status`).textContent = '';
|
1405 |
+
}
|
1406 |
+
} catch (error) {
|
1407 |
+
showError('처리 중 오류가 발생했습니다.');
|
1408 |
+
document.getElementById(`${type}-status`).textContent = '';
|
1409 |
+
}
|
1410 |
+
}
|
1411 |
+
|
1412 |
+
// Process audio file
|
1413 |
+
async function processAudioFile(file) {
|
1414 |
+
const formData = new FormData();
|
1415 |
+
formData.append('audio', file);
|
1416 |
+
|
1417 |
+
try {
|
1418 |
+
showSuccess('오디오 파일 처리 중...');
|
1419 |
+
const response = await fetch('/whisper/audio', {
|
1420 |
+
method: 'POST',
|
1421 |
+
body: formData
|
1422 |
+
});
|
1423 |
+
|
1424 |
+
const result = await response.json();
|
1425 |
+
if (result.error) {
|
1426 |
+
showError(result.error);
|
1427 |
+
} else {
|
1428 |
+
document.getElementById('audio-original').textContent = result.text;
|
1429 |
+
document.getElementById('audio-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
|
1430 |
+
}
|
1431 |
+
} catch (error) {
|
1432 |
+
showError('오디오 파일 처리 중 오류가 발생했습니다.');
|
1433 |
+
}
|
1434 |
+
}
|
1435 |
+
|
1436 |
+
// Process video file
|
1437 |
+
async function processVideoFile(file) {
|
1438 |
+
const formData = new FormData();
|
1439 |
+
formData.append('video', file);
|
1440 |
+
|
1441 |
+
try {
|
1442 |
+
showSuccess('비디오 파일 처리 중... (시간이 걸릴 수 있습니다)');
|
1443 |
+
const response = await fetch('/whisper/video', {
|
1444 |
+
method: 'POST',
|
1445 |
+
body: formData
|
1446 |
+
});
|
1447 |
+
|
1448 |
+
const result = await response.json();
|
1449 |
+
if (result.error) {
|
1450 |
+
showError(result.error);
|
1451 |
+
} else {
|
1452 |
+
document.getElementById('video-original').textContent = result.text;
|
1453 |
+
document.getElementById('video-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
|
1454 |
+
}
|
1455 |
+
} catch (error) {
|
1456 |
+
showError('비디오 파일 처리 중 오류가 발생했습니다.');
|
1457 |
+
}
|
1458 |
+
}
|
1459 |
+
|
1460 |
+
// Process PDF file
|
1461 |
+
async function processPDFFile(file) {
|
1462 |
+
const formData = new FormData();
|
1463 |
+
formData.append('pdf', file);
|
1464 |
+
formData.append('max_pages', document.getElementById('pdf-max-pages').value);
|
1465 |
+
|
1466 |
+
try {
|
1467 |
+
showSuccess('PDF 파일 처리 중...');
|
1468 |
+
const response = await fetch('/whisper/pdf', {
|
1469 |
+
method: 'POST',
|
1470 |
+
body: formData
|
1471 |
+
});
|
1472 |
+
|
1473 |
+
const result = await response.json();
|
1474 |
+
if (result.error) {
|
1475 |
+
showError(result.error);
|
1476 |
+
} else {
|
1477 |
+
document.getElementById('pdf-original').textContent = result.text;
|
1478 |
+
document.getElementById('pdf-translation').innerHTML = result.translation.replace(/\n/g, '<br>');
|
1479 |
+
}
|
1480 |
+
} catch (error) {
|
1481 |
+
showError('PDF 파일 처리 중 오류가 발생했습니다.');
|
1482 |
+
}
|
1483 |
+
}
|
1484 |
+
|
1485 |
+
// Realtime recording
|
1486 |
+
let realtimeEventSource = null;
|
1487 |
+
|
1488 |
+
async function toggleRealtimeRecording() {
|
1489 |
+
const btn = document.getElementById('realtime-start-btn');
|
1490 |
+
const status = document.getElementById('realtime-status');
|
1491 |
+
const output = document.getElementById('realtime-output');
|
1492 |
+
|
1493 |
+
if (!isRealtimeRecording) {
|
1494 |
+
try {
|
1495 |
+
const response = await fetch('/whisper/realtime/start', {
|
1496 |
+
method: 'POST'
|
1497 |
+
});
|
1498 |
+
|
1499 |
+
const result = await response.json();
|
1500 |
+
if (result.session_id) {
|
1501 |
+
// Start listening for results
|
1502 |
+
realtimeEventSource = new EventSource(`/whisper/realtime/stream?session_id=${result.session_id}`);
|
1503 |
+
|
1504 |
+
realtimeEventSource.onmessage = (event) => {
|
1505 |
+
const data = JSON.parse(event.data);
|
1506 |
+
if (data.timestamp) {
|
1507 |
+
const segment = document.createElement('div');
|
1508 |
+
segment.style.borderBottom = '1px solid #333';
|
1509 |
+
segment.style.paddingBottom = '15px';
|
1510 |
+
segment.style.marginBottom = '15px';
|
1511 |
+
segment.innerHTML = `
|
1512 |
+
<strong>[${data.timestamp}]</strong><br>
|
1513 |
+
<strong>[KO]</strong> ${data.text}<br>
|
1514 |
+
${data.translation.replace(/\n/g, '<br>')}
|
1515 |
+
`;
|
1516 |
+
output.insertBefore(segment, output.firstChild);
|
1517 |
+
}
|
1518 |
+
};
|
1519 |
+
|
1520 |
+
realtimeEventSource.onerror = () => {
|
1521 |
+
stopRealtimeRecording();
|
1522 |
+
showError('실시간 통역 연결이 끊어졌습니다.');
|
1523 |
+
};
|
1524 |
+
|
1525 |
+
// Start recording
|
1526 |
+
realtimeStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
1527 |
+
|
1528 |
+
isRealtimeRecording = true;
|
1529 |
+
btn.textContent = '통역 중지';
|
1530 |
+
status.innerHTML = '<div class="recording-indicator"><div class="recording-dot"></div>실시간 통역 중...</div>';
|
1531 |
+
|
1532 |
+
// Send audio data periodically
|
1533 |
+
startRealtimeAudioCapture(result.session_id);
|
1534 |
+
}
|
1535 |
+
} catch (err) {
|
1536 |
+
showError('실시간 통역을 시작할 수 없습니다.');
|
1537 |
+
}
|
1538 |
+
} else {
|
1539 |
+
stopRealtimeRecording();
|
1540 |
+
}
|
1541 |
+
}
|
1542 |
+
|
1543 |
+
function stopRealtimeRecording() {
|
1544 |
+
if (realtimeEventSource) {
|
1545 |
+
realtimeEventSource.close();
|
1546 |
+
realtimeEventSource = null;
|
1547 |
+
}
|
1548 |
+
|
1549 |
+
if (realtimeStream) {
|
1550 |
+
realtimeStream.getTracks().forEach(track => track.stop());
|
1551 |
+
realtimeStream = null;
|
1552 |
+
}
|
1553 |
+
|
1554 |
+
isRealtimeRecording = false;
|
1555 |
+
document.getElementById('realtime-start-btn').textContent = '실시간 통역 시작';
|
1556 |
+
document.getElementById('realtime-status').textContent = '';
|
1557 |
+
}
|
1558 |
+
|
1559 |
+
async function startRealtimeAudioCapture(sessionId) {
|
1560 |
+
const audioContext = new AudioContext({ sampleRate: 16000 });
|
1561 |
+
const source = audioContext.createMediaStreamSource(realtimeStream);
|
1562 |
+
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
1563 |
+
|
1564 |
+
let audioBuffer = [];
|
1565 |
+
let silenceFrames = 0;
|
1566 |
+
const SILENCE_THRESHOLD = 0.01;
|
1567 |
+
const MIN_BUFFER_SIZE = 16000 * 2; // 2 seconds minimum
|
1568 |
+
const MAX_BUFFER_SIZE = 16000 * 15; // 15 seconds maximum
|
1569 |
+
|
1570 |
+
processor.onaudioprocess = async (e) => {
|
1571 |
+
if (!isRealtimeRecording) return;
|
1572 |
+
|
1573 |
+
const inputData = e.inputBuffer.getChannelData(0);
|
1574 |
+
audioBuffer.push(...inputData);
|
1575 |
+
|
1576 |
+
// Simple voice activity detection
|
1577 |
+
const rms = Math.sqrt(inputData.reduce((sum, val) => sum + val * val, 0) / inputData.length);
|
1578 |
+
|
1579 |
+
if (rms < SILENCE_THRESHOLD) {
|
1580 |
+
silenceFrames++;
|
1581 |
+
} else {
|
1582 |
+
silenceFrames = 0;
|
1583 |
+
}
|
1584 |
+
|
1585 |
+
// Send audio when we have enough silence or max buffer reached
|
1586 |
+
if ((silenceFrames > 20 && audioBuffer.length > MIN_BUFFER_SIZE) ||
|
1587 |
+
audioBuffer.length > MAX_BUFFER_SIZE) {
|
1588 |
+
|
1589 |
+
const audioData = new Float32Array(audioBuffer);
|
1590 |
+
audioBuffer = [];
|
1591 |
+
silenceFrames = 0;
|
1592 |
+
|
1593 |
+
// Convert to 16-bit PCM
|
1594 |
+
const pcmData = new Int16Array(audioData.length);
|
1595 |
+
for (let i = 0; i < audioData.length; i++) {
|
1596 |
+
pcmData[i] = Math.max(-32768, Math.min(32767, audioData[i] * 32768));
|
1597 |
+
}
|
1598 |
+
|
1599 |
+
// Send to server
|
1600 |
+
const formData = new FormData();
|
1601 |
+
formData.append('audio', new Blob([pcmData.buffer], { type: 'audio/pcm' }));
|
1602 |
+
formData.append('session_id', sessionId);
|
1603 |
+
|
1604 |
+
fetch('/whisper/realtime/process', {
|
1605 |
+
method: 'POST',
|
1606 |
+
body: formData
|
1607 |
+
}).catch(err => console.error('Error sending audio:', err));
|
1608 |
+
}
|
1609 |
+
};
|
1610 |
+
|
1611 |
+
source.connect(processor);
|
1612 |
+
processor.connect(audioContext.destination);
|
1613 |
+
}
|
1614 |
+
|
1615 |
// Initialize on page load
|
1616 |
window.addEventListener('DOMContentLoaded', () => {
|
1617 |
sendButton.style.display = 'block';
|
|
|
1621 |
|
1622 |
</html>"""
|
1623 |
|
1624 |
+
# Whisper model loader
|
1625 |
+
def _get_whisper_pipe():
|
1626 |
+
"""Lazy load Whisper pipeline"""
|
1627 |
+
if not hasattr(_get_whisper_pipe, "pipe"):
|
1628 |
+
device = 0 if torch.cuda.is_available() else "cpu"
|
1629 |
+
_get_whisper_pipe.pipe = pipeline(
|
1630 |
+
task="automatic-speech-recognition",
|
1631 |
+
model=WHISPER_MODEL_NAME,
|
1632 |
+
chunk_length_s=30,
|
1633 |
+
device=device,
|
1634 |
+
)
|
1635 |
+
return _get_whisper_pipe.pipe
|
1636 |
+
|
1637 |
+
# Audio helpers for Whisper
|
1638 |
+
def _ensure_16k_whisper(y: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
|
1639 |
+
"""Resample audio to 16kHz for Whisper"""
|
1640 |
+
if sr == WHISPER_SAMPLE_RATE:
|
1641 |
+
return y.astype(np.float32), WHISPER_SAMPLE_RATE
|
1642 |
+
g = np.gcd(sr, WHISPER_SAMPLE_RATE)
|
1643 |
+
y = sps.resample_poly(y, WHISPER_SAMPLE_RATE // g, sr // g).astype(np.float32)
|
1644 |
+
return y, WHISPER_SAMPLE_RATE
|
1645 |
+
|
1646 |
+
def _should_flush_whisper(buffer: np.ndarray, sr: int) -> bool:
|
1647 |
+
"""Check if audio buffer should be flushed for processing"""
|
1648 |
+
dur = len(buffer) / sr
|
1649 |
+
if dur < MIN_SEG_SEC:
|
1650 |
+
return False
|
1651 |
+
tail_len = int(SILENCE_SEC * sr)
|
1652 |
+
tail = buffer[-tail_len:]
|
1653 |
+
rms = np.sqrt(np.mean(tail ** 2)) if len(tail) else 1.0
|
1654 |
+
end_of_sentence = rms < SILENCE_THRESH
|
1655 |
+
return end_of_sentence or dur >= MAX_SEG_SEC
|
1656 |
+
|
1657 |
+
# Translation helper
|
1658 |
+
def _translate_text_4langs(text: str) -> str:
|
1659 |
+
"""Translate text to 4 languages using OpenAI"""
|
1660 |
+
try:
|
1661 |
+
client = openai.OpenAI()
|
1662 |
+
prompt = (
|
1663 |
+
"Translate the following text into English (EN), Chinese (ZH), Thai (TH) and Russian (RU).\n"
|
1664 |
+
"Return ONLY the translations in this format (one per line):\n"
|
1665 |
+
"EN: <english>\nZH: <chinese>\nTH: <thai>\nRU: <russian>\n\n"
|
1666 |
+
f"Text: {text}"
|
1667 |
+
)
|
1668 |
+
|
1669 |
+
response = client.chat.completions.create(
|
1670 |
+
model="gpt-4o-mini",
|
1671 |
+
messages=[
|
1672 |
+
{"role": "system", "content": "You are a professional translator."},
|
1673 |
+
{"role": "user", "content": prompt}
|
1674 |
+
],
|
1675 |
+
temperature=0.7,
|
1676 |
+
max_tokens=512
|
1677 |
+
)
|
1678 |
+
|
1679 |
+
return response.choices[0].message.content.strip()
|
1680 |
+
except Exception as e:
|
1681 |
+
print(f"Translation error: {e}")
|
1682 |
+
return f"Translation error: {str(e)}"
|
1683 |
+
|
1684 |
+
# ffmpeg check
|
1685 |
+
def _check_ffmpeg() -> bool:
|
1686 |
+
try:
|
1687 |
+
subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
|
1688 |
+
return True
|
1689 |
+
except Exception:
|
1690 |
+
return False
|
1691 |
+
|
1692 |
+
_HAS_FFMPEG = _check_ffmpeg()
|
1693 |
+
|
1694 |
+
def extract_audio_from_video(video_path: str) -> str:
|
1695 |
+
"""Extract audio from video file"""
|
1696 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1697 |
+
tmp.close()
|
1698 |
+
|
1699 |
+
if _HAS_FFMPEG:
|
1700 |
+
cmd = [
|
1701 |
+
"ffmpeg", "-i", video_path, "-vn",
|
1702 |
+
"-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", tmp.name
|
1703 |
+
]
|
1704 |
+
result = subprocess.run(cmd, capture_output=True)
|
1705 |
+
if result.returncode != 0:
|
1706 |
+
os.unlink(tmp.name)
|
1707 |
+
raise RuntimeError("ffmpeg error extracting audio")
|
1708 |
+
return tmp.name
|
1709 |
+
else:
|
1710 |
+
raise RuntimeError("ffmpeg is required for video processing")
|
1711 |
+
|
1712 |
+
# GPU workers for Whisper
|
1713 |
+
def gpu_transcribe_whisper(audio_path: str) -> str:
|
1714 |
+
"""Transcribe audio using Whisper on GPU"""
|
1715 |
+
pipe = _get_whisper_pipe()
|
1716 |
+
result = pipe(audio_path, batch_size=WHISPER_BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)
|
1717 |
+
return result["text"].strip()
|
1718 |
+
|
1719 |
+
def gpu_asr_translate_whisper(audio: np.ndarray, sr: int) -> str:
|
1720 |
+
"""Transcribe and translate audio for realtime"""
|
1721 |
+
pipe = _get_whisper_pipe()
|
1722 |
+
ko = pipe({"array": audio, "sampling_rate": sr}, batch_size=WHISPER_BATCH_SIZE)["text"].strip()
|
1723 |
+
trans = _translate_text_4langs(ko).replace("\n", "<br>")
|
1724 |
+
ts = datetime.now(SEOUL_TZ).strftime("%Y-%m-%d %H:%M:%S")
|
1725 |
+
return f"[{ts}]<br>[KO] {ko}<br>{trans}<br>{'-'*40}<br>"
|
1726 |
|
1727 |
class BraveSearchClient:
|
1728 |
"""Brave Search API client"""
|
|
|
1773 |
# Store connection settings
|
1774 |
connection_settings = {}
|
1775 |
|
1776 |
+
# Store realtime sessions
|
1777 |
+
realtime_sessions = {}
|
1778 |
+
|
1779 |
# Initialize OpenAI client for text chat
|
1780 |
client = openai.AsyncOpenAI()
|
1781 |
|
|
|
2435 |
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
2436 |
|
2437 |
|
2438 |
+
# Whisper endpoints
|
2439 |
+
@app.post("/whisper/transcribe")
|
2440 |
+
async def whisper_transcribe(audio: UploadFile = File(...)):
|
2441 |
+
"""Transcribe audio using Whisper"""
|
2442 |
+
try:
|
2443 |
+
# Save uploaded file temporarily
|
2444 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as tmp:
|
2445 |
+
content = await audio.read()
|
2446 |
+
tmp.write(content)
|
2447 |
+
tmp_path = tmp.name
|
2448 |
+
|
2449 |
+
# Transcribe
|
2450 |
+
text = await asyncio.get_event_loop().run_in_executor(
|
2451 |
+
whisper_executor, gpu_transcribe_whisper, tmp_path
|
2452 |
+
)
|
2453 |
+
|
2454 |
+
# Translate
|
2455 |
+
translation = _translate_text_4langs(text)
|
2456 |
+
|
2457 |
+
# Clean up
|
2458 |
+
os.unlink(tmp_path)
|
2459 |
+
|
2460 |
+
return {"text": text, "translation": translation}
|
2461 |
+
|
2462 |
+
except Exception as e:
|
2463 |
+
print(f"Whisper transcribe error: {e}")
|
2464 |
+
return {"error": str(e)}
|
2465 |
+
|
2466 |
+
|
2467 |
+
@app.post("/whisper/audio")
|
2468 |
+
async def whisper_audio(audio: UploadFile = File(...)):
|
2469 |
+
"""Process audio file"""
|
2470 |
+
try:
|
2471 |
+
# Save uploaded file temporarily
|
2472 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(audio.filename).suffix) as tmp:
|
2473 |
+
content = await audio.read()
|
2474 |
+
tmp.write(content)
|
2475 |
+
tmp_path = tmp.name
|
2476 |
+
|
2477 |
+
# Transcribe
|
2478 |
+
text = await asyncio.get_event_loop().run_in_executor(
|
2479 |
+
whisper_executor, gpu_transcribe_whisper, tmp_path
|
2480 |
+
)
|
2481 |
+
|
2482 |
+
# Translate
|
2483 |
+
translation = _translate_text_4langs(text)
|
2484 |
+
|
2485 |
+
# Clean up
|
2486 |
+
os.unlink(tmp_path)
|
2487 |
+
|
2488 |
+
return {"text": text, "translation": translation}
|
2489 |
+
|
2490 |
+
except Exception as e:
|
2491 |
+
print(f"Whisper audio error: {e}")
|
2492 |
+
return {"error": str(e)}
|
2493 |
+
|
2494 |
+
|
2495 |
+
@app.post("/whisper/video")
|
2496 |
+
async def whisper_video(video: UploadFile = File(...)):
|
2497 |
+
"""Process video file"""
|
2498 |
+
try:
|
2499 |
+
# Save uploaded file temporarily
|
2500 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(video.filename).suffix) as tmp:
|
2501 |
+
content = await video.read()
|
2502 |
+
tmp.write(content)
|
2503 |
+
tmp_path = tmp.name
|
2504 |
+
|
2505 |
+
# Extract audio
|
2506 |
+
audio_path = await asyncio.get_event_loop().run_in_executor(
|
2507 |
+
None, extract_audio_from_video, tmp_path
|
2508 |
+
)
|
2509 |
+
|
2510 |
+
# Transcribe
|
2511 |
+
text = await asyncio.get_event_loop().run_in_executor(
|
2512 |
+
whisper_executor, gpu_transcribe_whisper, audio_path
|
2513 |
+
)
|
2514 |
+
|
2515 |
+
# Translate
|
2516 |
+
translation = _translate_text_4langs(text)
|
2517 |
+
|
2518 |
+
# Clean up
|
2519 |
+
os.unlink(tmp_path)
|
2520 |
+
os.unlink(audio_path)
|
2521 |
+
|
2522 |
+
return {"text": text, "translation": translation}
|
2523 |
+
|
2524 |
+
except Exception as e:
|
2525 |
+
print(f"Whisper video error: {e}")
|
2526 |
+
return {"error": str(e)}
|
2527 |
+
|
2528 |
+
|
2529 |
+
@app.post("/whisper/pdf")
|
2530 |
+
async def whisper_pdf(pdf: UploadFile = File(...), max_pages: int = Form(10)):
|
2531 |
+
"""Process PDF file"""
|
2532 |
+
try:
|
2533 |
+
# Save uploaded file temporarily
|
2534 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
2535 |
+
content = await pdf.read()
|
2536 |
+
tmp.write(content)
|
2537 |
+
tmp_path = tmp.name
|
2538 |
+
|
2539 |
+
# Extract text
|
2540 |
+
extracted = []
|
2541 |
+
with pdfplumber.open(tmp_path) as pdf_doc:
|
2542 |
+
pages = pdf_doc.pages[:max_pages]
|
2543 |
+
for idx, pg in enumerate(pages, start=1):
|
2544 |
+
txt = pg.extract_text() or ""
|
2545 |
+
if txt.strip():
|
2546 |
+
extracted.append(f"[Page {idx}]\n{txt}")
|
2547 |
+
|
2548 |
+
full_text = "\n\n".join(extracted)
|
2549 |
+
|
2550 |
+
# Translate each page
|
2551 |
+
translated = []
|
2552 |
+
for page_text in extracted:
|
2553 |
+
trans = _translate_text_4langs(page_text.split('\n', 1)[1]) # Skip page header
|
2554 |
+
translated.append(page_text.split('\n')[0] + "\n" + trans)
|
2555 |
+
|
2556 |
+
# Clean up
|
2557 |
+
os.unlink(tmp_path)
|
2558 |
+
|
2559 |
+
return {"text": full_text, "translation": "\n\n".join(translated)}
|
2560 |
+
|
2561 |
+
except Exception as e:
|
2562 |
+
print(f"Whisper PDF error: {e}")
|
2563 |
+
return {"error": str(e)}
|
2564 |
+
|
2565 |
+
|
2566 |
+
@app.post("/whisper/realtime/start")
|
2567 |
+
async def whisper_realtime_start():
|
2568 |
+
"""Start realtime transcription session"""
|
2569 |
+
session_id = os.urandom(16).hex()
|
2570 |
+
realtime_sessions[session_id] = {
|
2571 |
+
"buffer": [],
|
2572 |
+
"queue": asyncio.Queue(),
|
2573 |
+
"active": True
|
2574 |
+
}
|
2575 |
+
return {"session_id": session_id}
|
2576 |
+
|
2577 |
+
|
2578 |
+
@app.post("/whisper/realtime/process")
|
2579 |
+
async def whisper_realtime_process(
|
2580 |
+
audio: UploadFile = File(...),
|
2581 |
+
session_id: str = Form(...)
|
2582 |
+
):
|
2583 |
+
"""Process realtime audio chunk"""
|
2584 |
+
if session_id not in realtime_sessions:
|
2585 |
+
return {"error": "Invalid session"}
|
2586 |
+
|
2587 |
+
try:
|
2588 |
+
# Read audio data
|
2589 |
+
content = await audio.read()
|
2590 |
+
audio_array = np.frombuffer(content, dtype=np.int16).astype(np.float32) / 32768.0
|
2591 |
+
|
2592 |
+
# Process in executor
|
2593 |
+
result = await asyncio.get_event_loop().run_in_executor(
|
2594 |
+
whisper_executor, gpu_asr_translate_whisper, audio_array, WHISPER_SAMPLE_RATE
|
2595 |
+
)
|
2596 |
+
|
2597 |
+
# Parse result
|
2598 |
+
lines = result.split('<br>')
|
2599 |
+
timestamp = lines[0].strip('[]') if lines else ""
|
2600 |
+
text = lines[1].replace('[KO]', '').strip() if len(lines) > 1 else ""
|
2601 |
+
translation = '<br>'.join(lines[2:-2]) if len(lines) > 3 else ""
|
2602 |
+
|
2603 |
+
# Queue result
|
2604 |
+
await realtime_sessions[session_id]["queue"].put({
|
2605 |
+
"timestamp": timestamp,
|
2606 |
+
"text": text,
|
2607 |
+
"translation": translation
|
2608 |
+
})
|
2609 |
+
|
2610 |
+
return {"status": "ok"}
|
2611 |
+
|
2612 |
+
except Exception as e:
|
2613 |
+
print(f"Realtime process error: {e}")
|
2614 |
+
return {"error": str(e)}
|
2615 |
+
|
2616 |
+
|
2617 |
+
@app.get("/whisper/realtime/stream")
|
2618 |
+
async def whisper_realtime_stream(session_id: str):
|
2619 |
+
"""Stream realtime results"""
|
2620 |
+
if session_id not in realtime_sessions:
|
2621 |
+
return JSONResponse({"error": "Invalid session"}, status_code=404)
|
2622 |
+
|
2623 |
+
async def stream_results():
|
2624 |
+
session = realtime_sessions[session_id]
|
2625 |
+
try:
|
2626 |
+
while session["active"]:
|
2627 |
+
try:
|
2628 |
+
result = await asyncio.wait_for(session["queue"].get(), timeout=1.0)
|
2629 |
+
yield f"data: {json.dumps(result)}\n\n"
|
2630 |
+
except asyncio.TimeoutError:
|
2631 |
+
yield f"data: {json.dumps({'keepalive': True})}\n\n"
|
2632 |
+
except Exception as e:
|
2633 |
+
print(f"Stream error: {e}")
|
2634 |
+
finally:
|
2635 |
+
# Cleanup session
|
2636 |
+
if session_id in realtime_sessions:
|
2637 |
+
del realtime_sessions[session_id]
|
2638 |
+
|
2639 |
+
return StreamingResponse(stream_results(), media_type="text/event-stream")
|
2640 |
+
|
2641 |
+
|
2642 |
@app.get("/")
|
2643 |
async def index():
|
2644 |
"""Serve the HTML page"""
|