Spaces:
Running
Running
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import
|
2 |
import { useVLMContext } from "../context/useVLMContext";
|
3 |
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
4 |
|
@@ -82,55 +82,12 @@ export default function MultiSourceCaptioningView() {
|
|
82 |
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
83 |
|
84 |
const videoRef = useRef<HTMLVideoElement | null>(null);
|
85 |
-
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
86 |
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
87 |
const imageRef = useRef<HTMLImageElement | null>(null);
|
88 |
const webcamStreamRef = useRef<MediaStream | null>(null);
|
89 |
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
90 |
|
91 |
-
// Persistent boxes state: {boxes: [...], timestamp: number}
|
92 |
-
const [persistentBoxes, setPersistentBoxes] = useState<{boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]>([]);
|
93 |
-
const BOX_PERSIST_MS = 2000; // 2 seconds
|
94 |
-
|
95 |
-
// Helper: Add new boxes with timestamp
|
96 |
-
const addBoxesWithTimestamp = (boxes: {label: string, bbox_2d: number[]}[]) => {
|
97 |
-
if (!boxes || boxes.length === 0) return;
|
98 |
-
setPersistentBoxes((prev: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]) => [
|
99 |
-
...prev.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => Date.now() - entry.timestamp < BOX_PERSIST_MS),
|
100 |
-
{ boxes, timestamp: Date.now() }
|
101 |
-
]);
|
102 |
-
};
|
103 |
-
|
104 |
-
// Helper: Get all boxes from last 2 seconds
|
105 |
-
const getCurrentBoxes = () => {
|
106 |
-
const now = Date.now();
|
107 |
-
return persistentBoxes
|
108 |
-
.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => now - entry.timestamp < BOX_PERSIST_MS)
|
109 |
-
.flatMap((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => entry.boxes);
|
110 |
-
};
|
111 |
-
|
112 |
-
// Synchronize overlay video with main video
|
113 |
-
useEffect(() => {
|
114 |
-
const main = videoRef.current;
|
115 |
-
const overlay = overlayVideoRef.current;
|
116 |
-
if (!main || !overlay) return;
|
117 |
-
// Sync play/pause
|
118 |
-
main.addEventListener('play', () => overlay.play());
|
119 |
-
main.addEventListener('pause', () => overlay.pause());
|
120 |
-
// Sync seeking
|
121 |
-
const syncTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) overlay.currentTime = main.currentTime; };
|
122 |
-
main.addEventListener('seeked', syncTime);
|
123 |
-
main.addEventListener('timeupdate', syncTime);
|
124 |
-
// Clean up
|
125 |
-
return () => {
|
126 |
-
main.removeEventListener('play', () => overlay.play());
|
127 |
-
main.removeEventListener('pause', () => overlay.pause());
|
128 |
-
main.removeEventListener('seeked', syncTime);
|
129 |
-
main.removeEventListener('timeupdate', syncTime);
|
130 |
-
};
|
131 |
-
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
|
132 |
-
|
133 |
-
// Update: processVideoFrame now adds boxes to persistentBoxes
|
134 |
const processVideoFrame = async () => {
|
135 |
if (!videoRef.current || !canvasRef.current) return;
|
136 |
const video = videoRef.current;
|
@@ -142,13 +99,12 @@ export default function MultiSourceCaptioningView() {
|
|
142 |
if (!ctx) return;
|
143 |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
144 |
await runInference(video, prompt, (output: string) => {
|
145 |
-
setDebugOutput(output);
|
146 |
let boxes = extractJsonFromMarkdown(output) || [];
|
147 |
if (boxes.length === 0 && Array.isArray(output)) {
|
148 |
boxes = parseFlatBoxArray(output);
|
149 |
}
|
150 |
boxes = normalizeBoxes(boxes);
|
151 |
-
// Restore debug logging
|
152 |
console.log("Model output:", output);
|
153 |
console.log("Boxes after normalization:", boxes);
|
154 |
console.log("Canvas size:", canvas.width, canvas.height);
|
@@ -158,35 +114,13 @@ export default function MultiSourceCaptioningView() {
|
|
158 |
}
|
159 |
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
160 |
if (Array.isArray(boxes) && boxes.length > 0) {
|
161 |
-
addBoxesWithTimestamp(boxes); // <-- Add to persistent state
|
162 |
-
}
|
163 |
-
});
|
164 |
-
};
|
165 |
-
|
166 |
-
// Draw persistent boxes on every frame
|
167 |
-
useEffect(() => {
|
168 |
-
const draw = () => {
|
169 |
-
// Use overlay video for dimensions if available, else fallback to main video
|
170 |
-
const video = overlayVideoRef.current || videoRef.current;
|
171 |
-
if (!video || !canvasRef.current) return;
|
172 |
-
if (video.videoWidth === 0) return;
|
173 |
-
const canvas = canvasRef.current;
|
174 |
-
canvas.width = video.videoWidth;
|
175 |
-
canvas.height = video.videoHeight;
|
176 |
-
const ctx = canvas.getContext("2d");
|
177 |
-
if (!ctx) return;
|
178 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
179 |
-
const boxes = getCurrentBoxes();
|
180 |
-
if (boxes.length > 0) {
|
181 |
const scaleX = canvas.width / video.videoWidth;
|
182 |
const scaleY = canvas.height / video.videoHeight;
|
183 |
-
|
|
|
184 |
}
|
185 |
-
};
|
186 |
-
|
187 |
-
const interval = setInterval(draw, 100);
|
188 |
-
return () => clearInterval(interval);
|
189 |
-
}, [persistentBoxes, videoRef, overlayVideoRef, canvasRef]);
|
190 |
|
191 |
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
192 |
const file = e.target.files?.[0] || null;
|
@@ -442,7 +376,7 @@ export default function MultiSourceCaptioningView() {
|
|
442 |
controls
|
443 |
autoPlay
|
444 |
loop
|
445 |
-
className="w-full rounded-lg shadow-lg mb-2
|
446 |
style={{ background: "#222" }}
|
447 |
/>
|
448 |
<video
|
@@ -452,8 +386,8 @@ export default function MultiSourceCaptioningView() {
|
|
452 |
autoPlay
|
453 |
loop
|
454 |
muted
|
455 |
-
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0
|
456 |
-
style={{ background: "#222" }}
|
457 |
/>
|
458 |
<canvas
|
459 |
ref={canvasRef}
|
@@ -461,7 +395,7 @@ export default function MultiSourceCaptioningView() {
|
|
461 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
462 |
/>
|
463 |
<button
|
464 |
-
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold
|
465 |
onClick={handleToggleUrlProcessing}
|
466 |
>
|
467 |
{urlProcessing ? "Stop Processing" : "Start Processing"}
|
@@ -475,6 +409,11 @@ export default function MultiSourceCaptioningView() {
|
|
475 |
>
|
476 |
Test Draw Box
|
477 |
</button>
|
|
|
|
|
|
|
|
|
|
|
478 |
</div>
|
479 |
)}
|
480 |
{mode === "File" && (
|
@@ -529,7 +468,7 @@ export default function MultiSourceCaptioningView() {
|
|
529 |
controls
|
530 |
autoPlay
|
531 |
loop
|
532 |
-
className="w-full rounded-lg shadow-lg mb-2
|
533 |
style={{ background: "#222" }}
|
534 |
/>
|
535 |
<video
|
@@ -539,8 +478,8 @@ export default function MultiSourceCaptioningView() {
|
|
539 |
autoPlay
|
540 |
loop
|
541 |
muted
|
542 |
-
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0
|
543 |
-
style={{ background: "#222" }}
|
544 |
/>
|
545 |
<canvas
|
546 |
ref={canvasRef}
|
@@ -548,7 +487,7 @@ export default function MultiSourceCaptioningView() {
|
|
548 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
549 |
/>
|
550 |
<button
|
551 |
-
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold
|
552 |
onClick={handleToggleVideoProcessing}
|
553 |
>
|
554 |
{videoProcessing ? "Stop Processing" : "Start Processing"}
|
@@ -564,7 +503,7 @@ export default function MultiSourceCaptioningView() {
|
|
564 |
controls
|
565 |
autoPlay
|
566 |
loop
|
567 |
-
className="w-full rounded-lg shadow-lg mb-2
|
568 |
style={{ background: "#222" }}
|
569 |
/>
|
570 |
<video
|
@@ -574,8 +513,8 @@ export default function MultiSourceCaptioningView() {
|
|
574 |
autoPlay
|
575 |
loop
|
576 |
muted
|
577 |
-
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0
|
578 |
-
style={{ background: "#222" }}
|
579 |
/>
|
580 |
<canvas
|
581 |
ref={canvasRef}
|
@@ -583,7 +522,7 @@ export default function MultiSourceCaptioningView() {
|
|
583 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
584 |
/>
|
585 |
<button
|
586 |
-
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold
|
587 |
onClick={handleToggleExampleProcessing}
|
588 |
>
|
589 |
{exampleProcessing ? "Stop Processing" : "Start Processing"}
|
@@ -598,14 +537,13 @@ export default function MultiSourceCaptioningView() {
|
|
598 |
>
|
599 |
Test Draw Box
|
600 |
</button>
|
|
|
|
|
|
|
|
|
|
|
601 |
</div>
|
602 |
)}
|
603 |
-
{/* Always show Raw Model Output at the bottom */}
|
604 |
-
<div className="mt-2 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
|
605 |
-
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
|
606 |
-
<div>Raw Model Output:</div>
|
607 |
-
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
|
608 |
-
</div>
|
609 |
</div>
|
610 |
</div>
|
611 |
</div>
|
|
|
1 |
+
import { useState, useRef, useEffect } from "react";
|
2 |
import { useVLMContext } from "../context/useVLMContext";
|
3 |
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
4 |
|
|
|
82 |
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
83 |
|
84 |
const videoRef = useRef<HTMLVideoElement | null>(null);
|
85 |
+
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
86 |
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
87 |
const imageRef = useRef<HTMLImageElement | null>(null);
|
88 |
const webcamStreamRef = useRef<MediaStream | null>(null);
|
89 |
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
const processVideoFrame = async () => {
|
92 |
if (!videoRef.current || !canvasRef.current) return;
|
93 |
const video = videoRef.current;
|
|
|
99 |
if (!ctx) return;
|
100 |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
101 |
await runInference(video, prompt, (output: string) => {
|
102 |
+
setDebugOutput(output); // <-- Ensure Raw Model Output is updated
|
103 |
let boxes = extractJsonFromMarkdown(output) || [];
|
104 |
if (boxes.length === 0 && Array.isArray(output)) {
|
105 |
boxes = parseFlatBoxArray(output);
|
106 |
}
|
107 |
boxes = normalizeBoxes(boxes);
|
|
|
108 |
console.log("Model output:", output);
|
109 |
console.log("Boxes after normalization:", boxes);
|
110 |
console.log("Canvas size:", canvas.width, canvas.height);
|
|
|
114 |
}
|
115 |
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
116 |
if (Array.isArray(boxes) && boxes.length > 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
const scaleX = canvas.width / video.videoWidth;
|
118 |
const scaleY = canvas.height / video.videoHeight;
|
119 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
|
120 |
+
drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
|
121 |
}
|
122 |
+
});
|
123 |
+
};
|
|
|
|
|
|
|
124 |
|
125 |
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
126 |
const file = e.target.files?.[0] || null;
|
|
|
376 |
controls
|
377 |
autoPlay
|
378 |
loop
|
379 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
380 |
style={{ background: "#222" }}
|
381 |
/>
|
382 |
<video
|
|
|
386 |
autoPlay
|
387 |
loop
|
388 |
muted
|
389 |
+
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 opacity-60 pointer-events-none"
|
390 |
+
style={{ background: "#222", zIndex: 10 }}
|
391 |
/>
|
392 |
<canvas
|
393 |
ref={canvasRef}
|
|
|
395 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
396 |
/>
|
397 |
<button
|
398 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
399 |
onClick={handleToggleUrlProcessing}
|
400 |
>
|
401 |
{urlProcessing ? "Stop Processing" : "Start Processing"}
|
|
|
409 |
>
|
410 |
Test Draw Box
|
411 |
</button>
|
412 |
+
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
|
413 |
+
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
|
414 |
+
<div>Raw Model Output:</div>
|
415 |
+
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
|
416 |
+
</div>
|
417 |
</div>
|
418 |
)}
|
419 |
{mode === "File" && (
|
|
|
468 |
controls
|
469 |
autoPlay
|
470 |
loop
|
471 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
472 |
style={{ background: "#222" }}
|
473 |
/>
|
474 |
<video
|
|
|
478 |
autoPlay
|
479 |
loop
|
480 |
muted
|
481 |
+
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 opacity-60 pointer-events-none"
|
482 |
+
style={{ background: "#222", zIndex: 10 }}
|
483 |
/>
|
484 |
<canvas
|
485 |
ref={canvasRef}
|
|
|
487 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
488 |
/>
|
489 |
<button
|
490 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
491 |
onClick={handleToggleVideoProcessing}
|
492 |
>
|
493 |
{videoProcessing ? "Stop Processing" : "Start Processing"}
|
|
|
503 |
controls
|
504 |
autoPlay
|
505 |
loop
|
506 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
507 |
style={{ background: "#222" }}
|
508 |
/>
|
509 |
<video
|
|
|
513 |
autoPlay
|
514 |
loop
|
515 |
muted
|
516 |
+
className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 opacity-60 pointer-events-none"
|
517 |
+
style={{ background: "#222", zIndex: 10 }}
|
518 |
/>
|
519 |
<canvas
|
520 |
ref={canvasRef}
|
|
|
522 |
style={{ zIndex: 20, pointerEvents: "none" }}
|
523 |
/>
|
524 |
<button
|
525 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
526 |
onClick={handleToggleExampleProcessing}
|
527 |
>
|
528 |
{exampleProcessing ? "Stop Processing" : "Start Processing"}
|
|
|
537 |
>
|
538 |
Test Draw Box
|
539 |
</button>
|
540 |
+
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
|
541 |
+
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
|
542 |
+
<div>Raw Model Output:</div>
|
543 |
+
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
|
544 |
+
</div>
|
545 |
</div>
|
546 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
</div>
|
548 |
</div>
|
549 |
</div>
|