Spaces:
Running
Running
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
@@ -1,717 +1,721 @@
|
|
1 |
-
import * as React from "react";
|
2 |
-
import { useState, useRef, useEffect } from "react";
|
3 |
-
import { useVLMContext } from "../context/useVLMContext";
|
4 |
-
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
5 |
-
|
6 |
-
const MODES = ["File"] as const;
|
7 |
-
type Mode = typeof MODES[number];
|
8 |
-
|
9 |
-
const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
|
10 |
-
const EXAMPLE_PROMPT = "Describe the video";
|
11 |
-
|
12 |
-
function isImageFile(file: File) {
|
13 |
-
return file.type.startsWith("image/");
|
14 |
-
}
|
15 |
-
function isVideoFile(file: File) {
|
16 |
-
return file.type.startsWith("video/");
|
17 |
-
}
|
18 |
-
|
19 |
-
function denormalizeBox(box: number[], width: number, height: number) {
|
20 |
-
// If all values are between 0 and 1, treat as normalized
|
21 |
-
if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
|
22 |
-
return [
|
23 |
-
box[0] * width,
|
24 |
-
box[1] * height,
|
25 |
-
box[2] * width,
|
26 |
-
box[3] * height
|
27 |
-
];
|
28 |
-
}
|
29 |
-
return box;
|
30 |
-
}
|
31 |
-
|
32 |
-
// Add this robust fallback parser near the top
|
33 |
-
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
|
34 |
-
// Try to parse as JSON first
|
35 |
-
try {
|
36 |
-
const parsed = JSON.parse(output);
|
37 |
-
if (Array.isArray(parsed)) {
|
38 |
-
const result: { label: string, bbox_2d: number[] }[] = [];
|
39 |
-
for (const obj of parsed) {
|
40 |
-
if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
|
41 |
-
if (Array.isArray(obj.bbox_2d[0])) {
|
42 |
-
for (const arr of obj.bbox_2d) {
|
43 |
-
if (Array.isArray(arr) && arr.length === 4) {
|
44 |
-
result.push({ label: obj.label, bbox_2d: arr });
|
45 |
-
}
|
46 |
-
}
|
47 |
-
} else if (obj.bbox_2d.length === 4) {
|
48 |
-
result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
|
49 |
-
}
|
50 |
-
}
|
51 |
-
}
|
52 |
-
if (result.length > 0) return result;
|
53 |
-
}
|
54 |
-
} catch (e) {}
|
55 |
-
// Fallback: extract all [x1, y1, x2, y2] arrays from the string
|
56 |
-
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
|
57 |
-
const boxes: { label: string, bbox_2d: number[] }[] = [];
|
58 |
-
let match;
|
59 |
-
while ((match = boxRegex.exec(output)) !== null) {
|
60 |
-
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
|
61 |
-
boxes.push({ label: '', bbox_2d: arr });
|
62 |
-
}
|
63 |
-
return boxes;
|
64 |
-
}
|
65 |
-
|
66 |
-
// NOTE: You must install onnxruntime-web:
|
67 |
-
// npm install onnxruntime-web
|
68 |
-
// @ts-ignore
|
69 |
-
import * as ort from 'onnxruntime-web';
|
70 |
-
// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
|
71 |
-
|
72 |
-
// Set your YOLOv8 ONNX model URL here:
|
73 |
-
const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
|
74 |
-
|
75 |
-
// Add these constants to match the YOLOv8 input size
|
76 |
-
const YOLOV8_INPUT_WIDTH = 640;
|
77 |
-
const YOLOV8_INPUT_HEIGHT = 480;
|
78 |
-
|
79 |
-
// 1. Load the ONNX model once
|
80 |
-
let yoloSession: ort.InferenceSession | null = null;
|
81 |
-
// Add a busy flag to prevent concurrent YOLOv8 inferences
|
82 |
-
let isYoloBusy = false;
|
83 |
-
async function loadYoloModel() {
|
84 |
-
if (!yoloSession) {
|
85 |
-
yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
|
86 |
-
}
|
87 |
-
return yoloSession;
|
88 |
-
}
|
89 |
-
|
90 |
-
// COCO class names for YOLOv8
|
91 |
-
const YOLO_CLASSES: string[] = [
|
92 |
-
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
|
93 |
-
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
94 |
-
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
95 |
-
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
|
96 |
-
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
97 |
-
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
|
98 |
-
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
|
99 |
-
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
100 |
-
];
|
101 |
-
|
102 |
-
// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
|
103 |
-
function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
|
104 |
-
const width = 640;
|
105 |
-
const height = 480;
|
106 |
-
const canvas = document.createElement('canvas');
|
107 |
-
canvas.width = width;
|
108 |
-
canvas.height = height;
|
109 |
-
const ctx = canvas.getContext('2d');
|
110 |
-
if (!ctx) throw new Error('Could not get 2D context');
|
111 |
-
ctx.drawImage(video, 0, 0, width, height);
|
112 |
-
const imageData = ctx.getImageData(0, 0, width, height);
|
113 |
-
const { data } = imageData;
|
114 |
-
// Convert to Float32Array [1,3,480,640], normalize to [0,1]
|
115 |
-
const floatData = new Float32Array(1 * 3 * height * width);
|
116 |
-
for (let i = 0; i < width * height; i++) {
|
117 |
-
floatData[i] = data[i * 4] / 255; // R
|
118 |
-
floatData[i + width * height] = data[i * 4 + 1] / 255; // G
|
119 |
-
floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
|
120 |
-
}
|
121 |
-
return new ort.Tensor('float32', floatData, [1, 3, height, width]);
|
122 |
-
}
|
123 |
-
|
124 |
-
// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
|
125 |
-
function postprocessYoloOutput(output: ort.Tensor) {
|
126 |
-
// output.dims: [1, num_detections, 6]
|
127 |
-
const data = output.data;
|
128 |
-
const numDetections = output.dims[1];
|
129 |
-
const results = [];
|
130 |
-
for (let i = 0; i < numDetections; i++) {
|
131 |
-
const offset = i * 6;
|
132 |
-
const x1 = data[offset];
|
133 |
-
const y1 = data[offset + 1];
|
134 |
-
const x2 = data[offset + 2];
|
135 |
-
const y2 = data[offset + 3];
|
136 |
-
const score = data[offset + 4];
|
137 |
-
const classId = data[offset + 5];
|
138 |
-
if (score < 0.2) continue; // adjust threshold as needed
|
139 |
-
results.push({
|
140 |
-
bbox: [x1, y1, x2, y2],
|
141 |
-
label: YOLO_CLASSES[classId] || `class_${classId}`,
|
142 |
-
score
|
143 |
-
});
|
144 |
-
}
|
145 |
-
return results;
|
146 |
-
}
|
147 |
-
|
148 |
-
// Helper type guard for annotation
|
149 |
-
function hasAnnotation(obj: any): obj is { annotation: string } {
|
150 |
-
return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
|
151 |
-
}
|
152 |
-
|
153 |
-
export default function MultiSourceCaptioningView() {
|
154 |
-
const [mode, setMode] = useState<Mode>("File");
|
155 |
-
const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
156 |
-
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
|
157 |
-
const [processing, setProcessing] = useState(false);
|
158 |
-
const [error, setError] = useState<string | null>(null);
|
159 |
-
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
|
160 |
-
const [uploadedUrl, setUploadedUrl] = useState<string>("");
|
161 |
-
const [videoProcessing, setVideoProcessing] = useState(false);
|
162 |
-
const [imageProcessed, setImageProcessed] = useState(false);
|
163 |
-
const [exampleProcessing, setExampleProcessing] = useState(false);
|
164 |
-
const [debugOutput, setDebugOutput] = useState<string>("");
|
165 |
-
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
|
166 |
-
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
|
167 |
-
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
168 |
-
const [showProcessingVideo, setShowProcessingVideo] = useState(false);
|
169 |
-
|
170 |
-
const videoRef = useRef<HTMLVideoElement | null>(null);
|
171 |
-
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
172 |
-
const processingVideoRef = useRef<HTMLVideoElement | null>(null);
|
173 |
-
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
174 |
-
const imageRef = useRef<HTMLImageElement | null>(null);
|
175 |
-
const boxHistoryRef = useRef<any[]>([]);
|
176 |
-
// Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
|
177 |
-
const lastYoloBoxesRef = React.useRef<any[]>([]);
|
178 |
-
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
179 |
-
|
180 |
-
// Remove videoProcessingRef and exampleProcessingRef
|
181 |
-
// Add a single processingLoopRef
|
182 |
-
const processingLoopRef = React.useRef(false);
|
183 |
-
|
184 |
-
const processVideoLoop = async () => {
|
185 |
-
if (!processingLoopRef.current) return;
|
186 |
-
if (isYoloBusy) {
|
187 |
-
// Optionally log: "Inference already running, skipping frame"
|
188 |
-
requestAnimationFrame(processVideoLoop);
|
189 |
-
return;
|
190 |
-
}
|
191 |
-
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
192 |
-
// Schedule the next frame as soon as possible
|
193 |
-
requestAnimationFrame(processVideoLoop);
|
194 |
-
};
|
195 |
-
const processExampleLoop = async () => {
|
196 |
-
while (processingLoopRef.current) {
|
197 |
-
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
198 |
-
await new Promise(res => setTimeout(res, 1000));
|
199 |
-
}
|
200 |
-
};
|
201 |
-
|
202 |
-
// Set your YOLOv8 ONNX backend API endpoint here:
|
203 |
-
// const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
|
204 |
-
|
205 |
-
// Add this useEffect for overlay video synchronization
|
206 |
-
useEffect(() => {
|
207 |
-
const main = videoRef.current;
|
208 |
-
const overlay = overlayVideoRef.current;
|
209 |
-
if (!main || !overlay) return;
|
210 |
-
// Sync play/pause
|
211 |
-
const onPlay = () => { if (overlay.paused) overlay.play(); };
|
212 |
-
const onPause = () => { if (!overlay.paused) overlay.pause(); };
|
213 |
-
// Sync seeking and time
|
214 |
-
const onSeekOrTime = () => {
|
215 |
-
if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
|
216 |
-
overlay.currentTime = main.currentTime;
|
217 |
-
}
|
218 |
-
};
|
219 |
-
main.addEventListener('play', onPlay);
|
220 |
-
main.addEventListener('pause', onPause);
|
221 |
-
main.addEventListener('seeked', onSeekOrTime);
|
222 |
-
main.addEventListener('timeupdate', onSeekOrTime);
|
223 |
-
// Clean up
|
224 |
-
return () => {
|
225 |
-
main.removeEventListener('play', onPlay);
|
226 |
-
main.removeEventListener('pause', onPause);
|
227 |
-
main.removeEventListener('seeked', onSeekOrTime);
|
228 |
-
main.removeEventListener('timeupdate', onSeekOrTime);
|
229 |
-
};
|
230 |
-
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
|
231 |
-
|
232 |
-
useEffect(() => {
|
233 |
-
if ((mode === "File") && processingVideoRef.current) {
|
234 |
-
processingVideoRef.current.play().catch(() => {});
|
235 |
-
}
|
236 |
-
}, [mode, videoUrl, uploadedUrl]);
|
237 |
-
|
238 |
-
// Remove old prompt-based box extraction logic and only use the above for video frames.
|
239 |
-
|
240 |
-
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
241 |
-
const file = e.target.files?.[0] || null;
|
242 |
-
setUploadedFile(file);
|
243 |
-
setUploadedUrl(file ? URL.createObjectURL(file) : "");
|
244 |
-
setError(null);
|
245 |
-
setImageProcessed(false);
|
246 |
-
setVideoProcessing(false);
|
247 |
-
setExampleProcessing(false);
|
248 |
-
};
|
249 |
-
|
250 |
-
// Webcam mode: process frames with setInterval
|
251 |
-
useEffect(() => {
|
252 |
-
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
253 |
-
processVideoLoop();
|
254 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
255 |
-
|
256 |
-
// Example video mode: process frames with setInterval
|
257 |
-
useEffect(() => {
|
258 |
-
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
259 |
-
processExampleLoop();
|
260 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
261 |
-
|
262 |
-
// File mode: process uploaded image (only on button click)
|
263 |
-
const handleProcessImage = async () => {
|
264 |
-
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
265 |
-
const img = imageRef.current;
|
266 |
-
const canvas = canvasRef.current;
|
267 |
-
canvas.width = img.naturalWidth;
|
268 |
-
canvas.height = img.naturalHeight;
|
269 |
-
setCanvasDims({w:canvas.width,h:canvas.height});
|
270 |
-
setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
|
271 |
-
const ctx = canvas.getContext("2d");
|
272 |
-
if (!ctx) return;
|
273 |
-
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
274 |
-
setProcessing(true);
|
275 |
-
setError(null);
|
276 |
-
setInferenceStatus("Running inference...");
|
277 |
-
await runInference(img, prompt, (output: string) => {
|
278 |
-
setDebugOutput(output);
|
279 |
-
setInferenceStatus("Inference complete.");
|
280 |
-
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
281 |
-
let boxes = extractAllBoundingBoxes(output);
|
282 |
-
console.log("Model output:", output);
|
283 |
-
console.log("Boxes after normalization:", boxes);
|
284 |
-
console.log("Canvas size:", canvas.width, canvas.height);
|
285 |
-
if (boxes.length > 0) {
|
286 |
-
const [x1, y1, x2, y2] = boxes[0].bbox_2d;
|
287 |
-
console.log("First box coords:", x1, y1, x2, y2);
|
288 |
-
}
|
289 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
290 |
-
if (Array.isArray(boxes) && boxes.length > 0) {
|
291 |
-
const scaleX = canvas.width / img.naturalWidth;
|
292 |
-
const scaleY = canvas.height / img.naturalHeight;
|
293 |
-
drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
|
294 |
-
}
|
295 |
-
setImageProcessed(true);
|
296 |
-
});
|
297 |
-
setProcessing(false);
|
298 |
-
};
|
299 |
-
|
300 |
-
// File mode: process uploaded video frames (start/stop)
|
301 |
-
const handleToggleVideoProcessing = () => {
|
302 |
-
setVideoProcessing((prev: boolean) => {
|
303 |
-
const next = !prev;
|
304 |
-
// Always stop all loops before starting
|
305 |
-
processingLoopRef.current = false;
|
306 |
-
setTimeout(() => {
|
307 |
-
if (next) {
|
308 |
-
processingLoopRef.current = true;
|
309 |
-
processVideoLoop();
|
310 |
-
}
|
311 |
-
}, 50);
|
312 |
-
return next;
|
313 |
-
});
|
314 |
-
};
|
315 |
-
|
316 |
-
// Handle start/stop for example video processing
|
317 |
-
const handleToggleExampleProcessing = () => {
|
318 |
-
setExampleProcessing((prev: boolean) => {
|
319 |
-
const next = !prev;
|
320 |
-
// Always stop all loops before starting
|
321 |
-
processingLoopRef.current = false;
|
322 |
-
setTimeout(() => {
|
323 |
-
if (next) {
|
324 |
-
processingLoopRef.current = true;
|
325 |
-
processVideoLoop();
|
326 |
-
}
|
327 |
-
}, 50);
|
328 |
-
return next;
|
329 |
-
});
|
330 |
-
};
|
331 |
-
|
332 |
-
// Test draw box function
|
333 |
-
const handleTestDrawBox = () => {
|
334 |
-
if (!canvasRef.current) return;
|
335 |
-
const canvas = canvasRef.current;
|
336 |
-
const ctx = canvas.getContext("2d");
|
337 |
-
if (!ctx) return;
|
338 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
339 |
-
ctx.strokeStyle = "#FF00FF";
|
340 |
-
ctx.lineWidth = 4;
|
341 |
-
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
|
342 |
-
ctx.font = "20px Arial";
|
343 |
-
ctx.fillStyle = "#FF00FF";
|
344 |
-
ctx.fillText("Test Box", 50, 35);
|
345 |
-
};
|
346 |
-
|
347 |
-
useEffect(() => {
|
348 |
-
const draw = () => {
|
349 |
-
const overlayVideo = overlayVideoRef.current;
|
350 |
-
const canvas = canvasRef.current;
|
351 |
-
if (!overlayVideo || !canvas) return;
|
352 |
-
const displayWidth = overlayVideo.clientWidth;
|
353 |
-
const displayHeight = overlayVideo.clientHeight;
|
354 |
-
canvas.width = displayWidth;
|
355 |
-
canvas.height = displayHeight;
|
356 |
-
const ctx = canvas.getContext("2d");
|
357 |
-
if (!ctx) return;
|
358 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
359 |
-
const now = Date.now();
|
360 |
-
const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
|
361 |
-
if (boxHistory.length > 0) {
|
362 |
-
// Fix: Draw all boxes, even if bbox_2d is an array of arrays
|
363 |
-
const denormalizedBoxes: any[] = [];
|
364 |
-
for (const b of boxHistory) {
|
365 |
-
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
|
366 |
-
// Multiple boxes per label
|
367 |
-
for (const arr of b.bbox_2d) {
|
368 |
-
if (Array.isArray(arr) && arr.length === 4) {
|
369 |
-
denormalizedBoxes.push({
|
370 |
-
...b,
|
371 |
-
bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
|
372 |
-
});
|
373 |
-
}
|
374 |
-
}
|
375 |
-
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
|
376 |
-
// Single box
|
377 |
-
denormalizedBoxes.push({
|
378 |
-
...b,
|
379 |
-
bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
|
380 |
-
});
|
381 |
-
}
|
382 |
-
}
|
383 |
-
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
|
384 |
-
}
|
385 |
-
};
|
386 |
-
draw();
|
387 |
-
const interval = setInterval(draw, 100);
|
388 |
-
// Redraw on window resize
|
389 |
-
const handleResize = () => draw();
|
390 |
-
window.addEventListener('resize', handleResize);
|
391 |
-
return () => {
|
392 |
-
clearInterval(interval);
|
393 |
-
window.removeEventListener('resize', handleResize);
|
394 |
-
};
|
395 |
-
}, [overlayVideoRef, canvasRef]);
|
396 |
-
|
397 |
-
// Drawing loop: draws the latest YOLOv8 boxes every frame
|
398 |
-
React.useEffect(() => {
|
399 |
-
let running = true;
|
400 |
-
function drawLoop() {
|
401 |
-
if (!running) return;
|
402 |
-
const overlayVideo = overlayVideoRef.current;
|
403 |
-
const canvas = canvasRef.current;
|
404 |
-
const processingVideo = processingVideoRef.current;
|
405 |
-
if (canvas && overlayVideo && processingVideo) {
|
406 |
-
// Set canvas size to match the visible video
|
407 |
-
canvas.width = overlayVideo.clientWidth;
|
408 |
-
canvas.height = overlayVideo.clientHeight;
|
409 |
-
const ctx = canvas.getContext('2d');
|
410 |
-
if (ctx) {
|
411 |
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
412 |
-
// Draw all YOLOv8 boxes from last detection
|
413 |
-
const yoloBoxes = lastYoloBoxesRef.current;
|
414 |
-
yoloBoxes.forEach((obj: any) => {
|
415 |
-
// Scale from YOLOv8 input size to canvas size
|
416 |
-
const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
|
417 |
-
const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
|
418 |
-
const [x1, y1, x2, y2] = obj.bbox;
|
419 |
-
const drawX = x1 * scaleX;
|
420 |
-
const drawY = y1 * scaleY;
|
421 |
-
const drawW = (x2 - x1) * scaleX;
|
422 |
-
const drawH = (y2 - y1) * scaleY;
|
423 |
-
ctx.strokeStyle = '#00FFFF';
|
424 |
-
ctx.lineWidth = 5;
|
425 |
-
ctx.strokeRect(drawX, drawY, drawW, drawH);
|
426 |
-
ctx.font = 'bold 22px Arial';
|
427 |
-
// Draw YOLOv8 label and confidence
|
428 |
-
const yoloLabel = obj.label || '';
|
429 |
-
const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
|
430 |
-
const yoloText = `${yoloLabel}${yoloScore}`;
|
431 |
-
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
432 |
-
const yoloTextWidth = ctx.measureText(yoloText).width + 8;
|
433 |
-
ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
|
434 |
-
ctx.fillStyle = '#00FFFF';
|
435 |
-
ctx.fillText(yoloText, drawX, drawY - 4);
|
436 |
-
// Draw FastVLM annotation below the box if available
|
437 |
-
if (hasAnnotation(obj)) {
|
438 |
-
ctx.font = 'bold 18px Arial';
|
439 |
-
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
440 |
-
const annTextWidth = ctx.measureText(obj.annotation).width + 8;
|
441 |
-
ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
|
442 |
-
ctx.fillStyle = '#00FFFF';
|
443 |
-
ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
|
444 |
-
}
|
445 |
-
});
|
446 |
-
}
|
447 |
-
}
|
448 |
-
requestAnimationFrame(drawLoop);
|
449 |
-
}
|
450 |
-
drawLoop();
|
451 |
-
return () => { running = false; };
|
452 |
-
}, [overlayVideoRef, canvasRef, processingVideoRef]);
|
453 |
-
|
454 |
-
// YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
|
455 |
-
const yoloDetectionLoop = async () => {
|
456 |
-
if (!processingLoopRef.current) return;
|
457 |
-
if (isYoloBusy) {
|
458 |
-
requestAnimationFrame(yoloDetectionLoop);
|
459 |
-
return;
|
460 |
-
}
|
461 |
-
isYoloBusy = true;
|
462 |
-
try {
|
463 |
-
const processingVideo = processingVideoRef.current;
|
464 |
-
if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
|
465 |
-
isYoloBusy = false;
|
466 |
-
requestAnimationFrame(yoloDetectionLoop);
|
467 |
-
return;
|
468 |
-
}
|
469 |
-
// Run YOLOv8 detection
|
470 |
-
const session = await loadYoloModel();
|
471 |
-
const inputTensor = preprocessFrameToTensor(processingVideo);
|
472 |
-
const feeds: Record<string, ort.Tensor> = {};
|
473 |
-
feeds[session.inputNames[0]] = inputTensor;
|
474 |
-
const results = await session.run(feeds);
|
475 |
-
const output = results[session.outputNames[0]];
|
476 |
-
const detections = postprocessYoloOutput(output);
|
477 |
-
lastYoloBoxesRef.current = detections;
|
478 |
-
// Run FastVLM on the full frame (wait for YOLOv8 to finish)
|
479 |
-
await runInference(processingVideo, prompt, (output: string) => {
|
480 |
-
setDebugOutput(output);
|
481 |
-
});
|
482 |
-
} catch (err) {
|
483 |
-
console.error('YOLOv8+FastVLM error:', err);
|
484 |
-
} finally {
|
485 |
-
isYoloBusy = false;
|
486 |
-
requestAnimationFrame(yoloDetectionLoop);
|
487 |
-
}
|
488 |
-
};
|
489 |
-
|
490 |
-
// Add this effect after the processing loop and toggle handlers
|
491 |
-
useEffect(() => {
|
492 |
-
// Stop processing loop on video source change or processing toggle
|
493 |
-
processingLoopRef.current = false;
|
494 |
-
// Start processing loop for the correct video after refs update
|
495 |
-
setTimeout(() => {
|
496 |
-
if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
|
497 |
-
processingLoopRef.current = true;
|
498 |
-
yoloDetectionLoop();
|
499 |
-
} else if (exampleProcessing && !uploadedFile) {
|
500 |
-
processingLoopRef.current = true;
|
501 |
-
yoloDetectionLoop();
|
502 |
-
}
|
503 |
-
}, 100);
|
504 |
-
// eslint-disable-next-line
|
505 |
-
}, [uploadedFile, videoProcessing, exampleProcessing]);
|
506 |
-
|
507 |
-
return (
|
508 |
-
<div className="absolute inset-0 text-white">
|
509 |
-
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
|
510 |
-
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
|
511 |
-
</div>
|
512 |
-
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
|
513 |
-
<div className="flex flex-col items-center justify-center h-full w-full">
|
514 |
-
{/* Mode Selector */}
|
515 |
-
<div className="mb-6">
|
516 |
-
<div className="flex space-x-4">
|
517 |
-
{MODES.map((m) => (
|
518 |
-
<button
|
519 |
-
key={m}
|
520 |
-
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
|
521 |
-
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
|
522 |
-
}`}
|
523 |
-
onClick={() => setMode(m)}
|
524 |
-
>
|
525 |
-
{m}
|
526 |
-
</button>
|
527 |
-
))}
|
528 |
-
</div>
|
529 |
-
</div>
|
530 |
-
|
531 |
-
{/* Mode Content */}
|
532 |
-
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
|
533 |
-
{mode === "File" && (
|
534 |
-
<div className="w-full text-center flex flex-col items-center">
|
535 |
-
<div className="mb-4 w-full max-w-xl">
|
536 |
-
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
|
537 |
-
<textarea
|
538 |
-
className="w-full p-2 rounded-lg text-black"
|
539 |
-
rows={3}
|
540 |
-
value={prompt}
|
541 |
-
onChange={(e) => setPrompt(e.target.value)}
|
542 |
-
/>
|
543 |
-
</div>
|
544 |
-
<div className="mb-4 w-full max-w-xl">
|
545 |
-
<input
|
546 |
-
type="file"
|
547 |
-
accept="image/*,video/*"
|
548 |
-
onChange={handleFileChange}
|
549 |
-
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
|
550 |
-
/>
|
551 |
-
</div>
|
552 |
-
{/* Add toggle button above video area */}
|
553 |
-
<div className="mb-2 w-full max-w-xl flex justify-end">
|
554 |
-
<button
|
555 |
-
className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
|
556 |
-
onClick={() => setShowProcessingVideo(v => !v)}
|
557 |
-
type="button"
|
558 |
-
>
|
559 |
-
{showProcessingVideo ? 'Hide' : 'Show'} Processed Video
|
560 |
-
</button>
|
561 |
-
</div>
|
562 |
-
{/* Show uploaded image */}
|
563 |
-
{uploadedFile && isImageFile(uploadedFile) && (
|
564 |
-
<div className="relative w-full max-w-xl">
|
565 |
-
<img
|
566 |
-
ref={imageRef}
|
567 |
-
src={uploadedUrl}
|
568 |
-
alt="Uploaded"
|
569 |
-
className="w-full rounded-lg shadow-lg mb-2"
|
570 |
-
style={{ background: "#222" }}
|
571 |
-
/>
|
572 |
-
<canvas
|
573 |
-
ref={canvasRef}
|
574 |
-
className="absolute top-0 left-0 w-full h-full pointer-events-none"
|
575 |
-
style={{ zIndex: 10, pointerEvents: "none" }}
|
576 |
-
/>
|
577 |
-
<button
|
578 |
-
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
579 |
-
onClick={handleProcessImage}
|
580 |
-
disabled={processing}
|
581 |
-
>
|
582 |
-
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
|
583 |
-
</button>
|
584 |
-
</div>
|
585 |
-
)}
|
586 |
-
{/* Show uploaded video */}
|
587 |
-
{uploadedFile && isVideoFile(uploadedFile) && (
|
588 |
-
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
589 |
-
{/* Visible overlay video for user */}
|
590 |
-
<video
|
591 |
-
ref={overlayVideoRef}
|
592 |
-
src={uploadedUrl}
|
593 |
-
controls
|
594 |
-
autoPlay
|
595 |
-
loop
|
596 |
-
muted
|
597 |
-
playsInline
|
598 |
-
className="w-full rounded-lg shadow-lg mb-2"
|
599 |
-
style={{ background: "#222", display: "block" }}
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
canvasRef.current.
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
canvasRef.current.
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
{
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
>
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
|
|
|
|
|
|
|
|
717 |
}
|
|
|
1 |
+
import * as React from "react";
|
2 |
+
import { useState, useRef, useEffect } from "react";
|
3 |
+
import { useVLMContext } from "../context/useVLMContext";
|
4 |
+
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
5 |
+
|
6 |
+
const MODES = ["File"] as const;
|
7 |
+
type Mode = typeof MODES[number];
|
8 |
+
|
9 |
+
const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
|
10 |
+
const EXAMPLE_PROMPT = "Describe the video";
|
11 |
+
|
12 |
+
function isImageFile(file: File) {
|
13 |
+
return file.type.startsWith("image/");
|
14 |
+
}
|
15 |
+
function isVideoFile(file: File) {
|
16 |
+
return file.type.startsWith("video/");
|
17 |
+
}
|
18 |
+
|
19 |
+
function denormalizeBox(box: number[], width: number, height: number) {
|
20 |
+
// If all values are between 0 and 1, treat as normalized
|
21 |
+
if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
|
22 |
+
return [
|
23 |
+
box[0] * width,
|
24 |
+
box[1] * height,
|
25 |
+
box[2] * width,
|
26 |
+
box[3] * height
|
27 |
+
];
|
28 |
+
}
|
29 |
+
return box;
|
30 |
+
}
|
31 |
+
|
32 |
+
// Add this robust fallback parser near the top
|
33 |
+
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
|
34 |
+
// Try to parse as JSON first
|
35 |
+
try {
|
36 |
+
const parsed = JSON.parse(output);
|
37 |
+
if (Array.isArray(parsed)) {
|
38 |
+
const result: { label: string, bbox_2d: number[] }[] = [];
|
39 |
+
for (const obj of parsed) {
|
40 |
+
if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
|
41 |
+
if (Array.isArray(obj.bbox_2d[0])) {
|
42 |
+
for (const arr of obj.bbox_2d) {
|
43 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
44 |
+
result.push({ label: obj.label, bbox_2d: arr });
|
45 |
+
}
|
46 |
+
}
|
47 |
+
} else if (obj.bbox_2d.length === 4) {
|
48 |
+
result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
if (result.length > 0) return result;
|
53 |
+
}
|
54 |
+
} catch (e) {}
|
55 |
+
// Fallback: extract all [x1, y1, x2, y2] arrays from the string
|
56 |
+
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
|
57 |
+
const boxes: { label: string, bbox_2d: number[] }[] = [];
|
58 |
+
let match;
|
59 |
+
while ((match = boxRegex.exec(output)) !== null) {
|
60 |
+
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
|
61 |
+
boxes.push({ label: '', bbox_2d: arr });
|
62 |
+
}
|
63 |
+
return boxes;
|
64 |
+
}
|
65 |
+
|
66 |
+
// NOTE: You must install onnxruntime-web:
|
67 |
+
// npm install onnxruntime-web
|
68 |
+
// @ts-ignore
|
69 |
+
import * as ort from 'onnxruntime-web';
|
70 |
+
// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
|
71 |
+
|
72 |
+
// Set your YOLOv8 ONNX model URL here:
|
73 |
+
const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
|
74 |
+
|
75 |
+
// Add these constants to match the YOLOv8 input size
|
76 |
+
const YOLOV8_INPUT_WIDTH = 640;
|
77 |
+
const YOLOV8_INPUT_HEIGHT = 480;
|
78 |
+
|
79 |
+
// 1. Load the ONNX model once
|
80 |
+
let yoloSession: ort.InferenceSession | null = null;
|
81 |
+
// Add a busy flag to prevent concurrent YOLOv8 inferences
|
82 |
+
let isYoloBusy = false;
|
83 |
+
async function loadYoloModel() {
|
84 |
+
if (!yoloSession) {
|
85 |
+
yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
|
86 |
+
}
|
87 |
+
return yoloSession;
|
88 |
+
}
|
89 |
+
|
90 |
+
// COCO class names for YOLOv8
|
91 |
+
const YOLO_CLASSES: string[] = [
|
92 |
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
|
93 |
+
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
|
94 |
+
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
95 |
+
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
|
96 |
+
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
97 |
+
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
|
98 |
+
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
|
99 |
+
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
100 |
+
];
|
101 |
+
|
102 |
+
// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
|
103 |
+
function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
|
104 |
+
const width = 640;
|
105 |
+
const height = 480;
|
106 |
+
const canvas = document.createElement('canvas');
|
107 |
+
canvas.width = width;
|
108 |
+
canvas.height = height;
|
109 |
+
const ctx = canvas.getContext('2d');
|
110 |
+
if (!ctx) throw new Error('Could not get 2D context');
|
111 |
+
ctx.drawImage(video, 0, 0, width, height);
|
112 |
+
const imageData = ctx.getImageData(0, 0, width, height);
|
113 |
+
const { data } = imageData;
|
114 |
+
// Convert to Float32Array [1,3,480,640], normalize to [0,1]
|
115 |
+
const floatData = new Float32Array(1 * 3 * height * width);
|
116 |
+
for (let i = 0; i < width * height; i++) {
|
117 |
+
floatData[i] = data[i * 4] / 255; // R
|
118 |
+
floatData[i + width * height] = data[i * 4 + 1] / 255; // G
|
119 |
+
floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
|
120 |
+
}
|
121 |
+
return new ort.Tensor('float32', floatData, [1, 3, height, width]);
|
122 |
+
}
|
123 |
+
|
124 |
+
// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
|
125 |
+
function postprocessYoloOutput(output: ort.Tensor) {
|
126 |
+
// output.dims: [1, num_detections, 6]
|
127 |
+
const data = output.data;
|
128 |
+
const numDetections = output.dims[1];
|
129 |
+
const results = [];
|
130 |
+
for (let i = 0; i < numDetections; i++) {
|
131 |
+
const offset = i * 6;
|
132 |
+
const x1 = data[offset];
|
133 |
+
const y1 = data[offset + 1];
|
134 |
+
const x2 = data[offset + 2];
|
135 |
+
const y2 = data[offset + 3];
|
136 |
+
const score = data[offset + 4];
|
137 |
+
const classId = data[offset + 5];
|
138 |
+
if (score < 0.2) continue; // adjust threshold as needed
|
139 |
+
results.push({
|
140 |
+
bbox: [x1, y1, x2, y2],
|
141 |
+
label: YOLO_CLASSES[classId] || `class_${classId}`,
|
142 |
+
score
|
143 |
+
});
|
144 |
+
}
|
145 |
+
return results;
|
146 |
+
}
|
147 |
+
|
148 |
+
// Helper type guard for annotation
|
149 |
+
function hasAnnotation(obj: any): obj is { annotation: string } {
|
150 |
+
return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
|
151 |
+
}
|
152 |
+
|
153 |
+
export default function MultiSourceCaptioningView() {
|
154 |
+
const [mode, setMode] = useState<Mode>("File");
|
155 |
+
const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
156 |
+
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
|
157 |
+
const [processing, setProcessing] = useState(false);
|
158 |
+
const [error, setError] = useState<string | null>(null);
|
159 |
+
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
|
160 |
+
const [uploadedUrl, setUploadedUrl] = useState<string>("");
|
161 |
+
const [videoProcessing, setVideoProcessing] = useState(false);
|
162 |
+
const [imageProcessed, setImageProcessed] = useState(false);
|
163 |
+
const [exampleProcessing, setExampleProcessing] = useState(false);
|
164 |
+
const [debugOutput, setDebugOutput] = useState<string>("");
|
165 |
+
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
|
166 |
+
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
|
167 |
+
const [inferenceStatus, setInferenceStatus] = useState<string>("");
|
168 |
+
const [showProcessingVideo, setShowProcessingVideo] = useState(false);
|
169 |
+
|
170 |
+
const videoRef = useRef<HTMLVideoElement | null>(null);
|
171 |
+
const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
|
172 |
+
const processingVideoRef = useRef<HTMLVideoElement | null>(null);
|
173 |
+
const canvasRef = useRef<HTMLCanvasElement | null>(null);
|
174 |
+
const imageRef = useRef<HTMLImageElement | null>(null);
|
175 |
+
const boxHistoryRef = useRef<any[]>([]);
|
176 |
+
// Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
|
177 |
+
const lastYoloBoxesRef = React.useRef<any[]>([]);
|
178 |
+
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
|
179 |
+
|
180 |
+
// Remove videoProcessingRef and exampleProcessingRef
|
181 |
+
// Add a single processingLoopRef
|
182 |
+
const processingLoopRef = React.useRef(false);
|
183 |
+
|
184 |
+
const processVideoLoop = async () => {
|
185 |
+
if (!processingLoopRef.current) return;
|
186 |
+
if (isYoloBusy) {
|
187 |
+
// Optionally log: "Inference already running, skipping frame"
|
188 |
+
requestAnimationFrame(processVideoLoop);
|
189 |
+
return;
|
190 |
+
}
|
191 |
+
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
192 |
+
// Schedule the next frame as soon as possible
|
193 |
+
requestAnimationFrame(processVideoLoop);
|
194 |
+
};
|
195 |
+
const processExampleLoop = async () => {
|
196 |
+
while (processingLoopRef.current) {
|
197 |
+
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
|
198 |
+
await new Promise(res => setTimeout(res, 1000));
|
199 |
+
}
|
200 |
+
};
|
201 |
+
|
202 |
+
// Set your YOLOv8 ONNX backend API endpoint here:
|
203 |
+
// const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
|
204 |
+
|
205 |
+
// Add this useEffect for overlay video synchronization
|
206 |
+
useEffect(() => {
|
207 |
+
const main = videoRef.current;
|
208 |
+
const overlay = overlayVideoRef.current;
|
209 |
+
if (!main || !overlay) return;
|
210 |
+
// Sync play/pause
|
211 |
+
const onPlay = () => { if (overlay.paused) overlay.play(); };
|
212 |
+
const onPause = () => { if (!overlay.paused) overlay.pause(); };
|
213 |
+
// Sync seeking and time
|
214 |
+
const onSeekOrTime = () => {
|
215 |
+
if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
|
216 |
+
overlay.currentTime = main.currentTime;
|
217 |
+
}
|
218 |
+
};
|
219 |
+
main.addEventListener('play', onPlay);
|
220 |
+
main.addEventListener('pause', onPause);
|
221 |
+
main.addEventListener('seeked', onSeekOrTime);
|
222 |
+
main.addEventListener('timeupdate', onSeekOrTime);
|
223 |
+
// Clean up
|
224 |
+
return () => {
|
225 |
+
main.removeEventListener('play', onPlay);
|
226 |
+
main.removeEventListener('pause', onPause);
|
227 |
+
main.removeEventListener('seeked', onSeekOrTime);
|
228 |
+
main.removeEventListener('timeupdate', onSeekOrTime);
|
229 |
+
};
|
230 |
+
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
|
231 |
+
|
232 |
+
useEffect(() => {
|
233 |
+
if ((mode === "File") && processingVideoRef.current) {
|
234 |
+
processingVideoRef.current.play().catch(() => {});
|
235 |
+
}
|
236 |
+
}, [mode, videoUrl, uploadedUrl]);
|
237 |
+
|
238 |
+
// Remove old prompt-based box extraction logic and only use the above for video frames.
|
239 |
+
|
240 |
+
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
241 |
+
const file = e.target.files?.[0] || null;
|
242 |
+
setUploadedFile(file);
|
243 |
+
setUploadedUrl(file ? URL.createObjectURL(file) : "");
|
244 |
+
setError(null);
|
245 |
+
setImageProcessed(false);
|
246 |
+
setVideoProcessing(false);
|
247 |
+
setExampleProcessing(false);
|
248 |
+
};
|
249 |
+
|
250 |
+
// Webcam mode: process frames with setInterval
|
251 |
+
useEffect(() => {
|
252 |
+
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
253 |
+
processVideoLoop();
|
254 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
255 |
+
|
256 |
+
// Example video mode: process frames with setInterval
|
257 |
+
useEffect(() => {
|
258 |
+
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
259 |
+
processExampleLoop();
|
260 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
261 |
+
|
262 |
+
// File mode: process uploaded image (only on button click)
|
263 |
+
const handleProcessImage = async () => {
|
264 |
+
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
265 |
+
const img = imageRef.current;
|
266 |
+
const canvas = canvasRef.current;
|
267 |
+
canvas.width = img.naturalWidth;
|
268 |
+
canvas.height = img.naturalHeight;
|
269 |
+
setCanvasDims({w:canvas.width,h:canvas.height});
|
270 |
+
setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
|
271 |
+
const ctx = canvas.getContext("2d");
|
272 |
+
if (!ctx) return;
|
273 |
+
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
274 |
+
setProcessing(true);
|
275 |
+
setError(null);
|
276 |
+
setInferenceStatus("Running inference...");
|
277 |
+
await runInference(img, prompt, (output: string) => {
|
278 |
+
setDebugOutput(output);
|
279 |
+
setInferenceStatus("Inference complete.");
|
280 |
+
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
281 |
+
let boxes = extractAllBoundingBoxes(output);
|
282 |
+
console.log("Model output:", output);
|
283 |
+
console.log("Boxes after normalization:", boxes);
|
284 |
+
console.log("Canvas size:", canvas.width, canvas.height);
|
285 |
+
if (boxes.length > 0) {
|
286 |
+
const [x1, y1, x2, y2] = boxes[0].bbox_2d;
|
287 |
+
console.log("First box coords:", x1, y1, x2, y2);
|
288 |
+
}
|
289 |
+
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
290 |
+
if (Array.isArray(boxes) && boxes.length > 0) {
|
291 |
+
const scaleX = canvas.width / img.naturalWidth;
|
292 |
+
const scaleY = canvas.height / img.naturalHeight;
|
293 |
+
drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
|
294 |
+
}
|
295 |
+
setImageProcessed(true);
|
296 |
+
});
|
297 |
+
setProcessing(false);
|
298 |
+
};
|
299 |
+
|
300 |
+
// File mode: process uploaded video frames (start/stop)
|
301 |
+
const handleToggleVideoProcessing = () => {
|
302 |
+
setVideoProcessing((prev: boolean) => {
|
303 |
+
const next = !prev;
|
304 |
+
// Always stop all loops before starting
|
305 |
+
processingLoopRef.current = false;
|
306 |
+
setTimeout(() => {
|
307 |
+
if (next) {
|
308 |
+
processingLoopRef.current = true;
|
309 |
+
processVideoLoop();
|
310 |
+
}
|
311 |
+
}, 50);
|
312 |
+
return next;
|
313 |
+
});
|
314 |
+
};
|
315 |
+
|
316 |
+
// Handle start/stop for example video processing
|
317 |
+
const handleToggleExampleProcessing = () => {
|
318 |
+
setExampleProcessing((prev: boolean) => {
|
319 |
+
const next = !prev;
|
320 |
+
// Always stop all loops before starting
|
321 |
+
processingLoopRef.current = false;
|
322 |
+
setTimeout(() => {
|
323 |
+
if (next) {
|
324 |
+
processingLoopRef.current = true;
|
325 |
+
processVideoLoop();
|
326 |
+
}
|
327 |
+
}, 50);
|
328 |
+
return next;
|
329 |
+
});
|
330 |
+
};
|
331 |
+
|
332 |
+
// Test draw box function
|
333 |
+
const handleTestDrawBox = () => {
|
334 |
+
if (!canvasRef.current) return;
|
335 |
+
const canvas = canvasRef.current;
|
336 |
+
const ctx = canvas.getContext("2d");
|
337 |
+
if (!ctx) return;
|
338 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
339 |
+
ctx.strokeStyle = "#FF00FF";
|
340 |
+
ctx.lineWidth = 4;
|
341 |
+
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
|
342 |
+
ctx.font = "20px Arial";
|
343 |
+
ctx.fillStyle = "#FF00FF";
|
344 |
+
ctx.fillText("Test Box", 50, 35);
|
345 |
+
};
|
346 |
+
|
347 |
+
useEffect(() => {
|
348 |
+
const draw = () => {
|
349 |
+
const overlayVideo = overlayVideoRef.current;
|
350 |
+
const canvas = canvasRef.current;
|
351 |
+
if (!overlayVideo || !canvas) return;
|
352 |
+
const displayWidth = overlayVideo.clientWidth;
|
353 |
+
const displayHeight = overlayVideo.clientHeight;
|
354 |
+
canvas.width = displayWidth;
|
355 |
+
canvas.height = displayHeight;
|
356 |
+
const ctx = canvas.getContext("2d");
|
357 |
+
if (!ctx) return;
|
358 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
359 |
+
const now = Date.now();
|
360 |
+
const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
|
361 |
+
if (boxHistory.length > 0) {
|
362 |
+
// Fix: Draw all boxes, even if bbox_2d is an array of arrays
|
363 |
+
const denormalizedBoxes: any[] = [];
|
364 |
+
for (const b of boxHistory) {
|
365 |
+
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
|
366 |
+
// Multiple boxes per label
|
367 |
+
for (const arr of b.bbox_2d) {
|
368 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
369 |
+
denormalizedBoxes.push({
|
370 |
+
...b,
|
371 |
+
bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
|
372 |
+
});
|
373 |
+
}
|
374 |
+
}
|
375 |
+
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
|
376 |
+
// Single box
|
377 |
+
denormalizedBoxes.push({
|
378 |
+
...b,
|
379 |
+
bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
|
380 |
+
});
|
381 |
+
}
|
382 |
+
}
|
383 |
+
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
|
384 |
+
}
|
385 |
+
};
|
386 |
+
draw();
|
387 |
+
const interval = setInterval(draw, 100);
|
388 |
+
// Redraw on window resize
|
389 |
+
const handleResize = () => draw();
|
390 |
+
window.addEventListener('resize', handleResize);
|
391 |
+
return () => {
|
392 |
+
clearInterval(interval);
|
393 |
+
window.removeEventListener('resize', handleResize);
|
394 |
+
};
|
395 |
+
}, [overlayVideoRef, canvasRef]);
|
396 |
+
|
397 |
+
// Drawing loop: draws the latest YOLOv8 boxes every frame
|
398 |
+
React.useEffect(() => {
|
399 |
+
let running = true;
|
400 |
+
function drawLoop() {
|
401 |
+
if (!running) return;
|
402 |
+
const overlayVideo = overlayVideoRef.current;
|
403 |
+
const canvas = canvasRef.current;
|
404 |
+
const processingVideo = processingVideoRef.current;
|
405 |
+
if (canvas && overlayVideo && processingVideo) {
|
406 |
+
// Set canvas size to match the visible video
|
407 |
+
canvas.width = overlayVideo.clientWidth;
|
408 |
+
canvas.height = overlayVideo.clientHeight;
|
409 |
+
const ctx = canvas.getContext('2d');
|
410 |
+
if (ctx) {
|
411 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
412 |
+
// Draw all YOLOv8 boxes from last detection
|
413 |
+
const yoloBoxes = lastYoloBoxesRef.current;
|
414 |
+
yoloBoxes.forEach((obj: any) => {
|
415 |
+
// Scale from YOLOv8 input size to canvas size
|
416 |
+
const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
|
417 |
+
const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
|
418 |
+
const [x1, y1, x2, y2] = obj.bbox;
|
419 |
+
const drawX = x1 * scaleX;
|
420 |
+
const drawY = y1 * scaleY;
|
421 |
+
const drawW = (x2 - x1) * scaleX;
|
422 |
+
const drawH = (y2 - y1) * scaleY;
|
423 |
+
ctx.strokeStyle = '#00FFFF';
|
424 |
+
ctx.lineWidth = 5;
|
425 |
+
ctx.strokeRect(drawX, drawY, drawW, drawH);
|
426 |
+
ctx.font = 'bold 22px Arial';
|
427 |
+
// Draw YOLOv8 label and confidence
|
428 |
+
const yoloLabel = obj.label || '';
|
429 |
+
const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
|
430 |
+
const yoloText = `${yoloLabel}${yoloScore}`;
|
431 |
+
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
432 |
+
const yoloTextWidth = ctx.measureText(yoloText).width + 8;
|
433 |
+
ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
|
434 |
+
ctx.fillStyle = '#00FFFF';
|
435 |
+
ctx.fillText(yoloText, drawX, drawY - 4);
|
436 |
+
// Draw FastVLM annotation below the box if available
|
437 |
+
if (hasAnnotation(obj)) {
|
438 |
+
ctx.font = 'bold 18px Arial';
|
439 |
+
ctx.fillStyle = 'rgba(0,0,0,0.7)';
|
440 |
+
const annTextWidth = ctx.measureText(obj.annotation).width + 8;
|
441 |
+
ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
|
442 |
+
ctx.fillStyle = '#00FFFF';
|
443 |
+
ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
|
444 |
+
}
|
445 |
+
});
|
446 |
+
}
|
447 |
+
}
|
448 |
+
requestAnimationFrame(drawLoop);
|
449 |
+
}
|
450 |
+
drawLoop();
|
451 |
+
return () => { running = false; };
|
452 |
+
}, [overlayVideoRef, canvasRef, processingVideoRef]);
|
453 |
+
|
454 |
+
// YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
|
455 |
+
const yoloDetectionLoop = async () => {
|
456 |
+
if (!processingLoopRef.current) return;
|
457 |
+
if (isYoloBusy) {
|
458 |
+
requestAnimationFrame(yoloDetectionLoop);
|
459 |
+
return;
|
460 |
+
}
|
461 |
+
isYoloBusy = true;
|
462 |
+
try {
|
463 |
+
const processingVideo = processingVideoRef.current;
|
464 |
+
if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
|
465 |
+
isYoloBusy = false;
|
466 |
+
requestAnimationFrame(yoloDetectionLoop);
|
467 |
+
return;
|
468 |
+
}
|
469 |
+
// Run YOLOv8 detection
|
470 |
+
const session = await loadYoloModel();
|
471 |
+
const inputTensor = preprocessFrameToTensor(processingVideo);
|
472 |
+
const feeds: Record<string, ort.Tensor> = {};
|
473 |
+
feeds[session.inputNames[0]] = inputTensor;
|
474 |
+
const results = await session.run(feeds);
|
475 |
+
const output = results[session.outputNames[0]];
|
476 |
+
const detections = postprocessYoloOutput(output);
|
477 |
+
lastYoloBoxesRef.current = detections;
|
478 |
+
// Run FastVLM on the full frame (wait for YOLOv8 to finish)
|
479 |
+
await runInference(processingVideo, prompt, (output: string) => {
|
480 |
+
setDebugOutput(output);
|
481 |
+
});
|
482 |
+
} catch (err) {
|
483 |
+
console.error('YOLOv8+FastVLM error:', err);
|
484 |
+
} finally {
|
485 |
+
isYoloBusy = false;
|
486 |
+
requestAnimationFrame(yoloDetectionLoop);
|
487 |
+
}
|
488 |
+
};
|
489 |
+
|
490 |
+
// Add this effect after the processing loop and toggle handlers
|
491 |
+
useEffect(() => {
|
492 |
+
// Stop processing loop on video source change or processing toggle
|
493 |
+
processingLoopRef.current = false;
|
494 |
+
// Start processing loop for the correct video after refs update
|
495 |
+
setTimeout(() => {
|
496 |
+
if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
|
497 |
+
processingLoopRef.current = true;
|
498 |
+
yoloDetectionLoop();
|
499 |
+
} else if (exampleProcessing && !uploadedFile) {
|
500 |
+
processingLoopRef.current = true;
|
501 |
+
yoloDetectionLoop();
|
502 |
+
}
|
503 |
+
}, 100);
|
504 |
+
// eslint-disable-next-line
|
505 |
+
}, [uploadedFile, videoProcessing, exampleProcessing]);
|
506 |
+
|
507 |
+
return (
|
508 |
+
<div className="absolute inset-0 text-white">
|
509 |
+
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
|
510 |
+
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
|
511 |
+
</div>
|
512 |
+
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
|
513 |
+
<div className="flex flex-col items-center justify-center h-full w-full">
|
514 |
+
{/* Mode Selector */}
|
515 |
+
<div className="mb-6">
|
516 |
+
<div className="flex space-x-4">
|
517 |
+
{MODES.map((m) => (
|
518 |
+
<button
|
519 |
+
key={m}
|
520 |
+
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
|
521 |
+
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
|
522 |
+
}`}
|
523 |
+
onClick={() => setMode(m)}
|
524 |
+
>
|
525 |
+
{m}
|
526 |
+
</button>
|
527 |
+
))}
|
528 |
+
</div>
|
529 |
+
</div>
|
530 |
+
|
531 |
+
{/* Mode Content */}
|
532 |
+
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
|
533 |
+
{mode === "File" && (
|
534 |
+
<div className="w-full text-center flex flex-col items-center">
|
535 |
+
<div className="mb-4 w-full max-w-xl">
|
536 |
+
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
|
537 |
+
<textarea
|
538 |
+
className="w-full p-2 rounded-lg text-black"
|
539 |
+
rows={3}
|
540 |
+
value={prompt}
|
541 |
+
onChange={(e) => setPrompt(e.target.value)}
|
542 |
+
/>
|
543 |
+
</div>
|
544 |
+
<div className="mb-4 w-full max-w-xl">
|
545 |
+
<input
|
546 |
+
type="file"
|
547 |
+
accept="image/*,video/*"
|
548 |
+
onChange={handleFileChange}
|
549 |
+
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
|
550 |
+
/>
|
551 |
+
</div>
|
552 |
+
{/* Add toggle button above video area */}
|
553 |
+
<div className="mb-2 w-full max-w-xl flex justify-end">
|
554 |
+
<button
|
555 |
+
className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
|
556 |
+
onClick={() => setShowProcessingVideo(v => !v)}
|
557 |
+
type="button"
|
558 |
+
>
|
559 |
+
{showProcessingVideo ? 'Hide' : 'Show'} Processed Video
|
560 |
+
</button>
|
561 |
+
</div>
|
562 |
+
{/* Show uploaded image */}
|
563 |
+
{uploadedFile && isImageFile(uploadedFile) && (
|
564 |
+
<div className="relative w-full max-w-xl">
|
565 |
+
<img
|
566 |
+
ref={imageRef}
|
567 |
+
src={uploadedUrl}
|
568 |
+
alt="Uploaded"
|
569 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
570 |
+
style={{ background: "#222" }}
|
571 |
+
/>
|
572 |
+
<canvas
|
573 |
+
ref={canvasRef}
|
574 |
+
className="absolute top-0 left-0 w-full h-full pointer-events-none"
|
575 |
+
style={{ zIndex: 10, pointerEvents: "none" }}
|
576 |
+
/>
|
577 |
+
<button
|
578 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
579 |
+
onClick={handleProcessImage}
|
580 |
+
disabled={processing}
|
581 |
+
>
|
582 |
+
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
|
583 |
+
</button>
|
584 |
+
</div>
|
585 |
+
)}
|
586 |
+
{/* Show uploaded video */}
|
587 |
+
{uploadedFile && isVideoFile(uploadedFile) && (
|
588 |
+
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
589 |
+
{/* Visible overlay video for user */}
|
590 |
+
<video
|
591 |
+
ref={overlayVideoRef}
|
592 |
+
src={uploadedUrl}
|
593 |
+
controls
|
594 |
+
autoPlay
|
595 |
+
loop
|
596 |
+
muted
|
597 |
+
playsInline
|
598 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
599 |
+
style={{ background: "#222", display: "block" }}
|
600 |
+
crossOrigin="anonymous"
|
601 |
+
onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
|
602 |
+
if (canvasRef.current) {
|
603 |
+
canvasRef.current.width = e.currentTarget.clientWidth;
|
604 |
+
canvasRef.current.height = e.currentTarget.clientHeight;
|
605 |
+
}
|
606 |
+
}}
|
607 |
+
onResize={() => {
|
608 |
+
if (canvasRef.current && overlayVideoRef.current) {
|
609 |
+
canvasRef.current.width = overlayVideoRef.current.clientWidth;
|
610 |
+
canvasRef.current.height = overlayVideoRef.current.clientHeight;
|
611 |
+
}
|
612 |
+
}}
|
613 |
+
/>
|
614 |
+
{/* Canvas overlay */}
|
615 |
+
<canvas
|
616 |
+
ref={canvasRef}
|
617 |
+
style={{
|
618 |
+
position: "absolute",
|
619 |
+
top: 0,
|
620 |
+
left: 0,
|
621 |
+
width: "100%",
|
622 |
+
height: "100%",
|
623 |
+
zIndex: 100,
|
624 |
+
pointerEvents: "none",
|
625 |
+
display: "block"
|
626 |
+
}}
|
627 |
+
width={overlayVideoRef.current?.clientWidth || 640}
|
628 |
+
height={overlayVideoRef.current?.clientHeight || 480}
|
629 |
+
/>
|
630 |
+
{/* Hidden or visible processing video for FastVLM/canvas */}
|
631 |
+
<video
|
632 |
+
ref={processingVideoRef}
|
633 |
+
src={uploadedUrl}
|
634 |
+
autoPlay
|
635 |
+
loop
|
636 |
+
muted
|
637 |
+
playsInline
|
638 |
+
crossOrigin="anonymous"
|
639 |
+
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
|
640 |
+
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
|
641 |
+
/>
|
642 |
+
<button
|
643 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
644 |
+
onClick={handleToggleVideoProcessing}
|
645 |
+
>
|
646 |
+
{videoProcessing ? "Stop Processing" : "Start Processing"}
|
647 |
+
</button>
|
648 |
+
</div>
|
649 |
+
)}
|
650 |
+
{/* Show example video if no file uploaded */}
|
651 |
+
{!uploadedFile && (
|
652 |
+
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
|
653 |
+
{/* Visible overlay video for user */}
|
654 |
+
<video
|
655 |
+
ref={overlayVideoRef}
|
656 |
+
src={EXAMPLE_VIDEO_URL}
|
657 |
+
controls
|
658 |
+
autoPlay
|
659 |
+
loop
|
660 |
+
muted
|
661 |
+
playsInline
|
662 |
+
className="w-full rounded-lg shadow-lg mb-2"
|
663 |
+
style={{ background: "#222", display: "block" }}
|
664 |
+
crossOrigin="anonymous"
|
665 |
+
/>
|
666 |
+
{/* Canvas overlay */}
|
667 |
+
<canvas
|
668 |
+
ref={canvasRef}
|
669 |
+
style={{
|
670 |
+
position: "absolute",
|
671 |
+
top: 0,
|
672 |
+
left: 0,
|
673 |
+
width: "100%",
|
674 |
+
height: "100%",
|
675 |
+
zIndex: 100,
|
676 |
+
pointerEvents: "none",
|
677 |
+
display: "block"
|
678 |
+
}}
|
679 |
+
width={overlayVideoRef.current?.clientWidth || 640}
|
680 |
+
height={overlayVideoRef.current?.clientHeight || 480}
|
681 |
+
/>
|
682 |
+
{/* Hidden or visible processing video for FastVLM/canvas */}
|
683 |
+
<video
|
684 |
+
ref={processingVideoRef}
|
685 |
+
src={EXAMPLE_VIDEO_URL}
|
686 |
+
autoPlay
|
687 |
+
loop
|
688 |
+
muted
|
689 |
+
playsInline
|
690 |
+
crossOrigin="anonymous"
|
691 |
+
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
|
692 |
+
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
|
693 |
+
/>
|
694 |
+
<button
|
695 |
+
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
|
696 |
+
onClick={handleToggleExampleProcessing}
|
697 |
+
>
|
698 |
+
{exampleProcessing ? "Stop Processing" : "Start Processing"}
|
699 |
+
</button>
|
700 |
+
</div>
|
701 |
+
)}
|
702 |
+
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
|
703 |
+
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
|
704 |
+
<button
|
705 |
+
className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
|
706 |
+
onClick={handleTestDrawBox}
|
707 |
+
>
|
708 |
+
Test Draw Box
|
709 |
+
</button>
|
710 |
+
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
|
711 |
+
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
|
712 |
+
<div>Raw Model Output:</div>
|
713 |
+
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
|
714 |
+
</div>
|
715 |
+
</div>
|
716 |
+
)}
|
717 |
+
</div>
|
718 |
+
</div>
|
719 |
+
</div>
|
720 |
+
);
|
721 |
}
|