Quazim0t0 commited on
Commit
b7c497f
·
verified ·
1 Parent(s): 67f9568

Upload 51 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,717 +1,721 @@
1
- import * as React from "react";
2
- import { useState, useRef, useEffect } from "react";
3
- import { useVLMContext } from "../context/useVLMContext";
4
- import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
5
-
6
- const MODES = ["File"] as const;
7
- type Mode = typeof MODES[number];
8
-
9
- const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
10
- const EXAMPLE_PROMPT = "Describe the video";
11
-
12
- function isImageFile(file: File) {
13
- return file.type.startsWith("image/");
14
- }
15
- function isVideoFile(file: File) {
16
- return file.type.startsWith("video/");
17
- }
18
-
19
- function denormalizeBox(box: number[], width: number, height: number) {
20
- // If all values are between 0 and 1, treat as normalized
21
- if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
22
- return [
23
- box[0] * width,
24
- box[1] * height,
25
- box[2] * width,
26
- box[3] * height
27
- ];
28
- }
29
- return box;
30
- }
31
-
32
- // Add this robust fallback parser near the top
33
- function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
34
- // Try to parse as JSON first
35
- try {
36
- const parsed = JSON.parse(output);
37
- if (Array.isArray(parsed)) {
38
- const result: { label: string, bbox_2d: number[] }[] = [];
39
- for (const obj of parsed) {
40
- if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
41
- if (Array.isArray(obj.bbox_2d[0])) {
42
- for (const arr of obj.bbox_2d) {
43
- if (Array.isArray(arr) && arr.length === 4) {
44
- result.push({ label: obj.label, bbox_2d: arr });
45
- }
46
- }
47
- } else if (obj.bbox_2d.length === 4) {
48
- result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
49
- }
50
- }
51
- }
52
- if (result.length > 0) return result;
53
- }
54
- } catch (e) {}
55
- // Fallback: extract all [x1, y1, x2, y2] arrays from the string
56
- const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
57
- const boxes: { label: string, bbox_2d: number[] }[] = [];
58
- let match;
59
- while ((match = boxRegex.exec(output)) !== null) {
60
- const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
61
- boxes.push({ label: '', bbox_2d: arr });
62
- }
63
- return boxes;
64
- }
65
-
66
- // NOTE: You must install onnxruntime-web:
67
- // npm install onnxruntime-web
68
- // @ts-ignore
69
- import * as ort from 'onnxruntime-web';
70
- // If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
71
-
72
- // Set your YOLOv8 ONNX model URL here:
73
- const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
74
-
75
- // Add these constants to match the YOLOv8 input size
76
- const YOLOV8_INPUT_WIDTH = 640;
77
- const YOLOV8_INPUT_HEIGHT = 480;
78
-
79
- // 1. Load the ONNX model once
80
- let yoloSession: ort.InferenceSession | null = null;
81
- // Add a busy flag to prevent concurrent YOLOv8 inferences
82
- let isYoloBusy = false;
83
- async function loadYoloModel() {
84
- if (!yoloSession) {
85
- yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
86
- }
87
- return yoloSession;
88
- }
89
-
90
- // COCO class names for YOLOv8
91
- const YOLO_CLASSES: string[] = [
92
- "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
93
- "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
94
- "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
95
- "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
96
- "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
97
- "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
98
- "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
99
- "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
100
- ];
101
-
102
- // Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
103
- function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
104
- const width = 640;
105
- const height = 480;
106
- const canvas = document.createElement('canvas');
107
- canvas.width = width;
108
- canvas.height = height;
109
- const ctx = canvas.getContext('2d');
110
- if (!ctx) throw new Error('Could not get 2D context');
111
- ctx.drawImage(video, 0, 0, width, height);
112
- const imageData = ctx.getImageData(0, 0, width, height);
113
- const { data } = imageData;
114
- // Convert to Float32Array [1,3,480,640], normalize to [0,1]
115
- const floatData = new Float32Array(1 * 3 * height * width);
116
- for (let i = 0; i < width * height; i++) {
117
- floatData[i] = data[i * 4] / 255; // R
118
- floatData[i + width * height] = data[i * 4 + 1] / 255; // G
119
- floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
120
- }
121
- return new ort.Tensor('float32', floatData, [1, 3, height, width]);
122
- }
123
-
124
- // Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
125
- function postprocessYoloOutput(output: ort.Tensor) {
126
- // output.dims: [1, num_detections, 6]
127
- const data = output.data;
128
- const numDetections = output.dims[1];
129
- const results = [];
130
- for (let i = 0; i < numDetections; i++) {
131
- const offset = i * 6;
132
- const x1 = data[offset];
133
- const y1 = data[offset + 1];
134
- const x2 = data[offset + 2];
135
- const y2 = data[offset + 3];
136
- const score = data[offset + 4];
137
- const classId = data[offset + 5];
138
- if (score < 0.2) continue; // adjust threshold as needed
139
- results.push({
140
- bbox: [x1, y1, x2, y2],
141
- label: YOLO_CLASSES[classId] || `class_${classId}`,
142
- score
143
- });
144
- }
145
- return results;
146
- }
147
-
148
- // Helper type guard for annotation
149
- function hasAnnotation(obj: any): obj is { annotation: string } {
150
- return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
151
- }
152
-
153
- export default function MultiSourceCaptioningView() {
154
- const [mode, setMode] = useState<Mode>("File");
155
- const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
156
- const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
157
- const [processing, setProcessing] = useState(false);
158
- const [error, setError] = useState<string | null>(null);
159
- const [uploadedFile, setUploadedFile] = useState<File | null>(null);
160
- const [uploadedUrl, setUploadedUrl] = useState<string>("");
161
- const [videoProcessing, setVideoProcessing] = useState(false);
162
- const [imageProcessed, setImageProcessed] = useState(false);
163
- const [exampleProcessing, setExampleProcessing] = useState(false);
164
- const [debugOutput, setDebugOutput] = useState<string>("");
165
- const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
166
- const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
167
- const [inferenceStatus, setInferenceStatus] = useState<string>("");
168
- const [showProcessingVideo, setShowProcessingVideo] = useState(false);
169
-
170
- const videoRef = useRef<HTMLVideoElement | null>(null);
171
- const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
172
- const processingVideoRef = useRef<HTMLVideoElement | null>(null);
173
- const canvasRef = useRef<HTMLCanvasElement | null>(null);
174
- const imageRef = useRef<HTMLImageElement | null>(null);
175
- const boxHistoryRef = useRef<any[]>([]);
176
- // Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
177
- const lastYoloBoxesRef = React.useRef<any[]>([]);
178
- const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
179
-
180
- // Remove videoProcessingRef and exampleProcessingRef
181
- // Add a single processingLoopRef
182
- const processingLoopRef = React.useRef(false);
183
-
184
- const processVideoLoop = async () => {
185
- if (!processingLoopRef.current) return;
186
- if (isYoloBusy) {
187
- // Optionally log: "Inference already running, skipping frame"
188
- requestAnimationFrame(processVideoLoop);
189
- return;
190
- }
191
- await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
192
- // Schedule the next frame as soon as possible
193
- requestAnimationFrame(processVideoLoop);
194
- };
195
- const processExampleLoop = async () => {
196
- while (processingLoopRef.current) {
197
- await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
198
- await new Promise(res => setTimeout(res, 1000));
199
- }
200
- };
201
-
202
- // Set your YOLOv8 ONNX backend API endpoint here:
203
- // const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
204
-
205
- // Add this useEffect for overlay video synchronization
206
- useEffect(() => {
207
- const main = videoRef.current;
208
- const overlay = overlayVideoRef.current;
209
- if (!main || !overlay) return;
210
- // Sync play/pause
211
- const onPlay = () => { if (overlay.paused) overlay.play(); };
212
- const onPause = () => { if (!overlay.paused) overlay.pause(); };
213
- // Sync seeking and time
214
- const onSeekOrTime = () => {
215
- if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
216
- overlay.currentTime = main.currentTime;
217
- }
218
- };
219
- main.addEventListener('play', onPlay);
220
- main.addEventListener('pause', onPause);
221
- main.addEventListener('seeked', onSeekOrTime);
222
- main.addEventListener('timeupdate', onSeekOrTime);
223
- // Clean up
224
- return () => {
225
- main.removeEventListener('play', onPlay);
226
- main.removeEventListener('pause', onPause);
227
- main.removeEventListener('seeked', onSeekOrTime);
228
- main.removeEventListener('timeupdate', onSeekOrTime);
229
- };
230
- }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
231
-
232
- useEffect(() => {
233
- if ((mode === "File") && processingVideoRef.current) {
234
- processingVideoRef.current.play().catch(() => {});
235
- }
236
- }, [mode, videoUrl, uploadedUrl]);
237
-
238
- // Remove old prompt-based box extraction logic and only use the above for video frames.
239
-
240
- const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
241
- const file = e.target.files?.[0] || null;
242
- setUploadedFile(file);
243
- setUploadedUrl(file ? URL.createObjectURL(file) : "");
244
- setError(null);
245
- setImageProcessed(false);
246
- setVideoProcessing(false);
247
- setExampleProcessing(false);
248
- };
249
-
250
- // Webcam mode: process frames with setInterval
251
- useEffect(() => {
252
- if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
253
- processVideoLoop();
254
- }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
255
-
256
- // Example video mode: process frames with setInterval
257
- useEffect(() => {
258
- if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
259
- processExampleLoop();
260
- }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
261
-
262
- // File mode: process uploaded image (only on button click)
263
- const handleProcessImage = async () => {
264
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
265
- const img = imageRef.current;
266
- const canvas = canvasRef.current;
267
- canvas.width = img.naturalWidth;
268
- canvas.height = img.naturalHeight;
269
- setCanvasDims({w:canvas.width,h:canvas.height});
270
- setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
271
- const ctx = canvas.getContext("2d");
272
- if (!ctx) return;
273
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
274
- setProcessing(true);
275
- setError(null);
276
- setInferenceStatus("Running inference...");
277
- await runInference(img, prompt, (output: string) => {
278
- setDebugOutput(output);
279
- setInferenceStatus("Inference complete.");
280
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
281
- let boxes = extractAllBoundingBoxes(output);
282
- console.log("Model output:", output);
283
- console.log("Boxes after normalization:", boxes);
284
- console.log("Canvas size:", canvas.width, canvas.height);
285
- if (boxes.length > 0) {
286
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
287
- console.log("First box coords:", x1, y1, x2, y2);
288
- }
289
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
290
- if (Array.isArray(boxes) && boxes.length > 0) {
291
- const scaleX = canvas.width / img.naturalWidth;
292
- const scaleY = canvas.height / img.naturalHeight;
293
- drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
294
- }
295
- setImageProcessed(true);
296
- });
297
- setProcessing(false);
298
- };
299
-
300
- // File mode: process uploaded video frames (start/stop)
301
- const handleToggleVideoProcessing = () => {
302
- setVideoProcessing((prev: boolean) => {
303
- const next = !prev;
304
- // Always stop all loops before starting
305
- processingLoopRef.current = false;
306
- setTimeout(() => {
307
- if (next) {
308
- processingLoopRef.current = true;
309
- processVideoLoop();
310
- }
311
- }, 50);
312
- return next;
313
- });
314
- };
315
-
316
- // Handle start/stop for example video processing
317
- const handleToggleExampleProcessing = () => {
318
- setExampleProcessing((prev: boolean) => {
319
- const next = !prev;
320
- // Always stop all loops before starting
321
- processingLoopRef.current = false;
322
- setTimeout(() => {
323
- if (next) {
324
- processingLoopRef.current = true;
325
- processVideoLoop();
326
- }
327
- }, 50);
328
- return next;
329
- });
330
- };
331
-
332
- // Test draw box function
333
- const handleTestDrawBox = () => {
334
- if (!canvasRef.current) return;
335
- const canvas = canvasRef.current;
336
- const ctx = canvas.getContext("2d");
337
- if (!ctx) return;
338
- ctx.clearRect(0, 0, canvas.width, canvas.height);
339
- ctx.strokeStyle = "#FF00FF";
340
- ctx.lineWidth = 4;
341
- ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
342
- ctx.font = "20px Arial";
343
- ctx.fillStyle = "#FF00FF";
344
- ctx.fillText("Test Box", 50, 35);
345
- };
346
-
347
- useEffect(() => {
348
- const draw = () => {
349
- const overlayVideo = overlayVideoRef.current;
350
- const canvas = canvasRef.current;
351
- if (!overlayVideo || !canvas) return;
352
- const displayWidth = overlayVideo.clientWidth;
353
- const displayHeight = overlayVideo.clientHeight;
354
- canvas.width = displayWidth;
355
- canvas.height = displayHeight;
356
- const ctx = canvas.getContext("2d");
357
- if (!ctx) return;
358
- ctx.clearRect(0, 0, canvas.width, canvas.height);
359
- const now = Date.now();
360
- const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
361
- if (boxHistory.length > 0) {
362
- // Fix: Draw all boxes, even if bbox_2d is an array of arrays
363
- const denormalizedBoxes: any[] = [];
364
- for (const b of boxHistory) {
365
- if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
366
- // Multiple boxes per label
367
- for (const arr of b.bbox_2d) {
368
- if (Array.isArray(arr) && arr.length === 4) {
369
- denormalizedBoxes.push({
370
- ...b,
371
- bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
372
- });
373
- }
374
- }
375
- } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
376
- // Single box
377
- denormalizedBoxes.push({
378
- ...b,
379
- bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
380
- });
381
- }
382
- }
383
- drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
384
- }
385
- };
386
- draw();
387
- const interval = setInterval(draw, 100);
388
- // Redraw on window resize
389
- const handleResize = () => draw();
390
- window.addEventListener('resize', handleResize);
391
- return () => {
392
- clearInterval(interval);
393
- window.removeEventListener('resize', handleResize);
394
- };
395
- }, [overlayVideoRef, canvasRef]);
396
-
397
- // Drawing loop: draws the latest YOLOv8 boxes every frame
398
- React.useEffect(() => {
399
- let running = true;
400
- function drawLoop() {
401
- if (!running) return;
402
- const overlayVideo = overlayVideoRef.current;
403
- const canvas = canvasRef.current;
404
- const processingVideo = processingVideoRef.current;
405
- if (canvas && overlayVideo && processingVideo) {
406
- // Set canvas size to match the visible video
407
- canvas.width = overlayVideo.clientWidth;
408
- canvas.height = overlayVideo.clientHeight;
409
- const ctx = canvas.getContext('2d');
410
- if (ctx) {
411
- ctx.clearRect(0, 0, canvas.width, canvas.height);
412
- // Draw all YOLOv8 boxes from last detection
413
- const yoloBoxes = lastYoloBoxesRef.current;
414
- yoloBoxes.forEach((obj: any) => {
415
- // Scale from YOLOv8 input size to canvas size
416
- const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
417
- const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
418
- const [x1, y1, x2, y2] = obj.bbox;
419
- const drawX = x1 * scaleX;
420
- const drawY = y1 * scaleY;
421
- const drawW = (x2 - x1) * scaleX;
422
- const drawH = (y2 - y1) * scaleY;
423
- ctx.strokeStyle = '#00FFFF';
424
- ctx.lineWidth = 5;
425
- ctx.strokeRect(drawX, drawY, drawW, drawH);
426
- ctx.font = 'bold 22px Arial';
427
- // Draw YOLOv8 label and confidence
428
- const yoloLabel = obj.label || '';
429
- const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
430
- const yoloText = `${yoloLabel}${yoloScore}`;
431
- ctx.fillStyle = 'rgba(0,0,0,0.7)';
432
- const yoloTextWidth = ctx.measureText(yoloText).width + 8;
433
- ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
434
- ctx.fillStyle = '#00FFFF';
435
- ctx.fillText(yoloText, drawX, drawY - 4);
436
- // Draw FastVLM annotation below the box if available
437
- if (hasAnnotation(obj)) {
438
- ctx.font = 'bold 18px Arial';
439
- ctx.fillStyle = 'rgba(0,0,0,0.7)';
440
- const annTextWidth = ctx.measureText(obj.annotation).width + 8;
441
- ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
442
- ctx.fillStyle = '#00FFFF';
443
- ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
444
- }
445
- });
446
- }
447
- }
448
- requestAnimationFrame(drawLoop);
449
- }
450
- drawLoop();
451
- return () => { running = false; };
452
- }, [overlayVideoRef, canvasRef, processingVideoRef]);
453
-
454
- // YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
455
- const yoloDetectionLoop = async () => {
456
- if (!processingLoopRef.current) return;
457
- if (isYoloBusy) {
458
- requestAnimationFrame(yoloDetectionLoop);
459
- return;
460
- }
461
- isYoloBusy = true;
462
- try {
463
- const processingVideo = processingVideoRef.current;
464
- if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
465
- isYoloBusy = false;
466
- requestAnimationFrame(yoloDetectionLoop);
467
- return;
468
- }
469
- // Run YOLOv8 detection
470
- const session = await loadYoloModel();
471
- const inputTensor = preprocessFrameToTensor(processingVideo);
472
- const feeds: Record<string, ort.Tensor> = {};
473
- feeds[session.inputNames[0]] = inputTensor;
474
- const results = await session.run(feeds);
475
- const output = results[session.outputNames[0]];
476
- const detections = postprocessYoloOutput(output);
477
- lastYoloBoxesRef.current = detections;
478
- // Run FastVLM on the full frame (wait for YOLOv8 to finish)
479
- await runInference(processingVideo, prompt, (output: string) => {
480
- setDebugOutput(output);
481
- });
482
- } catch (err) {
483
- console.error('YOLOv8+FastVLM error:', err);
484
- } finally {
485
- isYoloBusy = false;
486
- requestAnimationFrame(yoloDetectionLoop);
487
- }
488
- };
489
-
490
- // Add this effect after the processing loop and toggle handlers
491
- useEffect(() => {
492
- // Stop processing loop on video source change or processing toggle
493
- processingLoopRef.current = false;
494
- // Start processing loop for the correct video after refs update
495
- setTimeout(() => {
496
- if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
497
- processingLoopRef.current = true;
498
- yoloDetectionLoop();
499
- } else if (exampleProcessing && !uploadedFile) {
500
- processingLoopRef.current = true;
501
- yoloDetectionLoop();
502
- }
503
- }, 100);
504
- // eslint-disable-next-line
505
- }, [uploadedFile, videoProcessing, exampleProcessing]);
506
-
507
- return (
508
- <div className="absolute inset-0 text-white">
509
- <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
510
- {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
511
- </div>
512
- <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
513
- <div className="flex flex-col items-center justify-center h-full w-full">
514
- {/* Mode Selector */}
515
- <div className="mb-6">
516
- <div className="flex space-x-4">
517
- {MODES.map((m) => (
518
- <button
519
- key={m}
520
- className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
521
- mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
522
- }`}
523
- onClick={() => setMode(m)}
524
- >
525
- {m}
526
- </button>
527
- ))}
528
- </div>
529
- </div>
530
-
531
- {/* Mode Content */}
532
- <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
533
- {mode === "File" && (
534
- <div className="w-full text-center flex flex-col items-center">
535
- <div className="mb-4 w-full max-w-xl">
536
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
537
- <textarea
538
- className="w-full p-2 rounded-lg text-black"
539
- rows={3}
540
- value={prompt}
541
- onChange={(e) => setPrompt(e.target.value)}
542
- />
543
- </div>
544
- <div className="mb-4 w-full max-w-xl">
545
- <input
546
- type="file"
547
- accept="image/*,video/*"
548
- onChange={handleFileChange}
549
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
550
- />
551
- </div>
552
- {/* Add toggle button above video area */}
553
- <div className="mb-2 w-full max-w-xl flex justify-end">
554
- <button
555
- className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
556
- onClick={() => setShowProcessingVideo(v => !v)}
557
- type="button"
558
- >
559
- {showProcessingVideo ? 'Hide' : 'Show'} Processed Video
560
- </button>
561
- </div>
562
- {/* Show uploaded image */}
563
- {uploadedFile && isImageFile(uploadedFile) && (
564
- <div className="relative w-full max-w-xl">
565
- <img
566
- ref={imageRef}
567
- src={uploadedUrl}
568
- alt="Uploaded"
569
- className="w-full rounded-lg shadow-lg mb-2"
570
- style={{ background: "#222" }}
571
- />
572
- <canvas
573
- ref={canvasRef}
574
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
575
- style={{ zIndex: 10, pointerEvents: "none" }}
576
- />
577
- <button
578
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
579
- onClick={handleProcessImage}
580
- disabled={processing}
581
- >
582
- {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
583
- </button>
584
- </div>
585
- )}
586
- {/* Show uploaded video */}
587
- {uploadedFile && isVideoFile(uploadedFile) && (
588
- <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
589
- {/* Visible overlay video for user */}
590
- <video
591
- ref={overlayVideoRef}
592
- src={uploadedUrl}
593
- controls
594
- autoPlay
595
- loop
596
- muted
597
- playsInline
598
- className="w-full rounded-lg shadow-lg mb-2"
599
- style={{ background: "#222", display: "block" }}
600
- onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
601
- if (canvasRef.current) {
602
- canvasRef.current.width = e.currentTarget.clientWidth;
603
- canvasRef.current.height = e.currentTarget.clientHeight;
604
- }
605
- }}
606
- onResize={() => {
607
- if (canvasRef.current && overlayVideoRef.current) {
608
- canvasRef.current.width = overlayVideoRef.current.clientWidth;
609
- canvasRef.current.height = overlayVideoRef.current.clientHeight;
610
- }
611
- }}
612
- />
613
- {/* Canvas overlay */}
614
- <canvas
615
- ref={canvasRef}
616
- style={{
617
- position: "absolute",
618
- top: 0,
619
- left: 0,
620
- width: "100%",
621
- height: "100%",
622
- zIndex: 100,
623
- pointerEvents: "none",
624
- display: "block"
625
- }}
626
- width={overlayVideoRef.current?.clientWidth || 640}
627
- height={overlayVideoRef.current?.clientHeight || 480}
628
- />
629
- {/* Hidden or visible processing video for FastVLM/canvas */}
630
- <video
631
- ref={processingVideoRef}
632
- src={uploadedUrl}
633
- autoPlay
634
- loop
635
- muted
636
- playsInline
637
- style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
638
- onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
639
- />
640
- <button
641
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
642
- onClick={handleToggleVideoProcessing}
643
- >
644
- {videoProcessing ? "Stop Processing" : "Start Processing"}
645
- </button>
646
- </div>
647
- )}
648
- {/* Show example video if no file uploaded */}
649
- {!uploadedFile && (
650
- <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
651
- {/* Visible overlay video for user */}
652
- <video
653
- ref={overlayVideoRef}
654
- src={EXAMPLE_VIDEO_URL}
655
- controls
656
- autoPlay
657
- loop
658
- muted
659
- playsInline
660
- className="w-full rounded-lg shadow-lg mb-2"
661
- style={{ background: "#222", display: "block" }}
662
- />
663
- {/* Canvas overlay */}
664
- <canvas
665
- ref={canvasRef}
666
- style={{
667
- position: "absolute",
668
- top: 0,
669
- left: 0,
670
- width: "100%",
671
- height: "100%",
672
- zIndex: 100,
673
- pointerEvents: "none",
674
- display: "block"
675
- }}
676
- width={overlayVideoRef.current?.clientWidth || 640}
677
- height={overlayVideoRef.current?.clientHeight || 480}
678
- />
679
- {/* Hidden or visible processing video for FastVLM/canvas */}
680
- <video
681
- ref={processingVideoRef}
682
- src={EXAMPLE_VIDEO_URL}
683
- autoPlay
684
- loop
685
- muted
686
- playsInline
687
- style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
688
- onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
689
- />
690
- <button
691
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
692
- onClick={handleToggleExampleProcessing}
693
- >
694
- {exampleProcessing ? "Stop Processing" : "Start Processing"}
695
- </button>
696
- </div>
697
- )}
698
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
699
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
700
- <button
701
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
702
- onClick={handleTestDrawBox}
703
- >
704
- Test Draw Box
705
- </button>
706
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
707
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
708
- <div>Raw Model Output:</div>
709
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
710
- </div>
711
- </div>
712
- )}
713
- </div>
714
- </div>
715
- </div>
716
- );
 
 
 
 
717
  }
 
1
+ import * as React from "react";
2
+ import { useState, useRef, useEffect } from "react";
3
+ import { useVLMContext } from "../context/useVLMContext";
4
+ import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
5
+
6
+ const MODES = ["File"] as const;
7
+ type Mode = typeof MODES[number];
8
+
9
+ const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
10
+ const EXAMPLE_PROMPT = "Describe the video";
11
+
12
+ function isImageFile(file: File) {
13
+ return file.type.startsWith("image/");
14
+ }
15
+ function isVideoFile(file: File) {
16
+ return file.type.startsWith("video/");
17
+ }
18
+
19
+ function denormalizeBox(box: number[], width: number, height: number) {
20
+ // If all values are between 0 and 1, treat as normalized
21
+ if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
22
+ return [
23
+ box[0] * width,
24
+ box[1] * height,
25
+ box[2] * width,
26
+ box[3] * height
27
+ ];
28
+ }
29
+ return box;
30
+ }
31
+
32
+ // Add this robust fallback parser near the top
33
+ function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
34
+ // Try to parse as JSON first
35
+ try {
36
+ const parsed = JSON.parse(output);
37
+ if (Array.isArray(parsed)) {
38
+ const result: { label: string, bbox_2d: number[] }[] = [];
39
+ for (const obj of parsed) {
40
+ if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
41
+ if (Array.isArray(obj.bbox_2d[0])) {
42
+ for (const arr of obj.bbox_2d) {
43
+ if (Array.isArray(arr) && arr.length === 4) {
44
+ result.push({ label: obj.label, bbox_2d: arr });
45
+ }
46
+ }
47
+ } else if (obj.bbox_2d.length === 4) {
48
+ result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
49
+ }
50
+ }
51
+ }
52
+ if (result.length > 0) return result;
53
+ }
54
+ } catch (e) {}
55
+ // Fallback: extract all [x1, y1, x2, y2] arrays from the string
56
+ const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
57
+ const boxes: { label: string, bbox_2d: number[] }[] = [];
58
+ let match;
59
+ while ((match = boxRegex.exec(output)) !== null) {
60
+ const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
61
+ boxes.push({ label: '', bbox_2d: arr });
62
+ }
63
+ return boxes;
64
+ }
65
+
66
+ // NOTE: You must install onnxruntime-web:
67
+ // npm install onnxruntime-web
68
+ // @ts-ignore
69
+ import * as ort from 'onnxruntime-web';
70
+ // If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
71
+
72
+ // Set your YOLOv8 ONNX model URL here:
73
+ const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
74
+
75
+ // Add these constants to match the YOLOv8 input size
76
+ const YOLOV8_INPUT_WIDTH = 640;
77
+ const YOLOV8_INPUT_HEIGHT = 480;
78
+
79
+ // 1. Load the ONNX model once
80
+ let yoloSession: ort.InferenceSession | null = null;
81
+ // Add a busy flag to prevent concurrent YOLOv8 inferences
82
+ let isYoloBusy = false;
83
+ async function loadYoloModel() {
84
+ if (!yoloSession) {
85
+ yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
86
+ }
87
+ return yoloSession;
88
+ }
89
+
90
+ // COCO class names for YOLOv8
91
+ const YOLO_CLASSES: string[] = [
92
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
93
+ "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
94
+ "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
95
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
96
+ "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
97
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
98
+ "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
99
+ "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
100
+ ];
101
+
102
+ // Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
103
+ function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
104
+ const width = 640;
105
+ const height = 480;
106
+ const canvas = document.createElement('canvas');
107
+ canvas.width = width;
108
+ canvas.height = height;
109
+ const ctx = canvas.getContext('2d');
110
+ if (!ctx) throw new Error('Could not get 2D context');
111
+ ctx.drawImage(video, 0, 0, width, height);
112
+ const imageData = ctx.getImageData(0, 0, width, height);
113
+ const { data } = imageData;
114
+ // Convert to Float32Array [1,3,480,640], normalize to [0,1]
115
+ const floatData = new Float32Array(1 * 3 * height * width);
116
+ for (let i = 0; i < width * height; i++) {
117
+ floatData[i] = data[i * 4] / 255; // R
118
+ floatData[i + width * height] = data[i * 4 + 1] / 255; // G
119
+ floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
120
+ }
121
+ return new ort.Tensor('float32', floatData, [1, 3, height, width]);
122
+ }
123
+
124
+ // Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
125
+ function postprocessYoloOutput(output: ort.Tensor) {
126
+ // output.dims: [1, num_detections, 6]
127
+ const data = output.data;
128
+ const numDetections = output.dims[1];
129
+ const results = [];
130
+ for (let i = 0; i < numDetections; i++) {
131
+ const offset = i * 6;
132
+ const x1 = data[offset];
133
+ const y1 = data[offset + 1];
134
+ const x2 = data[offset + 2];
135
+ const y2 = data[offset + 3];
136
+ const score = data[offset + 4];
137
+ const classId = data[offset + 5];
138
+ if (score < 0.2) continue; // adjust threshold as needed
139
+ results.push({
140
+ bbox: [x1, y1, x2, y2],
141
+ label: YOLO_CLASSES[classId] || `class_${classId}`,
142
+ score
143
+ });
144
+ }
145
+ return results;
146
+ }
147
+
148
+ // Helper type guard for annotation
149
+ function hasAnnotation(obj: any): obj is { annotation: string } {
150
+ return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
151
+ }
152
+
153
+ export default function MultiSourceCaptioningView() {
154
+ const [mode, setMode] = useState<Mode>("File");
155
+ const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
156
+ const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
157
+ const [processing, setProcessing] = useState(false);
158
+ const [error, setError] = useState<string | null>(null);
159
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
160
+ const [uploadedUrl, setUploadedUrl] = useState<string>("");
161
+ const [videoProcessing, setVideoProcessing] = useState(false);
162
+ const [imageProcessed, setImageProcessed] = useState(false);
163
+ const [exampleProcessing, setExampleProcessing] = useState(false);
164
+ const [debugOutput, setDebugOutput] = useState<string>("");
165
+ const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
166
+ const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
167
+ const [inferenceStatus, setInferenceStatus] = useState<string>("");
168
+ const [showProcessingVideo, setShowProcessingVideo] = useState(false);
169
+
170
+ const videoRef = useRef<HTMLVideoElement | null>(null);
171
+ const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
172
+ const processingVideoRef = useRef<HTMLVideoElement | null>(null);
173
+ const canvasRef = useRef<HTMLCanvasElement | null>(null);
174
+ const imageRef = useRef<HTMLImageElement | null>(null);
175
+ const boxHistoryRef = useRef<any[]>([]);
176
+ // Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
177
+ const lastYoloBoxesRef = React.useRef<any[]>([]);
178
+ const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
179
+
180
+ // Remove videoProcessingRef and exampleProcessingRef
181
+ // Add a single processingLoopRef
182
+ const processingLoopRef = React.useRef(false);
183
+
184
+ const processVideoLoop = async () => {
185
+ if (!processingLoopRef.current) return;
186
+ if (isYoloBusy) {
187
+ // Optionally log: "Inference already running, skipping frame"
188
+ requestAnimationFrame(processVideoLoop);
189
+ return;
190
+ }
191
+ await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
192
+ // Schedule the next frame as soon as possible
193
+ requestAnimationFrame(processVideoLoop);
194
+ };
195
+ const processExampleLoop = async () => {
196
+ while (processingLoopRef.current) {
197
+ await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
198
+ await new Promise(res => setTimeout(res, 1000));
199
+ }
200
+ };
201
+
202
+ // Set your YOLOv8 ONNX backend API endpoint here:
203
+ // const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
204
+
205
+ // Add this useEffect for overlay video synchronization
206
+ useEffect(() => {
207
+ const main = videoRef.current;
208
+ const overlay = overlayVideoRef.current;
209
+ if (!main || !overlay) return;
210
+ // Sync play/pause
211
+ const onPlay = () => { if (overlay.paused) overlay.play(); };
212
+ const onPause = () => { if (!overlay.paused) overlay.pause(); };
213
+ // Sync seeking and time
214
+ const onSeekOrTime = () => {
215
+ if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
216
+ overlay.currentTime = main.currentTime;
217
+ }
218
+ };
219
+ main.addEventListener('play', onPlay);
220
+ main.addEventListener('pause', onPause);
221
+ main.addEventListener('seeked', onSeekOrTime);
222
+ main.addEventListener('timeupdate', onSeekOrTime);
223
+ // Clean up
224
+ return () => {
225
+ main.removeEventListener('play', onPlay);
226
+ main.removeEventListener('pause', onPause);
227
+ main.removeEventListener('seeked', onSeekOrTime);
228
+ main.removeEventListener('timeupdate', onSeekOrTime);
229
+ };
230
+ }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
231
+
232
+ useEffect(() => {
233
+ if ((mode === "File") && processingVideoRef.current) {
234
+ processingVideoRef.current.play().catch(() => {});
235
+ }
236
+ }, [mode, videoUrl, uploadedUrl]);
237
+
238
+ // Remove old prompt-based box extraction logic and only use the above for video frames.
239
+
240
+ const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
241
+ const file = e.target.files?.[0] || null;
242
+ setUploadedFile(file);
243
+ setUploadedUrl(file ? URL.createObjectURL(file) : "");
244
+ setError(null);
245
+ setImageProcessed(false);
246
+ setVideoProcessing(false);
247
+ setExampleProcessing(false);
248
+ };
249
+
250
+ // Webcam mode: process frames with setInterval
251
+ useEffect(() => {
252
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
253
+ processVideoLoop();
254
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
255
+
256
+ // Example video mode: process frames with setInterval
257
+ useEffect(() => {
258
+ if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
259
+ processExampleLoop();
260
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
261
+
262
+ // File mode: process uploaded image (only on button click)
263
+ const handleProcessImage = async () => {
264
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
265
+ const img = imageRef.current;
266
+ const canvas = canvasRef.current;
267
+ canvas.width = img.naturalWidth;
268
+ canvas.height = img.naturalHeight;
269
+ setCanvasDims({w:canvas.width,h:canvas.height});
270
+ setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
271
+ const ctx = canvas.getContext("2d");
272
+ if (!ctx) return;
273
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
274
+ setProcessing(true);
275
+ setError(null);
276
+ setInferenceStatus("Running inference...");
277
+ await runInference(img, prompt, (output: string) => {
278
+ setDebugOutput(output);
279
+ setInferenceStatus("Inference complete.");
280
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
281
+ let boxes = extractAllBoundingBoxes(output);
282
+ console.log("Model output:", output);
283
+ console.log("Boxes after normalization:", boxes);
284
+ console.log("Canvas size:", canvas.width, canvas.height);
285
+ if (boxes.length > 0) {
286
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
287
+ console.log("First box coords:", x1, y1, x2, y2);
288
+ }
289
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
290
+ if (Array.isArray(boxes) && boxes.length > 0) {
291
+ const scaleX = canvas.width / img.naturalWidth;
292
+ const scaleY = canvas.height / img.naturalHeight;
293
+ drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
294
+ }
295
+ setImageProcessed(true);
296
+ });
297
+ setProcessing(false);
298
+ };
299
+
300
+ // File mode: process uploaded video frames (start/stop)
301
+ const handleToggleVideoProcessing = () => {
302
+ setVideoProcessing((prev: boolean) => {
303
+ const next = !prev;
304
+ // Always stop all loops before starting
305
+ processingLoopRef.current = false;
306
+ setTimeout(() => {
307
+ if (next) {
308
+ processingLoopRef.current = true;
309
+ processVideoLoop();
310
+ }
311
+ }, 50);
312
+ return next;
313
+ });
314
+ };
315
+
316
+ // Handle start/stop for example video processing
317
+ const handleToggleExampleProcessing = () => {
318
+ setExampleProcessing((prev: boolean) => {
319
+ const next = !prev;
320
+ // Always stop all loops before starting
321
+ processingLoopRef.current = false;
322
+ setTimeout(() => {
323
+ if (next) {
324
+ processingLoopRef.current = true;
325
+ processVideoLoop();
326
+ }
327
+ }, 50);
328
+ return next;
329
+ });
330
+ };
331
+
332
+ // Test draw box function
333
+ const handleTestDrawBox = () => {
334
+ if (!canvasRef.current) return;
335
+ const canvas = canvasRef.current;
336
+ const ctx = canvas.getContext("2d");
337
+ if (!ctx) return;
338
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
339
+ ctx.strokeStyle = "#FF00FF";
340
+ ctx.lineWidth = 4;
341
+ ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
342
+ ctx.font = "20px Arial";
343
+ ctx.fillStyle = "#FF00FF";
344
+ ctx.fillText("Test Box", 50, 35);
345
+ };
346
+
347
+ useEffect(() => {
348
+ const draw = () => {
349
+ const overlayVideo = overlayVideoRef.current;
350
+ const canvas = canvasRef.current;
351
+ if (!overlayVideo || !canvas) return;
352
+ const displayWidth = overlayVideo.clientWidth;
353
+ const displayHeight = overlayVideo.clientHeight;
354
+ canvas.width = displayWidth;
355
+ canvas.height = displayHeight;
356
+ const ctx = canvas.getContext("2d");
357
+ if (!ctx) return;
358
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
359
+ const now = Date.now();
360
+ const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
361
+ if (boxHistory.length > 0) {
362
+ // Fix: Draw all boxes, even if bbox_2d is an array of arrays
363
+ const denormalizedBoxes: any[] = [];
364
+ for (const b of boxHistory) {
365
+ if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
366
+ // Multiple boxes per label
367
+ for (const arr of b.bbox_2d) {
368
+ if (Array.isArray(arr) && arr.length === 4) {
369
+ denormalizedBoxes.push({
370
+ ...b,
371
+ bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
372
+ });
373
+ }
374
+ }
375
+ } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
376
+ // Single box
377
+ denormalizedBoxes.push({
378
+ ...b,
379
+ bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
380
+ });
381
+ }
382
+ }
383
+ drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
384
+ }
385
+ };
386
+ draw();
387
+ const interval = setInterval(draw, 100);
388
+ // Redraw on window resize
389
+ const handleResize = () => draw();
390
+ window.addEventListener('resize', handleResize);
391
+ return () => {
392
+ clearInterval(interval);
393
+ window.removeEventListener('resize', handleResize);
394
+ };
395
+ }, [overlayVideoRef, canvasRef]);
396
+
397
+ // Drawing loop: draws the latest YOLOv8 boxes every frame
398
+ React.useEffect(() => {
399
+ let running = true;
400
+ function drawLoop() {
401
+ if (!running) return;
402
+ const overlayVideo = overlayVideoRef.current;
403
+ const canvas = canvasRef.current;
404
+ const processingVideo = processingVideoRef.current;
405
+ if (canvas && overlayVideo && processingVideo) {
406
+ // Set canvas size to match the visible video
407
+ canvas.width = overlayVideo.clientWidth;
408
+ canvas.height = overlayVideo.clientHeight;
409
+ const ctx = canvas.getContext('2d');
410
+ if (ctx) {
411
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
412
+ // Draw all YOLOv8 boxes from last detection
413
+ const yoloBoxes = lastYoloBoxesRef.current;
414
+ yoloBoxes.forEach((obj: any) => {
415
+ // Scale from YOLOv8 input size to canvas size
416
+ const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
417
+ const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
418
+ const [x1, y1, x2, y2] = obj.bbox;
419
+ const drawX = x1 * scaleX;
420
+ const drawY = y1 * scaleY;
421
+ const drawW = (x2 - x1) * scaleX;
422
+ const drawH = (y2 - y1) * scaleY;
423
+ ctx.strokeStyle = '#00FFFF';
424
+ ctx.lineWidth = 5;
425
+ ctx.strokeRect(drawX, drawY, drawW, drawH);
426
+ ctx.font = 'bold 22px Arial';
427
+ // Draw YOLOv8 label and confidence
428
+ const yoloLabel = obj.label || '';
429
+ const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
430
+ const yoloText = `${yoloLabel}${yoloScore}`;
431
+ ctx.fillStyle = 'rgba(0,0,0,0.7)';
432
+ const yoloTextWidth = ctx.measureText(yoloText).width + 8;
433
+ ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
434
+ ctx.fillStyle = '#00FFFF';
435
+ ctx.fillText(yoloText, drawX, drawY - 4);
436
+ // Draw FastVLM annotation below the box if available
437
+ if (hasAnnotation(obj)) {
438
+ ctx.font = 'bold 18px Arial';
439
+ ctx.fillStyle = 'rgba(0,0,0,0.7)';
440
+ const annTextWidth = ctx.measureText(obj.annotation).width + 8;
441
+ ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
442
+ ctx.fillStyle = '#00FFFF';
443
+ ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
444
+ }
445
+ });
446
+ }
447
+ }
448
+ requestAnimationFrame(drawLoop);
449
+ }
450
+ drawLoop();
451
+ return () => { running = false; };
452
+ }, [overlayVideoRef, canvasRef, processingVideoRef]);
453
+
454
+ // YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
455
+ const yoloDetectionLoop = async () => {
456
+ if (!processingLoopRef.current) return;
457
+ if (isYoloBusy) {
458
+ requestAnimationFrame(yoloDetectionLoop);
459
+ return;
460
+ }
461
+ isYoloBusy = true;
462
+ try {
463
+ const processingVideo = processingVideoRef.current;
464
+ if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
465
+ isYoloBusy = false;
466
+ requestAnimationFrame(yoloDetectionLoop);
467
+ return;
468
+ }
469
+ // Run YOLOv8 detection
470
+ const session = await loadYoloModel();
471
+ const inputTensor = preprocessFrameToTensor(processingVideo);
472
+ const feeds: Record<string, ort.Tensor> = {};
473
+ feeds[session.inputNames[0]] = inputTensor;
474
+ const results = await session.run(feeds);
475
+ const output = results[session.outputNames[0]];
476
+ const detections = postprocessYoloOutput(output);
477
+ lastYoloBoxesRef.current = detections;
478
+ // Run FastVLM on the full frame (wait for YOLOv8 to finish)
479
+ await runInference(processingVideo, prompt, (output: string) => {
480
+ setDebugOutput(output);
481
+ });
482
+ } catch (err) {
483
+ console.error('YOLOv8+FastVLM error:', err);
484
+ } finally {
485
+ isYoloBusy = false;
486
+ requestAnimationFrame(yoloDetectionLoop);
487
+ }
488
+ };
489
+
490
+ // Add this effect after the processing loop and toggle handlers
491
+ useEffect(() => {
492
+ // Stop processing loop on video source change or processing toggle
493
+ processingLoopRef.current = false;
494
+ // Start processing loop for the correct video after refs update
495
+ setTimeout(() => {
496
+ if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
497
+ processingLoopRef.current = true;
498
+ yoloDetectionLoop();
499
+ } else if (exampleProcessing && !uploadedFile) {
500
+ processingLoopRef.current = true;
501
+ yoloDetectionLoop();
502
+ }
503
+ }, 100);
504
+ // eslint-disable-next-line
505
+ }, [uploadedFile, videoProcessing, exampleProcessing]);
506
+
507
+ return (
508
+ <div className="absolute inset-0 text-white">
509
+ <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
510
+ {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
511
+ </div>
512
+ <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
513
+ <div className="flex flex-col items-center justify-center h-full w-full">
514
+ {/* Mode Selector */}
515
+ <div className="mb-6">
516
+ <div className="flex space-x-4">
517
+ {MODES.map((m) => (
518
+ <button
519
+ key={m}
520
+ className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
521
+ mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
522
+ }`}
523
+ onClick={() => setMode(m)}
524
+ >
525
+ {m}
526
+ </button>
527
+ ))}
528
+ </div>
529
+ </div>
530
+
531
+ {/* Mode Content */}
532
+ <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
533
+ {mode === "File" && (
534
+ <div className="w-full text-center flex flex-col items-center">
535
+ <div className="mb-4 w-full max-w-xl">
536
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
537
+ <textarea
538
+ className="w-full p-2 rounded-lg text-black"
539
+ rows={3}
540
+ value={prompt}
541
+ onChange={(e) => setPrompt(e.target.value)}
542
+ />
543
+ </div>
544
+ <div className="mb-4 w-full max-w-xl">
545
+ <input
546
+ type="file"
547
+ accept="image/*,video/*"
548
+ onChange={handleFileChange}
549
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
550
+ />
551
+ </div>
552
+ {/* Add toggle button above video area */}
553
+ <div className="mb-2 w-full max-w-xl flex justify-end">
554
+ <button
555
+ className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
556
+ onClick={() => setShowProcessingVideo(v => !v)}
557
+ type="button"
558
+ >
559
+ {showProcessingVideo ? 'Hide' : 'Show'} Processed Video
560
+ </button>
561
+ </div>
562
+ {/* Show uploaded image */}
563
+ {uploadedFile && isImageFile(uploadedFile) && (
564
+ <div className="relative w-full max-w-xl">
565
+ <img
566
+ ref={imageRef}
567
+ src={uploadedUrl}
568
+ alt="Uploaded"
569
+ className="w-full rounded-lg shadow-lg mb-2"
570
+ style={{ background: "#222" }}
571
+ />
572
+ <canvas
573
+ ref={canvasRef}
574
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
575
+ style={{ zIndex: 10, pointerEvents: "none" }}
576
+ />
577
+ <button
578
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
579
+ onClick={handleProcessImage}
580
+ disabled={processing}
581
+ >
582
+ {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
583
+ </button>
584
+ </div>
585
+ )}
586
+ {/* Show uploaded video */}
587
+ {uploadedFile && isVideoFile(uploadedFile) && (
588
+ <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
589
+ {/* Visible overlay video for user */}
590
+ <video
591
+ ref={overlayVideoRef}
592
+ src={uploadedUrl}
593
+ controls
594
+ autoPlay
595
+ loop
596
+ muted
597
+ playsInline
598
+ className="w-full rounded-lg shadow-lg mb-2"
599
+ style={{ background: "#222", display: "block" }}
600
+ crossOrigin="anonymous"
601
+ onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
602
+ if (canvasRef.current) {
603
+ canvasRef.current.width = e.currentTarget.clientWidth;
604
+ canvasRef.current.height = e.currentTarget.clientHeight;
605
+ }
606
+ }}
607
+ onResize={() => {
608
+ if (canvasRef.current && overlayVideoRef.current) {
609
+ canvasRef.current.width = overlayVideoRef.current.clientWidth;
610
+ canvasRef.current.height = overlayVideoRef.current.clientHeight;
611
+ }
612
+ }}
613
+ />
614
+ {/* Canvas overlay */}
615
+ <canvas
616
+ ref={canvasRef}
617
+ style={{
618
+ position: "absolute",
619
+ top: 0,
620
+ left: 0,
621
+ width: "100%",
622
+ height: "100%",
623
+ zIndex: 100,
624
+ pointerEvents: "none",
625
+ display: "block"
626
+ }}
627
+ width={overlayVideoRef.current?.clientWidth || 640}
628
+ height={overlayVideoRef.current?.clientHeight || 480}
629
+ />
630
+ {/* Hidden or visible processing video for FastVLM/canvas */}
631
+ <video
632
+ ref={processingVideoRef}
633
+ src={uploadedUrl}
634
+ autoPlay
635
+ loop
636
+ muted
637
+ playsInline
638
+ crossOrigin="anonymous"
639
+ style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
640
+ onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
641
+ />
642
+ <button
643
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
644
+ onClick={handleToggleVideoProcessing}
645
+ >
646
+ {videoProcessing ? "Stop Processing" : "Start Processing"}
647
+ </button>
648
+ </div>
649
+ )}
650
+ {/* Show example video if no file uploaded */}
651
+ {!uploadedFile && (
652
+ <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
653
+ {/* Visible overlay video for user */}
654
+ <video
655
+ ref={overlayVideoRef}
656
+ src={EXAMPLE_VIDEO_URL}
657
+ controls
658
+ autoPlay
659
+ loop
660
+ muted
661
+ playsInline
662
+ className="w-full rounded-lg shadow-lg mb-2"
663
+ style={{ background: "#222", display: "block" }}
664
+ crossOrigin="anonymous"
665
+ />
666
+ {/* Canvas overlay */}
667
+ <canvas
668
+ ref={canvasRef}
669
+ style={{
670
+ position: "absolute",
671
+ top: 0,
672
+ left: 0,
673
+ width: "100%",
674
+ height: "100%",
675
+ zIndex: 100,
676
+ pointerEvents: "none",
677
+ display: "block"
678
+ }}
679
+ width={overlayVideoRef.current?.clientWidth || 640}
680
+ height={overlayVideoRef.current?.clientHeight || 480}
681
+ />
682
+ {/* Hidden or visible processing video for FastVLM/canvas */}
683
+ <video
684
+ ref={processingVideoRef}
685
+ src={EXAMPLE_VIDEO_URL}
686
+ autoPlay
687
+ loop
688
+ muted
689
+ playsInline
690
+ crossOrigin="anonymous"
691
+ style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
692
+ onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
693
+ />
694
+ <button
695
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
696
+ onClick={handleToggleExampleProcessing}
697
+ >
698
+ {exampleProcessing ? "Stop Processing" : "Start Processing"}
699
+ </button>
700
+ </div>
701
+ )}
702
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
703
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
704
+ <button
705
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
706
+ onClick={handleTestDrawBox}
707
+ >
708
+ Test Draw Box
709
+ </button>
710
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
711
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
712
+ <div>Raw Model Output:</div>
713
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
714
+ </div>
715
+ </div>
716
+ )}
717
+ </div>
718
+ </div>
719
+ </div>
720
+ );
721
  }