Quazim0t0 commited on
Commit
d5e70e6
·
verified ·
1 Parent(s): c53d679

Upload 51 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,520 +1,609 @@
1
- import { useState, useRef, useEffect } from "react";
2
- import { useVLMContext } from "../context/useVLMContext";
3
- import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
-
5
- const MODES = ["Webcam", "URL", "File"] as const;
6
- type Mode = typeof MODES[number];
7
-
8
- const EXAMPLE_VIDEO_URL = "https://huggingface.co/spaces/Quazim0t0/test/resolve/main/videos/1.mp4";
9
- const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values.DO NOT USE THESE EXACT VALUES, THIS IS JUST AN Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
-
11
- function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
- if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
- const label = arr[0];
14
- return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
- }
16
- return [];
17
- }
18
-
19
- function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
- if (!raw) return [];
21
- let boxes = [];
22
- if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
23
- boxes = raw.image;
24
- } else if (Array.isArray(raw)) {
25
- boxes = raw;
26
- } else if (typeof raw === "object" && raw !== null) {
27
- boxes = [raw];
28
- }
29
- return boxes
30
- .map((obj: any) => {
31
- if (!obj || !obj.bbox_2d) return null;
32
- let bbox = obj.bbox_2d;
33
- // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
34
- if (
35
- Array.isArray(bbox) &&
36
- bbox.length === 2 &&
37
- Array.isArray(bbox[0]) &&
38
- Array.isArray(bbox[1]) &&
39
- bbox[0].length === 2 &&
40
- bbox[1].length === 2
41
- ) {
42
- bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
- }
44
- // If bbox_2d is [x1, y1, x2, y2], use as-is
45
- if (
46
- Array.isArray(bbox) &&
47
- bbox.length === 4 &&
48
- bbox.every((v: any) => typeof v === "number")
49
- ) {
50
- return { ...obj, bbox_2d: bbox };
51
- }
52
- // Otherwise, skip
53
- return null;
54
- })
55
- .filter((obj: any) => obj);
56
- }
57
-
58
- function isImageFile(file: File) {
59
- return file.type.startsWith("image/");
60
- }
61
- function isVideoFile(file: File) {
62
- return file.type.startsWith("video/");
63
- }
64
-
65
- export default function MultiSourceCaptioningView() {
66
- const [mode, setMode] = useState<Mode>("File");
67
- const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
68
- const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
69
- const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
70
- const [processing, setProcessing] = useState(false);
71
- const [error, setError] = useState<string | null>(null);
72
- const [webcamActive, setWebcamActive] = useState(false);
73
- const [uploadedFile, setUploadedFile] = useState<File | null>(null);
74
- const [uploadedUrl, setUploadedUrl] = useState<string>("");
75
- const [videoProcessing, setVideoProcessing] = useState(false);
76
- const [imageProcessed, setImageProcessed] = useState(false);
77
- const [exampleProcessing, setExampleProcessing] = useState(false);
78
- const [urlProcessing, setUrlProcessing] = useState(false);
79
- const [debugOutput, setDebugOutput] = useState<string>("");
80
- const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
- const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
- const [inferenceStatus, setInferenceStatus] = useState<string>("");
83
-
84
- const videoRef = useRef<HTMLVideoElement | null>(null);
85
- const canvasRef = useRef<HTMLCanvasElement | null>(null);
86
- const imageRef = useRef<HTMLImageElement | null>(null);
87
- const webcamStreamRef = useRef<MediaStream | null>(null);
88
- const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
89
-
90
- const processVideoFrame = async () => {
91
- if (!videoRef.current || !canvasRef.current) return;
92
- const video = videoRef.current;
93
- const canvas = canvasRef.current;
94
- if (video.paused || video.ended || video.videoWidth === 0) return;
95
- canvas.width = video.videoWidth;
96
- canvas.height = video.videoHeight;
97
- const ctx = canvas.getContext("2d");
98
- if (!ctx) return;
99
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
100
- await runInference(video, prompt, (output: string) => {
101
- setDebugOutput(output); // <-- Ensure Raw Model Output is updated
102
- let boxes = extractJsonFromMarkdown(output) || [];
103
- if (boxes.length === 0 && Array.isArray(output)) {
104
- boxes = parseFlatBoxArray(output);
105
- }
106
- boxes = normalizeBoxes(boxes);
107
- console.log("Model output:", output);
108
- console.log("Boxes after normalization:", boxes);
109
- console.log("Canvas size:", canvas.width, canvas.height);
110
- if (boxes.length > 0) {
111
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
112
- console.log("First box coords:", x1, y1, x2, y2);
113
- }
114
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
115
- if (Array.isArray(boxes) && boxes.length > 0) {
116
- const scaleX = canvas.width / video.videoWidth;
117
- const scaleY = canvas.height / video.videoHeight;
118
- ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
119
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
120
- }
121
- });
122
- };
123
-
124
- const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
125
- const file = e.target.files?.[0] || null;
126
- setUploadedFile(file);
127
- setUploadedUrl(file ? URL.createObjectURL(file) : "");
128
- setError(null);
129
- setImageProcessed(false);
130
- setVideoProcessing(false);
131
- setExampleProcessing(false);
132
- };
133
-
134
- // Webcam setup and teardown (unchanged)
135
- useEffect(() => {
136
- if (mode !== "Webcam") {
137
- if (webcamStreamRef.current) {
138
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
139
- webcamStreamRef.current = null;
140
- }
141
- setWebcamActive(false);
142
- return;
143
- }
144
- const setupWebcam = async () => {
145
- try {
146
- setError(null);
147
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
148
- webcamStreamRef.current = stream;
149
- if (videoRef.current) {
150
- videoRef.current.srcObject = stream;
151
- setWebcamActive(true);
152
- }
153
- } catch (e) {
154
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
155
- setWebcamActive(false);
156
- }
157
- };
158
- setupWebcam();
159
- return () => {
160
- if (webcamStreamRef.current) {
161
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
162
- webcamStreamRef.current = null;
163
- }
164
- setWebcamActive(false);
165
- };
166
- }, [mode]);
167
-
168
- // Webcam mode: process frames with setInterval
169
- useEffect(() => {
170
- if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
171
- let interval: ReturnType<typeof setInterval> | null = null;
172
- interval = setInterval(() => {
173
- processVideoFrame();
174
- }, 1000);
175
- return () => {
176
- if (interval) clearInterval(interval);
177
- };
178
- }, [mode, isLoaded, prompt, runInference, webcamActive]);
179
-
180
- // URL mode: process frames with setInterval
181
- useEffect(() => {
182
- if (mode !== "URL" || !isLoaded || !urlProcessing) return;
183
- let interval: ReturnType<typeof setInterval> | null = null;
184
- interval = setInterval(() => {
185
- processVideoFrame();
186
- }, 1000);
187
- return () => {
188
- if (interval) clearInterval(interval);
189
- };
190
- }, [mode, isLoaded, prompt, runInference, urlProcessing]);
191
-
192
- // File video mode: process frames with setInterval
193
- useEffect(() => {
194
- if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
195
- let interval: ReturnType<typeof setInterval> | null = null;
196
- interval = setInterval(() => {
197
- processVideoFrame();
198
- }, 1000);
199
- return () => {
200
- if (interval) clearInterval(interval);
201
- };
202
- }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
203
-
204
- // Example video mode: process frames with setInterval
205
- useEffect(() => {
206
- if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
207
- let interval: ReturnType<typeof setInterval> | null = null;
208
- interval = setInterval(() => {
209
- processVideoFrame();
210
- }, 1000);
211
- return () => {
212
- if (interval) clearInterval(interval);
213
- };
214
- }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
215
-
216
- // File mode: process uploaded image (only on button click)
217
- const handleProcessImage = async () => {
218
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
219
- const img = imageRef.current;
220
- const canvas = canvasRef.current;
221
- canvas.width = img.naturalWidth;
222
- canvas.height = img.naturalHeight;
223
- setCanvasDims({w:canvas.width,h:canvas.height});
224
- setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
225
- const ctx = canvas.getContext("2d");
226
- if (!ctx) return;
227
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
228
- setProcessing(true);
229
- setError(null);
230
- setInferenceStatus("Running inference...");
231
- await runInference(img, prompt, (output: string) => {
232
- setDebugOutput(output);
233
- setInferenceStatus("Inference complete.");
234
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
235
- let boxes = extractJsonFromMarkdown(output) || [];
236
- if (boxes.length === 0 && Array.isArray(output)) {
237
- boxes = parseFlatBoxArray(output);
238
- }
239
- boxes = normalizeBoxes(boxes);
240
- console.log("Model output:", output);
241
- console.log("Boxes after normalization:", boxes);
242
- console.log("Canvas size:", canvas.width, canvas.height);
243
- if (boxes.length > 0) {
244
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
245
- console.log("First box coords:", x1, y1, x2, y2);
246
- }
247
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
248
- if (Array.isArray(boxes) && boxes.length > 0) {
249
- const scaleX = canvas.width / img.naturalWidth;
250
- const scaleY = canvas.height / img.naturalHeight;
251
- drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
252
- }
253
- setImageProcessed(true);
254
- });
255
- setProcessing(false);
256
- };
257
-
258
- // File mode: process uploaded video frames (start/stop)
259
- const handleToggleVideoProcessing = () => {
260
- setVideoProcessing((prev) => !prev);
261
- };
262
-
263
- // Handle start/stop for example video processing
264
- const handleToggleExampleProcessing = () => {
265
- setExampleProcessing((prev) => !prev);
266
- };
267
-
268
- // Handle start/stop for URL video processing
269
- const handleToggleUrlProcessing = () => {
270
- setUrlProcessing((prev) => !prev);
271
- };
272
-
273
- // Test draw box function
274
- const handleTestDrawBox = () => {
275
- if (!canvasRef.current) return;
276
- const canvas = canvasRef.current;
277
- const ctx = canvas.getContext("2d");
278
- if (!ctx) return;
279
- ctx.clearRect(0, 0, canvas.width, canvas.height);
280
- ctx.strokeStyle = "#FF00FF";
281
- ctx.lineWidth = 4;
282
- ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
283
- ctx.font = "20px Arial";
284
- ctx.fillStyle = "#FF00FF";
285
- ctx.fillText("Test Box", 50, 35);
286
- };
287
-
288
- return (
289
- <div className="absolute inset-0 text-white">
290
- <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
291
- {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
292
- </div>
293
- <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
294
- <div className="flex flex-col items-center justify-center h-full w-full">
295
- {/* Mode Selector */}
296
- <div className="mb-6">
297
- <div className="flex space-x-4">
298
- {MODES.map((m) => (
299
- <button
300
- key={m}
301
- className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
302
- mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
303
- }`}
304
- onClick={() => setMode(m)}
305
- >
306
- {m}
307
- </button>
308
- ))}
309
- </div>
310
- </div>
311
-
312
- {/* Mode Content */}
313
- <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
314
- {mode === "Webcam" && (
315
- <div className="w-full text-center flex flex-col items-center">
316
- <div className="mb-4 w-full max-w-xl">
317
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
318
- <textarea
319
- className="w-full p-2 rounded-lg text-black"
320
- rows={3}
321
- value={prompt}
322
- onChange={(e) => setPrompt(e.target.value)}
323
- />
324
- </div>
325
- <div className="relative w-full max-w-xl">
326
- <video
327
- ref={videoRef}
328
- autoPlay
329
- muted
330
- playsInline
331
- className="w-full rounded-lg shadow-lg mb-2"
332
- style={{ background: "#222" }}
333
- />
334
- <canvas
335
- ref={canvasRef}
336
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
337
- style={{ zIndex: 10, pointerEvents: "none" }}
338
- />
339
- </div>
340
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
341
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
342
- </div>
343
- )}
344
- {mode === "URL" && (
345
- <div className="w-full text-center flex flex-col items-center">
346
- <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
347
- <div className="flex w-full max-w-xl mb-4">
348
- <input
349
- type="text"
350
- className="flex-1 px-4 py-2 rounded-l-lg text-black"
351
- value={inputUrl}
352
- onChange={(e) => setInputUrl(e.target.value)}
353
- placeholder="Paste video URL here"
354
- />
355
- <button
356
- className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
357
- onClick={() => setVideoUrl(inputUrl)}
358
- >
359
- Load
360
- </button>
361
- </div>
362
- <div className="mb-4 w-full max-w-xl">
363
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
364
- <textarea
365
- className="w-full p-2 rounded-lg text-black"
366
- rows={3}
367
- value={prompt}
368
- onChange={(e) => setPrompt(e.target.value)}
369
- />
370
- </div>
371
- <div className="relative w-full max-w-xl">
372
- <video
373
- ref={videoRef}
374
- src={videoUrl}
375
- controls
376
- autoPlay
377
- loop
378
- className="w-full rounded-lg shadow-lg mb-2"
379
- style={{ background: "#222" }}
380
- />
381
- <canvas
382
- ref={canvasRef}
383
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
384
- style={{ zIndex: 10, pointerEvents: "none" }}
385
- />
386
- <button
387
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
388
- onClick={handleToggleUrlProcessing}
389
- >
390
- {urlProcessing ? "Stop Processing" : "Start Processing"}
391
- </button>
392
- </div>
393
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
394
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
395
- <button
396
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
397
- onClick={handleTestDrawBox}
398
- >
399
- Test Draw Box
400
- </button>
401
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
402
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
403
- <div>Raw Model Output:</div>
404
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
405
- </div>
406
- </div>
407
- )}
408
- {mode === "File" && (
409
- <div className="w-full text-center flex flex-col items-center">
410
- <div className="mb-4 w-full max-w-xl">
411
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
412
- <textarea
413
- className="w-full p-2 rounded-lg text-black"
414
- rows={3}
415
- value={prompt}
416
- onChange={(e) => setPrompt(e.target.value)}
417
- />
418
- </div>
419
- <div className="mb-4 w-full max-w-xl">
420
- <input
421
- type="file"
422
- accept="image/*,video/*"
423
- onChange={handleFileChange}
424
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
425
- />
426
- </div>
427
- {/* Show uploaded image */}
428
- {uploadedFile && isImageFile(uploadedFile) && (
429
- <div className="relative w-full max-w-xl">
430
- <img
431
- ref={imageRef}
432
- src={uploadedUrl}
433
- alt="Uploaded"
434
- className="w-full rounded-lg shadow-lg mb-2"
435
- style={{ background: "#222" }}
436
- />
437
- <canvas
438
- ref={canvasRef}
439
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
440
- style={{ zIndex: 10, pointerEvents: "none" }}
441
- />
442
- <button
443
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
444
- onClick={handleProcessImage}
445
- disabled={processing}
446
- >
447
- {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
448
- </button>
449
- </div>
450
- )}
451
- {/* Show uploaded video */}
452
- {uploadedFile && isVideoFile(uploadedFile) && (
453
- <div className="relative w-full max-w-xl">
454
- <video
455
- ref={videoRef}
456
- src={uploadedUrl}
457
- controls
458
- autoPlay
459
- loop
460
- className="w-full rounded-lg shadow-lg mb-2"
461
- style={{ background: "#222" }}
462
- />
463
- <canvas
464
- ref={canvasRef}
465
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
466
- style={{ zIndex: 10, pointerEvents: "none" }}
467
- />
468
- <button
469
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
470
- onClick={handleToggleVideoProcessing}
471
- >
472
- {videoProcessing ? "Stop Processing" : "Start Processing"}
473
- </button>
474
- </div>
475
- )}
476
- {/* Show example video if no file uploaded */}
477
- {!uploadedFile && (
478
- <div className="relative w-full max-w-xl">
479
- <video
480
- ref={videoRef}
481
- src={EXAMPLE_VIDEO_URL}
482
- controls
483
- autoPlay
484
- loop
485
- className="w-full rounded-lg shadow-lg mb-2"
486
- style={{ background: "#222" }}
487
- />
488
- <canvas
489
- ref={canvasRef}
490
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
491
- style={{ zIndex: 10, pointerEvents: "none" }}
492
- />
493
- <button
494
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
495
- onClick={handleToggleExampleProcessing}
496
- >
497
- {exampleProcessing ? "Stop Processing" : "Start Processing"}
498
- </button>
499
- </div>
500
- )}
501
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
502
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
503
- <button
504
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
505
- onClick={handleTestDrawBox}
506
- >
507
- Test Draw Box
508
- </button>
509
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
510
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
511
- <div>Raw Model Output:</div>
512
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
513
- </div>
514
- </div>
515
- )}
516
- </div>
517
- </div>
518
- </div>
519
- );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  }
 
1
+ import React, { useState, useRef, useEffect } from "react";
2
+ import { useVLMContext } from "../context/useVLMContext";
3
+ import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
+
5
+ const MODES = ["Webcam", "URL", "File"] as const;
6
+ type Mode = typeof MODES[number];
7
+
8
+ const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
9
+ const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
+
11
+ function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
+ if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
+ const label = arr[0];
14
+ return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
+ }
16
+ return [];
17
+ }
18
+
19
+ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
+ if (!raw) return [];
21
+ let boxes = [];
22
+ if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
23
+ boxes = raw.image;
24
+ } else if (Array.isArray(raw)) {
25
+ boxes = raw;
26
+ } else if (typeof raw === "object" && raw !== null) {
27
+ boxes = [raw];
28
+ }
29
+ return boxes
30
+ .map((obj: any) => {
31
+ if (!obj || !obj.bbox_2d) return null;
32
+ let bbox = obj.bbox_2d;
33
+ // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
34
+ if (
35
+ Array.isArray(bbox) &&
36
+ bbox.length === 2 &&
37
+ Array.isArray(bbox[0]) &&
38
+ Array.isArray(bbox[1]) &&
39
+ bbox[0].length === 2 &&
40
+ bbox[1].length === 2
41
+ ) {
42
+ bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
+ }
44
+ // If bbox_2d is [x1, y1, x2, y2], use as-is
45
+ if (
46
+ Array.isArray(bbox) &&
47
+ bbox.length === 4 &&
48
+ bbox.every((v: any) => typeof v === "number")
49
+ ) {
50
+ return { ...obj, bbox_2d: bbox };
51
+ }
52
+ // Otherwise, skip
53
+ return null;
54
+ })
55
+ .filter((obj: any) => obj);
56
+ }
57
+
58
+ function isImageFile(file: File) {
59
+ return file.type.startsWith("image/");
60
+ }
61
+ function isVideoFile(file: File) {
62
+ return file.type.startsWith("video/");
63
+ }
64
+
65
+ export default function MultiSourceCaptioningView() {
66
+ const [mode, setMode] = useState<Mode>("File");
67
+ const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
68
+ const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
69
+ const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
70
+ const [processing, setProcessing] = useState(false);
71
+ const [error, setError] = useState<string | null>(null);
72
+ const [webcamActive, setWebcamActive] = useState(false);
73
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
74
+ const [uploadedUrl, setUploadedUrl] = useState<string>("");
75
+ const [videoProcessing, setVideoProcessing] = useState(false);
76
+ const [imageProcessed, setImageProcessed] = useState(false);
77
+ const [exampleProcessing, setExampleProcessing] = useState(false);
78
+ const [urlProcessing, setUrlProcessing] = useState(false);
79
+ const [debugOutput, setDebugOutput] = useState<string>("");
80
+ const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
+ const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
+ const [inferenceStatus, setInferenceStatus] = useState<string>("");
83
+
84
+ const videoRef = useRef<HTMLVideoElement | null>(null);
85
+ const overlayVideoRef = useRef<HTMLVideoElement | null>(null); // NEW: overlay video
86
+ const canvasRef = useRef<HTMLCanvasElement | null>(null);
87
+ const imageRef = useRef<HTMLImageElement | null>(null);
88
+ const webcamStreamRef = useRef<MediaStream | null>(null);
89
+ const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
90
+
91
+ // Persistent boxes state: {boxes: [...], timestamp: number}
92
+ const [persistentBoxes, setPersistentBoxes] = useState<{boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]>([]);
93
+ const BOX_PERSIST_MS = 2000; // 2 seconds
94
+
95
+ // Helper: Add new boxes with timestamp
96
+ const addBoxesWithTimestamp = (boxes: {label: string, bbox_2d: number[]}[]) => {
97
+ if (!boxes || boxes.length === 0) return;
98
+ setPersistentBoxes((prev: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]) => [
99
+ ...prev.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => Date.now() - entry.timestamp < BOX_PERSIST_MS),
100
+ { boxes, timestamp: Date.now() }
101
+ ]);
102
+ };
103
+
104
+ // Helper: Get all boxes from last 2 seconds
105
+ const getCurrentBoxes = () => {
106
+ const now = Date.now();
107
+ return persistentBoxes
108
+ .filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => now - entry.timestamp < BOX_PERSIST_MS)
109
+ .flatMap((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => entry.boxes);
110
+ };
111
+
112
+ // Synchronize overlay video with main video
113
+ useEffect(() => {
114
+ const main = videoRef.current;
115
+ const overlay = overlayVideoRef.current;
116
+ if (!main || !overlay) return;
117
+ // Sync play/pause
118
+ const syncPlay = () => { if (main.paused !== overlay.paused) main.paused ? overlay.pause() : overlay.play(); };
119
+ main.addEventListener('play', () => overlay.play());
120
+ main.addEventListener('pause', () => overlay.pause());
121
+ // Sync seeking
122
+ const syncTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) overlay.currentTime = main.currentTime; };
123
+ main.addEventListener('seeked', syncTime);
124
+ main.addEventListener('timeupdate', syncTime);
125
+ // Clean up
126
+ return () => {
127
+ main.removeEventListener('play', () => overlay.play());
128
+ main.removeEventListener('pause', () => overlay.pause());
129
+ main.removeEventListener('seeked', syncTime);
130
+ main.removeEventListener('timeupdate', syncTime);
131
+ };
132
+ }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
133
+
134
+ // Update: processVideoFrame now adds boxes to persistentBoxes
135
+ const processVideoFrame = async () => {
136
+ if (!videoRef.current || !canvasRef.current) return;
137
+ const video = videoRef.current;
138
+ const canvas = canvasRef.current;
139
+ if (video.paused || video.ended || video.videoWidth === 0) return;
140
+ canvas.width = video.videoWidth;
141
+ canvas.height = video.videoHeight;
142
+ const ctx = canvas.getContext("2d");
143
+ if (!ctx) return;
144
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
145
+ await runInference(video, prompt, (output: string) => {
146
+ setDebugOutput(output);
147
+ let boxes = extractJsonFromMarkdown(output) || [];
148
+ if (boxes.length === 0 && Array.isArray(output)) {
149
+ boxes = parseFlatBoxArray(output);
150
+ }
151
+ boxes = normalizeBoxes(boxes);
152
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
153
+ if (Array.isArray(boxes) && boxes.length > 0) {
154
+ addBoxesWithTimestamp(boxes); // <-- Add to persistent state
155
+ }
156
+ });
157
+ };
158
+
159
+ // Draw persistent boxes on every frame
160
+ useEffect(() => {
161
+ const draw = () => {
162
+ if (!videoRef.current || !canvasRef.current) return;
163
+ const video = videoRef.current;
164
+ const canvas = canvasRef.current;
165
+ if (video.videoWidth === 0) return;
166
+ canvas.width = video.videoWidth;
167
+ canvas.height = video.videoHeight;
168
+ const ctx = canvas.getContext("2d");
169
+ if (!ctx) return;
170
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
171
+ const boxes = getCurrentBoxes();
172
+ if (boxes.length > 0) {
173
+ const scaleX = canvas.width / video.videoWidth;
174
+ const scaleY = canvas.height / video.videoHeight;
175
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
176
+ }
177
+ };
178
+ draw();
179
+ const interval = setInterval(draw, 100);
180
+ return () => clearInterval(interval);
181
+ }, [persistentBoxes, videoRef, canvasRef]);
182
+
183
+ const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
184
+ const file = e.target.files?.[0] || null;
185
+ setUploadedFile(file);
186
+ setUploadedUrl(file ? URL.createObjectURL(file) : "");
187
+ setError(null);
188
+ setImageProcessed(false);
189
+ setVideoProcessing(false);
190
+ setExampleProcessing(false);
191
+ };
192
+
193
+ // Webcam setup and teardown (unchanged)
194
+ useEffect(() => {
195
+ if (mode !== "Webcam") {
196
+ if (webcamStreamRef.current) {
197
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
198
+ webcamStreamRef.current = null;
199
+ }
200
+ setWebcamActive(false);
201
+ return;
202
+ }
203
+ const setupWebcam = async () => {
204
+ try {
205
+ setError(null);
206
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
207
+ webcamStreamRef.current = stream;
208
+ if (videoRef.current) {
209
+ videoRef.current.srcObject = stream;
210
+ setWebcamActive(true);
211
+ }
212
+ } catch (e) {
213
+ setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
214
+ setWebcamActive(false);
215
+ }
216
+ };
217
+ setupWebcam();
218
+ return () => {
219
+ if (webcamStreamRef.current) {
220
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
221
+ webcamStreamRef.current = null;
222
+ }
223
+ setWebcamActive(false);
224
+ };
225
+ }, [mode]);
226
+
227
+ // Webcam mode: process frames with setInterval
228
+ useEffect(() => {
229
+ if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
230
+ let interval: ReturnType<typeof setInterval> | null = null;
231
+ interval = setInterval(() => {
232
+ processVideoFrame();
233
+ }, 1000);
234
+ return () => {
235
+ if (interval) clearInterval(interval);
236
+ };
237
+ }, [mode, isLoaded, prompt, runInference, webcamActive]);
238
+
239
+ // URL mode: process frames with setInterval
240
+ useEffect(() => {
241
+ if (mode !== "URL" || !isLoaded || !urlProcessing) return;
242
+ let interval: ReturnType<typeof setInterval> | null = null;
243
+ interval = setInterval(() => {
244
+ processVideoFrame();
245
+ }, 1000);
246
+ return () => {
247
+ if (interval) clearInterval(interval);
248
+ };
249
+ }, [mode, isLoaded, prompt, runInference, urlProcessing]);
250
+
251
+ // File video mode: process frames with setInterval
252
+ useEffect(() => {
253
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
254
+ let interval: ReturnType<typeof setInterval> | null = null;
255
+ interval = setInterval(() => {
256
+ processVideoFrame();
257
+ }, 1000);
258
+ return () => {
259
+ if (interval) clearInterval(interval);
260
+ };
261
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
262
+
263
+ // Example video mode: process frames with setInterval
264
+ useEffect(() => {
265
+ if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
266
+ let interval: ReturnType<typeof setInterval> | null = null;
267
+ interval = setInterval(() => {
268
+ processVideoFrame();
269
+ }, 1000);
270
+ return () => {
271
+ if (interval) clearInterval(interval);
272
+ };
273
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
274
+
275
+ // File mode: process uploaded image (only on button click)
276
+ const handleProcessImage = async () => {
277
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
278
+ const img = imageRef.current;
279
+ const canvas = canvasRef.current;
280
+ canvas.width = img.naturalWidth;
281
+ canvas.height = img.naturalHeight;
282
+ setCanvasDims({w:canvas.width,h:canvas.height});
283
+ setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
284
+ const ctx = canvas.getContext("2d");
285
+ if (!ctx) return;
286
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
287
+ setProcessing(true);
288
+ setError(null);
289
+ setInferenceStatus("Running inference...");
290
+ await runInference(img, prompt, (output: string) => {
291
+ setDebugOutput(output);
292
+ setInferenceStatus("Inference complete.");
293
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
294
+ let boxes = extractJsonFromMarkdown(output) || [];
295
+ if (boxes.length === 0 && Array.isArray(output)) {
296
+ boxes = parseFlatBoxArray(output);
297
+ }
298
+ boxes = normalizeBoxes(boxes);
299
+ console.log("Model output:", output);
300
+ console.log("Boxes after normalization:", boxes);
301
+ console.log("Canvas size:", canvas.width, canvas.height);
302
+ if (boxes.length > 0) {
303
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
304
+ console.log("First box coords:", x1, y1, x2, y2);
305
+ }
306
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
307
+ if (Array.isArray(boxes) && boxes.length > 0) {
308
+ const scaleX = canvas.width / img.naturalWidth;
309
+ const scaleY = canvas.height / img.naturalHeight;
310
+ drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
311
+ }
312
+ setImageProcessed(true);
313
+ });
314
+ setProcessing(false);
315
+ };
316
+
317
+ // File mode: process uploaded video frames (start/stop)
318
+ const handleToggleVideoProcessing = () => {
319
+ setVideoProcessing((prev) => !prev);
320
+ };
321
+
322
+ // Handle start/stop for example video processing
323
+ const handleToggleExampleProcessing = () => {
324
+ setExampleProcessing((prev) => !prev);
325
+ };
326
+
327
+ // Handle start/stop for URL video processing
328
+ const handleToggleUrlProcessing = () => {
329
+ setUrlProcessing((prev) => !prev);
330
+ };
331
+
332
+ // Test draw box function
333
+ const handleTestDrawBox = () => {
334
+ if (!canvasRef.current) return;
335
+ const canvas = canvasRef.current;
336
+ const ctx = canvas.getContext("2d");
337
+ if (!ctx) return;
338
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
339
+ ctx.strokeStyle = "#FF00FF";
340
+ ctx.lineWidth = 4;
341
+ ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
342
+ ctx.font = "20px Arial";
343
+ ctx.fillStyle = "#FF00FF";
344
+ ctx.fillText("Test Box", 50, 35);
345
+ };
346
+
347
+ return (
348
+ <div className="absolute inset-0 text-white">
349
+ <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
350
+ {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
351
+ </div>
352
+ <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
353
+ <div className="flex flex-col items-center justify-center h-full w-full">
354
+ {/* Mode Selector */}
355
+ <div className="mb-6">
356
+ <div className="flex space-x-4">
357
+ {MODES.map((m) => (
358
+ <button
359
+ key={m}
360
+ className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
361
+ mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
362
+ }`}
363
+ onClick={() => setMode(m)}
364
+ >
365
+ {m}
366
+ </button>
367
+ ))}
368
+ </div>
369
+ </div>
370
+
371
+ {/* Mode Content */}
372
+ <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
373
+ {mode === "Webcam" && (
374
+ <div className="w-full text-center flex flex-col items-center">
375
+ <div className="mb-4 w-full max-w-xl">
376
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
377
+ <textarea
378
+ className="w-full p-2 rounded-lg text-black"
379
+ rows={3}
380
+ value={prompt}
381
+ onChange={(e) => setPrompt(e.target.value)}
382
+ />
383
+ </div>
384
+ <div className="relative w-full max-w-xl">
385
+ <video
386
+ ref={videoRef}
387
+ autoPlay
388
+ muted
389
+ playsInline
390
+ className="w-full rounded-lg shadow-lg mb-2"
391
+ style={{ background: "#222" }}
392
+ />
393
+ <canvas
394
+ ref={canvasRef}
395
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
396
+ style={{ zIndex: 10, pointerEvents: "none" }}
397
+ />
398
+ </div>
399
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
400
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
401
+ </div>
402
+ )}
403
+ {mode === "URL" && (
404
+ <div className="w-full text-center flex flex-col items-center">
405
+ <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
406
+ <div className="flex w-full max-w-xl mb-4">
407
+ <input
408
+ type="text"
409
+ className="flex-1 px-4 py-2 rounded-l-lg text-black"
410
+ value={inputUrl}
411
+ onChange={(e) => setInputUrl(e.target.value)}
412
+ placeholder="Paste video URL here"
413
+ />
414
+ <button
415
+ className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
416
+ onClick={() => setVideoUrl(inputUrl)}
417
+ >
418
+ Load
419
+ </button>
420
+ </div>
421
+ <div className="mb-4 w-full max-w-xl">
422
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
423
+ <textarea
424
+ className="w-full p-2 rounded-lg text-black"
425
+ rows={3}
426
+ value={prompt}
427
+ onChange={(e) => setPrompt(e.target.value)}
428
+ />
429
+ </div>
430
+ <div className="relative w-full max-w-xl">
431
+ <video
432
+ ref={videoRef}
433
+ src={videoUrl}
434
+ controls
435
+ autoPlay
436
+ loop
437
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
438
+ style={{ background: "#222" }}
439
+ />
440
+ <video
441
+ ref={overlayVideoRef}
442
+ src={videoUrl}
443
+ controls={false}
444
+ autoPlay
445
+ loop
446
+ muted
447
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
448
+ style={{ background: "#222" }}
449
+ />
450
+ <canvas
451
+ ref={canvasRef}
452
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
453
+ style={{ zIndex: 20, pointerEvents: "none" }}
454
+ />
455
+ <button
456
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
457
+ onClick={handleToggleUrlProcessing}
458
+ >
459
+ {urlProcessing ? "Stop Processing" : "Start Processing"}
460
+ </button>
461
+ </div>
462
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
463
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
464
+ <button
465
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
466
+ onClick={handleTestDrawBox}
467
+ >
468
+ Test Draw Box
469
+ </button>
470
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
471
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
472
+ <div>Raw Model Output:</div>
473
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
474
+ </div>
475
+ </div>
476
+ )}
477
+ {mode === "File" && (
478
+ <div className="w-full text-center flex flex-col items-center">
479
+ <div className="mb-4 w-full max-w-xl">
480
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
481
+ <textarea
482
+ className="w-full p-2 rounded-lg text-black"
483
+ rows={3}
484
+ value={prompt}
485
+ onChange={(e) => setPrompt(e.target.value)}
486
+ />
487
+ </div>
488
+ <div className="mb-4 w-full max-w-xl">
489
+ <input
490
+ type="file"
491
+ accept="image/*,video/*"
492
+ onChange={handleFileChange}
493
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
494
+ />
495
+ </div>
496
+ {/* Show uploaded image */}
497
+ {uploadedFile && isImageFile(uploadedFile) && (
498
+ <div className="relative w-full max-w-xl">
499
+ <img
500
+ ref={imageRef}
501
+ src={uploadedUrl}
502
+ alt="Uploaded"
503
+ className="w-full rounded-lg shadow-lg mb-2"
504
+ style={{ background: "#222" }}
505
+ />
506
+ <canvas
507
+ ref={canvasRef}
508
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
509
+ style={{ zIndex: 10, pointerEvents: "none" }}
510
+ />
511
+ <button
512
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
513
+ onClick={handleProcessImage}
514
+ disabled={processing}
515
+ >
516
+ {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
517
+ </button>
518
+ </div>
519
+ )}
520
+ {/* Show uploaded video */}
521
+ {uploadedFile && isVideoFile(uploadedFile) && (
522
+ <div className="relative w-full max-w-xl">
523
+ <video
524
+ ref={videoRef}
525
+ src={uploadedUrl}
526
+ controls
527
+ autoPlay
528
+ loop
529
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
530
+ style={{ background: "#222" }}
531
+ />
532
+ <video
533
+ ref={overlayVideoRef}
534
+ src={uploadedUrl}
535
+ controls={false}
536
+ autoPlay
537
+ loop
538
+ muted
539
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
540
+ style={{ background: "#222" }}
541
+ />
542
+ <canvas
543
+ ref={canvasRef}
544
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
545
+ style={{ zIndex: 20, pointerEvents: "none" }}
546
+ />
547
+ <button
548
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
549
+ onClick={handleToggleVideoProcessing}
550
+ >
551
+ {videoProcessing ? "Stop Processing" : "Start Processing"}
552
+ </button>
553
+ </div>
554
+ )}
555
+ {/* Show example video if no file uploaded */}
556
+ {!uploadedFile && (
557
+ <div className="relative w-full max-w-xl">
558
+ <video
559
+ ref={videoRef}
560
+ src={EXAMPLE_VIDEO_URL}
561
+ controls
562
+ autoPlay
563
+ loop
564
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
565
+ style={{ background: "#222" }}
566
+ />
567
+ <video
568
+ ref={overlayVideoRef}
569
+ src={EXAMPLE_VIDEO_URL}
570
+ controls={false}
571
+ autoPlay
572
+ loop
573
+ muted
574
+ className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
575
+ style={{ background: "#222" }}
576
+ />
577
+ <canvas
578
+ ref={canvasRef}
579
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
580
+ style={{ zIndex: 20, pointerEvents: "none" }}
581
+ />
582
+ <button
583
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
584
+ onClick={handleToggleExampleProcessing}
585
+ >
586
+ {exampleProcessing ? "Stop Processing" : "Start Processing"}
587
+ </button>
588
+ </div>
589
+ )}
590
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
591
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
592
+ <button
593
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
594
+ onClick={handleTestDrawBox}
595
+ >
596
+ Test Draw Box
597
+ </button>
598
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
599
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
600
+ <div>Raw Model Output:</div>
601
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
602
+ </div>
603
+ </div>
604
+ )}
605
+ </div>
606
+ </div>
607
+ </div>
608
+ );
609
  }