Quazim0t0 commited on
Commit
f83c163
·
verified ·
1 Parent(s): 6f9187a

Update src/components/MultiSourceCaptioningView.tsx

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,520 +1,520 @@
1
- import { useState, useRef, useEffect } from "react";
2
- import { useVLMContext } from "../context/useVLMContext";
3
- import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
-
5
- const MODES = ["Webcam", "URL", "File"] as const;
6
- type Mode = typeof MODES[number];
7
-
8
- const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
9
- const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
-
11
- function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
- if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
- const label = arr[0];
14
- return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
- }
16
- return [];
17
- }
18
-
19
- function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
- if (!raw) return [];
21
- let boxes = [];
22
- if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
23
- boxes = raw.image;
24
- } else if (Array.isArray(raw)) {
25
- boxes = raw;
26
- } else if (typeof raw === "object" && raw !== null) {
27
- boxes = [raw];
28
- }
29
- return boxes
30
- .map((obj: any) => {
31
- if (!obj || !obj.bbox_2d) return null;
32
- let bbox = obj.bbox_2d;
33
- // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
34
- if (
35
- Array.isArray(bbox) &&
36
- bbox.length === 2 &&
37
- Array.isArray(bbox[0]) &&
38
- Array.isArray(bbox[1]) &&
39
- bbox[0].length === 2 &&
40
- bbox[1].length === 2
41
- ) {
42
- bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
- }
44
- // If bbox_2d is [x1, y1, x2, y2], use as-is
45
- if (
46
- Array.isArray(bbox) &&
47
- bbox.length === 4 &&
48
- bbox.every((v: any) => typeof v === "number")
49
- ) {
50
- return { ...obj, bbox_2d: bbox };
51
- }
52
- // Otherwise, skip
53
- return null;
54
- })
55
- .filter((obj: any) => obj);
56
- }
57
-
58
- function isImageFile(file: File) {
59
- return file.type.startsWith("image/");
60
- }
61
- function isVideoFile(file: File) {
62
- return file.type.startsWith("video/");
63
- }
64
-
65
- export default function MultiSourceCaptioningView() {
66
- const [mode, setMode] = useState<Mode>("File");
67
- const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
68
- const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
69
- const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
70
- const [processing, setProcessing] = useState(false);
71
- const [error, setError] = useState<string | null>(null);
72
- const [webcamActive, setWebcamActive] = useState(false);
73
- const [uploadedFile, setUploadedFile] = useState<File | null>(null);
74
- const [uploadedUrl, setUploadedUrl] = useState<string>("");
75
- const [videoProcessing, setVideoProcessing] = useState(false);
76
- const [imageProcessed, setImageProcessed] = useState(false);
77
- const [exampleProcessing, setExampleProcessing] = useState(false);
78
- const [urlProcessing, setUrlProcessing] = useState(false);
79
- const [debugOutput, setDebugOutput] = useState<string>("");
80
- const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
- const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
- const [inferenceStatus, setInferenceStatus] = useState<string>("");
83
-
84
- const videoRef = useRef<HTMLVideoElement | null>(null);
85
- const canvasRef = useRef<HTMLCanvasElement | null>(null);
86
- const imageRef = useRef<HTMLImageElement | null>(null);
87
- const webcamStreamRef = useRef<MediaStream | null>(null);
88
- const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
89
-
90
- const processVideoFrame = async () => {
91
- if (!videoRef.current || !canvasRef.current) return;
92
- const video = videoRef.current;
93
- const canvas = canvasRef.current;
94
- if (video.paused || video.ended || video.videoWidth === 0) return;
95
- canvas.width = video.videoWidth;
96
- canvas.height = video.videoHeight;
97
- const ctx = canvas.getContext("2d");
98
- if (!ctx) return;
99
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
100
- await runInference(video, prompt, (output: string) => {
101
- setDebugOutput(output); // <-- Ensure Raw Model Output is updated
102
- let boxes = extractJsonFromMarkdown(output) || [];
103
- if (boxes.length === 0 && Array.isArray(output)) {
104
- boxes = parseFlatBoxArray(output);
105
- }
106
- boxes = normalizeBoxes(boxes);
107
- console.log("Model output:", output);
108
- console.log("Boxes after normalization:", boxes);
109
- console.log("Canvas size:", canvas.width, canvas.height);
110
- if (boxes.length > 0) {
111
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
112
- console.log("First box coords:", x1, y1, x2, y2);
113
- }
114
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
115
- if (Array.isArray(boxes) && boxes.length > 0) {
116
- const scaleX = canvas.width / video.videoWidth;
117
- const scaleY = canvas.height / video.videoHeight;
118
- ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
119
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
120
- }
121
- });
122
- };
123
-
124
- const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
125
- const file = e.target.files?.[0] || null;
126
- setUploadedFile(file);
127
- setUploadedUrl(file ? URL.createObjectURL(file) : "");
128
- setError(null);
129
- setImageProcessed(false);
130
- setVideoProcessing(false);
131
- setExampleProcessing(false);
132
- };
133
-
134
- // Webcam setup and teardown (unchanged)
135
- useEffect(() => {
136
- if (mode !== "Webcam") {
137
- if (webcamStreamRef.current) {
138
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
139
- webcamStreamRef.current = null;
140
- }
141
- setWebcamActive(false);
142
- return;
143
- }
144
- const setupWebcam = async () => {
145
- try {
146
- setError(null);
147
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
148
- webcamStreamRef.current = stream;
149
- if (videoRef.current) {
150
- videoRef.current.srcObject = stream;
151
- setWebcamActive(true);
152
- }
153
- } catch (e) {
154
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
155
- setWebcamActive(false);
156
- }
157
- };
158
- setupWebcam();
159
- return () => {
160
- if (webcamStreamRef.current) {
161
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
162
- webcamStreamRef.current = null;
163
- }
164
- setWebcamActive(false);
165
- };
166
- }, [mode]);
167
-
168
- // Webcam mode: process frames with setInterval
169
- useEffect(() => {
170
- if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
171
- let interval: ReturnType<typeof setInterval> | null = null;
172
- interval = setInterval(() => {
173
- processVideoFrame();
174
- }, 1000);
175
- return () => {
176
- if (interval) clearInterval(interval);
177
- };
178
- }, [mode, isLoaded, prompt, runInference, webcamActive]);
179
-
180
- // URL mode: process frames with setInterval
181
- useEffect(() => {
182
- if (mode !== "URL" || !isLoaded || !urlProcessing) return;
183
- let interval: ReturnType<typeof setInterval> | null = null;
184
- interval = setInterval(() => {
185
- processVideoFrame();
186
- }, 1000);
187
- return () => {
188
- if (interval) clearInterval(interval);
189
- };
190
- }, [mode, isLoaded, prompt, runInference, urlProcessing]);
191
-
192
- // File video mode: process frames with setInterval
193
- useEffect(() => {
194
- if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
195
- let interval: ReturnType<typeof setInterval> | null = null;
196
- interval = setInterval(() => {
197
- processVideoFrame();
198
- }, 1000);
199
- return () => {
200
- if (interval) clearInterval(interval);
201
- };
202
- }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
203
-
204
- // Example video mode: process frames with setInterval
205
- useEffect(() => {
206
- if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
207
- let interval: ReturnType<typeof setInterval> | null = null;
208
- interval = setInterval(() => {
209
- processVideoFrame();
210
- }, 1000);
211
- return () => {
212
- if (interval) clearInterval(interval);
213
- };
214
- }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
215
-
216
- // File mode: process uploaded image (only on button click)
217
- const handleProcessImage = async () => {
218
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
219
- const img = imageRef.current;
220
- const canvas = canvasRef.current;
221
- canvas.width = img.naturalWidth;
222
- canvas.height = img.naturalHeight;
223
- setCanvasDims({w:canvas.width,h:canvas.height});
224
- setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
225
- const ctx = canvas.getContext("2d");
226
- if (!ctx) return;
227
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
228
- setProcessing(true);
229
- setError(null);
230
- setInferenceStatus("Running inference...");
231
- await runInference(img, prompt, (output: string) => {
232
- setDebugOutput(output);
233
- setInferenceStatus("Inference complete.");
234
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
235
- let boxes = extractJsonFromMarkdown(output) || [];
236
- if (boxes.length === 0 && Array.isArray(output)) {
237
- boxes = parseFlatBoxArray(output);
238
- }
239
- boxes = normalizeBoxes(boxes);
240
- console.log("Model output:", output);
241
- console.log("Boxes after normalization:", boxes);
242
- console.log("Canvas size:", canvas.width, canvas.height);
243
- if (boxes.length > 0) {
244
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
245
- console.log("First box coords:", x1, y1, x2, y2);
246
- }
247
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
248
- if (Array.isArray(boxes) && boxes.length > 0) {
249
- const scaleX = canvas.width / img.naturalWidth;
250
- const scaleY = canvas.height / img.naturalHeight;
251
- drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
252
- }
253
- setImageProcessed(true);
254
- });
255
- setProcessing(false);
256
- };
257
-
258
- // File mode: process uploaded video frames (start/stop)
259
- const handleToggleVideoProcessing = () => {
260
- setVideoProcessing((prev) => !prev);
261
- };
262
-
263
- // Handle start/stop for example video processing
264
- const handleToggleExampleProcessing = () => {
265
- setExampleProcessing((prev) => !prev);
266
- };
267
-
268
- // Handle start/stop for URL video processing
269
- const handleToggleUrlProcessing = () => {
270
- setUrlProcessing((prev) => !prev);
271
- };
272
-
273
- // Test draw box function
274
- const handleTestDrawBox = () => {
275
- if (!canvasRef.current) return;
276
- const canvas = canvasRef.current;
277
- const ctx = canvas.getContext("2d");
278
- if (!ctx) return;
279
- ctx.clearRect(0, 0, canvas.width, canvas.height);
280
- ctx.strokeStyle = "#FF00FF";
281
- ctx.lineWidth = 4;
282
- ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
283
- ctx.font = "20px Arial";
284
- ctx.fillStyle = "#FF00FF";
285
- ctx.fillText("Test Box", 50, 35);
286
- };
287
-
288
- return (
289
- <div className="absolute inset-0 text-white">
290
- <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
291
- {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
292
- </div>
293
- <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
294
- <div className="flex flex-col items-center justify-center h-full w-full">
295
- {/* Mode Selector */}
296
- <div className="mb-6">
297
- <div className="flex space-x-4">
298
- {MODES.map((m) => (
299
- <button
300
- key={m}
301
- className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
302
- mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
303
- }`}
304
- onClick={() => setMode(m)}
305
- >
306
- {m}
307
- </button>
308
- ))}
309
- </div>
310
- </div>
311
-
312
- {/* Mode Content */}
313
- <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
314
- {mode === "Webcam" && (
315
- <div className="w-full text-center flex flex-col items-center">
316
- <div className="mb-4 w-full max-w-xl">
317
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
318
- <textarea
319
- className="w-full p-2 rounded-lg text-black"
320
- rows={3}
321
- value={prompt}
322
- onChange={(e) => setPrompt(e.target.value)}
323
- />
324
- </div>
325
- <div className="relative w-full max-w-xl">
326
- <video
327
- ref={videoRef}
328
- autoPlay
329
- muted
330
- playsInline
331
- className="w-full rounded-lg shadow-lg mb-2"
332
- style={{ background: "#222" }}
333
- />
334
- <canvas
335
- ref={canvasRef}
336
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
337
- style={{ zIndex: 10, pointerEvents: "none" }}
338
- />
339
- </div>
340
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
341
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
342
- </div>
343
- )}
344
- {mode === "URL" && (
345
- <div className="w-full text-center flex flex-col items-center">
346
- <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
347
- <div className="flex w-full max-w-xl mb-4">
348
- <input
349
- type="text"
350
- className="flex-1 px-4 py-2 rounded-l-lg text-black"
351
- value={inputUrl}
352
- onChange={(e) => setInputUrl(e.target.value)}
353
- placeholder="Paste video URL here"
354
- />
355
- <button
356
- className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
357
- onClick={() => setVideoUrl(inputUrl)}
358
- >
359
- Load
360
- </button>
361
- </div>
362
- <div className="mb-4 w-full max-w-xl">
363
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
364
- <textarea
365
- className="w-full p-2 rounded-lg text-black"
366
- rows={3}
367
- value={prompt}
368
- onChange={(e) => setPrompt(e.target.value)}
369
- />
370
- </div>
371
- <div className="relative w-full max-w-xl">
372
- <video
373
- ref={videoRef}
374
- src={videoUrl}
375
- controls
376
- autoPlay
377
- loop
378
- className="w-full rounded-lg shadow-lg mb-2"
379
- style={{ background: "#222" }}
380
- />
381
- <canvas
382
- ref={canvasRef}
383
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
384
- style={{ zIndex: 10, pointerEvents: "none" }}
385
- />
386
- <button
387
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
388
- onClick={handleToggleUrlProcessing}
389
- >
390
- {urlProcessing ? "Stop Processing" : "Start Processing"}
391
- </button>
392
- </div>
393
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
394
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
395
- <button
396
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
397
- onClick={handleTestDrawBox}
398
- >
399
- Test Draw Box
400
- </button>
401
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
402
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
403
- <div>Raw Model Output:</div>
404
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
405
- </div>
406
- </div>
407
- )}
408
- {mode === "File" && (
409
- <div className="w-full text-center flex flex-col items-center">
410
- <div className="mb-4 w-full max-w-xl">
411
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
412
- <textarea
413
- className="w-full p-2 rounded-lg text-black"
414
- rows={3}
415
- value={prompt}
416
- onChange={(e) => setPrompt(e.target.value)}
417
- />
418
- </div>
419
- <div className="mb-4 w-full max-w-xl">
420
- <input
421
- type="file"
422
- accept="image/*,video/*"
423
- onChange={handleFileChange}
424
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
425
- />
426
- </div>
427
- {/* Show uploaded image */}
428
- {uploadedFile && isImageFile(uploadedFile) && (
429
- <div className="relative w-full max-w-xl">
430
- <img
431
- ref={imageRef}
432
- src={uploadedUrl}
433
- alt="Uploaded"
434
- className="w-full rounded-lg shadow-lg mb-2"
435
- style={{ background: "#222" }}
436
- />
437
- <canvas
438
- ref={canvasRef}
439
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
440
- style={{ zIndex: 10, pointerEvents: "none" }}
441
- />
442
- <button
443
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
444
- onClick={handleProcessImage}
445
- disabled={processing}
446
- >
447
- {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
448
- </button>
449
- </div>
450
- )}
451
- {/* Show uploaded video */}
452
- {uploadedFile && isVideoFile(uploadedFile) && (
453
- <div className="relative w-full max-w-xl">
454
- <video
455
- ref={videoRef}
456
- src={uploadedUrl}
457
- controls
458
- autoPlay
459
- loop
460
- className="w-full rounded-lg shadow-lg mb-2"
461
- style={{ background: "#222" }}
462
- />
463
- <canvas
464
- ref={canvasRef}
465
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
466
- style={{ zIndex: 10, pointerEvents: "none" }}
467
- />
468
- <button
469
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
470
- onClick={handleToggleVideoProcessing}
471
- >
472
- {videoProcessing ? "Stop Processing" : "Start Processing"}
473
- </button>
474
- </div>
475
- )}
476
- {/* Show example video if no file uploaded */}
477
- {!uploadedFile && (
478
- <div className="relative w-full max-w-xl">
479
- <video
480
- ref={videoRef}
481
- src={EXAMPLE_VIDEO_URL}
482
- controls
483
- autoPlay
484
- loop
485
- className="w-full rounded-lg shadow-lg mb-2"
486
- style={{ background: "#222" }}
487
- />
488
- <canvas
489
- ref={canvasRef}
490
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
491
- style={{ zIndex: 10, pointerEvents: "none" }}
492
- />
493
- <button
494
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
495
- onClick={handleToggleExampleProcessing}
496
- >
497
- {exampleProcessing ? "Stop Processing" : "Start Processing"}
498
- </button>
499
- </div>
500
- )}
501
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
502
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
503
- <button
504
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
505
- onClick={handleTestDrawBox}
506
- >
507
- Test Draw Box
508
- </button>
509
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
510
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
511
- <div>Raw Model Output:</div>
512
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
513
- </div>
514
- </div>
515
- )}
516
- </div>
517
- </div>
518
- </div>
519
- );
520
  }
 
1
+ import { useState, useRef, useEffect } from "react";
2
+ import { useVLMContext } from "../context/useVLMContext";
3
+ import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
+
5
+ const MODES = ["Webcam", "URL", "File"] as const;
6
+ type Mode = typeof MODES[number];
7
+
8
+ const EXAMPLE_VIDEO_URL = "https://huggingface.co/spaces/Quazim0t0/test/resolve/main/videos/1.mp4";
9
+ const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
+
11
+ function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
+ if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
+ const label = arr[0];
14
+ return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
+ }
16
+ return [];
17
+ }
18
+
19
+ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
+ if (!raw) return [];
21
+ let boxes = [];
22
+ if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
23
+ boxes = raw.image;
24
+ } else if (Array.isArray(raw)) {
25
+ boxes = raw;
26
+ } else if (typeof raw === "object" && raw !== null) {
27
+ boxes = [raw];
28
+ }
29
+ return boxes
30
+ .map((obj: any) => {
31
+ if (!obj || !obj.bbox_2d) return null;
32
+ let bbox = obj.bbox_2d;
33
+ // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
34
+ if (
35
+ Array.isArray(bbox) &&
36
+ bbox.length === 2 &&
37
+ Array.isArray(bbox[0]) &&
38
+ Array.isArray(bbox[1]) &&
39
+ bbox[0].length === 2 &&
40
+ bbox[1].length === 2
41
+ ) {
42
+ bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
+ }
44
+ // If bbox_2d is [x1, y1, x2, y2], use as-is
45
+ if (
46
+ Array.isArray(bbox) &&
47
+ bbox.length === 4 &&
48
+ bbox.every((v: any) => typeof v === "number")
49
+ ) {
50
+ return { ...obj, bbox_2d: bbox };
51
+ }
52
+ // Otherwise, skip
53
+ return null;
54
+ })
55
+ .filter((obj: any) => obj);
56
+ }
57
+
58
+ function isImageFile(file: File) {
59
+ return file.type.startsWith("image/");
60
+ }
61
+ function isVideoFile(file: File) {
62
+ return file.type.startsWith("video/");
63
+ }
64
+
65
+ export default function MultiSourceCaptioningView() {
66
+ const [mode, setMode] = useState<Mode>("File");
67
+ const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
68
+ const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
69
+ const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
70
+ const [processing, setProcessing] = useState(false);
71
+ const [error, setError] = useState<string | null>(null);
72
+ const [webcamActive, setWebcamActive] = useState(false);
73
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
74
+ const [uploadedUrl, setUploadedUrl] = useState<string>("");
75
+ const [videoProcessing, setVideoProcessing] = useState(false);
76
+ const [imageProcessed, setImageProcessed] = useState(false);
77
+ const [exampleProcessing, setExampleProcessing] = useState(false);
78
+ const [urlProcessing, setUrlProcessing] = useState(false);
79
+ const [debugOutput, setDebugOutput] = useState<string>("");
80
+ const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
+ const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
+ const [inferenceStatus, setInferenceStatus] = useState<string>("");
83
+
84
+ const videoRef = useRef<HTMLVideoElement | null>(null);
85
+ const canvasRef = useRef<HTMLCanvasElement | null>(null);
86
+ const imageRef = useRef<HTMLImageElement | null>(null);
87
+ const webcamStreamRef = useRef<MediaStream | null>(null);
88
+ const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
89
+
90
+ const processVideoFrame = async () => {
91
+ if (!videoRef.current || !canvasRef.current) return;
92
+ const video = videoRef.current;
93
+ const canvas = canvasRef.current;
94
+ if (video.paused || video.ended || video.videoWidth === 0) return;
95
+ canvas.width = video.videoWidth;
96
+ canvas.height = video.videoHeight;
97
+ const ctx = canvas.getContext("2d");
98
+ if (!ctx) return;
99
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
100
+ await runInference(video, prompt, (output: string) => {
101
+ setDebugOutput(output); // <-- Ensure Raw Model Output is updated
102
+ let boxes = extractJsonFromMarkdown(output) || [];
103
+ if (boxes.length === 0 && Array.isArray(output)) {
104
+ boxes = parseFlatBoxArray(output);
105
+ }
106
+ boxes = normalizeBoxes(boxes);
107
+ console.log("Model output:", output);
108
+ console.log("Boxes after normalization:", boxes);
109
+ console.log("Canvas size:", canvas.width, canvas.height);
110
+ if (boxes.length > 0) {
111
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
112
+ console.log("First box coords:", x1, y1, x2, y2);
113
+ }
114
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
115
+ if (Array.isArray(boxes) && boxes.length > 0) {
116
+ const scaleX = canvas.width / video.videoWidth;
117
+ const scaleY = canvas.height / video.videoHeight;
118
+ ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
119
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
120
+ }
121
+ });
122
+ };
123
+
124
+ const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
125
+ const file = e.target.files?.[0] || null;
126
+ setUploadedFile(file);
127
+ setUploadedUrl(file ? URL.createObjectURL(file) : "");
128
+ setError(null);
129
+ setImageProcessed(false);
130
+ setVideoProcessing(false);
131
+ setExampleProcessing(false);
132
+ };
133
+
134
+ // Webcam setup and teardown (unchanged)
135
+ useEffect(() => {
136
+ if (mode !== "Webcam") {
137
+ if (webcamStreamRef.current) {
138
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
139
+ webcamStreamRef.current = null;
140
+ }
141
+ setWebcamActive(false);
142
+ return;
143
+ }
144
+ const setupWebcam = async () => {
145
+ try {
146
+ setError(null);
147
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
148
+ webcamStreamRef.current = stream;
149
+ if (videoRef.current) {
150
+ videoRef.current.srcObject = stream;
151
+ setWebcamActive(true);
152
+ }
153
+ } catch (e) {
154
+ setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
155
+ setWebcamActive(false);
156
+ }
157
+ };
158
+ setupWebcam();
159
+ return () => {
160
+ if (webcamStreamRef.current) {
161
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
162
+ webcamStreamRef.current = null;
163
+ }
164
+ setWebcamActive(false);
165
+ };
166
+ }, [mode]);
167
+
168
+ // Webcam mode: process frames with setInterval
169
+ useEffect(() => {
170
+ if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
171
+ let interval: ReturnType<typeof setInterval> | null = null;
172
+ interval = setInterval(() => {
173
+ processVideoFrame();
174
+ }, 1000);
175
+ return () => {
176
+ if (interval) clearInterval(interval);
177
+ };
178
+ }, [mode, isLoaded, prompt, runInference, webcamActive]);
179
+
180
+ // URL mode: process frames with setInterval
181
+ useEffect(() => {
182
+ if (mode !== "URL" || !isLoaded || !urlProcessing) return;
183
+ let interval: ReturnType<typeof setInterval> | null = null;
184
+ interval = setInterval(() => {
185
+ processVideoFrame();
186
+ }, 1000);
187
+ return () => {
188
+ if (interval) clearInterval(interval);
189
+ };
190
+ }, [mode, isLoaded, prompt, runInference, urlProcessing]);
191
+
192
+ // File video mode: process frames with setInterval
193
+ useEffect(() => {
194
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
195
+ let interval: ReturnType<typeof setInterval> | null = null;
196
+ interval = setInterval(() => {
197
+ processVideoFrame();
198
+ }, 1000);
199
+ return () => {
200
+ if (interval) clearInterval(interval);
201
+ };
202
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
203
+
204
+ // Example video mode: process frames with setInterval
205
+ useEffect(() => {
206
+ if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
207
+ let interval: ReturnType<typeof setInterval> | null = null;
208
+ interval = setInterval(() => {
209
+ processVideoFrame();
210
+ }, 1000);
211
+ return () => {
212
+ if (interval) clearInterval(interval);
213
+ };
214
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
215
+
216
+ // File mode: process uploaded image (only on button click)
217
+ const handleProcessImage = async () => {
218
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
219
+ const img = imageRef.current;
220
+ const canvas = canvasRef.current;
221
+ canvas.width = img.naturalWidth;
222
+ canvas.height = img.naturalHeight;
223
+ setCanvasDims({w:canvas.width,h:canvas.height});
224
+ setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
225
+ const ctx = canvas.getContext("2d");
226
+ if (!ctx) return;
227
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
228
+ setProcessing(true);
229
+ setError(null);
230
+ setInferenceStatus("Running inference...");
231
+ await runInference(img, prompt, (output: string) => {
232
+ setDebugOutput(output);
233
+ setInferenceStatus("Inference complete.");
234
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
235
+ let boxes = extractJsonFromMarkdown(output) || [];
236
+ if (boxes.length === 0 && Array.isArray(output)) {
237
+ boxes = parseFlatBoxArray(output);
238
+ }
239
+ boxes = normalizeBoxes(boxes);
240
+ console.log("Model output:", output);
241
+ console.log("Boxes after normalization:", boxes);
242
+ console.log("Canvas size:", canvas.width, canvas.height);
243
+ if (boxes.length > 0) {
244
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
245
+ console.log("First box coords:", x1, y1, x2, y2);
246
+ }
247
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
248
+ if (Array.isArray(boxes) && boxes.length > 0) {
249
+ const scaleX = canvas.width / img.naturalWidth;
250
+ const scaleY = canvas.height / img.naturalHeight;
251
+ drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
252
+ }
253
+ setImageProcessed(true);
254
+ });
255
+ setProcessing(false);
256
+ };
257
+
258
+ // File mode: process uploaded video frames (start/stop)
259
+ const handleToggleVideoProcessing = () => {
260
+ setVideoProcessing((prev) => !prev);
261
+ };
262
+
263
+ // Handle start/stop for example video processing
264
+ const handleToggleExampleProcessing = () => {
265
+ setExampleProcessing((prev) => !prev);
266
+ };
267
+
268
+ // Handle start/stop for URL video processing
269
+ const handleToggleUrlProcessing = () => {
270
+ setUrlProcessing((prev) => !prev);
271
+ };
272
+
273
+ // Test draw box function
274
+ const handleTestDrawBox = () => {
275
+ if (!canvasRef.current) return;
276
+ const canvas = canvasRef.current;
277
+ const ctx = canvas.getContext("2d");
278
+ if (!ctx) return;
279
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
280
+ ctx.strokeStyle = "#FF00FF";
281
+ ctx.lineWidth = 4;
282
+ ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
283
+ ctx.font = "20px Arial";
284
+ ctx.fillStyle = "#FF00FF";
285
+ ctx.fillText("Test Box", 50, 35);
286
+ };
287
+
288
+ return (
289
+ <div className="absolute inset-0 text-white">
290
+ <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
291
+ {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
292
+ </div>
293
+ <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
294
+ <div className="flex flex-col items-center justify-center h-full w-full">
295
+ {/* Mode Selector */}
296
+ <div className="mb-6">
297
+ <div className="flex space-x-4">
298
+ {MODES.map((m) => (
299
+ <button
300
+ key={m}
301
+ className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
302
+ mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
303
+ }`}
304
+ onClick={() => setMode(m)}
305
+ >
306
+ {m}
307
+ </button>
308
+ ))}
309
+ </div>
310
+ </div>
311
+
312
+ {/* Mode Content */}
313
+ <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
314
+ {mode === "Webcam" && (
315
+ <div className="w-full text-center flex flex-col items-center">
316
+ <div className="mb-4 w-full max-w-xl">
317
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
318
+ <textarea
319
+ className="w-full p-2 rounded-lg text-black"
320
+ rows={3}
321
+ value={prompt}
322
+ onChange={(e) => setPrompt(e.target.value)}
323
+ />
324
+ </div>
325
+ <div className="relative w-full max-w-xl">
326
+ <video
327
+ ref={videoRef}
328
+ autoPlay
329
+ muted
330
+ playsInline
331
+ className="w-full rounded-lg shadow-lg mb-2"
332
+ style={{ background: "#222" }}
333
+ />
334
+ <canvas
335
+ ref={canvasRef}
336
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
337
+ style={{ zIndex: 10, pointerEvents: "none" }}
338
+ />
339
+ </div>
340
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
341
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
342
+ </div>
343
+ )}
344
+ {mode === "URL" && (
345
+ <div className="w-full text-center flex flex-col items-center">
346
+ <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
347
+ <div className="flex w-full max-w-xl mb-4">
348
+ <input
349
+ type="text"
350
+ className="flex-1 px-4 py-2 rounded-l-lg text-black"
351
+ value={inputUrl}
352
+ onChange={(e) => setInputUrl(e.target.value)}
353
+ placeholder="Paste video URL here"
354
+ />
355
+ <button
356
+ className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
357
+ onClick={() => setVideoUrl(inputUrl)}
358
+ >
359
+ Load
360
+ </button>
361
+ </div>
362
+ <div className="mb-4 w-full max-w-xl">
363
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
364
+ <textarea
365
+ className="w-full p-2 rounded-lg text-black"
366
+ rows={3}
367
+ value={prompt}
368
+ onChange={(e) => setPrompt(e.target.value)}
369
+ />
370
+ </div>
371
+ <div className="relative w-full max-w-xl">
372
+ <video
373
+ ref={videoRef}
374
+ src={videoUrl}
375
+ controls
376
+ autoPlay
377
+ loop
378
+ className="w-full rounded-lg shadow-lg mb-2"
379
+ style={{ background: "#222" }}
380
+ />
381
+ <canvas
382
+ ref={canvasRef}
383
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
384
+ style={{ zIndex: 10, pointerEvents: "none" }}
385
+ />
386
+ <button
387
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
388
+ onClick={handleToggleUrlProcessing}
389
+ >
390
+ {urlProcessing ? "Stop Processing" : "Start Processing"}
391
+ </button>
392
+ </div>
393
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
394
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
395
+ <button
396
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
397
+ onClick={handleTestDrawBox}
398
+ >
399
+ Test Draw Box
400
+ </button>
401
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
402
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
403
+ <div>Raw Model Output:</div>
404
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
405
+ </div>
406
+ </div>
407
+ )}
408
+ {mode === "File" && (
409
+ <div className="w-full text-center flex flex-col items-center">
410
+ <div className="mb-4 w-full max-w-xl">
411
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
412
+ <textarea
413
+ className="w-full p-2 rounded-lg text-black"
414
+ rows={3}
415
+ value={prompt}
416
+ onChange={(e) => setPrompt(e.target.value)}
417
+ />
418
+ </div>
419
+ <div className="mb-4 w-full max-w-xl">
420
+ <input
421
+ type="file"
422
+ accept="image/*,video/*"
423
+ onChange={handleFileChange}
424
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
425
+ />
426
+ </div>
427
+ {/* Show uploaded image */}
428
+ {uploadedFile && isImageFile(uploadedFile) && (
429
+ <div className="relative w-full max-w-xl">
430
+ <img
431
+ ref={imageRef}
432
+ src={uploadedUrl}
433
+ alt="Uploaded"
434
+ className="w-full rounded-lg shadow-lg mb-2"
435
+ style={{ background: "#222" }}
436
+ />
437
+ <canvas
438
+ ref={canvasRef}
439
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
440
+ style={{ zIndex: 10, pointerEvents: "none" }}
441
+ />
442
+ <button
443
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
444
+ onClick={handleProcessImage}
445
+ disabled={processing}
446
+ >
447
+ {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
448
+ </button>
449
+ </div>
450
+ )}
451
+ {/* Show uploaded video */}
452
+ {uploadedFile && isVideoFile(uploadedFile) && (
453
+ <div className="relative w-full max-w-xl">
454
+ <video
455
+ ref={videoRef}
456
+ src={uploadedUrl}
457
+ controls
458
+ autoPlay
459
+ loop
460
+ className="w-full rounded-lg shadow-lg mb-2"
461
+ style={{ background: "#222" }}
462
+ />
463
+ <canvas
464
+ ref={canvasRef}
465
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
466
+ style={{ zIndex: 10, pointerEvents: "none" }}
467
+ />
468
+ <button
469
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
470
+ onClick={handleToggleVideoProcessing}
471
+ >
472
+ {videoProcessing ? "Stop Processing" : "Start Processing"}
473
+ </button>
474
+ </div>
475
+ )}
476
+ {/* Show example video if no file uploaded */}
477
+ {!uploadedFile && (
478
+ <div className="relative w-full max-w-xl">
479
+ <video
480
+ ref={videoRef}
481
+ src={EXAMPLE_VIDEO_URL}
482
+ controls
483
+ autoPlay
484
+ loop
485
+ className="w-full rounded-lg shadow-lg mb-2"
486
+ style={{ background: "#222" }}
487
+ />
488
+ <canvas
489
+ ref={canvasRef}
490
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
491
+ style={{ zIndex: 10, pointerEvents: "none" }}
492
+ />
493
+ <button
494
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
495
+ onClick={handleToggleExampleProcessing}
496
+ >
497
+ {exampleProcessing ? "Stop Processing" : "Start Processing"}
498
+ </button>
499
+ </div>
500
+ )}
501
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
502
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
503
+ <button
504
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
505
+ onClick={handleTestDrawBox}
506
+ >
507
+ Test Draw Box
508
+ </button>
509
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
510
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
511
+ <div>Raw Model Output:</div>
512
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
513
+ </div>
514
+ </div>
515
+ )}
516
+ </div>
517
+ </div>
518
+ </div>
519
+ );
520
  }