Quazim0t0 commited on
Commit
da86406
·
verified ·
1 Parent(s): 98c6726

Update src/components/MultiSourceCaptioningView.tsx

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,564 +1,533 @@
1
- import React, { useState, useRef, useEffect, useCallback } from "react";
2
- import { useVLMContext } from "../context/useVLMContext";
3
- import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
-
5
- const MODES = ["Webcam", "URL", "File"] as const;
6
- type Mode = typeof MODES[number];
7
-
8
- const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
9
- const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
-
11
- // Helper functions (remain the same)
12
- function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
13
- if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
14
- const label = arr[0];
15
- return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
16
- }
17
- return [];
18
- }
19
-
20
- function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
21
- if (!raw) return [];
22
- let boxes = [];
23
- if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
24
- boxes = raw.image;
25
- } else if (Array.isArray(raw)) {
26
- boxes = raw;
27
- } else if (typeof raw === "object" && raw !== null) {
28
- boxes = [raw];
29
- }
30
- return boxes
31
- .map((obj: any) => {
32
- if (!obj || !obj.bbox_2d) return null;
33
- let bbox = obj.bbox_2d;
34
- if (
35
- Array.isArray(bbox) &&
36
- bbox.length === 2 &&
37
- Array.isArray(bbox[0]) &&
38
- Array.isArray(bbox[1]) &&
39
- bbox[0].length === 2 &&
40
- bbox[1].length === 2
41
- ) {
42
- bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
- }
44
- if (
45
- Array.isArray(bbox) &&
46
- bbox.length === 4 &&
47
- bbox.every((v: any) => typeof v === "number")
48
- ) {
49
- return { ...obj, bbox_2d: bbox };
50
- }
51
- return null;
52
- })
53
- .filter((obj: any) => obj);
54
- }
55
-
56
- function isImageFile(file: File) {
57
- return file.type.startsWith("image/");
58
- }
59
- function isVideoFile(file: File) {
60
- return file.type.startsWith("video/");
61
- }
62
-
63
- export default function MultiSourceCaptioningView() {
64
- const [mode, setMode] = useState<Mode>("File");
65
- const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
66
- const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
67
- const [processingState, setProcessingState] = useState(false); // General processing indicator
68
- const [error, setError] = useState<string | null>(null);
69
- const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
70
- const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
71
- const [inferenceStatus, setInferenceStatus] = useState<string>("");
72
- const [debugOutput, setDebugOutput] = useState<string>("");
73
-
74
- // Refs for the two video elements and the canvas
75
- const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
76
- const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
77
- const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
78
- const imageRef = useRef<HTMLImageElement>(null); // For image file processing
79
-
80
- const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
81
-
82
- // --- Drawing Loop for the Visible Display ---
83
- // This loop runs constantly to draw the latest boxes on the display video
84
- const drawDisplayCanvas = useCallback(() => {
85
- const displayVideo = displayVideoRef.current;
86
- const canvas = canvasRef.current;
87
- const ctx = canvas?.getContext('2d');
88
-
89
- if (!displayVideo || !canvas || !ctx) {
90
- return;
91
- }
92
-
93
- // Adjust canvas size to match the display video's dimensions
94
- if (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight) {
95
- canvas.width = displayVideo.videoWidth;
96
- canvas.height = displayVideo.videoHeight;
97
- }
98
-
99
- // Clear the canvas each frame
100
- ctx.clearRect(0, 0, canvas.width, canvas.height);
101
-
102
- // Draw the latest bounding boxes
103
- const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
104
- const scaleY = canvas.height / (displayVideo.videoHeight || 1);
105
- drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
106
-
107
- // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
108
- if (!displayVideo.paused && !displayVideo.ended) {
109
- requestAnimationFrame(drawDisplayCanvas);
110
- }
111
- }, [latestBoxes]); // Re-create if latestBoxes changes
112
-
113
- // Effect to start the display drawing loop when the display video is ready
114
- useEffect(() => {
115
- const displayVideo = displayVideoRef.current;
116
- if (displayVideo) {
117
- const handleVideoReady = () => {
118
- // Start the requestAnimationFrame loop once the video has loaded metadata
119
- if (displayVideo.readyState >= 1) { // HAVE_METADATA
120
- requestAnimationFrame(drawDisplayCanvas);
121
- }
122
- };
123
- displayVideo.addEventListener('loadedmetadata', handleVideoReady);
124
- // Also check if video is already ready (e.g., on component re-mount)
125
- if (displayVideo.readyState >= 1) {
126
- requestAnimationFrame(drawDisplayCanvas);
127
- }
128
- return () => {
129
- displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
130
- };
131
- }
132
- }, [drawDisplayCanvas]);
133
-
134
- // --- FastVLM Processing Loop (from hidden video/image) ---
135
- // This interval loop controls when FastVLM processes a frame
136
- useEffect(() => {
137
- const vlmVideo = vlmVideoRef.current;
138
- const isVideoMode = (mode === "Webcam" || (mode === "URL" && vlmVideo?.src) || (mode === "File" && vlmVideo?.src && isVideoFile(uploadedFile || null)));
139
-
140
- if (!isLoaded || !vlmVideo || !isVideoMode) {
141
- // If not in a video mode or VLM/video not ready, ensure processing stops
142
- setProcessingState(false);
143
- return;
144
- }
145
-
146
- let interval: ReturnType<typeof setInterval> | null = null;
147
-
148
- const startVLMProcessing = () => {
149
- if (interval) clearInterval(interval); // Clear any old interval
150
-
151
- interval = setInterval(async () => {
152
- if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
153
- return; // Skip if video not ready, paused, ended, or already processing
154
- }
155
-
156
- setProcessingState(true);
157
- setInferenceStatus("Running inference...");
158
- setError(null); // Clear previous errors
159
-
160
- try {
161
- // Create a temporary offscreen canvas to get image data from the VLM video
162
- const tempCanvas = document.createElement('canvas');
163
- tempCanvas.width = vlmVideo.videoWidth;
164
- tempCanvas.height = vlmVideo.videoHeight;
165
- const tempCtx = tempCanvas.getContext('2d', { willReadFrequently: true });
166
-
167
- if (tempCtx && vlmVideo.readyState >= 2) { // HAVE_CURRENT_DATA
168
- tempCtx.drawImage(vlmVideo, 0, 0, tempCanvas.width, tempCanvas.height);
169
- const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height);
170
-
171
- const modelOutput = await runInference(imageData, prompt); // Pass ImageData
172
- setDebugOutput(modelOutput); // Update raw model output
173
-
174
- let boxes = extractJsonFromMarkdown(modelOutput) || [];
175
- if (boxes.length === 0 && Array.isArray(modelOutput)) { // Fallback for direct array output
176
- // This condition `Array.isArray(modelOutput)` is unlikely if modelOutput is string,
177
- // so ensure `extractJsonFromMarkdown` is robust or `runInference` returns expected string
178
- }
179
- boxes = normalizeBoxes(boxes);
180
-
181
- setLatestBoxes(boxes); // Update state, triggers display canvas redraw
182
- setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
183
- } else {
184
- setInferenceStatus("Video not ready for processing.");
185
- }
186
- } catch (e) {
187
- setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
188
- setLatestBoxes([]);
189
- setInferenceStatus("Inference failed.");
190
- } finally {
191
- setProcessingState(false); // Processing finished
192
- }
193
- }, 200); // Inference interval (e.g., 5 frames per second)
194
- };
195
-
196
- const stopVLMProcessing = () => {
197
- if (interval) clearInterval(interval);
198
- interval = null;
199
- setProcessingState(false);
200
- setInferenceStatus("Stopped processing.");
201
- };
202
-
203
- // Start/stop processing based on video playback events
204
- vlmVideo.addEventListener('play', startVLMProcessing);
205
- vlmVideo.addEventListener('pause', stopVLMProcessing);
206
- vlmVideo.addEventListener('ended', stopVLMProcessing);
207
-
208
- // Initial check if video is already playing (e.g., after initial load/autoplay)
209
- if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
210
- startVLMProcessing();
211
- }
212
-
213
- // Cleanup function for useEffect
214
- return () => {
215
- stopVLMProcessing();
216
- vlmVideo.removeEventListener('play', startVLMProcessing);
217
- vlmVideo.removeEventListener('pause', stopVLMProcessing);
218
- vlmVideo.removeEventListener('ended', stopVLMProcessing);
219
- };
220
- }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Added uploadedFile for file mode re-trigger
221
-
222
- // --- Media Source Handling ---
223
-
224
- // Cleanup for media stream and object URLs
225
- const cleanupMediaSource = useCallback(() => {
226
- if (mediaStream) {
227
- mediaStream.getTracks().forEach(track => track.stop());
228
- setMediaStream(null);
229
- }
230
- // Revoke any created blob URLs (for file inputs)
231
- if (displayVideoRef.current?.src.startsWith('blob:')) {
232
- URL.revokeObjectURL(displayVideoRef.current.src);
233
- displayVideoRef.current.src = "";
234
- }
235
- if (vlmVideoRef.current?.src.startsWith('blob:')) {
236
- URL.revokeObjectURL(vlmVideoRef.current.src);
237
- vlmVideoRef.current.src = "";
238
- }
239
- setLatestBoxes([]); // Clear boxes when source changes
240
- setError(null);
241
- setInferenceStatus("");
242
- setDebugOutput("");
243
- }, [mediaStream]);
244
-
245
- // Handle changing the mode (Webcam, URL, File)
246
- useEffect(() => {
247
- cleanupMediaSource(); // Clean up previous source
248
-
249
- const displayVideo = displayVideoRef.current;
250
- const vlmVideo = vlmVideoRef.current;
251
-
252
- if (!displayVideo || !vlmVideo) return;
253
-
254
- // Reset srcObject/src to ensure fresh start
255
- displayVideo.srcObject = null;
256
- vlmVideo.srcObject = null;
257
- displayVideo.src = "";
258
- vlmVideo.src = "";
259
-
260
- setLatestBoxes([]); // Clear boxes on mode change
261
- setError(null);
262
- setInferenceStatus("");
263
- setDebugOutput("");
264
-
265
- // Special handling for initial file mode to load example video
266
- if (mode === "File" && !uploadedFile) {
267
- displayVideo.src = EXAMPLE_VIDEO_URL;
268
- vlmVideo.src = EXAMPLE_VIDEO_URL;
269
- displayVideo.load(); vlmVideo.load(); // Load the video
270
- displayVideo.play().catch(e => console.error("Error playing example display video:", e));
271
- vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
272
- }
273
- }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
274
-
275
- // Handle Webcam Input
276
- const handleWebcamInput = useCallback(async () => {
277
- cleanupMediaSource(); // Clean up any active stream
278
- try {
279
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
280
- setMediaStream(stream); // Store stream to manage it
281
-
282
- if (displayVideoRef.current && vlmVideoRef.current) {
283
- displayVideoRef.current.srcObject = stream;
284
- vlmVideoRef.current.srcObject = stream;
285
- // Programmatically play both videos
286
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
287
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
288
- }
289
- setMode("Webcam");
290
- } catch (e) {
291
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
292
- setMediaStream(null);
293
- setLatestBoxes([]);
294
- setInferenceStatus("Webcam access denied or failed.");
295
- }
296
- }, [cleanupMediaSource]);
297
-
298
- // Handle URL Input (when Load button is clicked)
299
- const handleLoadUrl = useCallback(() => {
300
- cleanupMediaSource(); // Clean up any active stream
301
-
302
- const url = currentUrlInput;
303
- if (!url) {
304
- setError("Please enter a valid URL.");
305
- return;
306
- }
307
-
308
- if (displayVideoRef.current && vlmVideoRef.current) {
309
- displayVideoRef.current.src = url;
310
- vlmVideoRef.current.src = url;
311
- displayVideoRef.current.load(); vlmVideoRef.current.load(); // Load the video
312
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
313
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
314
- setMode("URL");
315
- }
316
- }, [currentUrlInput, cleanupMediaSource]);
317
-
318
- // Handle File Input
319
- const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
320
- cleanupMediaSource(); // Clean up any active stream
321
-
322
- const file = e.target.files?.[0] || null;
323
- if (file) {
324
- const fileUrl = URL.createObjectURL(file); // Create blob URL for the file
325
- // Store the file to differentiate image/video and manage its URL
326
- setUploadedFile(file);
327
-
328
- if (isImageFile(file)) {
329
- // For images, we handle processing on a button click, not a continuous loop
330
- // The imageRef will display the image
331
- // The canvas will be used for processing and drawing
332
- setError(null);
333
- setMode("File");
334
- } else if (isVideoFile(file)) {
335
- if (displayVideoRef.current && vlmVideoRef.current) {
336
- displayVideoRef.current.src = fileUrl;
337
- vlmVideoRef.current.src = fileUrl;
338
- displayVideoRef.current.load(); vlmVideoRef.current.load();
339
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
340
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
341
- setMode("File");
342
- }
343
- } else {
344
- setError("Unsupported file type. Please upload an image or video.");
345
- setUploadedFile(null);
346
- if (fileUrl) URL.revokeObjectURL(fileUrl); // Clean up invalid file URL
347
- }
348
- } else {
349
- setUploadedFile(null); // Clear file if nothing selected
350
- // If no file selected, revert to example video if in File mode
351
- if (mode === "File") {
352
- if (displayVideoRef.current && vlmVideoRef.current) {
353
- displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
354
- vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
355
- displayVideoRef.current.load(); vlmVideoRef.current.load();
356
- displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
357
- vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
358
- }
359
- }
360
- }
361
- }, [cleanupMediaSource, mode]);
362
-
363
-
364
- // Handler for processing an uploaded image file (one-time inference)
365
- const handleProcessImage = async () => {
366
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
367
- setError("Image or model not ready for processing.");
368
- return;
369
- }
370
-
371
- const img = imageRef.current;
372
- const canvas = canvasRef.current;
373
- const ctx = canvas.getContext("2d");
374
- if (!ctx) return;
375
-
376
- // Ensure canvas dimensions match image for processing and display
377
- canvas.width = img.naturalWidth;
378
- canvas.height = img.naturalHeight;
379
-
380
- setProcessingState(true);
381
- setError(null);
382
- setInferenceStatus("Running image inference...");
383
-
384
- try {
385
- // Draw image to canvas to get ImageData for inference
386
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
387
- const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
388
-
389
- const modelOutput = await runInference(imageData, prompt);
390
- setDebugOutput(modelOutput);
391
- setInferenceStatus("Image inference complete.");
392
-
393
- // Clear canvas and redraw image before drawing boxes
394
- ctx.clearRect(0, 0, canvas.width, canvas.height);
395
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
396
-
397
- let boxes = extractJsonFromMarkdown(modelOutput) || [];
398
- boxes = normalizeBoxes(boxes);
399
- setLatestBoxes(boxes); // Update latestBoxes for display
400
-
401
- if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
402
- } catch (e) {
403
- setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
404
- setLatestBoxes([]);
405
- setInferenceStatus("Image inference failed.");
406
- } finally {
407
- setProcessingState(false);
408
- }
409
- };
410
-
411
- // --- Rendered UI ---
412
- return (
413
- <div className="absolute inset-0 text-white flex flex-col">
414
- <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
415
- {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
416
- </div>
417
- <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div> {/* Adjusted top margin */}
418
-
419
- <div className="flex flex-col items-center justify-center flex-1 w-full p-4"> {/* Added padding */}
420
- {/* Mode Selector */}
421
- <div className="mb-6 mt-4"> {/* Increased margin-top for selector */}
422
- <div className="flex space-x-4">
423
- {MODES.map((m) => (
424
- <button
425
- key={m}
426
- className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
427
- mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
428
- }`}
429
- onClick={() => setMode(m)}
430
- disabled={!isLoaded && m !== "File"} // Disable if model not loaded, except for initial file view
431
- >
432
- {m}
433
- </button>
434
- ))}
435
- </div>
436
- </div>
437
-
438
- {/* Dynamic Content Area */}
439
- <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
440
- {/* Prompt Input (Common to all modes) */}
441
- <div className="mb-4 w-full max-w-xl">
442
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
443
- <textarea
444
- className="w-full p-2 rounded-lg text-black"
445
- rows={3}
446
- value={prompt}
447
- onChange={(e) => setPrompt(e.target.value)}
448
- disabled={processingState}
449
- />
450
- </div>
451
-
452
- {/* Video/Image Display and Canvas Overlay */}
453
- <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
454
- {/* Conditional rendering for image vs video display */}
455
- {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
456
- <img
457
- ref={imageRef}
458
- src={URL.createObjectURL(uploadedFile)} // Use object URL for display
459
- alt="Uploaded"
460
- className="max-w-full max-h-full block object-contain"
461
- style={{ position: 'absolute' }}
462
- onLoad={() => {
463
- // This is important to ensure canvas matches image size for single image processing
464
- if (imageRef.current && canvasRef.current) {
465
- canvasRef.current.width = imageRef.current.naturalWidth;
466
- canvasRef.current.height = imageRef.current.naturalHeight;
467
- }
468
- }}
469
- />
470
- ) : (
471
- <video
472
- ref={displayVideoRef}
473
- autoPlay
474
- muted
475
- playsInline
476
- loop // Loop for URL and File videos
477
- className="max-w-full max-h-full block object-contain"
478
- style={{ position: 'absolute' }}
479
- />
480
- )}
481
- <canvas
482
- ref={canvasRef}
483
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
484
- style={{ zIndex: 10 }}
485
- />
486
- </div>
487
-
488
- {/* Controls specific to each mode */}
489
- <div className="mt-4 flex flex-col items-center gap-2">
490
- {mode === "Webcam" && (
491
- <button
492
- className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
493
- onClick={handleWebcamInput} // This button sets up/starts webcam
494
- disabled={processingState || !isLoaded}
495
- >
496
- {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
497
- </button>
498
- )}
499
-
500
- {mode === "URL" && (
501
- <>
502
- <div className="flex w-full max-w-xl">
503
- <input
504
- type="text"
505
- className="flex-1 px-4 py-2 rounded-l-lg text-black"
506
- value={currentUrlInput}
507
- onChange={(e) => setCurrentUrlInput(e.target.value)}
508
- placeholder="Paste video URL here"
509
- disabled={processingState}
510
- />
511
- <button
512
- className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
513
- onClick={handleLoadUrl}
514
- disabled={processingState || !isLoaded}
515
- >
516
- Load URL
517
- </button>
518
- </div>
519
- </>
520
- )}
521
-
522
- {mode === "File" && (
523
- <>
524
- <input
525
- type="file"
526
- accept="image/*,video/*"
527
- onChange={handleFileChange}
528
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
529
- disabled={processingState}
530
- />
531
- {uploadedFile && isImageFile(uploadedFile) && (
532
- <button
533
- className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
534
- onClick={handleProcessImage}
535
- disabled={processingState || !isLoaded}
536
- >
537
- {processingState ? "Processing Image..." : "Process Image"}
538
- </button>
539
- )}
540
- </>
541
- )}
542
- </div>
543
-
544
- {/* Error and Debug Output */}
545
- {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
546
- <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
547
- <div>Raw Model Output:</div>
548
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
549
- </div>
550
- </div>
551
- </div>
552
-
553
- {/* Hidden Video for VLM processing - this must be rendered always */}
554
- <video
555
- ref={vlmVideoRef}
556
- autoPlay
557
- muted
558
- playsInline
559
- loop // Loop for URL and File videos
560
- style={{ display: 'none' }} // Hidden from view
561
- />
562
- </div>
563
- );
564
  }
 
1
+ import React, { useState, useRef, useEffect, useCallback } from "react";
2
+ import { useVLMContext } from "../context/useVLMContext";
3
+ import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
+
5
+ const MODES = ["Webcam", "URL", "File"] as const;
6
+ type Mode = typeof MODES[number];
7
+
8
+ const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
9
+ const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
+
11
+ // Helper function: normalizeBoxes remains as it is used
12
+ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
13
+ if (!raw) return [];
14
+ let boxes = [];
15
+ if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
16
+ boxes = raw.image;
17
+ } else if (Array.isArray(raw)) {
18
+ boxes = raw;
19
+ } else if (typeof raw === "object" && raw !== null) {
20
+ boxes = [raw];
21
+ }
22
+ return boxes
23
+ .map((obj: any) => {
24
+ if (!obj || !obj.bbox_2d) return null;
25
+ let bbox = obj.bbox_2d;
26
+ if (
27
+ Array.isArray(bbox) &&
28
+ bbox.length === 2 &&
29
+ Array.isArray(bbox[0]) &&
30
+ Array.isArray(bbox[1]) &&
31
+ bbox[0].length === 2 &&
32
+ bbox[1].length === 2
33
+ ) {
34
+ bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
35
+ }
36
+ if (
37
+ Array.isArray(bbox) &&
38
+ bbox.length === 4 &&
39
+ bbox.every((v: any) => typeof v === "number")
40
+ ) {
41
+ return { ...obj, bbox_2d: bbox };
42
+ }
43
+ return null;
44
+ })
45
+ .filter((obj: any) => obj);
46
+ }
47
+
48
+ function isImageFile(file: File) {
49
+ return file.type.startsWith("image/");
50
+ }
51
+ function isVideoFile(file: File) {
52
+ return file.type.startsWith("video/");
53
+ }
54
+
55
+ export default function MultiSourceCaptioningView() {
56
+ const [mode, setMode] = useState<Mode>("File");
57
+ const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
58
+ const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
59
+ const [processingState, setProcessingState] = useState(false); // General processing indicator
60
+ const [error, setError] = useState<string | null>(null);
61
+ const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
62
+ const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
63
+ const [inferenceStatus, setInferenceStatus] = useState<string>("");
64
+ const [debugOutput, setDebugOutput] = useState<string>("");
65
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE
66
+
67
+ // Refs for the two video elements and the canvas
68
+ const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
69
+ const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
70
+ const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
71
+ const imageRef = useRef<HTMLImageElement>(null); // For image file processing
72
+
73
+ const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
74
+
75
+ // --- Drawing Loop for the Visible Display ---
76
+ // This loop runs constantly to draw the latest boxes on the display video
77
+ const drawDisplayCanvas = useCallback(() => {
78
+ const displayVideo = displayVideoRef.current;
79
+ const canvas = canvasRef.current;
80
+ const ctx = canvas?.getContext('2d');
81
+
82
+ if (!displayVideo || !canvas || !ctx) {
83
+ return;
84
+ }
85
+
86
+ // Adjust canvas size to match the display video's dimensions
87
+ // Only adjust if video has valid dimensions
88
+ if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
89
+ (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
90
+ canvas.width = displayVideo.videoWidth;
91
+ canvas.height = displayVideo.videoHeight;
92
+ }
93
+
94
+ // Clear the canvas each frame
95
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
96
+
97
+ // Draw the latest bounding boxes
98
+ const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
99
+ const scaleY = canvas.height / (displayVideo.videoHeight || 1);
100
+ drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
101
+
102
+ // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
103
+ if (!displayVideo.paused && !displayVideo.ended) {
104
+ requestAnimationFrame(drawDisplayCanvas);
105
+ }
106
+ }, [latestBoxes]); // Re-create if latestBoxes changes
107
+
108
+ // Effect to start the display drawing loop when the display video is ready
109
+ useEffect(() => {
110
+ const displayVideo = displayVideoRef.current;
111
+ if (displayVideo) {
112
+ const handleVideoReady = () => {
113
+ if (displayVideo.readyState >= 1) { // HAVE_METADATA
114
+ requestAnimationFrame(drawDisplayCanvas);
115
+ }
116
+ };
117
+ displayVideo.addEventListener('loadedmetadata', handleVideoReady);
118
+ displayVideo.addEventListener('play', handleVideoReady); // Also start on play
119
+ // Also check if video is already ready (e.g., on component re-mount or autoplay)
120
+ if (displayVideo.readyState >= 1) {
121
+ requestAnimationFrame(drawDisplayCanvas);
122
+ }
123
+ return () => {
124
+ displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
125
+ displayVideo.removeEventListener('play', handleVideoReady);
126
+ };
127
+ }
128
+ }, [drawDisplayCanvas]);
129
+
130
+ // --- FastVLM Processing Loop (from hidden video) ---
131
+ // This interval loop controls when FastVLM processes a frame
132
+ useEffect(() => {
133
+ const vlmVideo = vlmVideoRef.current;
134
+ // Determine if we are in a video-based mode that requires continuous processing
135
+ const isVideoModeActive = (
136
+ mode === "Webcam" ||
137
+ (mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
138
+ (mode === "File" && !!vlmVideo?.src && isVideoFile(uploadedFile || null)) // Check if file is video
139
+ );
140
+
141
+ if (!isLoaded || !vlmVideo || !isVideoModeActive) {
142
+ setProcessingState(false);
143
+ return;
144
+ }
145
+
146
+ let interval: ReturnType<typeof setInterval> | null = null;
147
+
148
+ const startVLMProcessing = () => {
149
+ if (interval) clearInterval(interval); // Clear any old interval
150
+
151
+ interval = setInterval(async () => {
152
+ if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
153
+ return; // Skip if video not ready, paused, ended, or already processing
154
+ }
155
+
156
+ setProcessingState(true);
157
+ setInferenceStatus("Running inference...");
158
+ setError(null);
159
+
160
+ try {
161
+ // Pass the HTMLVideoElement directly to runInference
162
+ const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
163
+ setDebugOutput(modelOutput);
164
+
165
+ let boxes = extractJsonFromMarkdown(modelOutput) || [];
166
+ boxes = normalizeBoxes(boxes);
167
+
168
+ setLatestBoxes(boxes);
169
+ setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
170
+ } catch (e) {
171
+ setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
172
+ setLatestBoxes([]);
173
+ setInferenceStatus("Inference failed.");
174
+ } finally {
175
+ setProcessingState(false);
176
+ }
177
+ }, 200); // Inference interval (e.g., 5 frames per second)
178
+ };
179
+
180
+ const stopVLMProcessing = () => {
181
+ if (interval) clearInterval(interval);
182
+ interval = null;
183
+ setProcessingState(false);
184
+ setInferenceStatus("Stopped processing.");
185
+ };
186
+
187
+ vlmVideo.addEventListener('play', startVLMProcessing);
188
+ vlmVideo.addEventListener('pause', stopVLMProcessing);
189
+ vlmVideo.addEventListener('ended', stopVLMProcessing);
190
+ vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability
191
+
192
+ // Initial check if video is already playing or ready
193
+ if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
194
+ startVLMProcessing();
195
+ }
196
+
197
+ return () => {
198
+ stopVLMProcessing();
199
+ vlmVideo.removeEventListener('play', startVLMProcessing);
200
+ vlmVideo.removeEventListener('pause', stopVLMProcessing);
201
+ vlmVideo.removeEventListener('ended', stopVLMProcessing);
202
+ vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
203
+ };
204
+ }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change
205
+
206
+ // --- Media Source Handling ---
207
+
208
+ // Cleanup for media stream and object URLs
209
+ const cleanupMediaSource = useCallback(() => {
210
+ if (mediaStream) {
211
+ mediaStream.getTracks().forEach(track => track.stop());
212
+ setMediaStream(null);
213
+ }
214
+ if (displayVideoRef.current?.src.startsWith('blob:')) {
215
+ URL.revokeObjectURL(displayVideoRef.current.src);
216
+ displayVideoRef.current.src = "";
217
+ }
218
+ if (vlmVideoRef.current?.src.startsWith('blob:')) {
219
+ URL.revokeObjectURL(vlmVideoRef.current.src);
220
+ vlmVideoRef.current.src = "";
221
+ }
222
+ setLatestBoxes([]);
223
+ setError(null);
224
+ setInferenceStatus("");
225
+ setDebugOutput("");
226
+ setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
227
+ }, [mediaStream]);
228
+
229
+ // Handle changing the mode (Webcam, URL, File)
230
+ useEffect(() => {
231
+ cleanupMediaSource();
232
+
233
+ const displayVideo = displayVideoRef.current;
234
+ const vlmVideo = vlmVideoRef.current;
235
+
236
+ if (!displayVideo || !vlmVideo) return;
237
+
238
+ // Reset srcObject/src to ensure fresh start
239
+ displayVideo.srcObject = null;
240
+ vlmVideo.srcObject = null;
241
+ displayVideo.src = "";
242
+ vlmVideo.src = "";
243
+
244
+ // Special handling for initial "File" mode to load example video if no file is selected
245
+ if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
246
+ displayVideo.src = EXAMPLE_VIDEO_URL;
247
+ vlmVideo.src = EXAMPLE_VIDEO_URL;
248
+ displayVideo.load(); vlmVideo.load();
249
+ displayVideo.play().catch(e => console.error("Error playing example display video:", e));
250
+ vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
251
+ }
252
+ }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
253
+
254
+ // Handle Webcam Input
255
+ const handleWebcamInput = useCallback(async () => {
256
+ cleanupMediaSource();
257
+ try {
258
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
259
+ setMediaStream(stream);
260
+
261
+ if (displayVideoRef.current && vlmVideoRef.current) {
262
+ displayVideoRef.current.srcObject = stream;
263
+ vlmVideoRef.current.srcObject = stream;
264
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
265
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
266
+ }
267
+ setMode("Webcam");
268
+ } catch (e) {
269
+ setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
270
+ setMediaStream(null);
271
+ setLatestBoxes([]);
272
+ setInferenceStatus("Webcam access denied or failed.");
273
+ }
274
+ }, [cleanupMediaSource]);
275
+
276
+ // Handle URL Input (when Load button is clicked)
277
+ const handleLoadUrl = useCallback(() => {
278
+ cleanupMediaSource();
279
+
280
+ const url = currentUrlInput;
281
+ if (!url) {
282
+ setError("Please enter a valid URL.");
283
+ return;
284
+ }
285
+
286
+ if (displayVideoRef.current && vlmVideoRef.current) {
287
+ displayVideoRef.current.src = url;
288
+ vlmVideoRef.current.src = url;
289
+ displayVideoRef.current.load(); vlmVideoRef.current.load();
290
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
291
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
292
+ setMode("URL");
293
+ }
294
+ }, [currentUrlInput, cleanupMediaSource]);
295
+
296
+ // Handle File Input
297
+ const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
298
+ cleanupMediaSource();
299
+
300
+ const file = e.target.files?.[0] || null;
301
+ setUploadedFile(file); // <<< FIXED: Set uploadedFile state here
302
+
303
+ if (file) {
304
+ const fileUrl = URL.createObjectURL(file);
305
+
306
+ if (isImageFile(file)) {
307
+ // Image file, will be handled by imageRef and single processing logic
308
+ setMode("File"); // Ensure mode is "File"
309
+ // No direct video assignment needed here, imageRef handles display
310
+ } else if (isVideoFile(file)) {
311
+ if (displayVideoRef.current && vlmVideoRef.current) {
312
+ displayVideoRef.current.src = fileUrl;
313
+ vlmVideoRef.current.src = fileUrl;
314
+ displayVideoRef.current.load(); vlmVideoRef.current.load();
315
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
316
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
317
+ setMode("File"); // Ensure mode is "File"
318
+ }
319
+ } else {
320
+ setError("Unsupported file type. Please upload an image or video.");
321
+ setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
322
+ if (fileUrl) URL.revokeObjectURL(fileUrl);
323
+ }
324
+ } else {
325
+ setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
326
+ // If no file selected, revert to example video if in File mode
327
+ if (mode === "File") {
328
+ if (displayVideoRef.current && vlmVideoRef.current) {
329
+ displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
330
+ vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
331
+ displayVideoRef.current.load(); vlmVideoRef.current.load();
332
+ displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
333
+ vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
334
+ }
335
+ }
336
+ }
337
+ }, [cleanupMediaSource, mode]);
338
+
339
+
340
+ // Handler for processing an uploaded image file (one-time inference)
341
+ const handleProcessImage = async () => {
342
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
343
+ setError("Image or model not ready for processing, or no image file selected.");
344
+ return;
345
+ }
346
+
347
+ const img = imageRef.current;
348
+ const canvas = canvasRef.current;
349
+ const ctx = canvas.getContext("2d");
350
+ if (!ctx) return;
351
+
352
+ canvas.width = img.naturalWidth;
353
+ canvas.height = img.naturalHeight;
354
+
355
+ setProcessingState(true);
356
+ setError(null);
357
+ setInferenceStatus("Running image inference...");
358
+
359
+ try {
360
+ // Pass the HTMLImageElement directly to runInference
361
+ const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
362
+ setDebugOutput(modelOutput);
363
+ setInferenceStatus("Image inference complete.");
364
+
365
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
366
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image
367
+
368
+ let boxes = extractJsonFromMarkdown(modelOutput) || [];
369
+ boxes = normalizeBoxes(boxes);
370
+ setLatestBoxes(boxes);
371
+
372
+ if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
373
+ } catch (e) {
374
+ setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
375
+ setLatestBoxes([]);
376
+ setInferenceStatus("Image inference failed.");
377
+ } finally {
378
+ setProcessingState(false);
379
+ }
380
+ };
381
+
382
+ // --- Rendered UI ---
383
+ return (
384
+ <div className="absolute inset-0 text-white flex flex-col">
385
+ <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
386
+ {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
387
+ </div>
388
+ <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>
389
+
390
+ <div className="flex flex-col items-center justify-center flex-1 w-full p-4">
391
+ {/* Mode Selector */}
392
+ <div className="mb-6 mt-4">
393
+ <div className="flex space-x-4">
394
+ {MODES.map((m) => (
395
+ <button
396
+ key={m}
397
+ className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
398
+ mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
399
+ }`}
400
+ onClick={() => setMode(m)}
401
+ disabled={!isLoaded && m !== "File"}
402
+ >
403
+ {m}
404
+ </button>
405
+ ))}
406
+ </div>
407
+ </div>
408
+
409
+ {/* Dynamic Content Area */}
410
+ <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
411
+ {/* Prompt Input (Common to all modes) */}
412
+ <div className="mb-4 w-full max-w-xl">
413
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
414
+ <textarea
415
+ className="w-full p-2 rounded-lg text-black"
416
+ rows={3}
417
+ value={prompt}
418
+ onChange={(e) => setPrompt(e.target.value)}
419
+ disabled={processingState}
420
+ />
421
+ </div>
422
+
423
+ {/* Video/Image Display and Canvas Overlay */}
424
+ <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
425
+ {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
426
+ <img
427
+ ref={imageRef}
428
+ src={URL.createObjectURL(uploadedFile)}
429
+ alt="Uploaded"
430
+ className="max-w-full max-h-full block object-contain"
431
+ style={{ position: 'absolute' }}
432
+ onLoad={() => {
433
+ if (imageRef.current && canvasRef.current) {
434
+ canvasRef.current.width = imageRef.current.naturalWidth;
435
+ canvasRef.current.height = imageRef.current.naturalHeight;
436
+ }
437
+ }}
438
+ />
439
+ ) : (
440
+ <video
441
+ ref={displayVideoRef}
442
+ autoPlay
443
+ muted
444
+ playsInline
445
+ loop
446
+ className="max-w-full max-h-full block object-contain"
447
+ style={{ position: 'absolute' }}
448
+ />
449
+ )}
450
+ <canvas
451
+ ref={canvasRef}
452
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
453
+ style={{ zIndex: 10 }}
454
+ />
455
+ </div>
456
+
457
+ {/* Controls specific to each mode */}
458
+ <div className="mt-4 flex flex-col items-center gap-2">
459
+ {mode === "Webcam" && (
460
+ <button
461
+ className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
462
+ onClick={handleWebcamInput}
463
+ disabled={processingState || !isLoaded}
464
+ >
465
+ {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
466
+ </button>
467
+ )}
468
+
469
+ {mode === "URL" && (
470
+ <>
471
+ <div className="flex w-full max-w-xl">
472
+ <input
473
+ type="text"
474
+ className="flex-1 px-4 py-2 rounded-l-lg text-black"
475
+ value={currentUrlInput}
476
+ onChange={(e) => setCurrentUrlInput(e.target.value)}
477
+ placeholder="Paste video URL here"
478
+ disabled={processingState}
479
+ />
480
+ <button
481
+ className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
482
+ onClick={handleLoadUrl}
483
+ disabled={processingState || !isLoaded}
484
+ >
485
+ Load URL
486
+ </button>
487
+ </div>
488
+ </>
489
+ )}
490
+
491
+ {mode === "File" && (
492
+ <>
493
+ <input
494
+ type="file"
495
+ accept="image/*,video/*"
496
+ onChange={handleFileChange}
497
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
498
+ disabled={processingState}
499
+ />
500
+ {uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
501
+ <button
502
+ className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
503
+ onClick={handleProcessImage}
504
+ disabled={processingState || !isLoaded}
505
+ >
506
+ {processingState ? "Processing Image..." : "Process Image"}
507
+ </button>
508
+ )}
509
+ </>
510
+ )}
511
+ </div>
512
+
513
+ {/* Error and Debug Output */}
514
+ {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
515
+ <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
516
+ <div>Raw Model Output:</div>
517
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
518
+ </div>
519
+ </div>
520
+ </div>
521
+
522
+ {/* Hidden Video for VLM processing - this must be rendered always */}
523
+ <video
524
+ ref={vlmVideoRef}
525
+ autoPlay
526
+ muted
527
+ playsInline
528
+ loop
529
+ style={{ display: 'none' }} // Hidden from view
530
+ />
531
+ </div>
532
+ );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  }