Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 2 days ago

Commit

da86406

verified ·

1 Parent(s): 98c6726

Update src/components/MultiSourceCaptioningView.tsx

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +532 -563

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -1,564 +1,533 @@
-import React, { useState, useRef, useEffect, useCallback } from "react";
-import { useVLMContext } from "../context/useVLMContext";
-import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
-const MODES = ["Webcam", "URL", "File"] as const;
-type Mode = typeof MODES[number];
-const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
-const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
-// Helper functions (remain the same)
-function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
-  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
-    const label = arr[0];
-    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
-  }
-  return [];
-}
-function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
-  if (!raw) return [];
-  let boxes = [];
-  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
-    boxes = raw.image;
-  } else if (Array.isArray(raw)) {
-    boxes = raw;
-  } else if (typeof raw === "object" && raw !== null) {
-    boxes = [raw];
-  }
-  return boxes
-    .map((obj: any) => {
-      if (!obj || !obj.bbox_2d) return null;
-      let bbox = obj.bbox_2d;
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 2 &&
-        Array.isArray(bbox[0]) &&
-        Array.isArray(bbox[1]) &&
-        bbox[0].length === 2 &&
-        bbox[1].length === 2
-      ) {
-        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
-      }
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 4 &&
-        bbox.every((v: any) => typeof v === "number")
-      ) {
-        return { ...obj, bbox_2d: bbox };
-      }
-      return null;
-    })
-    .filter((obj: any) => obj);
-}
-function isImageFile(file: File) {
-  return file.type.startsWith("image/");
-}
-function isVideoFile(file: File) {
-  return file.type.startsWith("video/");
-}
-export default function MultiSourceCaptioningView() {
-  const [mode, setMode] = useState<Mode>("File");
-  const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
-  const [processingState, setProcessingState] = useState(false); // General processing indicator
-  const [error, setError] = useState<string | null>(null);
-  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
-  const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
-  const [inferenceStatus, setInferenceStatus] = useState<string>("");
-  const [debugOutput, setDebugOutput] = useState<string>("");
-  // Refs for the two video elements and the canvas
-  const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
-  const vlmVideoRef = useRef<HTMLVideoElement>(null);    // The hidden video for VLM processing
-  const canvasRef = useRef<HTMLCanvasElement>(null);     // The canvas overlay for drawing boxes
-  const imageRef = useRef<HTMLImageElement>(null);       // For image file processing
-  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
-  // --- Drawing Loop for the Visible Display ---
-  // This loop runs constantly to draw the latest boxes on the display video
-  const drawDisplayCanvas = useCallback(() => {
-    const displayVideo = displayVideoRef.current;
-    const canvas = canvasRef.current;
-    const ctx = canvas?.getContext('2d');
-    if (!displayVideo || !canvas || !ctx) {
-      return;
-    }
-    // Adjust canvas size to match the display video's dimensions
-    if (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight) {
-      canvas.width = displayVideo.videoWidth;
-      canvas.height = displayVideo.videoHeight;
-    }
-    // Clear the canvas each frame
-    ctx.clearRect(0, 0, canvas.width, canvas.height);
-    // Draw the latest bounding boxes
-    const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
-    const scaleY = canvas.height / (displayVideo.videoHeight || 1);
-    drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-    // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
-    if (!displayVideo.paused && !displayVideo.ended) {
-      requestAnimationFrame(drawDisplayCanvas);
-    }
-  }, [latestBoxes]); // Re-create if latestBoxes changes
-  // Effect to start the display drawing loop when the display video is ready
-  useEffect(() => {
-    const displayVideo = displayVideoRef.current;
-    if (displayVideo) {
-      const handleVideoReady = () => {
-        // Start the requestAnimationFrame loop once the video has loaded metadata
-        if (displayVideo.readyState >= 1) { // HAVE_METADATA
-          requestAnimationFrame(drawDisplayCanvas);
-        }
-      };
-      displayVideo.addEventListener('loadedmetadata', handleVideoReady);
-      // Also check if video is already ready (e.g., on component re-mount)
-      if (displayVideo.readyState >= 1) {
-        requestAnimationFrame(drawDisplayCanvas);
-      }
-      return () => {
-        displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
-      };
-    }
-  }, [drawDisplayCanvas]);
-  // --- FastVLM Processing Loop (from hidden video/image) ---
-  // This interval loop controls when FastVLM processes a frame
-  useEffect(() => {
-    const vlmVideo = vlmVideoRef.current;
-    const isVideoMode = (mode === "Webcam" || (mode === "URL" && vlmVideo?.src) || (mode === "File" && vlmVideo?.src && isVideoFile(uploadedFile || null)));
-    if (!isLoaded || !vlmVideo || !isVideoMode) {
-      // If not in a video mode or VLM/video not ready, ensure processing stops
-      setProcessingState(false);
-      return;
-    }
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const startVLMProcessing = () => {
-      if (interval) clearInterval(interval); // Clear any old interval
-      interval = setInterval(async () => {
-        if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
-          return; // Skip if video not ready, paused, ended, or already processing
-        }
-        setProcessingState(true);
-        setInferenceStatus("Running inference...");
-        setError(null); // Clear previous errors
-        try {
-          // Create a temporary offscreen canvas to get image data from the VLM video
-          const tempCanvas = document.createElement('canvas');
-          tempCanvas.width = vlmVideo.videoWidth;
-          tempCanvas.height = vlmVideo.videoHeight;
-          const tempCtx = tempCanvas.getContext('2d', { willReadFrequently: true });
-          if (tempCtx && vlmVideo.readyState >= 2) { // HAVE_CURRENT_DATA
-            tempCtx.drawImage(vlmVideo, 0, 0, tempCanvas.width, tempCanvas.height);
-            const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height);
-            const modelOutput = await runInference(imageData, prompt); // Pass ImageData
-            setDebugOutput(modelOutput); // Update raw model output
-            let boxes = extractJsonFromMarkdown(modelOutput) || [];
-            if (boxes.length === 0 && Array.isArray(modelOutput)) { // Fallback for direct array output
-                // This condition `Array.isArray(modelOutput)` is unlikely if modelOutput is string,
-                // so ensure `extractJsonFromMarkdown` is robust or `runInference` returns expected string
-            }
-            boxes = normalizeBoxes(boxes);
-            setLatestBoxes(boxes); // Update state, triggers display canvas redraw
-            setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
-          } else {
-             setInferenceStatus("Video not ready for processing.");
-          }
-        } catch (e) {
-          setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
-          setLatestBoxes([]);
-          setInferenceStatus("Inference failed.");
-        } finally {
-          setProcessingState(false); // Processing finished
-        }
-      }, 200); // Inference interval (e.g., 5 frames per second)
-    };
-    const stopVLMProcessing = () => {
-      if (interval) clearInterval(interval);
-      interval = null;
-      setProcessingState(false);
-      setInferenceStatus("Stopped processing.");
-    };
-    // Start/stop processing based on video playback events
-    vlmVideo.addEventListener('play', startVLMProcessing);
-    vlmVideo.addEventListener('pause', stopVLMProcessing);
-    vlmVideo.addEventListener('ended', stopVLMProcessing);
-    // Initial check if video is already playing (e.g., after initial load/autoplay)
-    if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
-        startVLMProcessing();
-    }
-    // Cleanup function for useEffect
-    return () => {
-      stopVLMProcessing();
-      vlmVideo.removeEventListener('play', startVLMProcessing);
-      vlmVideo.removeEventListener('pause', stopVLMProcessing);
-      vlmVideo.removeEventListener('ended', stopVLMProcessing);
-    };
-  }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Added uploadedFile for file mode re-trigger
-  // --- Media Source Handling ---
-  // Cleanup for media stream and object URLs
-  const cleanupMediaSource = useCallback(() => {
-    if (mediaStream) {
-      mediaStream.getTracks().forEach(track => track.stop());
-      setMediaStream(null);
-    }
-    // Revoke any created blob URLs (for file inputs)
-    if (displayVideoRef.current?.src.startsWith('blob:')) {
-      URL.revokeObjectURL(displayVideoRef.current.src);
-      displayVideoRef.current.src = "";
-    }
-    if (vlmVideoRef.current?.src.startsWith('blob:')) {
-      URL.revokeObjectURL(vlmVideoRef.current.src);
-      vlmVideoRef.current.src = "";
-    }
-    setLatestBoxes([]); // Clear boxes when source changes
-    setError(null);
-    setInferenceStatus("");
-    setDebugOutput("");
-  }, [mediaStream]);
-  // Handle changing the mode (Webcam, URL, File)
-  useEffect(() => {
-    cleanupMediaSource(); // Clean up previous source
-    const displayVideo = displayVideoRef.current;
-    const vlmVideo = vlmVideoRef.current;
-    if (!displayVideo || !vlmVideo) return;
-    // Reset srcObject/src to ensure fresh start
-    displayVideo.srcObject = null;
-    vlmVideo.srcObject = null;
-    displayVideo.src = "";
-    vlmVideo.src = "";
-    setLatestBoxes([]); // Clear boxes on mode change
-    setError(null);
-    setInferenceStatus("");
-    setDebugOutput("");
-    // Special handling for initial file mode to load example video
-    if (mode === "File" && !uploadedFile) {
-        displayVideo.src = EXAMPLE_VIDEO_URL;
-        vlmVideo.src = EXAMPLE_VIDEO_URL;
-        displayVideo.load(); vlmVideo.load(); // Load the video
-        displayVideo.play().catch(e => console.error("Error playing example display video:", e));
-        vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
-    }
-  }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
-  // Handle Webcam Input
-  const handleWebcamInput = useCallback(async () => {
-    cleanupMediaSource(); // Clean up any active stream
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-      setMediaStream(stream); // Store stream to manage it
-      if (displayVideoRef.current && vlmVideoRef.current) {
-        displayVideoRef.current.srcObject = stream;
-        vlmVideoRef.current.srcObject = stream;
-        // Programmatically play both videos
-        displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-        vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-      }
-      setMode("Webcam");
-    } catch (e) {
-      setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
-      setMediaStream(null);
-      setLatestBoxes([]);
-      setInferenceStatus("Webcam access denied or failed.");
-    }
-  }, [cleanupMediaSource]);
-  // Handle URL Input (when Load button is clicked)
-  const handleLoadUrl = useCallback(() => {
-    cleanupMediaSource(); // Clean up any active stream
-    const url = currentUrlInput;
-    if (!url) {
-      setError("Please enter a valid URL.");
-      return;
-    }
-    if (displayVideoRef.current && vlmVideoRef.current) {
-      displayVideoRef.current.src = url;
-      vlmVideoRef.current.src = url;
-      displayVideoRef.current.load(); vlmVideoRef.current.load(); // Load the video
-      displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-      vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-      setMode("URL");
-    }
-  }, [currentUrlInput, cleanupMediaSource]);
-  // Handle File Input
-  const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
-    cleanupMediaSource(); // Clean up any active stream
-    const file = e.target.files?.[0] || null;
-    if (file) {
-      const fileUrl = URL.createObjectURL(file); // Create blob URL for the file
-      // Store the file to differentiate image/video and manage its URL
-      setUploadedFile(file);
-      if (isImageFile(file)) {
-        // For images, we handle processing on a button click, not a continuous loop
-        // The imageRef will display the image
-        // The canvas will be used for processing and drawing
-        setError(null);
-        setMode("File");
-      } else if (isVideoFile(file)) {
-        if (displayVideoRef.current && vlmVideoRef.current) {
-          displayVideoRef.current.src = fileUrl;
-          vlmVideoRef.current.src = fileUrl;
-          displayVideoRef.current.load(); vlmVideoRef.current.load();
-          displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-          vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-          setMode("File");
-        }
-      } else {
-        setError("Unsupported file type. Please upload an image or video.");
-        setUploadedFile(null);
-        if (fileUrl) URL.revokeObjectURL(fileUrl); // Clean up invalid file URL
-      }
-    } else {
-      setUploadedFile(null); // Clear file if nothing selected
-      // If no file selected, revert to example video if in File mode
-      if (mode === "File") {
-        if (displayVideoRef.current && vlmVideoRef.current) {
-          displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
-          vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
-          displayVideoRef.current.load(); vlmVideoRef.current.load();
-          displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
-          vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
-        }
-      }
-    }
-  }, [cleanupMediaSource, mode]);
-  // Handler for processing an uploaded image file (one-time inference)
-  const handleProcessImage = async () => {
-    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
-      setError("Image or model not ready for processing.");
-      return;
-    }
-    const img = imageRef.current;
-    const canvas = canvasRef.current;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    // Ensure canvas dimensions match image for processing and display
-    canvas.width = img.naturalWidth;
-    canvas.height = img.naturalHeight;
-    setProcessingState(true);
-    setError(null);
-    setInferenceStatus("Running image inference...");
-    try {
-      // Draw image to canvas to get ImageData for inference
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
-      const modelOutput = await runInference(imageData, prompt);
-      setDebugOutput(modelOutput);
-      setInferenceStatus("Image inference complete.");
-      // Clear canvas and redraw image before drawing boxes
-      ctx.clearRect(0, 0, canvas.width, canvas.height);
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      let boxes = extractJsonFromMarkdown(modelOutput) || [];
-      boxes = normalizeBoxes(boxes);
-      setLatestBoxes(boxes); // Update latestBoxes for display
-      if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
-    } catch (e) {
-      setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
-      setLatestBoxes([]);
-      setInferenceStatus("Image inference failed.");
-    } finally {
-      setProcessingState(false);
-    }
-  };
-  // --- Rendered UI ---
-  return (
-    <div className="absolute inset-0 text-white flex flex-col">
-      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
-        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
-      </div>
-      <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div> {/* Adjusted top margin */}
-      <div className="flex flex-col items-center justify-center flex-1 w-full p-4"> {/* Added padding */}
-        {/* Mode Selector */}
-        <div className="mb-6 mt-4"> {/* Increased margin-top for selector */}
-          <div className="flex space-x-4">
-            {MODES.map((m) => (
-              <button
-                key={m}
-                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
-                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
-                }`}
-                onClick={() => setMode(m)}
-                disabled={!isLoaded && m !== "File"} // Disable if model not loaded, except for initial file view
-              >
-                {m}
-              </button>
-            ))}
-          </div>
-        </div>
-        {/* Dynamic Content Area */}
-        <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
-          {/* Prompt Input (Common to all modes) */}
-          <div className="mb-4 w-full max-w-xl">
-            <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-            <textarea
-              className="w-full p-2 rounded-lg text-black"
-              rows={3}
-              value={prompt}
-              onChange={(e) => setPrompt(e.target.value)}
-              disabled={processingState}
-            />
-          </div>
-          {/* Video/Image Display and Canvas Overlay */}
-          <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
-            {/* Conditional rendering for image vs video display */}
-            {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
-              <img
-                ref={imageRef}
-                src={URL.createObjectURL(uploadedFile)} // Use object URL for display
-                alt="Uploaded"
-                className="max-w-full max-h-full block object-contain"
-                style={{ position: 'absolute' }}
-                onLoad={() => {
-                    // This is important to ensure canvas matches image size for single image processing
-                    if (imageRef.current && canvasRef.current) {
-                        canvasRef.current.width = imageRef.current.naturalWidth;
-                        canvasRef.current.height = imageRef.current.naturalHeight;
-                    }
-                }}
-              />
-            ) : (
-              <video
-                ref={displayVideoRef}
-                autoPlay
-                muted
-                playsInline
-                loop // Loop for URL and File videos
-                className="max-w-full max-h-full block object-contain"
-                style={{ position: 'absolute' }}
-              />
-            )}
-            <canvas
-              ref={canvasRef}
-              className="absolute top-0 left-0 w-full h-full pointer-events-none"
-              style={{ zIndex: 10 }}
-            />
-          </div>
-          {/* Controls specific to each mode */}
-          <div className="mt-4 flex flex-col items-center gap-2">
-            {mode === "Webcam" && (
-              <button
-                className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                onClick={handleWebcamInput} // This button sets up/starts webcam
-                disabled={processingState || !isLoaded}
-              >
-                {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
-              </button>
-            )}
-            {mode === "URL" && (
-              <>
-                <div className="flex w-full max-w-xl">
-                  <input
-                    type="text"
-                    className="flex-1 px-4 py-2 rounded-l-lg text-black"
-                    value={currentUrlInput}
-                    onChange={(e) => setCurrentUrlInput(e.target.value)}
-                    placeholder="Paste video URL here"
-                    disabled={processingState}
-                  />
-                  <button
-                    className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                    onClick={handleLoadUrl}
-                    disabled={processingState || !isLoaded}
-                  >
-                    Load URL
-                  </button>
-                </div>
-              </>
-            )}
-            {mode === "File" && (
-              <>
-                <input
-                  type="file"
-                  accept="image/*,video/*"
-                  onChange={handleFileChange}
-                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
-                  disabled={processingState}
-                />
-                {uploadedFile && isImageFile(uploadedFile) && (
-                  <button
-                    className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                    onClick={handleProcessImage}
-                    disabled={processingState || !isLoaded}
-                  >
-                    {processingState ? "Processing Image..." : "Process Image"}
-                  </button>
-                )}
-              </>
-            )}
-          </div>
-          {/* Error and Debug Output */}
-          {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
-          <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
-            <div>Raw Model Output:</div>
-            <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-          </div>
-        </div>
-      </div>
-      {/* Hidden Video for VLM processing - this must be rendered always */}
-      <video
-        ref={vlmVideoRef}
-        autoPlay
-        muted
-        playsInline
-        loop // Loop for URL and File videos
-        style={{ display: 'none' }} // Hidden from view
-      />
-    </div>
-  );
 }

+import React, { useState, useRef, useEffect, useCallback } from "react";
+import { useVLMContext } from "../context/useVLMContext";
+import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
+const MODES = ["Webcam", "URL", "File"] as const;
+type Mode = typeof MODES[number];
+const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
+const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
+// Helper function: normalizeBoxes remains as it is used
+function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
+  if (!raw) return [];
+  let boxes = [];
+  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
+    boxes = raw.image;
+  } else if (Array.isArray(raw)) {
+    boxes = raw;
+  } else if (typeof raw === "object" && raw !== null) {
+    boxes = [raw];
+  }
+  return boxes
+    .map((obj: any) => {
+      if (!obj || !obj.bbox_2d) return null;
+      let bbox = obj.bbox_2d;
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 2 &&
+        Array.isArray(bbox[0]) &&
+        Array.isArray(bbox[1]) &&
+        bbox[0].length === 2 &&
+        bbox[1].length === 2
+      ) {
+        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
+      }
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 4 &&
+        bbox.every((v: any) => typeof v === "number")
+      ) {
+        return { ...obj, bbox_2d: bbox };
+      }
+      return null;
+    })
+    .filter((obj: any) => obj);
+}
+function isImageFile(file: File) {
+  return file.type.startsWith("image/");
+}
+function isVideoFile(file: File) {
+  return file.type.startsWith("video/");
+}
+export default function MultiSourceCaptioningView() {
+  const [mode, setMode] = useState<Mode>("File");
+  const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
+  const [processingState, setProcessingState] = useState(false); // General processing indicator
+  const [error, setError] = useState<string | null>(null);
+  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
+  const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
+  const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const [debugOutput, setDebugOutput] = useState<string>("");
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE
+  // Refs for the two video elements and the canvas
+  const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
+  const vlmVideoRef = useRef<HTMLVideoElement>(null);    // The hidden video for VLM processing
+  const canvasRef = useRef<HTMLCanvasElement>(null);     // The canvas overlay for drawing boxes
+  const imageRef = useRef<HTMLImageElement>(null);       // For image file processing
+  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  // --- Drawing Loop for the Visible Display ---
+  // This loop runs constantly to draw the latest boxes on the display video
+  const drawDisplayCanvas = useCallback(() => {
+    const displayVideo = displayVideoRef.current;
+    const canvas = canvasRef.current;
+    const ctx = canvas?.getContext('2d');
+    if (!displayVideo || !canvas || !ctx) {
+      return;
+    }
+    // Adjust canvas size to match the display video's dimensions
+    // Only adjust if video has valid dimensions
+    if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
+        (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
+      canvas.width = displayVideo.videoWidth;
+      canvas.height = displayVideo.videoHeight;
+    }
+    // Clear the canvas each frame
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    // Draw the latest bounding boxes
+    const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
+    const scaleY = canvas.height / (displayVideo.videoHeight || 1);
+    drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+    // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
+    if (!displayVideo.paused && !displayVideo.ended) {
+      requestAnimationFrame(drawDisplayCanvas);
+    }
+  }, [latestBoxes]); // Re-create if latestBoxes changes
+  // Effect to start the display drawing loop when the display video is ready
+  useEffect(() => {
+    const displayVideo = displayVideoRef.current;
+    if (displayVideo) {
+      const handleVideoReady = () => {
+        if (displayVideo.readyState >= 1) { // HAVE_METADATA
+          requestAnimationFrame(drawDisplayCanvas);
+        }
+      };
+      displayVideo.addEventListener('loadedmetadata', handleVideoReady);
+      displayVideo.addEventListener('play', handleVideoReady); // Also start on play
+      // Also check if video is already ready (e.g., on component re-mount or autoplay)
+      if (displayVideo.readyState >= 1) {
+        requestAnimationFrame(drawDisplayCanvas);
+      }
+      return () => {
+        displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
+        displayVideo.removeEventListener('play', handleVideoReady);
+      };
+    }
+  }, [drawDisplayCanvas]);
+  // --- FastVLM Processing Loop (from hidden video) ---
+  // This interval loop controls when FastVLM processes a frame
+  useEffect(() => {
+    const vlmVideo = vlmVideoRef.current;
+    // Determine if we are in a video-based mode that requires continuous processing
+    const isVideoModeActive = (
+      mode === "Webcam" ||
+      (mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
+      (mode === "File" && !!vlmVideo?.src && isVideoFile(uploadedFile || null)) // Check if file is video
+    );
+    if (!isLoaded || !vlmVideo || !isVideoModeActive) {
+      setProcessingState(false);
+      return;
+    }
+    let interval: ReturnType<typeof setInterval> | null = null;
+    const startVLMProcessing = () => {
+      if (interval) clearInterval(interval); // Clear any old interval
+      interval = setInterval(async () => {
+        if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
+          return; // Skip if video not ready, paused, ended, or already processing
+        }
+        setProcessingState(true);
+        setInferenceStatus("Running inference...");
+        setError(null);
+        try {
+          // Pass the HTMLVideoElement directly to runInference
+          const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
+          setDebugOutput(modelOutput);
+          let boxes = extractJsonFromMarkdown(modelOutput) || [];
+          boxes = normalizeBoxes(boxes);
+          setLatestBoxes(boxes);
+          setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
+        } catch (e) {
+          setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
+          setLatestBoxes([]);
+          setInferenceStatus("Inference failed.");
+        } finally {
+          setProcessingState(false);
+        }
+      }, 200); // Inference interval (e.g., 5 frames per second)
+    };
+    const stopVLMProcessing = () => {
+      if (interval) clearInterval(interval);
+      interval = null;
+      setProcessingState(false);
+      setInferenceStatus("Stopped processing.");
+    };
+    vlmVideo.addEventListener('play', startVLMProcessing);
+    vlmVideo.addEventListener('pause', stopVLMProcessing);
+    vlmVideo.addEventListener('ended', stopVLMProcessing);
+    vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability
+    // Initial check if video is already playing or ready
+    if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
+        startVLMProcessing();
+    }
+    return () => {
+      stopVLMProcessing();
+      vlmVideo.removeEventListener('play', startVLMProcessing);
+      vlmVideo.removeEventListener('pause', stopVLMProcessing);
+      vlmVideo.removeEventListener('ended', stopVLMProcessing);
+      vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
+    };
+  }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change
+  // --- Media Source Handling ---
+  // Cleanup for media stream and object URLs
+  const cleanupMediaSource = useCallback(() => {
+    if (mediaStream) {
+      mediaStream.getTracks().forEach(track => track.stop());
+      setMediaStream(null);
+    }
+    if (displayVideoRef.current?.src.startsWith('blob:')) {
+      URL.revokeObjectURL(displayVideoRef.current.src);
+      displayVideoRef.current.src = "";
+    }
+    if (vlmVideoRef.current?.src.startsWith('blob:')) {
+      URL.revokeObjectURL(vlmVideoRef.current.src);
+      vlmVideoRef.current.src = "";
+    }
+    setLatestBoxes([]);
+    setError(null);
+    setInferenceStatus("");
+    setDebugOutput("");
+    setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
+  }, [mediaStream]);
+  // Handle changing the mode (Webcam, URL, File)
+  useEffect(() => {
+    cleanupMediaSource();
+    const displayVideo = displayVideoRef.current;
+    const vlmVideo = vlmVideoRef.current;
+    if (!displayVideo || !vlmVideo) return;
+    // Reset srcObject/src to ensure fresh start
+    displayVideo.srcObject = null;
+    vlmVideo.srcObject = null;
+    displayVideo.src = "";
+    vlmVideo.src = "";
+    // Special handling for initial "File" mode to load example video if no file is selected
+    if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
+        displayVideo.src = EXAMPLE_VIDEO_URL;
+        vlmVideo.src = EXAMPLE_VIDEO_URL;
+        displayVideo.load(); vlmVideo.load();
+        displayVideo.play().catch(e => console.error("Error playing example display video:", e));
+        vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
+    }
+  }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
+  // Handle Webcam Input
+  const handleWebcamInput = useCallback(async () => {
+    cleanupMediaSource();
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+      setMediaStream(stream);
+      if (displayVideoRef.current && vlmVideoRef.current) {
+        displayVideoRef.current.srcObject = stream;
+        vlmVideoRef.current.srcObject = stream;
+        displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+        vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+      }
+      setMode("Webcam");
+    } catch (e) {
+      setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
+      setMediaStream(null);
+      setLatestBoxes([]);
+      setInferenceStatus("Webcam access denied or failed.");
+    }
+  }, [cleanupMediaSource]);
+  // Handle URL Input (when Load button is clicked)
+  const handleLoadUrl = useCallback(() => {
+    cleanupMediaSource();
+    const url = currentUrlInput;
+    if (!url) {
+      setError("Please enter a valid URL.");
+      return;
+    }
+    if (displayVideoRef.current && vlmVideoRef.current) {
+      displayVideoRef.current.src = url;
+      vlmVideoRef.current.src = url;
+      displayVideoRef.current.load(); vlmVideoRef.current.load();
+      displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+      vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+      setMode("URL");
+    }
+  }, [currentUrlInput, cleanupMediaSource]);
+  // Handle File Input
+  const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
+    cleanupMediaSource();
+    const file = e.target.files?.[0] || null;
+    setUploadedFile(file); // <<< FIXED: Set uploadedFile state here
+    if (file) {
+      const fileUrl = URL.createObjectURL(file);
+      if (isImageFile(file)) {
+        // Image file, will be handled by imageRef and single processing logic
+        setMode("File"); // Ensure mode is "File"
+        // No direct video assignment needed here, imageRef handles display
+      } else if (isVideoFile(file)) {
+        if (displayVideoRef.current && vlmVideoRef.current) {
+          displayVideoRef.current.src = fileUrl;
+          vlmVideoRef.current.src = fileUrl;
+          displayVideoRef.current.load(); vlmVideoRef.current.load();
+          displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+          vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+          setMode("File"); // Ensure mode is "File"
+        }
+      } else {
+        setError("Unsupported file type. Please upload an image or video.");
+        setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
+        if (fileUrl) URL.revokeObjectURL(fileUrl);
+      }
+    } else {
+      setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
+      // If no file selected, revert to example video if in File mode
+      if (mode === "File") {
+        if (displayVideoRef.current && vlmVideoRef.current) {
+          displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
+          vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
+          displayVideoRef.current.load(); vlmVideoRef.current.load();
+          displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
+          vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
+        }
+      }
+    }
+  }, [cleanupMediaSource, mode]);
+  // Handler for processing an uploaded image file (one-time inference)
+  const handleProcessImage = async () => {
+    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
+      setError("Image or model not ready for processing, or no image file selected.");
+      return;
+    }
+    const img = imageRef.current;
+    const canvas = canvasRef.current;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    canvas.width = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    setProcessingState(true);
+    setError(null);
+    setInferenceStatus("Running image inference...");
+    try {
+      // Pass the HTMLImageElement directly to runInference
+      const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
+      setDebugOutput(modelOutput);
+      setInferenceStatus("Image inference complete.");
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image
+      let boxes = extractJsonFromMarkdown(modelOutput) || [];
+      boxes = normalizeBoxes(boxes);
+      setLatestBoxes(boxes);
+      if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
+    } catch (e) {
+      setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
+      setLatestBoxes([]);
+      setInferenceStatus("Image inference failed.");
+    } finally {
+      setProcessingState(false);
+    }
+  };
+  // --- Rendered UI ---
+  return (
+    <div className="absolute inset-0 text-white flex flex-col">
+      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
+        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
+      </div>
+      <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>
+      <div className="flex flex-col items-center justify-center flex-1 w-full p-4">
+        {/* Mode Selector */}
+        <div className="mb-6 mt-4">
+          <div className="flex space-x-4">
+            {MODES.map((m) => (
+              <button
+                key={m}
+                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
+                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
+                }`}
+                onClick={() => setMode(m)}
+                disabled={!isLoaded && m !== "File"}
+              >
+                {m}
+              </button>
+            ))}
+          </div>
+        </div>
+        {/* Dynamic Content Area */}
+        <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
+          {/* Prompt Input (Common to all modes) */}
+          <div className="mb-4 w-full max-w-xl">
+            <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+            <textarea
+              className="w-full p-2 rounded-lg text-black"
+              rows={3}
+              value={prompt}
+              onChange={(e) => setPrompt(e.target.value)}
+              disabled={processingState}
+            />
+          </div>
+          {/* Video/Image Display and Canvas Overlay */}
+          <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
+            {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
+              <img
+                ref={imageRef}
+                src={URL.createObjectURL(uploadedFile)}
+                alt="Uploaded"
+                className="max-w-full max-h-full block object-contain"
+                style={{ position: 'absolute' }}
+                onLoad={() => {
+                    if (imageRef.current && canvasRef.current) {
+                        canvasRef.current.width = imageRef.current.naturalWidth;
+                        canvasRef.current.height = imageRef.current.naturalHeight;
+                    }
+                }}
+              />
+            ) : (
+              <video
+                ref={displayVideoRef}
+                autoPlay
+                muted
+                playsInline
+                loop
+                className="max-w-full max-h-full block object-contain"
+                style={{ position: 'absolute' }}
+              />
+            )}
+            <canvas
+              ref={canvasRef}
+              className="absolute top-0 left-0 w-full h-full pointer-events-none"
+              style={{ zIndex: 10 }}
+            />
+          </div>
+          {/* Controls specific to each mode */}
+          <div className="mt-4 flex flex-col items-center gap-2">
+            {mode === "Webcam" && (
+              <button
+                className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
+                onClick={handleWebcamInput}
+                disabled={processingState || !isLoaded}
+              >
+                {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
+              </button>
+            )}
+            {mode === "URL" && (
+              <>
+                <div className="flex w-full max-w-xl">
+                  <input
+                    type="text"
+                    className="flex-1 px-4 py-2 rounded-l-lg text-black"
+                    value={currentUrlInput}
+                    onChange={(e) => setCurrentUrlInput(e.target.value)}
+                    placeholder="Paste video URL here"
+                    disabled={processingState}
+                  />
+                  <button
+                    className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
+                    onClick={handleLoadUrl}
+                    disabled={processingState || !isLoaded}
+                  >
+                    Load URL
+                  </button>
+                </div>
+              </>
+            )}
+            {mode === "File" && (
+              <>
+                <input
+                  type="file"
+                  accept="image/*,video/*"
+                  onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
+                  disabled={processingState}
+                />
+                {uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
+                  <button
+                    className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
+                    onClick={handleProcessImage}
+                    disabled={processingState || !isLoaded}
+                  >
+                    {processingState ? "Processing Image..." : "Process Image"}
+                  </button>
+                )}
+              </>
+            )}
+          </div>
+          {/* Error and Debug Output */}
+          {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
+          <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
+            <div>Raw Model Output:</div>
+            <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+          </div>
+        </div>
+      </div>
+      {/* Hidden Video for VLM processing - this must be rendered always */}
+      <video
+        ref={vlmVideoRef}
+        autoPlay
+        muted
+        playsInline
+        loop
+        style={{ display: 'none' }} // Hidden from view
+      />
+    </div>
+  );
 }