Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 2 days ago

Commit

d5e70e6

verified ·

1 Parent(s): c53d679

Upload 51 files

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +608 -519

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -1,520 +1,609 @@
-import { useState, useRef, useEffect } from "react";
-import { useVLMContext } from "../context/useVLMContext";
-import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
-const MODES = ["Webcam", "URL", "File"] as const;
-type Mode = typeof MODES[number];
-const EXAMPLE_VIDEO_URL = "https://huggingface.co/spaces/Quazim0t0/test/resolve/main/videos/1.mp4";
-const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values.DO NOT USE THESE EXACT VALUES, THIS IS JUST AN Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
-function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
-  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
-    const label = arr[0];
-    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
-  }
-  return [];
-}
-function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
-  if (!raw) return [];
-  let boxes = [];
-  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
-    boxes = raw.image;
-  } else if (Array.isArray(raw)) {
-    boxes = raw;
-  } else if (typeof raw === "object" && raw !== null) {
-    boxes = [raw];
-  }
-  return boxes
-    .map((obj: any) => {
-      if (!obj || !obj.bbox_2d) return null;
-      let bbox = obj.bbox_2d;
-      // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 2 &&
-        Array.isArray(bbox[0]) &&
-        Array.isArray(bbox[1]) &&
-        bbox[0].length === 2 &&
-        bbox[1].length === 2
-      ) {
-        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
-      }
-      // If bbox_2d is [x1, y1, x2, y2], use as-is
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 4 &&
-        bbox.every((v: any) => typeof v === "number")
-      ) {
-        return { ...obj, bbox_2d: bbox };
-      }
-      // Otherwise, skip
-      return null;
-    })
-    .filter((obj: any) => obj);
-}
-function isImageFile(file: File) {
-  return file.type.startsWith("image/");
-}
-function isVideoFile(file: File) {
-  return file.type.startsWith("video/");
-}
-export default function MultiSourceCaptioningView() {
-  const [mode, setMode] = useState<Mode>("File");
-  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
-  const [processing, setProcessing] = useState(false);
-  const [error, setError] = useState<string | null>(null);
-  const [webcamActive, setWebcamActive] = useState(false);
-  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
-  const [uploadedUrl, setUploadedUrl] = useState<string>("");
-  const [videoProcessing, setVideoProcessing] = useState(false);
-  const [imageProcessed, setImageProcessed] = useState(false);
-  const [exampleProcessing, setExampleProcessing] = useState(false);
-  const [urlProcessing, setUrlProcessing] = useState(false);
-  const [debugOutput, setDebugOutput] = useState<string>("");
-  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
-  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
-  const [inferenceStatus, setInferenceStatus] = useState<string>("");
-  const videoRef = useRef<HTMLVideoElement | null>(null);
-  const canvasRef = useRef<HTMLCanvasElement | null>(null);
-  const imageRef = useRef<HTMLImageElement | null>(null);
-  const webcamStreamRef = useRef<MediaStream | null>(null);
-  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
-  const processVideoFrame = async () => {
-    if (!videoRef.current || !canvasRef.current) return;
-    const video = videoRef.current;
-    const canvas = canvasRef.current;
-    if (video.paused || video.ended || video.videoWidth === 0) return;
-    canvas.width = video.videoWidth;
-    canvas.height = video.videoHeight;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-    await runInference(video, prompt, (output: string) => {
-      setDebugOutput(output); // <-- Ensure Raw Model Output is updated
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
-      console.log("Model output:", output);
-      console.log("Boxes after normalization:", boxes);
-      console.log("Canvas size:", canvas.width, canvas.height);
-      if (boxes.length > 0) {
-        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
-        console.log("First box coords:", x1, y1, x2, y2);
-      }
-      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-      if (Array.isArray(boxes) && boxes.length > 0) {
-        const scaleX = canvas.width / video.videoWidth;
-        const scaleY = canvas.height / video.videoHeight;
-        ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
-        drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
-      }
-    });
-  };
-  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
-    const file = e.target.files?.[0] || null;
-    setUploadedFile(file);
-    setUploadedUrl(file ? URL.createObjectURL(file) : "");
-    setError(null);
-    setImageProcessed(false);
-    setVideoProcessing(false);
-    setExampleProcessing(false);
-  };
-  // Webcam setup and teardown (unchanged)
-  useEffect(() => {
-    if (mode !== "Webcam") {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
-      }
-      setWebcamActive(false);
-      return;
-    }
-    const setupWebcam = async () => {
-      try {
-        setError(null);
-        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-        webcamStreamRef.current = stream;
-        if (videoRef.current) {
-          videoRef.current.srcObject = stream;
-          setWebcamActive(true);
-        }
-      } catch (e) {
-        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
-        setWebcamActive(false);
-      }
-    };
-    setupWebcam();
-    return () => {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
-      }
-      setWebcamActive(false);
-    };
-  }, [mode]);
-  // Webcam mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, webcamActive]);
-  // URL mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "URL" || !isLoaded || !urlProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
-  // File video mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
-  // Example video mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
-  // File mode: process uploaded image (only on button click)
-  const handleProcessImage = async () => {
-    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
-    const img = imageRef.current;
-    const canvas = canvasRef.current;
-    canvas.width = img.naturalWidth;
-    canvas.height = img.naturalHeight;
-    setCanvasDims({w:canvas.width,h:canvas.height});
-    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-    setProcessing(true);
-    setError(null);
-    setInferenceStatus("Running inference...");
-    await runInference(img, prompt, (output: string) => {
-      setDebugOutput(output);
-      setInferenceStatus("Inference complete.");
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
-      console.log("Model output:", output);
-      console.log("Boxes after normalization:", boxes);
-      console.log("Canvas size:", canvas.width, canvas.height);
-      if (boxes.length > 0) {
-        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
-        console.log("First box coords:", x1, y1, x2, y2);
-      }
-      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-      if (Array.isArray(boxes) && boxes.length > 0) {
-        const scaleX = canvas.width / img.naturalWidth;
-        const scaleY = canvas.height / img.naturalHeight;
-        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
-      }
-      setImageProcessed(true);
-    });
-    setProcessing(false);
-  };
-  // File mode: process uploaded video frames (start/stop)
-  const handleToggleVideoProcessing = () => {
-    setVideoProcessing((prev) => !prev);
-  };
-  // Handle start/stop for example video processing
-  const handleToggleExampleProcessing = () => {
-    setExampleProcessing((prev) => !prev);
-  };
-  // Handle start/stop for URL video processing
-  const handleToggleUrlProcessing = () => {
-    setUrlProcessing((prev) => !prev);
-  };
-  // Test draw box function
-  const handleTestDrawBox = () => {
-    if (!canvasRef.current) return;
-    const canvas = canvasRef.current;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.clearRect(0, 0, canvas.width, canvas.height);
-    ctx.strokeStyle = "#FF00FF";
-    ctx.lineWidth = 4;
-    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
-    ctx.font = "20px Arial";
-    ctx.fillStyle = "#FF00FF";
-    ctx.fillText("Test Box", 50, 35);
-  };
-  return (
-    <div className="absolute inset-0 text-white">
-      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
-        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
-      </div>
-      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
-      <div className="flex flex-col items-center justify-center h-full w-full">
-        {/* Mode Selector */}
-        <div className="mb-6">
-          <div className="flex space-x-4">
-            {MODES.map((m) => (
-              <button
-                key={m}
-                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
-                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
-                }`}
-                onClick={() => setMode(m)}
-              >
-                {m}
-              </button>
-            ))}
-          </div>
-        </div>
-        {/* Mode Content */}
-        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
-          {mode === "Webcam" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="relative w-full max-w-xl">
-                <video
-                  ref={videoRef}
-                  autoPlay
-                  muted
-                  playsInline
-                  className="w-full rounded-lg shadow-lg mb-2"
-                  style={{ background: "#222" }}
-                />
-                <canvas
-                  ref={canvasRef}
-                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                  style={{ zIndex: 10, pointerEvents: "none" }}
-                />
-              </div>
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-            </div>
-          )}
-          {mode === "URL" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
-              <div className="flex w-full max-w-xl mb-4">
-                <input
-                  type="text"
-                  className="flex-1 px-4 py-2 rounded-l-lg text-black"
-                  value={inputUrl}
-                  onChange={(e) => setInputUrl(e.target.value)}
-                  placeholder="Paste video URL here"
-                />
-                <button
-                  className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
-                  onClick={() => setVideoUrl(inputUrl)}
-                >
-                  Load
-                </button>
-              </div>
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="relative w-full max-w-xl">
-                <video
-                  ref={videoRef}
-                  src={videoUrl}
-                  controls
-                  autoPlay
-                  loop
-                  className="w-full rounded-lg shadow-lg mb-2"
-                  style={{ background: "#222" }}
-                />
-                <canvas
-                  ref={canvasRef}
-                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                  style={{ zIndex: 10, pointerEvents: "none" }}
-                />
-                <button
-                  className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                  onClick={handleToggleUrlProcessing}
-                >
-                  {urlProcessing ? "Stop Processing" : "Start Processing"}
-                </button>
-              </div>
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-              <button
-                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
-                onClick={handleTestDrawBox}
-              >
-                Test Draw Box
-              </button>
-              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
-                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
-                <div>Raw Model Output:</div>
-                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-              </div>
-            </div>
-          )}
-          {mode === "File" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="mb-4 w-full max-w-xl">
-                <input
-                  type="file"
-                  accept="image/*,video/*"
-                  onChange={handleFileChange}
-                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
-                />
-              </div>
-              {/* Show uploaded image */}
-              {uploadedFile && isImageFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl">
-                  <img
-                    ref={imageRef}
-                    src={uploadedUrl}
-                    alt="Uploaded"
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleProcessImage}
-                    disabled={processing}
-                  >
-                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
-                  </button>
-                </div>
-              )}
-              {/* Show uploaded video */}
-              {uploadedFile && isVideoFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl">
-                  <video
-                    ref={videoRef}
-                    src={uploadedUrl}
-                    controls
-                    autoPlay
-                    loop
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleVideoProcessing}
-                  >
-                    {videoProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {/* Show example video if no file uploaded */}
-              {!uploadedFile && (
-                <div className="relative w-full max-w-xl">
-                  <video
-                    ref={videoRef}
-                    src={EXAMPLE_VIDEO_URL}
-                    controls
-                    autoPlay
-                    loop
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleExampleProcessing}
-                  >
-                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-              <button
-                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
-                onClick={handleTestDrawBox}
-              >
-                Test Draw Box
-              </button>
-              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
-                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
-                <div>Raw Model Output:</div>
-                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-              </div>
-            </div>
-          )}
-        </div>
-      </div>
-    </div>
-  );
 }

+import React, { useState, useRef, useEffect } from "react";
+import { useVLMContext } from "../context/useVLMContext";
+import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
+const MODES = ["Webcam", "URL", "File"] as const;
+type Mode = typeof MODES[number];
+const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
+const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
+function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
+  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
+    const label = arr[0];
+    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
+  }
+  return [];
+}
+function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
+  if (!raw) return [];
+  let boxes = [];
+  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
+    boxes = raw.image;
+  } else if (Array.isArray(raw)) {
+    boxes = raw;
+  } else if (typeof raw === "object" && raw !== null) {
+    boxes = [raw];
+  }
+  return boxes
+    .map((obj: any) => {
+      if (!obj || !obj.bbox_2d) return null;
+      let bbox = obj.bbox_2d;
+      // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 2 &&
+        Array.isArray(bbox[0]) &&
+        Array.isArray(bbox[1]) &&
+        bbox[0].length === 2 &&
+        bbox[1].length === 2
+      ) {
+        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
+      }
+      // If bbox_2d is [x1, y1, x2, y2], use as-is
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 4 &&
+        bbox.every((v: any) => typeof v === "number")
+      ) {
+        return { ...obj, bbox_2d: bbox };
+      }
+      // Otherwise, skip
+      return null;
+    })
+    .filter((obj: any) => obj);
+}
+function isImageFile(file: File) {
+  return file.type.startsWith("image/");
+}
+function isVideoFile(file: File) {
+  return file.type.startsWith("video/");
+}
+export default function MultiSourceCaptioningView() {
+  const [mode, setMode] = useState<Mode>("File");
+  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
+  const [processing, setProcessing] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const [webcamActive, setWebcamActive] = useState(false);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [uploadedUrl, setUploadedUrl] = useState<string>("");
+  const [videoProcessing, setVideoProcessing] = useState(false);
+  const [imageProcessed, setImageProcessed] = useState(false);
+  const [exampleProcessing, setExampleProcessing] = useState(false);
+  const [urlProcessing, setUrlProcessing] = useState(false);
+  const [debugOutput, setDebugOutput] = useState<string>("");
+  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
+  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
+  const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const videoRef = useRef<HTMLVideoElement | null>(null);
+  const overlayVideoRef = useRef<HTMLVideoElement | null>(null); // NEW: overlay video
+  const canvasRef = useRef<HTMLCanvasElement | null>(null);
+  const imageRef = useRef<HTMLImageElement | null>(null);
+  const webcamStreamRef = useRef<MediaStream | null>(null);
+  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  // Persistent boxes state: {boxes: [...], timestamp: number}
+  const [persistentBoxes, setPersistentBoxes] = useState<{boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]>([]);
+  const BOX_PERSIST_MS = 2000; // 2 seconds
+  // Helper: Add new boxes with timestamp
+  const addBoxesWithTimestamp = (boxes: {label: string, bbox_2d: number[]}[]) => {
+    if (!boxes || boxes.length === 0) return;
+    setPersistentBoxes((prev: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]) => [
+      ...prev.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => Date.now() - entry.timestamp < BOX_PERSIST_MS),
+      { boxes, timestamp: Date.now() }
+    ]);
+  };
+  // Helper: Get all boxes from last 2 seconds
+  const getCurrentBoxes = () => {
+    const now = Date.now();
+    return persistentBoxes
+      .filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => now - entry.timestamp < BOX_PERSIST_MS)
+      .flatMap((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => entry.boxes);
+  };
+  // Synchronize overlay video with main video
+  useEffect(() => {
+    const main = videoRef.current;
+    const overlay = overlayVideoRef.current;
+    if (!main || !overlay) return;
+    // Sync play/pause
+    const syncPlay = () => { if (main.paused !== overlay.paused) main.paused ? overlay.pause() : overlay.play(); };
+    main.addEventListener('play', () => overlay.play());
+    main.addEventListener('pause', () => overlay.pause());
+    // Sync seeking
+    const syncTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) overlay.currentTime = main.currentTime; };
+    main.addEventListener('seeked', syncTime);
+    main.addEventListener('timeupdate', syncTime);
+    // Clean up
+    return () => {
+      main.removeEventListener('play', () => overlay.play());
+      main.removeEventListener('pause', () => overlay.pause());
+      main.removeEventListener('seeked', syncTime);
+      main.removeEventListener('timeupdate', syncTime);
+    };
+  }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
+  // Update: processVideoFrame now adds boxes to persistentBoxes
+  const processVideoFrame = async () => {
+    if (!videoRef.current || !canvasRef.current) return;
+    const video = videoRef.current;
+    const canvas = canvasRef.current;
+    if (video.paused || video.ended || video.videoWidth === 0) return;
+    canvas.width = video.videoWidth;
+    canvas.height = video.videoHeight;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+    await runInference(video, prompt, (output: string) => {
+      setDebugOutput(output);
+      let boxes = extractJsonFromMarkdown(output) || [];
+      if (boxes.length === 0 && Array.isArray(output)) {
+        boxes = parseFlatBoxArray(output);
+      }
+      boxes = normalizeBoxes(boxes);
+      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+      if (Array.isArray(boxes) && boxes.length > 0) {
+        addBoxesWithTimestamp(boxes); // <-- Add to persistent state
+      }
+    });
+  };
+  // Draw persistent boxes on every frame
+  useEffect(() => {
+    const draw = () => {
+      if (!videoRef.current || !canvasRef.current) return;
+      const video = videoRef.current;
+      const canvas = canvasRef.current;
+      if (video.videoWidth === 0) return;
+      canvas.width = video.videoWidth;
+      canvas.height = video.videoHeight;
+      const ctx = canvas.getContext("2d");
+      if (!ctx) return;
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      const boxes = getCurrentBoxes();
+      if (boxes.length > 0) {
+        const scaleX = canvas.width / video.videoWidth;
+        const scaleY = canvas.height / video.videoHeight;
+        drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+      }
+    };
+    draw();
+    const interval = setInterval(draw, 100);
+    return () => clearInterval(interval);
+  }, [persistentBoxes, videoRef, canvasRef]);
+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0] || null;
+    setUploadedFile(file);
+    setUploadedUrl(file ? URL.createObjectURL(file) : "");
+    setError(null);
+    setImageProcessed(false);
+    setVideoProcessing(false);
+    setExampleProcessing(false);
+  };
+  // Webcam setup and teardown (unchanged)
+  useEffect(() => {
+    if (mode !== "Webcam") {
+      if (webcamStreamRef.current) {
+        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
+        webcamStreamRef.current = null;
+      }
+      setWebcamActive(false);
+      return;
+    }
+    const setupWebcam = async () => {
+      try {
+        setError(null);
+        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+        webcamStreamRef.current = stream;
+        if (videoRef.current) {
+          videoRef.current.srcObject = stream;
+          setWebcamActive(true);
+        }
+      } catch (e) {
+        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
+        setWebcamActive(false);
+      }
+    };
+    setupWebcam();
+    return () => {
+      if (webcamStreamRef.current) {
+        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
+        webcamStreamRef.current = null;
+      }
+      setWebcamActive(false);
+    };
+  }, [mode]);
+  // Webcam mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, webcamActive]);
+  // URL mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "URL" || !isLoaded || !urlProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
+  // File video mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
+  // Example video mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
+  // File mode: process uploaded image (only on button click)
+  const handleProcessImage = async () => {
+    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
+    const img = imageRef.current;
+    const canvas = canvasRef.current;
+    canvas.width = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    setCanvasDims({w:canvas.width,h:canvas.height});
+    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    setProcessing(true);
+    setError(null);
+    setInferenceStatus("Running inference...");
+    await runInference(img, prompt, (output: string) => {
+      setDebugOutput(output);
+      setInferenceStatus("Inference complete.");
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractJsonFromMarkdown(output) || [];
+      if (boxes.length === 0 && Array.isArray(output)) {
+        boxes = parseFlatBoxArray(output);
+      }
+      boxes = normalizeBoxes(boxes);
+      console.log("Model output:", output);
+      console.log("Boxes after normalization:", boxes);
+      console.log("Canvas size:", canvas.width, canvas.height);
+      if (boxes.length > 0) {
+        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
+        console.log("First box coords:", x1, y1, x2, y2);
+      }
+      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+      if (Array.isArray(boxes) && boxes.length > 0) {
+        const scaleX = canvas.width / img.naturalWidth;
+        const scaleY = canvas.height / img.naturalHeight;
+        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
+      }
+      setImageProcessed(true);
+    });
+    setProcessing(false);
+  };
+  // File mode: process uploaded video frames (start/stop)
+  const handleToggleVideoProcessing = () => {
+    setVideoProcessing((prev) => !prev);
+  };
+  // Handle start/stop for example video processing
+  const handleToggleExampleProcessing = () => {
+    setExampleProcessing((prev) => !prev);
+  };
+  // Handle start/stop for URL video processing
+  const handleToggleUrlProcessing = () => {
+    setUrlProcessing((prev) => !prev);
+  };
+  // Test draw box function
+  const handleTestDrawBox = () => {
+    if (!canvasRef.current) return;
+    const canvas = canvasRef.current;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    ctx.strokeStyle = "#FF00FF";
+    ctx.lineWidth = 4;
+    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
+    ctx.font = "20px Arial";
+    ctx.fillStyle = "#FF00FF";
+    ctx.fillText("Test Box", 50, 35);
+  };
+  return (
+    <div className="absolute inset-0 text-white">
+      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
+        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
+      </div>
+      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
+      <div className="flex flex-col items-center justify-center h-full w-full">
+        {/* Mode Selector */}
+        <div className="mb-6">
+          <div className="flex space-x-4">
+            {MODES.map((m) => (
+              <button
+                key={m}
+                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
+                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
+                }`}
+                onClick={() => setMode(m)}
+              >
+                {m}
+              </button>
+            ))}
+          </div>
+        </div>
+        {/* Mode Content */}
+        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
+          {mode === "Webcam" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="relative w-full max-w-xl">
+                <video
+                  ref={videoRef}
+                  autoPlay
+                  muted
+                  playsInline
+                  className="w-full rounded-lg shadow-lg mb-2"
+                  style={{ background: "#222" }}
+                />
+                <canvas
+                  ref={canvasRef}
+                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                  style={{ zIndex: 10, pointerEvents: "none" }}
+                />
+              </div>
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+            </div>
+          )}
+          {mode === "URL" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
+              <div className="flex w-full max-w-xl mb-4">
+                <input
+                  type="text"
+                  className="flex-1 px-4 py-2 rounded-l-lg text-black"
+                  value={inputUrl}
+                  onChange={(e) => setInputUrl(e.target.value)}
+                  placeholder="Paste video URL here"
+                />
+                <button
+                  className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
+                  onClick={() => setVideoUrl(inputUrl)}
+                >
+                  Load
+                </button>
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="relative w-full max-w-xl">
+                <video
+                  ref={videoRef}
+                  src={videoUrl}
+                  controls
+                  autoPlay
+                  loop
+                  className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
+                  style={{ background: "#222" }}
+                />
+                <video
+                  ref={overlayVideoRef}
+                  src={videoUrl}
+                  controls={false}
+                  autoPlay
+                  loop
+                  muted
+                  className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
+                  style={{ background: "#222" }}
+                />
+                <canvas
+                  ref={canvasRef}
+                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                  style={{ zIndex: 20, pointerEvents: "none" }}
+                />
+                <button
+                  className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
+                  onClick={handleToggleUrlProcessing}
+                >
+                  {urlProcessing ? "Stop Processing" : "Start Processing"}
+                </button>
+              </div>
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+              <button
+                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
+                onClick={handleTestDrawBox}
+              >
+                Test Draw Box
+              </button>
+              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
+                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
+                <div>Raw Model Output:</div>
+                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+              </div>
+            </div>
+          )}
+          {mode === "File" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <input
+                  type="file"
+                  accept="image/*,video/*"
+                  onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
+                />
+              </div>
+              {/* Show uploaded image */}
+              {uploadedFile && isImageFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <img
+                    ref={imageRef}
+                    src={uploadedUrl}
+                    alt="Uploaded"
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleProcessImage}
+                    disabled={processing}
+                  >
+                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
+                  </button>
+                </div>
+              )}
+              {/* Show uploaded video */}
+              {uploadedFile && isVideoFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={uploadedUrl}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
+                    style={{ background: "#222" }}
+                  />
+                  <video
+                    ref={overlayVideoRef}
+                    src={uploadedUrl}
+                    controls={false}
+                    autoPlay
+                    loop
+                    muted
+                    className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 20, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
+                    onClick={handleToggleVideoProcessing}
+                  >
+                    {videoProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {/* Show example video if no file uploaded */}
+              {!uploadedFile && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
+                    style={{ background: "#222" }}
+                  />
+                  <video
+                    ref={overlayVideoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    controls={false}
+                    autoPlay
+                    loop
+                    muted
+                    className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 20, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
+                    onClick={handleToggleExampleProcessing}
+                  >
+                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+              <button
+                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
+                onClick={handleTestDrawBox}
+              >
+                Test Draw Box
+              </button>
+              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
+                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
+                <div>Raw Model Output:</div>
+                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+    </div>
+  );
 }