Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 1 day ago

Commit

96b5215

verified ·

1 Parent(s): 20197c2

Upload 51 files

Browse files

Files changed (4) hide show

src/App.tsx +2 -6
src/components/BoxAnnotator.ts +3 -13
src/components/MultiSourceCaptioningView.tsx +520 -533
src/index.js +1 -1

src/App.tsx CHANGED Viewed

@@ -11,8 +11,7 @@ export default function App() {
       await loadModel();
       setStarted(true);
     } catch (e) {
-      // error is handled by context, could log here if needed
-      console.error("Failed to load model:", e);
     }
   };
@@ -28,9 +27,6 @@ export default function App() {
           {isLoading ? "Loading Model..." : "Load Model"}
         </button>
         {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
-        <p className="text-sm text-gray-400 mt-2">
-          Model will download on first load. This may take a moment.
-        </p>
       </div>
     );
   }
@@ -41,4 +37,4 @@ export default function App() {
       <MultiSourceCaptioningView />
     </div>
   );
-}

       await loadModel();
       setStarted(true);
     } catch (e) {
+      // error is handled by context
     }
   };
           {isLoading ? "Loading Model..." : "Load Model"}
         </button>
         {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
       </div>
     );
   }
       <MultiSourceCaptioningView />
     </div>
   );
+}

src/components/BoxAnnotator.ts CHANGED Viewed

@@ -16,7 +16,6 @@ export function extractJsonFromMarkdown(markdown: string): any[] | null {
     if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
     return null;
   } catch {
-    console.error("Failed to parse JSON from markdown:", jsonString);
     return null;
   }
 }
@@ -32,15 +31,7 @@ export function drawBoundingBoxesOnCanvas(
   boxes: { bbox_2d: number[]; label?: string }[],
   options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
 ) {
-  if (!Array.isArray(boxes)) {
-    console.warn("drawBoundingBoxesOnCanvas: 'boxes' is not an array or is null/undefined.", boxes);
-    return;
-  }
-  if (boxes.length === 0) {
-    // console.log("drawBoundingBoxesOnCanvas: 'boxes' array is empty, nothing to draw.");
-    return;
-  }
   const color = options?.color || "#00FF00";
   const lineWidth = options?.lineWidth || 2;
   const font = options?.font || "16px Arial";
@@ -63,10 +54,9 @@ export function drawBoundingBoxesOnCanvas(
     ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
     ctx.stroke();
     if (obj.label) {
-      // Adjust text position to ensure visibility, especially if near top edge
-      ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 16 ? sy1 + 16 : sy1 - 4);
     }
   });
   ctx.restore();
-}

     if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
     return null;
   } catch {
     return null;
   }
 }
   boxes: { bbox_2d: number[]; label?: string }[],
   options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
 ) {
+  if (!Array.isArray(boxes)) return; // Prevent errors if boxes is undefined/null
   const color = options?.color || "#00FF00";
   const lineWidth = options?.lineWidth || 2;
   const font = options?.font || "16px Arial";
     ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
     ctx.stroke();
     if (obj.label) {
+      ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 10 ? sy1 + 16 : sy1 - 4);
     }
   });
   ctx.restore();
+}

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -1,533 +1,520 @@
-import React, { useState, useRef, useEffect, useCallback } from "react";
-import { useVLMContext } from "../context/useVLMContext";
-import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
-const MODES = ["Webcam", "URL", "File"] as const;
-type Mode = typeof MODES[number];
-const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
-const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
-// Helper function: normalizeBoxes remains as it is used
-function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
-  if (!raw) return [];
-  let boxes = [];
-  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
-    boxes = raw.image;
-  } else if (Array.isArray(raw)) {
-    boxes = raw;
-  } else if (typeof raw === "object" && raw !== null) {
-    boxes = [raw];
-  }
-  return boxes
-    .map((obj: any) => {
-      if (!obj || !obj.bbox_2d) return null;
-      let bbox = obj.bbox_2d;
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 2 &&
-        Array.isArray(bbox[0]) &&
-        Array.isArray(bbox[1]) &&
-        bbox[0].length === 2 &&
-        bbox[1].length === 2
-      ) {
-        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
-      }
-      if (
-        Array.isArray(bbox) &&
-        bbox.length === 4 &&
-        bbox.every((v: any) => typeof v === "number")
-      ) {
-        return { ...obj, bbox_2d: bbox };
-      }
-      return null;
-    })
-    .filter((obj: any) => obj);
-}
-function isImageFile(file: File) {
-  return file.type.startsWith("image/");
-}
-function isVideoFile(file: File) {
-  return file.type.startsWith("video/");
-}
-export default function MultiSourceCaptioningView() {
-  const [mode, setMode] = useState<Mode>("File");
-  const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
-  const [processingState, setProcessingState] = useState(false); // General processing indicator
-  const [error, setError] = useState<string | null>(null);
-  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
-  const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
-  const [inferenceStatus, setInferenceStatus] = useState<string>("");
-  const [debugOutput, setDebugOutput] = useState<string>("");
-  const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE
-  // Refs for the two video elements and the canvas
-  const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
-  const vlmVideoRef = useRef<HTMLVideoElement>(null);    // The hidden video for VLM processing
-  const canvasRef = useRef<HTMLCanvasElement>(null);     // The canvas overlay for drawing boxes
-  const imageRef = useRef<HTMLImageElement>(null);       // For image file processing
-  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
-  // --- Drawing Loop for the Visible Display ---
-  // This loop runs constantly to draw the latest boxes on the display video
-  const drawDisplayCanvas = useCallback(() => {
-    const displayVideo = displayVideoRef.current;
-    const canvas = canvasRef.current;
-    const ctx = canvas?.getContext('2d');
-    if (!displayVideo || !canvas || !ctx) {
-      return;
-    }
-    // Adjust canvas size to match the display video's dimensions
-    // Only adjust if video has valid dimensions
-    if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
-        (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
-      canvas.width = displayVideo.videoWidth;
-      canvas.height = displayVideo.videoHeight;
-    }
-    // Clear the canvas each frame
-    ctx.clearRect(0, 0, canvas.width, canvas.height);
-    // Draw the latest bounding boxes
-    const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
-    const scaleY = canvas.height / (displayVideo.videoHeight || 1);
-    drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-    // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
-    if (!displayVideo.paused && !displayVideo.ended) {
-      requestAnimationFrame(drawDisplayCanvas);
-    }
-  }, [latestBoxes]); // Re-create if latestBoxes changes
-  // Effect to start the display drawing loop when the display video is ready
-  useEffect(() => {
-    const displayVideo = displayVideoRef.current;
-    if (displayVideo) {
-      const handleVideoReady = () => {
-        if (displayVideo.readyState >= 1) { // HAVE_METADATA
-          requestAnimationFrame(drawDisplayCanvas);
-        }
-      };
-      displayVideo.addEventListener('loadedmetadata', handleVideoReady);
-      displayVideo.addEventListener('play', handleVideoReady); // Also start on play
-      // Also check if video is already ready (e.g., on component re-mount or autoplay)
-      if (displayVideo.readyState >= 1) {
-        requestAnimationFrame(drawDisplayCanvas);
-      }
-      return () => {
-        displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
-        displayVideo.removeEventListener('play', handleVideoReady);
-      };
-    }
-  }, [drawDisplayCanvas]);
-  // --- FastVLM Processing Loop (from hidden video) ---
-  // This interval loop controls when FastVLM processes a frame
-  useEffect(() => {
-    const vlmVideo = vlmVideoRef.current;
-    // Determine if we are in a video-based mode that requires continuous processing
-    const isVideoModeActive = (
-      mode === "Webcam" ||
-      (mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
-      (mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile))
-    );
-    if (!isLoaded || !vlmVideo || !isVideoModeActive) {
-      setProcessingState(false);
-      return;
-    }
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const startVLMProcessing = () => {
-      if (interval) clearInterval(interval); // Clear any old interval
-      interval = setInterval(async () => {
-        if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
-          return; // Skip if video not ready, paused, ended, or already processing
-        }
-        setProcessingState(true);
-        setInferenceStatus("Running inference...");
-        setError(null);
-        try {
-          // Pass the HTMLVideoElement directly to runInference
-          const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
-          setDebugOutput(modelOutput);
-          let boxes = extractJsonFromMarkdown(modelOutput) || [];
-          boxes = normalizeBoxes(boxes);
-          setLatestBoxes(boxes);
-          setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
-        } catch (e) {
-          setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
-          setLatestBoxes([]);
-          setInferenceStatus("Inference failed.");
-        } finally {
-          setProcessingState(false);
-        }
-      }, 200); // Inference interval (e.g., 5 frames per second)
-    };
-    const stopVLMProcessing = () => {
-      if (interval) clearInterval(interval);
-      interval = null;
-      setProcessingState(false);
-      setInferenceStatus("Stopped processing.");
-    };
-    vlmVideo.addEventListener('play', startVLMProcessing);
-    vlmVideo.addEventListener('pause', stopVLMProcessing);
-    vlmVideo.addEventListener('ended', stopVLMProcessing);
-    vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability
-    // Initial check if video is already playing or ready
-    if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
-        startVLMProcessing();
-    }
-    return () => {
-      stopVLMProcessing();
-      vlmVideo.removeEventListener('play', startVLMProcessing);
-      vlmVideo.removeEventListener('pause', stopVLMProcessing);
-      vlmVideo.removeEventListener('ended', stopVLMProcessing);
-      vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
-    };
-  }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change
-  // --- Media Source Handling ---
-  // Cleanup for media stream and object URLs
-  const cleanupMediaSource = useCallback(() => {
-    if (mediaStream) {
-      mediaStream.getTracks().forEach(track => track.stop());
-      setMediaStream(null);
-    }
-    if (displayVideoRef.current?.src.startsWith('blob:')) {
-      URL.revokeObjectURL(displayVideoRef.current.src);
-      displayVideoRef.current.src = "";
-    }
-    if (vlmVideoRef.current?.src.startsWith('blob:')) {
-      URL.revokeObjectURL(vlmVideoRef.current.src);
-      vlmVideoRef.current.src = "";
-    }
-    setLatestBoxes([]);
-    setError(null);
-    setInferenceStatus("");
-    setDebugOutput("");
-    setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
-  }, [mediaStream]);
-  // Handle changing the mode (Webcam, URL, File)
-  useEffect(() => {
-    cleanupMediaSource();
-    const displayVideo = displayVideoRef.current;
-    const vlmVideo = vlmVideoRef.current;
-    if (!displayVideo || !vlmVideo) return;
-    // Reset srcObject/src to ensure fresh start
-    displayVideo.srcObject = null;
-    vlmVideo.srcObject = null;
-    displayVideo.src = "";
-    vlmVideo.src = "";
-    // Special handling for initial "File" mode to load example video if no file is selected
-    if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
-        displayVideo.src = EXAMPLE_VIDEO_URL;
-        vlmVideo.src = EXAMPLE_VIDEO_URL;
-        displayVideo.load(); vlmVideo.load();
-        displayVideo.play().catch(e => console.error("Error playing example display video:", e));
-        vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
-    }
-  }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
-  // Handle Webcam Input
-  const handleWebcamInput = useCallback(async () => {
-    cleanupMediaSource();
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-      setMediaStream(stream);
-      if (displayVideoRef.current && vlmVideoRef.current) {
-        displayVideoRef.current.srcObject = stream;
-        vlmVideoRef.current.srcObject = stream;
-        displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-        vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-      }
-      setMode("Webcam");
-    } catch (e) {
-      setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
-      setMediaStream(null);
-      setLatestBoxes([]);
-      setInferenceStatus("Webcam access denied or failed.");
-    }
-  }, [cleanupMediaSource]);
-  // Handle URL Input (when Load button is clicked)
-  const handleLoadUrl = useCallback(() => {
-    cleanupMediaSource();
-    const url = currentUrlInput;
-    if (!url) {
-      setError("Please enter a valid URL.");
-      return;
-    }
-    if (displayVideoRef.current && vlmVideoRef.current) {
-      displayVideoRef.current.src = url;
-      vlmVideoRef.current.src = url;
-      displayVideoRef.current.load(); vlmVideoRef.current.load();
-      displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-      vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-      setMode("URL");
-    }
-  }, [currentUrlInput, cleanupMediaSource]);
-  // Handle File Input
-  const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
-    cleanupMediaSource();
-    const file = e.target.files?.[0] || null;
-    setUploadedFile(file); // <<< FIXED: Set uploadedFile state here
-    if (file) {
-      const fileUrl = URL.createObjectURL(file);
-      if (isImageFile(file)) {
-        // Image file, will be handled by imageRef and single processing logic
-        setMode("File"); // Ensure mode is "File"
-        // No direct video assignment needed here, imageRef handles display
-      } else if (isVideoFile(file)) {
-        if (displayVideoRef.current && vlmVideoRef.current) {
-          displayVideoRef.current.src = fileUrl;
-          vlmVideoRef.current.src = fileUrl;
-          displayVideoRef.current.load(); vlmVideoRef.current.load();
-          displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
-          vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
-          setMode("File"); // Ensure mode is "File"
-        }
-      } else {
-        setError("Unsupported file type. Please upload an image or video.");
-        setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
-        if (fileUrl) URL.revokeObjectURL(fileUrl);
-      }
-    } else {
-      setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
-      // If no file selected, revert to example video if in File mode
-      if (mode === "File") {
-        if (displayVideoRef.current && vlmVideoRef.current) {
-          displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
-          vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
-          displayVideoRef.current.load(); vlmVideoRef.current.load();
-          displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
-          vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
-        }
-      }
-    }
-  }, [cleanupMediaSource, mode]);
-  // Handler for processing an uploaded image file (one-time inference)
-  const handleProcessImage = async () => {
-    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
-      setError("Image or model not ready for processing, or no image file selected.");
-      return;
-    }
-    const img = imageRef.current;
-    const canvas = canvasRef.current;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    canvas.width = img.naturalWidth;
-    canvas.height = img.naturalHeight;
-    setProcessingState(true);
-    setError(null);
-    setInferenceStatus("Running image inference...");
-    try {
-      // Pass the HTMLImageElement directly to runInference
-      const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
-      setDebugOutput(modelOutput);
-      setInferenceStatus("Image inference complete.");
-      ctx.clearRect(0, 0, canvas.width, canvas.height);
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image
-      let boxes = extractJsonFromMarkdown(modelOutput) || [];
-      boxes = normalizeBoxes(boxes);
-      setLatestBoxes(boxes);
-      if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
-    } catch (e) {
-      setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
-      setLatestBoxes([]);
-      setInferenceStatus("Image inference failed.");
-    } finally {
-      setProcessingState(false);
-    }
-  };
-  // --- Rendered UI ---
-  return (
-    <div className="absolute inset-0 text-white flex flex-col">
-      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
-        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
-      </div>
-      <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>
-      <div className="flex flex-col items-center justify-center flex-1 w-full p-4">
-        {/* Mode Selector */}
-        <div className="mb-6 mt-4">
-          <div className="flex space-x-4">
-            {MODES.map((m) => (
-              <button
-                key={m}
-                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
-                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
-                }`}
-                onClick={() => setMode(m)}
-                disabled={!isLoaded && m !== "File"}
-              >
-                {m}
-              </button>
-            ))}
-          </div>
-        </div>
-        {/* Dynamic Content Area */}
-        <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
-          {/* Prompt Input (Common to all modes) */}
-          <div className="mb-4 w-full max-w-xl">
-            <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-            <textarea
-              className="w-full p-2 rounded-lg text-black"
-              rows={3}
-              value={prompt}
-              onChange={(e) => setPrompt(e.target.value)}
-              disabled={processingState}
-            />
-          </div>
-          {/* Video/Image Display and Canvas Overlay */}
-          <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
-            {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
-              <img
-                ref={imageRef}
-                src={URL.createObjectURL(uploadedFile)}
-                alt="Uploaded"
-                className="max-w-full max-h-full block object-contain"
-                style={{ position: 'absolute' }}
-                onLoad={() => {
-                    if (imageRef.current && canvasRef.current) {
-                        canvasRef.current.width = imageRef.current.naturalWidth;
-                        canvasRef.current.height = imageRef.current.naturalHeight;
-                    }
-                }}
-              />
-            ) : (
-              <video
-                ref={displayVideoRef}
-                autoPlay
-                muted
-                playsInline
-                loop
-                className="max-w-full max-h-full block object-contain"
-                style={{ position: 'absolute' }}
-              />
-            )}
-            <canvas
-              ref={canvasRef}
-              className="absolute top-0 left-0 w-full h-full pointer-events-none"
-              style={{ zIndex: 10 }}
-            />
-          </div>
-          {/* Controls specific to each mode */}
-          <div className="mt-4 flex flex-col items-center gap-2">
-            {mode === "Webcam" && (
-              <button
-                className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                onClick={handleWebcamInput}
-                disabled={processingState || !isLoaded}
-              >
-                {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
-              </button>
-            )}
-            {mode === "URL" && (
-              <>
-                <div className="flex w-full max-w-xl">
-                  <input
-                    type="text"
-                    className="flex-1 px-4 py-2 rounded-l-lg text-black"
-                    value={currentUrlInput}
-                    onChange={(e) => setCurrentUrlInput(e.target.value)}
-                    placeholder="Paste video URL here"
-                    disabled={processingState}
-                  />
-                  <button
-                    className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                    onClick={handleLoadUrl}
-                    disabled={processingState || !isLoaded}
-                  >
-                    Load URL
-                  </button>
-                </div>
-              </>
-            )}
-            {mode === "File" && (
-              <>
-                <input
-                  type="file"
-                  accept="image/*,video/*"
-                  onChange={handleFileChange}
-                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
-                  disabled={processingState}
-                />
-                {uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
-                  <button
-                    className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
-                    onClick={handleProcessImage}
-                    disabled={processingState || !isLoaded}
-                  >
-                    {processingState ? "Processing Image..." : "Process Image"}
-                  </button>
-                )}
-              </>
-            )}
-          </div>
-          {/* Error and Debug Output */}
-          {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
-          <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
-            <div>Raw Model Output:</div>
-            <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-          </div>
-        </div>
-      </div>
-      {/* Hidden Video for VLM processing - this must be rendered always */}
-      <video
-        ref={vlmVideoRef}
-        autoPlay
-        muted
-        playsInline
-        loop
-        style={{ display: 'none' }} // Hidden from view
-      />
-    </div>
-  );
-}

+import { useState, useRef, useEffect } from "react";
+import { useVLMContext } from "../context/useVLMContext";
+import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
+const MODES = ["Webcam", "URL", "File"] as const;
+type Mode = typeof MODES[number];
+const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
+const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
+function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
+  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
+    const label = arr[0];
+    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
+  }
+  return [];
+}
+function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
+  if (!raw) return [];
+  let boxes = [];
+  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
+    boxes = raw.image;
+  } else if (Array.isArray(raw)) {
+    boxes = raw;
+  } else if (typeof raw === "object" && raw !== null) {
+    boxes = [raw];
+  }
+  return boxes
+    .map((obj: any) => {
+      if (!obj || !obj.bbox_2d) return null;
+      let bbox = obj.bbox_2d;
+      // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 2 &&
+        Array.isArray(bbox[0]) &&
+        Array.isArray(bbox[1]) &&
+        bbox[0].length === 2 &&
+        bbox[1].length === 2
+      ) {
+        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
+      }
+      // If bbox_2d is [x1, y1, x2, y2], use as-is
+      if (
+        Array.isArray(bbox) &&
+        bbox.length === 4 &&
+        bbox.every((v: any) => typeof v === "number")
+      ) {
+        return { ...obj, bbox_2d: bbox };
+      }
+      // Otherwise, skip
+      return null;
+    })
+    .filter((obj: any) => obj);
+}
+function isImageFile(file: File) {
+  return file.type.startsWith("image/");
+}
+function isVideoFile(file: File) {
+  return file.type.startsWith("video/");
+}
+export default function MultiSourceCaptioningView() {
+  const [mode, setMode] = useState<Mode>("File");
+  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
+  const [processing, setProcessing] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const [webcamActive, setWebcamActive] = useState(false);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [uploadedUrl, setUploadedUrl] = useState<string>("");
+  const [videoProcessing, setVideoProcessing] = useState(false);
+  const [imageProcessed, setImageProcessed] = useState(false);
+  const [exampleProcessing, setExampleProcessing] = useState(false);
+  const [urlProcessing, setUrlProcessing] = useState(false);
+  const [debugOutput, setDebugOutput] = useState<string>("");
+  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
+  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
+  const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const videoRef = useRef<HTMLVideoElement | null>(null);
+  const canvasRef = useRef<HTMLCanvasElement | null>(null);
+  const imageRef = useRef<HTMLImageElement | null>(null);
+  const webcamStreamRef = useRef<MediaStream | null>(null);
+  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  const processVideoFrame = async () => {
+    if (!videoRef.current || !canvasRef.current) return;
+    const video = videoRef.current;
+    const canvas = canvasRef.current;
+    if (video.paused || video.ended || video.videoWidth === 0) return;
+    canvas.width = video.videoWidth;
+    canvas.height = video.videoHeight;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+    await runInference(video, prompt, (output: string) => {
+      setDebugOutput(output); // <-- Ensure Raw Model Output is updated
+      let boxes = extractJsonFromMarkdown(output) || [];
+      if (boxes.length === 0 && Array.isArray(output)) {
+        boxes = parseFlatBoxArray(output);
+      }
+      boxes = normalizeBoxes(boxes);
+      console.log("Model output:", output);
+      console.log("Boxes after normalization:", boxes);
+      console.log("Canvas size:", canvas.width, canvas.height);
+      if (boxes.length > 0) {
+        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
+        console.log("First box coords:", x1, y1, x2, y2);
+      }
+      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+      if (Array.isArray(boxes) && boxes.length > 0) {
+        const scaleX = canvas.width / video.videoWidth;
+        const scaleY = canvas.height / video.videoHeight;
+        ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
+        drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
+      }
+    });
+  };
+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0] || null;
+    setUploadedFile(file);
+    setUploadedUrl(file ? URL.createObjectURL(file) : "");
+    setError(null);
+    setImageProcessed(false);
+    setVideoProcessing(false);
+    setExampleProcessing(false);
+  };
+  // Webcam setup and teardown (unchanged)
+  useEffect(() => {
+    if (mode !== "Webcam") {
+      if (webcamStreamRef.current) {
+        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
+        webcamStreamRef.current = null;
+      }
+      setWebcamActive(false);
+      return;
+    }
+    const setupWebcam = async () => {
+      try {
+        setError(null);
+        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+        webcamStreamRef.current = stream;
+        if (videoRef.current) {
+          videoRef.current.srcObject = stream;
+          setWebcamActive(true);
+        }
+      } catch (e) {
+        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
+        setWebcamActive(false);
+      }
+    };
+    setupWebcam();
+    return () => {
+      if (webcamStreamRef.current) {
+        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
+        webcamStreamRef.current = null;
+      }
+      setWebcamActive(false);
+    };
+  }, [mode]);
+  // Webcam mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, webcamActive]);
+  // URL mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "URL" || !isLoaded || !urlProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
+  // File video mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
+  // Example video mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    interval = setInterval(() => {
+      processVideoFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
+  // File mode: process uploaded image (only on button click)
+  const handleProcessImage = async () => {
+    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
+    const img = imageRef.current;
+    const canvas = canvasRef.current;
+    canvas.width = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    setCanvasDims({w:canvas.width,h:canvas.height});
+    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    setProcessing(true);
+    setError(null);
+    setInferenceStatus("Running inference...");
+    await runInference(img, prompt, (output: string) => {
+      setDebugOutput(output);
+      setInferenceStatus("Inference complete.");
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractJsonFromMarkdown(output) || [];
+      if (boxes.length === 0 && Array.isArray(output)) {
+        boxes = parseFlatBoxArray(output);
+      }
+      boxes = normalizeBoxes(boxes);
+      console.log("Model output:", output);
+      console.log("Boxes after normalization:", boxes);
+      console.log("Canvas size:", canvas.width, canvas.height);
+      if (boxes.length > 0) {
+        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
+        console.log("First box coords:", x1, y1, x2, y2);
+      }
+      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+      if (Array.isArray(boxes) && boxes.length > 0) {
+        const scaleX = canvas.width / img.naturalWidth;
+        const scaleY = canvas.height / img.naturalHeight;
+        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
+      }
+      setImageProcessed(true);
+    });
+    setProcessing(false);
+  };
+  // File mode: process uploaded video frames (start/stop)
+  const handleToggleVideoProcessing = () => {
+    setVideoProcessing((prev) => !prev);
+  };
+  // Handle start/stop for example video processing
+  const handleToggleExampleProcessing = () => {
+    setExampleProcessing((prev) => !prev);
+  };
+  // Handle start/stop for URL video processing
+  const handleToggleUrlProcessing = () => {
+    setUrlProcessing((prev) => !prev);
+  };
+  // Test draw box function
+  const handleTestDrawBox = () => {
+    if (!canvasRef.current) return;
+    const canvas = canvasRef.current;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    ctx.strokeStyle = "#FF00FF";
+    ctx.lineWidth = 4;
+    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
+    ctx.font = "20px Arial";
+    ctx.fillStyle = "#FF00FF";
+    ctx.fillText("Test Box", 50, 35);
+  };
+  return (
+    <div className="absolute inset-0 text-white">
+      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
+        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
+      </div>
+      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
+      <div className="flex flex-col items-center justify-center h-full w-full">
+        {/* Mode Selector */}
+        <div className="mb-6">
+          <div className="flex space-x-4">
+            {MODES.map((m) => (
+              <button
+                key={m}
+                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
+                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
+                }`}
+                onClick={() => setMode(m)}
+              >
+                {m}
+              </button>
+            ))}
+          </div>
+        </div>
+        {/* Mode Content */}
+        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
+          {mode === "Webcam" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="relative w-full max-w-xl">
+                <video
+                  ref={videoRef}
+                  autoPlay
+                  muted
+                  playsInline
+                  className="w-full rounded-lg shadow-lg mb-2"
+                  style={{ background: "#222" }}
+                />
+                <canvas
+                  ref={canvasRef}
+                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                  style={{ zIndex: 10, pointerEvents: "none" }}
+                />
+              </div>
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+            </div>
+          )}
+          {mode === "URL" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
+              <div className="flex w-full max-w-xl mb-4">
+                <input
+                  type="text"
+                  className="flex-1 px-4 py-2 rounded-l-lg text-black"
+                  value={inputUrl}
+                  onChange={(e) => setInputUrl(e.target.value)}
+                  placeholder="Paste video URL here"
+                />
+                <button
+                  className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
+                  onClick={() => setVideoUrl(inputUrl)}
+                >
+                  Load
+                </button>
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="relative w-full max-w-xl">
+                <video
+                  ref={videoRef}
+                  src={videoUrl}
+                  controls
+                  autoPlay
+                  loop
+                  className="w-full rounded-lg shadow-lg mb-2"
+                  style={{ background: "#222" }}
+                />
+                <canvas
+                  ref={canvasRef}
+                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                  style={{ zIndex: 10, pointerEvents: "none" }}
+                />
+                <button
+                  className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                  onClick={handleToggleUrlProcessing}
+                >
+                  {urlProcessing ? "Stop Processing" : "Start Processing"}
+                </button>
+              </div>
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+              <button
+                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
+                onClick={handleTestDrawBox}
+              >
+                Test Draw Box
+              </button>
+              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
+                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
+                <div>Raw Model Output:</div>
+                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+              </div>
+            </div>
+          )}
+          {mode === "File" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <input
+                  type="file"
+                  accept="image/*,video/*"
+                  onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
+                />
+              </div>
+              {/* Show uploaded image */}
+              {uploadedFile && isImageFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <img
+                    ref={imageRef}
+                    src={uploadedUrl}
+                    alt="Uploaded"
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleProcessImage}
+                    disabled={processing}
+                  >
+                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
+                  </button>
+                </div>
+              )}
+              {/* Show uploaded video */}
+              {uploadedFile && isVideoFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={uploadedUrl}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleToggleVideoProcessing}
+                  >
+                    {videoProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {/* Show example video if no file uploaded */}
+              {!uploadedFile && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleToggleExampleProcessing}
+                  >
+                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+              <button
+                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
+                onClick={handleTestDrawBox}
+              >
+                Test Draw Box
+              </button>
+              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
+                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
+                <div>Raw Model Output:</div>
+                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+    </div>
+  );
+}

src/index.js CHANGED Viewed

@@ -14,4 +14,4 @@ root.render(
 // If you want to start measuring performance in your app, pass a function
 // to log results (for example: reportWebVitals(console.log))
 // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
-reportWebVitals();

 // If you want to start measuring performance in your app, pass a function
 // to log results (for example: reportWebVitals(console.log))
 // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
+reportWebVitals();