Spaces:

Quazim0t0
/

FastVLMBoxes

Running

File size: 20,805 Bytes

import { useState, useRef, useEffect } from "react";
import { useVLMContext } from "../context/useVLMContext";
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

const MODES = ["File"] as const;
type Mode = typeof MODES[number];

const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
const EXAMPLE_PROMPT = "Detect each individual animated characters in the image. The characters are moving. For each character, output a JSON array of objects with fields. Each character should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. No coordinates should be the same. This should be used to draw a box using the points around the character. This is an example of two boxes, the format of this : [x1, y1, x2, y2], [x1, y1, x2, y2]";

function isImageFile(file: File) {
  return file.type.startsWith("image/");
}
function isVideoFile(file: File) {
  return file.type.startsWith("video/");
}

function denormalizeBox(box: number[], width: number, height: number) {
  // If all values are between 0 and 1, treat as normalized
  if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
    return [
      box[0] * width,
      box[1] * height,
      box[2] * width,
      box[3] * height
    ];
  }
  return box;
}

// Add this robust fallback parser near the top
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
  // Try to parse as JSON first
  try {
    const parsed = JSON.parse(output);
    if (Array.isArray(parsed)) {
      const result: { label: string, bbox_2d: number[] }[] = [];
      for (const obj of parsed) {
        if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
          if (Array.isArray(obj.bbox_2d[0])) {
            for (const arr of obj.bbox_2d) {
              if (Array.isArray(arr) && arr.length === 4) {
                result.push({ label: obj.label, bbox_2d: arr });
              }
            }
          } else if (obj.bbox_2d.length === 4) {
            result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
          }
        }
      }
      if (result.length > 0) return result;
    }
  } catch (e) {}
  // Fallback: extract all [x1, y1, x2, y2] arrays from the string
  const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
  const boxes: { label: string, bbox_2d: number[] }[] = [];
  let match;
  while ((match = boxRegex.exec(output)) !== null) {
    const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
    boxes.push({ label: '', bbox_2d: arr });
  }
  return boxes;
}

export default function MultiSourceCaptioningView() {
  const [mode, setMode] = useState<Mode>("File");
  const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
  const [processing, setProcessing] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
  const [uploadedUrl, setUploadedUrl] = useState<string>("");
  const [videoProcessing, setVideoProcessing] = useState(false);
  const [imageProcessed, setImageProcessed] = useState(false);
  const [exampleProcessing, setExampleProcessing] = useState(false);
  const [debugOutput, setDebugOutput] = useState<string>("");
  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
  const [inferenceStatus, setInferenceStatus] = useState<string>("");

  const videoRef = useRef<HTMLVideoElement | null>(null);
  const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
  const processingVideoRef = useRef<HTMLVideoElement | null>(null);
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
  const imageRef = useRef<HTMLImageElement | null>(null);
  const boxHistoryRef = useRef<any[]>([]);
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();

  // Add this useEffect for overlay video synchronization
  useEffect(() => {
    const main = videoRef.current;
    const overlay = overlayVideoRef.current;
    if (!main || !overlay) return;
    // Sync play/pause
    const onPlay = () => { if (overlay.paused) overlay.play(); };
    const onPause = () => { if (!overlay.paused) overlay.pause(); };
    // Sync seeking and time
    const onSeekOrTime = () => {
      if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
        overlay.currentTime = main.currentTime;
      }
    };
    main.addEventListener('play', onPlay);
    main.addEventListener('pause', onPause);
    main.addEventListener('seeked', onSeekOrTime);
    main.addEventListener('timeupdate', onSeekOrTime);
    // Clean up
    return () => {
      main.removeEventListener('play', onPlay);
      main.removeEventListener('pause', onPause);
      main.removeEventListener('seeked', onSeekOrTime);
      main.removeEventListener('timeupdate', onSeekOrTime);
    };
  }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);

  useEffect(() => {
    if ((mode === "File") && processingVideoRef.current) {
      processingVideoRef.current.play().catch(() => {});
    }
  }, [mode, videoUrl, uploadedUrl]);

  const processVideoFrame = async () => {
    if (!processingVideoRef.current || !canvasRef.current) return;
    const video = processingVideoRef.current;
    const canvas = canvasRef.current;
    if (video.paused || video.ended || video.videoWidth === 0) return;
    canvas.width = video.videoWidth;
    canvas.height = video.videoHeight;
    const ctx = canvas.getContext("2d");
    if (!ctx) return;
    ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
    await runInference(video, prompt, (output: string) => {
      setDebugOutput(output);
      let boxes = extractAllBoundingBoxes(output);
      // Box persistence logic (2 seconds)
      const now = Date.now();
      if (Array.isArray(boxes) && boxes.length > 0) {
        boxHistoryRef.current = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
        boxHistoryRef.current.push(...boxes.map(box => ({ ...box, timestamp: now })));
      }
      // Draw all boxes from last 2 seconds
      const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
      ctx.clearRect(0, 0, canvas.width, canvas.height);
      if (boxHistory.length > 0) {
        const scaleX = canvas.width / video.videoWidth;
        const scaleY = canvas.height / video.videoHeight;
        // Fix: Draw all boxes, even if bbox_2d is an array of arrays
        const denormalizedBoxes: any[] = [];
        for (const b of boxHistory) {
          if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
            // Multiple boxes per label
            for (const arr of b.bbox_2d) {
              if (Array.isArray(arr) && arr.length === 4) {
                denormalizedBoxes.push({
                  ...b,
                  bbox_2d: denormalizeBox(arr, canvas.width, canvas.height)
                });
              }
            }
          } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
            // Single box
            denormalizedBoxes.push({
              ...b,
              bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height)
            });
          }
        }
        drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
      }
    });
  };

  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
    const file = e.target.files?.[0] || null;
    setUploadedFile(file);
    setUploadedUrl(file ? URL.createObjectURL(file) : "");
    setError(null);
    setImageProcessed(false);
    setVideoProcessing(false);
    setExampleProcessing(false);
  };

  // Webcam mode: process frames with setInterval
  useEffect(() => {
    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
    let interval: ReturnType<typeof setInterval> | null = null;
    interval = setInterval(() => {
      processVideoFrame();
    }, 1000);
    return () => {
      if (interval) clearInterval(interval);
    };
  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);

  // Example video mode: process frames with setInterval
  useEffect(() => {
    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
    let interval: ReturnType<typeof setInterval> | null = null;
    interval = setInterval(() => {
      processVideoFrame();
    }, 1000);
    return () => {
      if (interval) clearInterval(interval);
    };
  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);

  // File mode: process uploaded image (only on button click)
  const handleProcessImage = async () => {
    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
    const img = imageRef.current;
    const canvas = canvasRef.current;
    canvas.width = img.naturalWidth;
    canvas.height = img.naturalHeight;
    setCanvasDims({w:canvas.width,h:canvas.height});
    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
    const ctx = canvas.getContext("2d");
    if (!ctx) return;
    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
    setProcessing(true);
    setError(null);
    setInferenceStatus("Running inference...");
    await runInference(img, prompt, (output: string) => {
      setDebugOutput(output);
      setInferenceStatus("Inference complete.");
      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
      let boxes = extractAllBoundingBoxes(output);
      console.log("Model output:", output);
      console.log("Boxes after normalization:", boxes);
      console.log("Canvas size:", canvas.width, canvas.height);
      if (boxes.length > 0) {
        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
        console.log("First box coords:", x1, y1, x2, y2);
      }
      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
      if (Array.isArray(boxes) && boxes.length > 0) {
        const scaleX = canvas.width / img.naturalWidth;
        const scaleY = canvas.height / img.naturalHeight;
        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
      }
      setImageProcessed(true);
    });
    setProcessing(false);
  };

  // File mode: process uploaded video frames (start/stop)
  const handleToggleVideoProcessing = () => {
    setVideoProcessing((prev) => !prev);
  };

  // Handle start/stop for example video processing
  const handleToggleExampleProcessing = () => {
    setExampleProcessing((prev) => !prev);
  };

  // Test draw box function
  const handleTestDrawBox = () => {
    if (!canvasRef.current) return;
    const canvas = canvasRef.current;
    const ctx = canvas.getContext("2d");
    if (!ctx) return;
    ctx.clearRect(0, 0, canvas.width, canvas.height);
    ctx.strokeStyle = "#FF00FF";
    ctx.lineWidth = 4;
    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
    ctx.font = "20px Arial";
    ctx.fillStyle = "#FF00FF";
    ctx.fillText("Test Box", 50, 35);
  };

  useEffect(() => {
    const draw = () => {
      const overlayVideo = overlayVideoRef.current;
      const canvas = canvasRef.current;
      if (!overlayVideo || !canvas) return;
      if (overlayVideo.videoWidth === 0) return;
      canvas.width = overlayVideo.videoWidth;
      canvas.height = overlayVideo.videoHeight;
      const ctx = canvas.getContext("2d");
      if (!ctx) return;
      ctx.clearRect(0, 0, canvas.width, canvas.height);
      const now = Date.now();
      const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
      if (boxHistory.length > 0) {
        const scaleX = canvas.width / overlayVideo.videoWidth;
        const scaleY = canvas.height / overlayVideo.videoHeight;
        // Fix: Draw all boxes, even if bbox_2d is an array of arrays
        const denormalizedBoxes: any[] = [];
        for (const b of boxHistory) {
          if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
            // Multiple boxes per label
            for (const arr of b.bbox_2d) {
              if (Array.isArray(arr) && arr.length === 4) {
                denormalizedBoxes.push({
                  ...b,
                  bbox_2d: denormalizeBox(arr, canvas.width, canvas.height)
                });
              }
            }
          } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
            // Single box
            denormalizedBoxes.push({
              ...b,
              bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height)
            });
          }
        }
        drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
      }
    };
    draw();
    const interval = setInterval(draw, 100);
    return () => clearInterval(interval);
  }, [overlayVideoRef, canvasRef]);

  return (
    <div className="absolute inset-0 text-white">

      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">

        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}

      </div>

      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>

      <div className="flex flex-col items-center justify-center h-full w-full">

        {/* Mode Selector */}

        <div className="mb-6">

          <div className="flex space-x-4">

            {MODES.map((m) => (

              <button

                key={m}

                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${

                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"

                }`}

                onClick={() => setMode(m)}

              >

                {m}

              </button>

            ))}

          </div>

        </div>



        {/* Mode Content */}

        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">

          {mode === "File" && (

            <div className="w-full text-center flex flex-col items-center">

              <div className="mb-4 w-full max-w-xl">

                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>

                <textarea

                  className="w-full p-2 rounded-lg text-black"

                  rows={3}

                  value={prompt}

                  onChange={(e) => setPrompt(e.target.value)}

                />

              </div>

              <div className="mb-4 w-full max-w-xl">

                <input

                  type="file"

                  accept="image/*,video/*"

                  onChange={handleFileChange}

                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"

                />

              </div>

              {/* Show uploaded image */}

              {uploadedFile && isImageFile(uploadedFile) && (

                <div className="relative w-full max-w-xl">

                  <img

                    ref={imageRef}

                    src={uploadedUrl}

                    alt="Uploaded"

                    className="w-full rounded-lg shadow-lg mb-2"

                    style={{ background: "#222" }}

                  />

                  <canvas

                    ref={canvasRef}

                    className="absolute top-0 left-0 w-full h-full pointer-events-none"

                    style={{ zIndex: 10, pointerEvents: "none" }}

                  />

                  <button

                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"

                    onClick={handleProcessImage}

                    disabled={processing}

                  >

                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}

                  </button>

                </div>

              )}

              {/* Show uploaded video */}

              {uploadedFile && isVideoFile(uploadedFile) && (

                <div className="relative w-full max-w-xl">

                  {/* Visible overlay video for user */}

                  <video

                    ref={overlayVideoRef}

                    src={uploadedUrl}

                    controls

                    autoPlay

                    loop

                    muted

                    playsInline

                    className="w-full rounded-lg shadow-lg mb-2"

                    style={{ background: "#222" }}

                  />

                  {/* Hidden processing video for FastVLM/canvas */}

                  <video

                    ref={processingVideoRef}

                    src={uploadedUrl}

                    autoPlay

                    loop

                    muted

                    playsInline

                    style={{ display: "none" }}

                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}

                  />

                  <canvas

                    ref={canvasRef}

                    className="absolute top-0 left-0 w-full h-full pointer-events-none"

                    style={{ zIndex: 20, pointerEvents: "none" }}

                  />

                  <button

                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"

                    onClick={handleToggleVideoProcessing}

                  >

                    {videoProcessing ? "Stop Processing" : "Start Processing"}

                  </button>

                </div>

              )}

              {/* Show example video if no file uploaded */}

              {!uploadedFile && (

                <div className="relative w-full max-w-xl">

                  {/* Visible overlay video for user */}

                  <video

                    ref={overlayVideoRef}

                    src={EXAMPLE_VIDEO_URL}

                    controls

                    autoPlay

                    loop

                    muted

                    playsInline

                    className="w-full rounded-lg shadow-lg mb-2"

                    style={{ background: "#222" }}

                  />

                  {/* Hidden processing video for FastVLM/canvas */}

                  <video

                    ref={processingVideoRef}

                    src={EXAMPLE_VIDEO_URL}

                    autoPlay

                    loop

                    muted

                    playsInline

                    style={{ display: "none" }}

                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}

                  />

                  <canvas

                    ref={canvasRef}

                    className="absolute top-0 left-0 w-full h-full pointer-events-none"

                    style={{ zIndex: 20, pointerEvents: "none" }}

                  />

                  <button

                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"

                    onClick={handleToggleExampleProcessing}

                  >

                    {exampleProcessing ? "Stop Processing" : "Start Processing"}

                  </button>

                </div>

              )}

              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}

              {error && <div className="text-red-400 mt-2">Error: {error}</div>}

              <button

                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"

                onClick={handleTestDrawBox}

              >

                Test Draw Box

              </button>

              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">

                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>

                <div>Raw Model Output:</div>

                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>

              </div>

            </div>

          )}

        </div>

      </div>

    </div>
  );
}