Spaces:

Quazim0t0
/

FastVLMBoxes

Running

File size: 21,714 Bytes

import React, { useState, useRef, useEffect, useCallback } from "react";
import { useVLMContext } from "../context/useVLMContext";
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

const MODES = ["Webcam", "URL", "File"] as const;
type Mode = typeof MODES[number];

const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";

// Helper function: normalizeBoxes remains as it is used
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
  if (!raw) return [];
  let boxes = [];
  if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
    boxes = raw.image;
  } else if (Array.isArray(raw)) {
    boxes = raw;
  } else if (typeof raw === "object" && raw !== null) {
    boxes = [raw];
  }
  return boxes
    .map((obj: any) => {
      if (!obj || !obj.bbox_2d) return null;
      let bbox = obj.bbox_2d;
      if (
        Array.isArray(bbox) &&
        bbox.length === 2 &&
        Array.isArray(bbox[0]) &&
        Array.isArray(bbox[1]) &&
        bbox[0].length === 2 &&
        bbox[1].length === 2
      ) {
        bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
      }
      if (
        Array.isArray(bbox) &&
        bbox.length === 4 &&
        bbox.every((v: any) => typeof v === "number")
      ) {
        return { ...obj, bbox_2d: bbox };
      }
      return null;
    })
    .filter((obj: any) => obj);
}

function isImageFile(file: File) {
  return file.type.startsWith("image/");
}
function isVideoFile(file: File) {
  return file.type.startsWith("video/");
}

export default function MultiSourceCaptioningView() {
  const [mode, setMode] = useState<Mode>("File");
  const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
  const [processingState, setProcessingState] = useState(false); // General processing indicator
  const [error, setError] = useState<string | null>(null);
  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
  const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
  const [debugOutput, setDebugOutput] = useState<string>("");
  const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE

  // Refs for the two video elements and the canvas
  const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
  const vlmVideoRef = useRef<HTMLVideoElement>(null);    // The hidden video for VLM processing
  const canvasRef = useRef<HTMLCanvasElement>(null);     // The canvas overlay for drawing boxes
  const imageRef = useRef<HTMLImageElement>(null);       // For image file processing

  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();

  // --- Drawing Loop for the Visible Display ---
  // This loop runs constantly to draw the latest boxes on the display video
  const drawDisplayCanvas = useCallback(() => {
    const displayVideo = displayVideoRef.current;
    const canvas = canvasRef.current;
    const ctx = canvas?.getContext('2d');

    if (!displayVideo || !canvas || !ctx) {
      return;
    }

    // Adjust canvas size to match the display video's dimensions
    // Only adjust if video has valid dimensions
    if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
        (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
      canvas.width = displayVideo.videoWidth;
      canvas.height = displayVideo.videoHeight;
    }

    // Clear the canvas each frame
    ctx.clearRect(0, 0, canvas.width, canvas.height);

    // Draw the latest bounding boxes
    const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
    const scaleY = canvas.height / (displayVideo.videoHeight || 1);
    drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });

    // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
    if (!displayVideo.paused && !displayVideo.ended) {
      requestAnimationFrame(drawDisplayCanvas);
    }
  }, [latestBoxes]); // Re-create if latestBoxes changes

  // Effect to start the display drawing loop when the display video is ready
  useEffect(() => {
    const displayVideo = displayVideoRef.current;
    if (displayVideo) {
      const handleVideoReady = () => {
        if (displayVideo.readyState >= 1) { // HAVE_METADATA
          requestAnimationFrame(drawDisplayCanvas);
        }
      };
      displayVideo.addEventListener('loadedmetadata', handleVideoReady);
      displayVideo.addEventListener('play', handleVideoReady); // Also start on play
      // Also check if video is already ready (e.g., on component re-mount or autoplay)
      if (displayVideo.readyState >= 1) {
        requestAnimationFrame(drawDisplayCanvas);
      }
      return () => {
        displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
        displayVideo.removeEventListener('play', handleVideoReady);
      };
    }
  }, [drawDisplayCanvas]);

  // --- FastVLM Processing Loop (from hidden video) ---
  // This interval loop controls when FastVLM processes a frame
  useEffect(() => {
    const vlmVideo = vlmVideoRef.current;
    // Determine if we are in a video-based mode that requires continuous processing
    const isVideoModeActive = (
      mode === "Webcam" ||
      (mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
      (mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile))
    );

    if (!isLoaded || !vlmVideo || !isVideoModeActive) {
      setProcessingState(false);
      return;
    }

    let interval: ReturnType<typeof setInterval> | null = null;

    const startVLMProcessing = () => {
      if (interval) clearInterval(interval); // Clear any old interval

      interval = setInterval(async () => {
        if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
          return; // Skip if video not ready, paused, ended, or already processing
        }

        setProcessingState(true);
        setInferenceStatus("Running inference...");
        setError(null);

        try {
          // Pass the HTMLVideoElement directly to runInference
          const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
          setDebugOutput(modelOutput);

          let boxes = extractJsonFromMarkdown(modelOutput) || [];
          boxes = normalizeBoxes(boxes);

          setLatestBoxes(boxes);
          setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
        } catch (e) {
          setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
          setLatestBoxes([]);
          setInferenceStatus("Inference failed.");
        } finally {
          setProcessingState(false);
        }
      }, 200); // Inference interval (e.g., 5 frames per second)
    };

    const stopVLMProcessing = () => {
      if (interval) clearInterval(interval);
      interval = null;
      setProcessingState(false);
      setInferenceStatus("Stopped processing.");
    };

    vlmVideo.addEventListener('play', startVLMProcessing);
    vlmVideo.addEventListener('pause', stopVLMProcessing);
    vlmVideo.addEventListener('ended', stopVLMProcessing);
    vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability

    // Initial check if video is already playing or ready
    if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
        startVLMProcessing();
    }

    return () => {
      stopVLMProcessing();
      vlmVideo.removeEventListener('play', startVLMProcessing);
      vlmVideo.removeEventListener('pause', stopVLMProcessing);
      vlmVideo.removeEventListener('ended', stopVLMProcessing);
      vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
    };
  }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change

  // --- Media Source Handling ---

  // Cleanup for media stream and object URLs
  const cleanupMediaSource = useCallback(() => {
    if (mediaStream) {
      mediaStream.getTracks().forEach(track => track.stop());
      setMediaStream(null);
    }
    if (displayVideoRef.current?.src.startsWith('blob:')) {
      URL.revokeObjectURL(displayVideoRef.current.src);
      displayVideoRef.current.src = "";
    }
    if (vlmVideoRef.current?.src.startsWith('blob:')) {
      URL.revokeObjectURL(vlmVideoRef.current.src);
      vlmVideoRef.current.src = "";
    }
    setLatestBoxes([]);
    setError(null);
    setInferenceStatus("");
    setDebugOutput("");
    setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
  }, [mediaStream]);

  // Handle changing the mode (Webcam, URL, File)
  useEffect(() => {
    cleanupMediaSource();

    const displayVideo = displayVideoRef.current;
    const vlmVideo = vlmVideoRef.current;

    if (!displayVideo || !vlmVideo) return;

    // Reset srcObject/src to ensure fresh start
    displayVideo.srcObject = null;
    vlmVideo.srcObject = null;
    displayVideo.src = "";
    vlmVideo.src = "";

    // Special handling for initial "File" mode to load example video if no file is selected
    if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
        displayVideo.src = EXAMPLE_VIDEO_URL;
        vlmVideo.src = EXAMPLE_VIDEO_URL;
        displayVideo.load(); vlmVideo.load();
        displayVideo.play().catch(e => console.error("Error playing example display video:", e));
        vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
    }
  }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode

  // Handle Webcam Input
  const handleWebcamInput = useCallback(async () => {
    cleanupMediaSource();
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ video: true });
      setMediaStream(stream);

      if (displayVideoRef.current && vlmVideoRef.current) {
        displayVideoRef.current.srcObject = stream;
        vlmVideoRef.current.srcObject = stream;
        displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
        vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
      }
      setMode("Webcam");
    } catch (e) {
      setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
      setMediaStream(null);
      setLatestBoxes([]);
      setInferenceStatus("Webcam access denied or failed.");
    }
  }, [cleanupMediaSource]);

  // Handle URL Input (when Load button is clicked)
  const handleLoadUrl = useCallback(() => {
    cleanupMediaSource();

    const url = currentUrlInput;
    if (!url) {
      setError("Please enter a valid URL.");
      return;
    }

    if (displayVideoRef.current && vlmVideoRef.current) {
      displayVideoRef.current.src = url;
      vlmVideoRef.current.src = url;
      displayVideoRef.current.load(); vlmVideoRef.current.load();
      displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
      vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
      setMode("URL");
    }
  }, [currentUrlInput, cleanupMediaSource]);

  // Handle File Input
  const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
    cleanupMediaSource();

    const file = e.target.files?.[0] || null;
    setUploadedFile(file); // <<< FIXED: Set uploadedFile state here

    if (file) {
      const fileUrl = URL.createObjectURL(file);

      if (isImageFile(file)) {
        // Image file, will be handled by imageRef and single processing logic
        setMode("File"); // Ensure mode is "File"
        // No direct video assignment needed here, imageRef handles display
      } else if (isVideoFile(file)) {
        if (displayVideoRef.current && vlmVideoRef.current) {
          displayVideoRef.current.src = fileUrl;
          vlmVideoRef.current.src = fileUrl;
          displayVideoRef.current.load(); vlmVideoRef.current.load();
          displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
          vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
          setMode("File"); // Ensure mode is "File"
        }
      } else {
        setError("Unsupported file type. Please upload an image or video.");
        setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
        if (fileUrl) URL.revokeObjectURL(fileUrl);
      }
    } else {
      setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
      // If no file selected, revert to example video if in File mode
      if (mode === "File") {
        if (displayVideoRef.current && vlmVideoRef.current) {
          displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
          vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
          displayVideoRef.current.load(); vlmVideoRef.current.load();
          displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
          vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
        }
      }
    }
  }, [cleanupMediaSource, mode]);


  // Handler for processing an uploaded image file (one-time inference)
  const handleProcessImage = async () => {
    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
      setError("Image or model not ready for processing, or no image file selected.");
      return;
    }

    const img = imageRef.current;
    const canvas = canvasRef.current;
    const ctx = canvas.getContext("2d");
    if (!ctx) return;

    canvas.width = img.naturalWidth;
    canvas.height = img.naturalHeight;

    setProcessingState(true);
    setError(null);
    setInferenceStatus("Running image inference...");

    try {
      // Pass the HTMLImageElement directly to runInference
      const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
      setDebugOutput(modelOutput);
      setInferenceStatus("Image inference complete.");

      ctx.clearRect(0, 0, canvas.width, canvas.height);
      ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image

      let boxes = extractJsonFromMarkdown(modelOutput) || [];
      boxes = normalizeBoxes(boxes);
      setLatestBoxes(boxes);

      if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
    } catch (e) {
      setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
      setLatestBoxes([]);
      setInferenceStatus("Image inference failed.");
    } finally {
      setProcessingState(false);
    }
  };

  // --- Rendered UI ---
  return (
    <div className="absolute inset-0 text-white flex flex-col">
      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
      </div>
      <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>

      <div className="flex flex-col items-center justify-center flex-1 w-full p-4">
        {/* Mode Selector */}
        <div className="mb-6 mt-4">
          <div className="flex space-x-4">
            {MODES.map((m) => (
              <button
                key={m}
                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
                }`}
                onClick={() => setMode(m)}
                disabled={!isLoaded && m !== "File"}
              >
                {m}
              </button>
            ))}
          </div>
        </div>

        {/* Dynamic Content Area */}
        <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
          {/* Prompt Input (Common to all modes) */}
          <div className="mb-4 w-full max-w-xl">
            <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
            <textarea
              className="w-full p-2 rounded-lg text-black"
              rows={3}
              value={prompt}
              onChange={(e) => setPrompt(e.target.value)}
              disabled={processingState}
            />
          </div>

          {/* Video/Image Display and Canvas Overlay */}
          <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
            {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
              <img
                ref={imageRef}
                src={URL.createObjectURL(uploadedFile)}
                alt="Uploaded"
                className="max-w-full max-h-full block object-contain"
                style={{ position: 'absolute' }}
                onLoad={() => {
                    if (imageRef.current && canvasRef.current) {
                        canvasRef.current.width = imageRef.current.naturalWidth;
                        canvasRef.current.height = imageRef.current.naturalHeight;
                    }
                }}
              />
            ) : (
              <video
                ref={displayVideoRef}
                autoPlay
                muted
                playsInline
                loop
                className="max-w-full max-h-full block object-contain"
                style={{ position: 'absolute' }}
              />
            )}
            <canvas
              ref={canvasRef}
              className="absolute top-0 left-0 w-full h-full pointer-events-none"
              style={{ zIndex: 10 }}
            />
          </div>

          {/* Controls specific to each mode */}
          <div className="mt-4 flex flex-col items-center gap-2">
            {mode === "Webcam" && (
              <button
                className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
                onClick={handleWebcamInput}
                disabled={processingState || !isLoaded}
              >
                {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
              </button>
            )}

            {mode === "URL" && (
              <>
                <div className="flex w-full max-w-xl">
                  <input
                    type="text"
                    className="flex-1 px-4 py-2 rounded-l-lg text-black"
                    value={currentUrlInput}
                    onChange={(e) => setCurrentUrlInput(e.target.value)}
                    placeholder="Paste video URL here"
                    disabled={processingState}
                  />
                  <button
                    className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
                    onClick={handleLoadUrl}
                    disabled={processingState || !isLoaded}
                  >
                    Load URL
                  </button>
                </div>
              </>
            )}

            {mode === "File" && (
              <>
                <input
                  type="file"
                  accept="image/*,video/*"
                  onChange={handleFileChange}
                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
                  disabled={processingState}
                />
                {uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
                  <button
                    className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
                    onClick={handleProcessImage}
                    disabled={processingState || !isLoaded}
                  >
                    {processingState ? "Processing Image..." : "Process Image"}
                  </button>
                )}
              </>
            )}
          </div>

          {/* Error and Debug Output */}
          {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
          <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
            <div>Raw Model Output:</div>
            <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
          </div>
        </div>
      </div>

      {/* Hidden Video for VLM processing - this must be rendered always */}
      <video
        ref={vlmVideoRef}
        autoPlay
        muted
        playsInline
        loop
        style={{ display: 'none' }} // Hidden from view
      />
    </div>
  );
}