Spaces:

Quazim0t0
/

FastVLMBoxes

Running

File size: 10,280 Bytes

1a9c884

import React, { useState, useRef, useEffect } from "react";
import { FASTVLM_BOXING_PROMPT } from "../constants";
import { useVLMContext } from "../context/useVLMContext";
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

const MODES = ["Webcam", "URL", "File"] as const;
type Mode = typeof MODES[number];

const EXAMPLE_VIDEO_URL =
  "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";

export default function MultiSourceCaptioningView() {
  const [mode, setMode] = useState<Mode>("URL");
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
  const [processing, setProcessing] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [webcamActive, setWebcamActive] = useState(false);

  const videoRef = useRef<HTMLVideoElement | null>(null);
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
  const webcamStreamRef = useRef<MediaStream | null>(null);
  const { isLoaded, runInference } = useVLMContext();

  // Webcam setup and teardown
  useEffect(() => {
    if (mode !== "Webcam") {
      if (webcamStreamRef.current) {
        webcamStreamRef.current.getTracks().forEach((track) => track.stop());
        webcamStreamRef.current = null;
      }
      setWebcamActive(false);
      return;
    }
    let stopped = false;
    const setupWebcam = async () => {
      try {
        setError(null);
        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
        webcamStreamRef.current = stream;
        if (videoRef.current) {
          videoRef.current.srcObject = stream;
          setWebcamActive(true);
        }
      } catch (e) {
        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
        setWebcamActive(false);
      }
    };
    setupWebcam();
    return () => {
      stopped = true;
      if (webcamStreamRef.current) {
        webcamStreamRef.current.getTracks().forEach((track) => track.stop());
        webcamStreamRef.current = null;
      }
      setWebcamActive(false);
    };
  }, [mode]);

  // Process webcam frames
  useEffect(() => {
    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
    let interval: NodeJS.Timeout | null = null;
    let stopped = false;
    const processFrame = async () => {
      if (!videoRef.current || !canvasRef.current) return;
      const video = videoRef.current;
      const canvas = canvasRef.current;
      if (video.videoWidth === 0) return;
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      const ctx = canvas.getContext("2d");
      if (!ctx) return;
      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
      try {
        setProcessing(true);
        setError(null);
        // Use FastVLM inference on the current frame
        const fakeVideo = {
          videoWidth: canvas.width,
          videoHeight: canvas.height,
          // @ts-ignore
          getContext: () => ctx,
        } as HTMLVideoElement;
        const result = await runInference(fakeVideo, prompt);
        // Clear canvas and redraw frame
        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
        // Parse and draw boxes
        const boxes = extractJsonFromMarkdown(result) || [];
        drawBoundingBoxesOnCanvas(ctx, boxes);
      } catch (e) {
        setError(e instanceof Error ? e.message : String(e));
      } finally {
        setProcessing(false);
      }
    };
    interval = setInterval(() => {
      if (!stopped) processFrame();
    }, 1000);
    return () => {
      stopped = true;
      if (interval) clearInterval(interval);
    };
  }, [mode, isLoaded, prompt, runInference, webcamActive]);

  // Process video frames for URL mode
  useEffect(() => {
    if (mode !== "URL" || !isLoaded) return;
    let interval: NodeJS.Timeout | null = null;
    let stopped = false;
    const processFrame = async () => {
      if (!videoRef.current || !canvasRef.current) return;
      const video = videoRef.current;
      const canvas = canvasRef.current;
      if (video.paused || video.ended || video.videoWidth === 0) return;
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      const ctx = canvas.getContext("2d");
      if (!ctx) return;
      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
      try {
        setProcessing(true);
        setError(null);
        // Use FastVLM inference on the current frame
        const fakeVideo = {
          videoWidth: canvas.width,
          videoHeight: canvas.height,
          // @ts-ignore
          getContext: () => ctx,
        } as HTMLVideoElement;
        const result = await runInference(fakeVideo, prompt);
        // Clear canvas and redraw frame
        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
        // Parse and draw boxes
        const boxes = extractJsonFromMarkdown(result) || [];
        drawBoundingBoxesOnCanvas(ctx, boxes);
      } catch (e) {
        setError(e instanceof Error ? e.message : String(e));
      } finally {
        setProcessing(false);
      }
    };
    interval = setInterval(() => {
      if (!stopped) processFrame();
    }, 1000);
    return () => {
      stopped = true;
      if (interval) clearInterval(interval);
    };
  }, [mode, isLoaded, prompt, runInference]);

  return (
    <div className="absolute inset-0 text-white">

      <div className="flex flex-col items-center justify-center h-full w-full">

        {/* Mode Selector */}

        <div className="mb-6">

          <div className="flex space-x-4">

            {MODES.map((m) => (

              <button

                key={m}

                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${

                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"

                }`}

                onClick={() => setMode(m)}

              >

                {m}

              </button>

            ))}

          </div>

        </div>



        {/* Mode Content */}

        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">

          {mode === "Webcam" && (

            <div className="w-full text-center flex flex-col items-center">

              <div className="mb-4 w-full max-w-xl">

                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>

                <textarea

                  className="w-full p-2 rounded-lg text-black"

                  rows={3}

                  value={prompt}

                  onChange={(e) => setPrompt(e.target.value)}

                />

              </div>

              <div className="relative w-full max-w-xl">

                <video

                  ref={videoRef}

                  autoPlay

                  muted

                  playsInline

                  className="w-full rounded-lg shadow-lg mb-2"

                  style={{ background: "#222" }}

                />

                <canvas

                  ref={canvasRef}

                  className="absolute top-0 left-0 w-full h-full pointer-events-none"

                  style={{ zIndex: 10, pointerEvents: "none" }}

                />

              </div>

              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}

              {error && <div className="text-red-400 mt-2">Error: {error}</div>}

            </div>

          )}

          {mode === "URL" && (

            <div className="w-full text-center flex flex-col items-center">

              <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>

              <div className="flex w-full max-w-xl mb-4">

                <input

                  type="text"

                  className="flex-1 px-4 py-2 rounded-l-lg text-black"

                  value={inputUrl}

                  onChange={(e) => setInputUrl(e.target.value)}

                  placeholder="Paste video URL here"

                />

                <button

                  className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"

                  onClick={() => setVideoUrl(inputUrl)}

                >

                  Load

                </button>

              </div>

              <div className="mb-4 w-full max-w-xl">

                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>

                <textarea

                  className="w-full p-2 rounded-lg text-black"

                  rows={3}

                  value={prompt}

                  onChange={(e) => setPrompt(e.target.value)}

                />

              </div>

              <div className="relative w-full max-w-xl">

                <video

                  ref={videoRef}

                  src={videoUrl}

                  controls

                  autoPlay

                  loop

                  className="w-full rounded-lg shadow-lg mb-2"

                  style={{ background: "#222" }}

                />

                <canvas

                  ref={canvasRef}

                  className="absolute top-0 left-0 w-full h-full pointer-events-none"

                  style={{ zIndex: 10, pointerEvents: "none" }}

                />

              </div>

              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}

              {error && <div className="text-red-400 mt-2">Error: {error}</div>}

            </div>

          )}

          {mode === "File" && (

            <div className="w-full text-center">

              <p className="mb-4">Upload a video or image file for detection (coming soon).</p>

            </div>

          )}

        </div>

      </div>

    </div>
  );
}