import React, { useState, useRef, useEffect } from "react"; import { FASTVLM_BOXING_PROMPT } from "../constants"; import { useVLMContext } from "../context/useVLMContext"; import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; const MODES = ["Webcam", "URL", "File"] as const; type Mode = typeof MODES[number]; const EXAMPLE_VIDEO_URL = "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4"; const EXAMPLE_PROMPT = "Find as many objects in the video and box them."; export default function MultiSourceCaptioningView() { const [mode, setMode] = useState("URL"); const [videoUrl, setVideoUrl] = useState(EXAMPLE_VIDEO_URL); const [inputUrl, setInputUrl] = useState(EXAMPLE_VIDEO_URL); const [prompt, setPrompt] = useState(EXAMPLE_PROMPT); const [processing, setProcessing] = useState(false); const [error, setError] = useState(null); const [webcamActive, setWebcamActive] = useState(false); const videoRef = useRef(null); const canvasRef = useRef(null); const webcamStreamRef = useRef(null); const { isLoaded, runInference } = useVLMContext(); // Webcam setup and teardown useEffect(() => { if (mode !== "Webcam") { if (webcamStreamRef.current) { webcamStreamRef.current.getTracks().forEach((track) => track.stop()); webcamStreamRef.current = null; } setWebcamActive(false); return; } let stopped = false; const setupWebcam = async () => { try { setError(null); const stream = await navigator.mediaDevices.getUserMedia({ video: true }); webcamStreamRef.current = stream; if (videoRef.current) { videoRef.current.srcObject = stream; setWebcamActive(true); } } catch (e) { setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e))); setWebcamActive(false); } }; setupWebcam(); return () => { stopped = true; if (webcamStreamRef.current) { webcamStreamRef.current.getTracks().forEach((track) => track.stop()); webcamStreamRef.current = null; } setWebcamActive(false); }; }, [mode]); // Process webcam frames useEffect(() => { if (mode !== "Webcam" || !isLoaded || !webcamActive) return; let interval: NodeJS.Timeout | null = null; let stopped = false; const processFrame = async () => { if (!videoRef.current || !canvasRef.current) return; const video = videoRef.current; const canvas = canvasRef.current; if (video.videoWidth === 0) return; canvas.width = video.videoWidth; canvas.height = video.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(video, 0, 0, canvas.width, canvas.height); try { setProcessing(true); setError(null); // Use FastVLM inference on the current frame const fakeVideo = { videoWidth: canvas.width, videoHeight: canvas.height, // @ts-ignore getContext: () => ctx, } as HTMLVideoElement; const result = await runInference(fakeVideo, prompt); // Clear canvas and redraw frame ctx.drawImage(video, 0, 0, canvas.width, canvas.height); // Parse and draw boxes const boxes = extractJsonFromMarkdown(result) || []; drawBoundingBoxesOnCanvas(ctx, boxes); } catch (e) { setError(e instanceof Error ? e.message : String(e)); } finally { setProcessing(false); } }; interval = setInterval(() => { if (!stopped) processFrame(); }, 1000); return () => { stopped = true; if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, webcamActive]); // Process video frames for URL mode useEffect(() => { if (mode !== "URL" || !isLoaded) return; let interval: NodeJS.Timeout | null = null; let stopped = false; const processFrame = async () => { if (!videoRef.current || !canvasRef.current) return; const video = videoRef.current; const canvas = canvasRef.current; if (video.paused || video.ended || video.videoWidth === 0) return; canvas.width = video.videoWidth; canvas.height = video.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(video, 0, 0, canvas.width, canvas.height); try { setProcessing(true); setError(null); // Use FastVLM inference on the current frame const fakeVideo = { videoWidth: canvas.width, videoHeight: canvas.height, // @ts-ignore getContext: () => ctx, } as HTMLVideoElement; const result = await runInference(fakeVideo, prompt); // Clear canvas and redraw frame ctx.drawImage(video, 0, 0, canvas.width, canvas.height); // Parse and draw boxes const boxes = extractJsonFromMarkdown(result) || []; drawBoundingBoxesOnCanvas(ctx, boxes); } catch (e) { setError(e instanceof Error ? e.message : String(e)); } finally { setProcessing(false); } }; interval = setInterval(() => { if (!stopped) processFrame(); }, 1000); return () => { stopped = true; if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference]); return (
{/* Mode Selector */}
{MODES.map((m) => ( ))}
{/* Mode Content */}
{mode === "Webcam" && (