FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx
Quazim0t0's picture
Upload 36 files
1a9c884 verified
raw
history blame
10.3 kB
import React, { useState, useRef, useEffect } from "react";
import { FASTVLM_BOXING_PROMPT } from "../constants";
import { useVLMContext } from "../context/useVLMContext";
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
const MODES = ["Webcam", "URL", "File"] as const;
type Mode = typeof MODES[number];
const EXAMPLE_VIDEO_URL =
"https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
export default function MultiSourceCaptioningView() {
const [mode, setMode] = useState<Mode>("URL");
const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
const [processing, setProcessing] = useState(false);
const [error, setError] = useState<string | null>(null);
const [webcamActive, setWebcamActive] = useState(false);
const videoRef = useRef<HTMLVideoElement | null>(null);
const canvasRef = useRef<HTMLCanvasElement | null>(null);
const webcamStreamRef = useRef<MediaStream | null>(null);
const { isLoaded, runInference } = useVLMContext();
// Webcam setup and teardown
useEffect(() => {
if (mode !== "Webcam") {
if (webcamStreamRef.current) {
webcamStreamRef.current.getTracks().forEach((track) => track.stop());
webcamStreamRef.current = null;
}
setWebcamActive(false);
return;
}
let stopped = false;
const setupWebcam = async () => {
try {
setError(null);
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
webcamStreamRef.current = stream;
if (videoRef.current) {
videoRef.current.srcObject = stream;
setWebcamActive(true);
}
} catch (e) {
setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
setWebcamActive(false);
}
};
setupWebcam();
return () => {
stopped = true;
if (webcamStreamRef.current) {
webcamStreamRef.current.getTracks().forEach((track) => track.stop());
webcamStreamRef.current = null;
}
setWebcamActive(false);
};
}, [mode]);
// Process webcam frames
useEffect(() => {
if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
let interval: NodeJS.Timeout | null = null;
let stopped = false;
const processFrame = async () => {
if (!videoRef.current || !canvasRef.current) return;
const video = videoRef.current;
const canvas = canvasRef.current;
if (video.videoWidth === 0) return;
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const ctx = canvas.getContext("2d");
if (!ctx) return;
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
try {
setProcessing(true);
setError(null);
// Use FastVLM inference on the current frame
const fakeVideo = {
videoWidth: canvas.width,
videoHeight: canvas.height,
// @ts-ignore
getContext: () => ctx,
} as HTMLVideoElement;
const result = await runInference(fakeVideo, prompt);
// Clear canvas and redraw frame
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
// Parse and draw boxes
const boxes = extractJsonFromMarkdown(result) || [];
drawBoundingBoxesOnCanvas(ctx, boxes);
} catch (e) {
setError(e instanceof Error ? e.message : String(e));
} finally {
setProcessing(false);
}
};
interval = setInterval(() => {
if (!stopped) processFrame();
}, 1000);
return () => {
stopped = true;
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference, webcamActive]);
// Process video frames for URL mode
useEffect(() => {
if (mode !== "URL" || !isLoaded) return;
let interval: NodeJS.Timeout | null = null;
let stopped = false;
const processFrame = async () => {
if (!videoRef.current || !canvasRef.current) return;
const video = videoRef.current;
const canvas = canvasRef.current;
if (video.paused || video.ended || video.videoWidth === 0) return;
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const ctx = canvas.getContext("2d");
if (!ctx) return;
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
try {
setProcessing(true);
setError(null);
// Use FastVLM inference on the current frame
const fakeVideo = {
videoWidth: canvas.width,
videoHeight: canvas.height,
// @ts-ignore
getContext: () => ctx,
} as HTMLVideoElement;
const result = await runInference(fakeVideo, prompt);
// Clear canvas and redraw frame
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
// Parse and draw boxes
const boxes = extractJsonFromMarkdown(result) || [];
drawBoundingBoxesOnCanvas(ctx, boxes);
} catch (e) {
setError(e instanceof Error ? e.message : String(e));
} finally {
setProcessing(false);
}
};
interval = setInterval(() => {
if (!stopped) processFrame();
}, 1000);
return () => {
stopped = true;
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference]);
return (
<div className="absolute inset-0 text-white">
<div className="flex flex-col items-center justify-center h-full w-full">
{/* Mode Selector */}
<div className="mb-6">
<div className="flex space-x-4">
{MODES.map((m) => (
<button
key={m}
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
}`}
onClick={() => setMode(m)}
>
{m}
</button>
))}
</div>
</div>
{/* Mode Content */}
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
{mode === "Webcam" && (
<div className="w-full text-center flex flex-col items-center">
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
/>
</div>
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
autoPlay
muted
playsInline
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
</div>
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
</div>
)}
{mode === "URL" && (
<div className="w-full text-center flex flex-col items-center">
<p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
<div className="flex w-full max-w-xl mb-4">
<input
type="text"
className="flex-1 px-4 py-2 rounded-l-lg text-black"
value={inputUrl}
onChange={(e) => setInputUrl(e.target.value)}
placeholder="Paste video URL here"
/>
<button
className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
onClick={() => setVideoUrl(inputUrl)}
>
Load
</button>
</div>
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
/>
</div>
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
src={videoUrl}
controls
autoPlay
loop
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
</div>
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
</div>
)}
{mode === "File" && (
<div className="w-full text-center">
<p className="mb-4">Upload a video or image file for detection (coming soon).</p>
</div>
)}
</div>
</div>
</div>
);
}