FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx
Quazim0t0's picture
Update src/components/MultiSourceCaptioningView.tsx
20197c2 verified
raw
history blame
21.7 kB
import React, { useState, useRef, useEffect, useCallback } from "react";
import { useVLMContext } from "../context/useVLMContext";
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
const MODES = ["Webcam", "URL", "File"] as const;
type Mode = typeof MODES[number];
const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
// Helper function: normalizeBoxes remains as it is used
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
if (!raw) return [];
let boxes = [];
if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
boxes = raw.image;
} else if (Array.isArray(raw)) {
boxes = raw;
} else if (typeof raw === "object" && raw !== null) {
boxes = [raw];
}
return boxes
.map((obj: any) => {
if (!obj || !obj.bbox_2d) return null;
let bbox = obj.bbox_2d;
if (
Array.isArray(bbox) &&
bbox.length === 2 &&
Array.isArray(bbox[0]) &&
Array.isArray(bbox[1]) &&
bbox[0].length === 2 &&
bbox[1].length === 2
) {
bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
}
if (
Array.isArray(bbox) &&
bbox.length === 4 &&
bbox.every((v: any) => typeof v === "number")
) {
return { ...obj, bbox_2d: bbox };
}
return null;
})
.filter((obj: any) => obj);
}
function isImageFile(file: File) {
return file.type.startsWith("image/");
}
function isVideoFile(file: File) {
return file.type.startsWith("video/");
}
export default function MultiSourceCaptioningView() {
const [mode, setMode] = useState<Mode>("File");
const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
const [processingState, setProcessingState] = useState(false); // General processing indicator
const [error, setError] = useState<string | null>(null);
const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
const [inferenceStatus, setInferenceStatus] = useState<string>("");
const [debugOutput, setDebugOutput] = useState<string>("");
const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE
// Refs for the two video elements and the canvas
const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
const imageRef = useRef<HTMLImageElement>(null); // For image file processing
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
// --- Drawing Loop for the Visible Display ---
// This loop runs constantly to draw the latest boxes on the display video
const drawDisplayCanvas = useCallback(() => {
const displayVideo = displayVideoRef.current;
const canvas = canvasRef.current;
const ctx = canvas?.getContext('2d');
if (!displayVideo || !canvas || !ctx) {
return;
}
// Adjust canvas size to match the display video's dimensions
// Only adjust if video has valid dimensions
if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
(canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
canvas.width = displayVideo.videoWidth;
canvas.height = displayVideo.videoHeight;
}
// Clear the canvas each frame
ctx.clearRect(0, 0, canvas.width, canvas.height);
// Draw the latest bounding boxes
const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
const scaleY = canvas.height / (displayVideo.videoHeight || 1);
drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
// Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
if (!displayVideo.paused && !displayVideo.ended) {
requestAnimationFrame(drawDisplayCanvas);
}
}, [latestBoxes]); // Re-create if latestBoxes changes
// Effect to start the display drawing loop when the display video is ready
useEffect(() => {
const displayVideo = displayVideoRef.current;
if (displayVideo) {
const handleVideoReady = () => {
if (displayVideo.readyState >= 1) { // HAVE_METADATA
requestAnimationFrame(drawDisplayCanvas);
}
};
displayVideo.addEventListener('loadedmetadata', handleVideoReady);
displayVideo.addEventListener('play', handleVideoReady); // Also start on play
// Also check if video is already ready (e.g., on component re-mount or autoplay)
if (displayVideo.readyState >= 1) {
requestAnimationFrame(drawDisplayCanvas);
}
return () => {
displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
displayVideo.removeEventListener('play', handleVideoReady);
};
}
}, [drawDisplayCanvas]);
// --- FastVLM Processing Loop (from hidden video) ---
// This interval loop controls when FastVLM processes a frame
useEffect(() => {
const vlmVideo = vlmVideoRef.current;
// Determine if we are in a video-based mode that requires continuous processing
const isVideoModeActive = (
mode === "Webcam" ||
(mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
(mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile))
);
if (!isLoaded || !vlmVideo || !isVideoModeActive) {
setProcessingState(false);
return;
}
let interval: ReturnType<typeof setInterval> | null = null;
const startVLMProcessing = () => {
if (interval) clearInterval(interval); // Clear any old interval
interval = setInterval(async () => {
if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
return; // Skip if video not ready, paused, ended, or already processing
}
setProcessingState(true);
setInferenceStatus("Running inference...");
setError(null);
try {
// Pass the HTMLVideoElement directly to runInference
const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
setDebugOutput(modelOutput);
let boxes = extractJsonFromMarkdown(modelOutput) || [];
boxes = normalizeBoxes(boxes);
setLatestBoxes(boxes);
setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
} catch (e) {
setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
setLatestBoxes([]);
setInferenceStatus("Inference failed.");
} finally {
setProcessingState(false);
}
}, 200); // Inference interval (e.g., 5 frames per second)
};
const stopVLMProcessing = () => {
if (interval) clearInterval(interval);
interval = null;
setProcessingState(false);
setInferenceStatus("Stopped processing.");
};
vlmVideo.addEventListener('play', startVLMProcessing);
vlmVideo.addEventListener('pause', stopVLMProcessing);
vlmVideo.addEventListener('ended', stopVLMProcessing);
vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability
// Initial check if video is already playing or ready
if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
startVLMProcessing();
}
return () => {
stopVLMProcessing();
vlmVideo.removeEventListener('play', startVLMProcessing);
vlmVideo.removeEventListener('pause', stopVLMProcessing);
vlmVideo.removeEventListener('ended', stopVLMProcessing);
vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
};
}, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change
// --- Media Source Handling ---
// Cleanup for media stream and object URLs
const cleanupMediaSource = useCallback(() => {
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
setMediaStream(null);
}
if (displayVideoRef.current?.src.startsWith('blob:')) {
URL.revokeObjectURL(displayVideoRef.current.src);
displayVideoRef.current.src = "";
}
if (vlmVideoRef.current?.src.startsWith('blob:')) {
URL.revokeObjectURL(vlmVideoRef.current.src);
vlmVideoRef.current.src = "";
}
setLatestBoxes([]);
setError(null);
setInferenceStatus("");
setDebugOutput("");
setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
}, [mediaStream]);
// Handle changing the mode (Webcam, URL, File)
useEffect(() => {
cleanupMediaSource();
const displayVideo = displayVideoRef.current;
const vlmVideo = vlmVideoRef.current;
if (!displayVideo || !vlmVideo) return;
// Reset srcObject/src to ensure fresh start
displayVideo.srcObject = null;
vlmVideo.srcObject = null;
displayVideo.src = "";
vlmVideo.src = "";
// Special handling for initial "File" mode to load example video if no file is selected
if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
displayVideo.src = EXAMPLE_VIDEO_URL;
vlmVideo.src = EXAMPLE_VIDEO_URL;
displayVideo.load(); vlmVideo.load();
displayVideo.play().catch(e => console.error("Error playing example display video:", e));
vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
}
}, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
// Handle Webcam Input
const handleWebcamInput = useCallback(async () => {
cleanupMediaSource();
try {
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
setMediaStream(stream);
if (displayVideoRef.current && vlmVideoRef.current) {
displayVideoRef.current.srcObject = stream;
vlmVideoRef.current.srcObject = stream;
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
}
setMode("Webcam");
} catch (e) {
setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
setMediaStream(null);
setLatestBoxes([]);
setInferenceStatus("Webcam access denied or failed.");
}
}, [cleanupMediaSource]);
// Handle URL Input (when Load button is clicked)
const handleLoadUrl = useCallback(() => {
cleanupMediaSource();
const url = currentUrlInput;
if (!url) {
setError("Please enter a valid URL.");
return;
}
if (displayVideoRef.current && vlmVideoRef.current) {
displayVideoRef.current.src = url;
vlmVideoRef.current.src = url;
displayVideoRef.current.load(); vlmVideoRef.current.load();
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
setMode("URL");
}
}, [currentUrlInput, cleanupMediaSource]);
// Handle File Input
const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
cleanupMediaSource();
const file = e.target.files?.[0] || null;
setUploadedFile(file); // <<< FIXED: Set uploadedFile state here
if (file) {
const fileUrl = URL.createObjectURL(file);
if (isImageFile(file)) {
// Image file, will be handled by imageRef and single processing logic
setMode("File"); // Ensure mode is "File"
// No direct video assignment needed here, imageRef handles display
} else if (isVideoFile(file)) {
if (displayVideoRef.current && vlmVideoRef.current) {
displayVideoRef.current.src = fileUrl;
vlmVideoRef.current.src = fileUrl;
displayVideoRef.current.load(); vlmVideoRef.current.load();
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
setMode("File"); // Ensure mode is "File"
}
} else {
setError("Unsupported file type. Please upload an image or video.");
setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
if (fileUrl) URL.revokeObjectURL(fileUrl);
}
} else {
setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
// If no file selected, revert to example video if in File mode
if (mode === "File") {
if (displayVideoRef.current && vlmVideoRef.current) {
displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
displayVideoRef.current.load(); vlmVideoRef.current.load();
displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
}
}
}
}, [cleanupMediaSource, mode]);
// Handler for processing an uploaded image file (one-time inference)
const handleProcessImage = async () => {
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
setError("Image or model not ready for processing, or no image file selected.");
return;
}
const img = imageRef.current;
const canvas = canvasRef.current;
const ctx = canvas.getContext("2d");
if (!ctx) return;
canvas.width = img.naturalWidth;
canvas.height = img.naturalHeight;
setProcessingState(true);
setError(null);
setInferenceStatus("Running image inference...");
try {
// Pass the HTMLImageElement directly to runInference
const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
setDebugOutput(modelOutput);
setInferenceStatus("Image inference complete.");
ctx.clearRect(0, 0, canvas.width, canvas.height);
ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image
let boxes = extractJsonFromMarkdown(modelOutput) || [];
boxes = normalizeBoxes(boxes);
setLatestBoxes(boxes);
if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
} catch (e) {
setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
setLatestBoxes([]);
setInferenceStatus("Image inference failed.");
} finally {
setProcessingState(false);
}
};
// --- Rendered UI ---
return (
<div className="absolute inset-0 text-white flex flex-col">
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
</div>
<div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>
<div className="flex flex-col items-center justify-center flex-1 w-full p-4">
{/* Mode Selector */}
<div className="mb-6 mt-4">
<div className="flex space-x-4">
{MODES.map((m) => (
<button
key={m}
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
}`}
onClick={() => setMode(m)}
disabled={!isLoaded && m !== "File"}
>
{m}
</button>
))}
</div>
</div>
{/* Dynamic Content Area */}
<div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
{/* Prompt Input (Common to all modes) */}
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
disabled={processingState}
/>
</div>
{/* Video/Image Display and Canvas Overlay */}
<div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
{mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
<img
ref={imageRef}
src={URL.createObjectURL(uploadedFile)}
alt="Uploaded"
className="max-w-full max-h-full block object-contain"
style={{ position: 'absolute' }}
onLoad={() => {
if (imageRef.current && canvasRef.current) {
canvasRef.current.width = imageRef.current.naturalWidth;
canvasRef.current.height = imageRef.current.naturalHeight;
}
}}
/>
) : (
<video
ref={displayVideoRef}
autoPlay
muted
playsInline
loop
className="max-w-full max-h-full block object-contain"
style={{ position: 'absolute' }}
/>
)}
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10 }}
/>
</div>
{/* Controls specific to each mode */}
<div className="mt-4 flex flex-col items-center gap-2">
{mode === "Webcam" && (
<button
className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
onClick={handleWebcamInput}
disabled={processingState || !isLoaded}
>
{mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
</button>
)}
{mode === "URL" && (
<>
<div className="flex w-full max-w-xl">
<input
type="text"
className="flex-1 px-4 py-2 rounded-l-lg text-black"
value={currentUrlInput}
onChange={(e) => setCurrentUrlInput(e.target.value)}
placeholder="Paste video URL here"
disabled={processingState}
/>
<button
className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
onClick={handleLoadUrl}
disabled={processingState || !isLoaded}
>
Load URL
</button>
</div>
</>
)}
{mode === "File" && (
<>
<input
type="file"
accept="image/*,video/*"
onChange={handleFileChange}
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
disabled={processingState}
/>
{uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
<button
className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
onClick={handleProcessImage}
disabled={processingState || !isLoaded}
>
{processingState ? "Processing Image..." : "Process Image"}
</button>
)}
</>
)}
</div>
{/* Error and Debug Output */}
{error && <div className="text-red-400 mt-2 text-center">{error}</div>}
<div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
<div>Raw Model Output:</div>
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
</div>
</div>
</div>
{/* Hidden Video for VLM processing - this must be rendered always */}
<video
ref={vlmVideoRef}
autoPlay
muted
playsInline
loop
style={{ display: 'none' }} // Hidden from view
/>
</div>
);
}