Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx

Quazim0t0

Update src/components/MultiSourceCaptioningView.tsx

20197c2 verified 3 days ago

raw

history blame

21.7 kB

	import React, { useState, useRef, useEffect, useCallback } from "react";
	import { useVLMContext } from "../context/useVLMContext";
	import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

	const MODES = ["Webcam", "URL", "File"] as const;
	type Mode = typeof MODES[number];

	const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
	const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";

	// Helper function: normalizeBoxes remains as it is used
	function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
	if (!raw) return [];
	let boxes = [];
	if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
	boxes = raw.image;
	} else if (Array.isArray(raw)) {
	boxes = raw;
	} else if (typeof raw === "object" && raw !== null) {
	boxes = [raw];
	}
	return boxes
	.map((obj: any) => {
	if (!obj \|\| !obj.bbox_2d) return null;
	let bbox = obj.bbox_2d;
	if (
	Array.isArray(bbox) &&
	bbox.length === 2 &&
	Array.isArray(bbox[0]) &&
	Array.isArray(bbox[1]) &&
	bbox[0].length === 2 &&
	bbox[1].length === 2
	) {
	bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
	}
	if (
	Array.isArray(bbox) &&
	bbox.length === 4 &&
	bbox.every((v: any) => typeof v === "number")
	) {
	return { ...obj, bbox_2d: bbox };
	}
	return null;
	})
	.filter((obj: any) => obj);
	}

	function isImageFile(file: File) {
	return file.type.startsWith("image/");
	}
	function isVideoFile(file: File) {
	return file.type.startsWith("video/");
	}

	export default function MultiSourceCaptioningView() {
	const [mode, setMode] = useState<Mode>("File");
	const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
	const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
	const [processingState, setProcessingState] = useState(false); // General processing indicator
	const [error, setError] = useState<string \| null>(null);
	const [mediaStream, setMediaStream] = useState<MediaStream \| null>(null); // For webcam stream
	const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
	const [inferenceStatus, setInferenceStatus] = useState<string>("");
	const [debugOutput, setDebugOutput] = useState<string>("");
	const [uploadedFile, setUploadedFile] = useState<File \| null>(null); // <<< ADDED THIS STATE

	// Refs for the two video elements and the canvas
	const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
	const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
	const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
	const imageRef = useRef<HTMLImageElement>(null); // For image file processing

	const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();

	// --- Drawing Loop for the Visible Display ---
	// This loop runs constantly to draw the latest boxes on the display video
	const drawDisplayCanvas = useCallback(() => {
	const displayVideo = displayVideoRef.current;
	const canvas = canvasRef.current;
	const ctx = canvas?.getContext('2d');

	if (!displayVideo \|\| !canvas \|\| !ctx) {
	return;
	}

	// Adjust canvas size to match the display video's dimensions
	// Only adjust if video has valid dimensions
	if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
	(canvas.width !== displayVideo.videoWidth \|\| canvas.height !== displayVideo.videoHeight)) {
	canvas.width = displayVideo.videoWidth;
	canvas.height = displayVideo.videoHeight;
	}

	// Clear the canvas each frame
	ctx.clearRect(0, 0, canvas.width, canvas.height);

	// Draw the latest bounding boxes
	const scaleX = canvas.width / (displayVideo.videoWidth \|\| 1); // Avoid division by zero
	const scaleY = canvas.height / (displayVideo.videoHeight \|\| 1);
	drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });

	// Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
	if (!displayVideo.paused && !displayVideo.ended) {
	requestAnimationFrame(drawDisplayCanvas);
	}
	}, [latestBoxes]); // Re-create if latestBoxes changes

	// Effect to start the display drawing loop when the display video is ready
	useEffect(() => {
	const displayVideo = displayVideoRef.current;
	if (displayVideo) {
	const handleVideoReady = () => {
	if (displayVideo.readyState >= 1) { // HAVE_METADATA
	requestAnimationFrame(drawDisplayCanvas);
	}
	};
	displayVideo.addEventListener('loadedmetadata', handleVideoReady);
	displayVideo.addEventListener('play', handleVideoReady); // Also start on play
	// Also check if video is already ready (e.g., on component re-mount or autoplay)
	if (displayVideo.readyState >= 1) {
	requestAnimationFrame(drawDisplayCanvas);
	}
	return () => {
	displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
	displayVideo.removeEventListener('play', handleVideoReady);
	};
	}
	}, [drawDisplayCanvas]);

	// --- FastVLM Processing Loop (from hidden video) ---
	// This interval loop controls when FastVLM processes a frame
	useEffect(() => {
	const vlmVideo = vlmVideoRef.current;
	// Determine if we are in a video-based mode that requires continuous processing
	const isVideoModeActive = (
	mode === "Webcam" \|\|
	(mode === "URL" && !!vlmVideo?.src) \|\| // Check if URL video is loaded
	(mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile))
	);

	if (!isLoaded \|\| !vlmVideo \|\| !isVideoModeActive) {
	setProcessingState(false);
	return;
	}

	let interval: ReturnType<typeof setInterval> \| null = null;

	const startVLMProcessing = () => {
	if (interval) clearInterval(interval); // Clear any old interval

	interval = setInterval(async () => {
	if (!vlmVideo \|\| vlmVideo.paused \|\| vlmVideo.ended \|\| vlmVideo.videoWidth === 0 \|\| processingState) {
	return; // Skip if video not ready, paused, ended, or already processing
	}

	setProcessingState(true);
	setInferenceStatus("Running inference...");
	setError(null);

	try {
	// Pass the HTMLVideoElement directly to runInference
	const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
	setDebugOutput(modelOutput);

	let boxes = extractJsonFromMarkdown(modelOutput) \|\| [];
	boxes = normalizeBoxes(boxes);

	setLatestBoxes(boxes);
	setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
	} catch (e) {
	setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
	setLatestBoxes([]);
	setInferenceStatus("Inference failed.");
	} finally {
	setProcessingState(false);
	}
	}, 200); // Inference interval (e.g., 5 frames per second)
	};

	const stopVLMProcessing = () => {
	if (interval) clearInterval(interval);
	interval = null;
	setProcessingState(false);
	setInferenceStatus("Stopped processing.");
	};

	vlmVideo.addEventListener('play', startVLMProcessing);
	vlmVideo.addEventListener('pause', stopVLMProcessing);
	vlmVideo.addEventListener('ended', stopVLMProcessing);
	vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability

	// Initial check if video is already playing or ready
	if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
	startVLMProcessing();
	}

	return () => {
	stopVLMProcessing();
	vlmVideo.removeEventListener('play', startVLMProcessing);
	vlmVideo.removeEventListener('pause', stopVLMProcessing);
	vlmVideo.removeEventListener('ended', stopVLMProcessing);
	vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
	};
	}, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change

	// --- Media Source Handling ---

	// Cleanup for media stream and object URLs
	const cleanupMediaSource = useCallback(() => {
	if (mediaStream) {
	mediaStream.getTracks().forEach(track => track.stop());
	setMediaStream(null);
	}
	if (displayVideoRef.current?.src.startsWith('blob:')) {
	URL.revokeObjectURL(displayVideoRef.current.src);
	displayVideoRef.current.src = "";
	}
	if (vlmVideoRef.current?.src.startsWith('blob:')) {
	URL.revokeObjectURL(vlmVideoRef.current.src);
	vlmVideoRef.current.src = "";
	}
	setLatestBoxes([]);
	setError(null);
	setInferenceStatus("");
	setDebugOutput("");
	setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
	}, [mediaStream]);

	// Handle changing the mode (Webcam, URL, File)
	useEffect(() => {
	cleanupMediaSource();

	const displayVideo = displayVideoRef.current;
	const vlmVideo = vlmVideoRef.current;

	if (!displayVideo \|\| !vlmVideo) return;

	// Reset srcObject/src to ensure fresh start
	displayVideo.srcObject = null;
	vlmVideo.srcObject = null;
	displayVideo.src = "";
	vlmVideo.src = "";

	// Special handling for initial "File" mode to load example video if no file is selected
	if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
	displayVideo.src = EXAMPLE_VIDEO_URL;
	vlmVideo.src = EXAMPLE_VIDEO_URL;
	displayVideo.load(); vlmVideo.load();
	displayVideo.play().catch(e => console.error("Error playing example display video:", e));
	vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
	}
	}, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode

	// Handle Webcam Input
	const handleWebcamInput = useCallback(async () => {
	cleanupMediaSource();
	try {
	const stream = await navigator.mediaDevices.getUserMedia({ video: true });
	setMediaStream(stream);

	if (displayVideoRef.current && vlmVideoRef.current) {
	displayVideoRef.current.srcObject = stream;
	vlmVideoRef.current.srcObject = stream;
	displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
	vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
	}
	setMode("Webcam");
	} catch (e) {
	setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
	setMediaStream(null);
	setLatestBoxes([]);
	setInferenceStatus("Webcam access denied or failed.");
	}
	}, [cleanupMediaSource]);

	// Handle URL Input (when Load button is clicked)
	const handleLoadUrl = useCallback(() => {
	cleanupMediaSource();

	const url = currentUrlInput;
	if (!url) {
	setError("Please enter a valid URL.");
	return;
	}

	if (displayVideoRef.current && vlmVideoRef.current) {
	displayVideoRef.current.src = url;
	vlmVideoRef.current.src = url;
	displayVideoRef.current.load(); vlmVideoRef.current.load();
	displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
	vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
	setMode("URL");
	}
	}, [currentUrlInput, cleanupMediaSource]);

	// Handle File Input
	const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
	cleanupMediaSource();

	const file = e.target.files?.[0] \|\| null;
	setUploadedFile(file); // <<< FIXED: Set uploadedFile state here

	if (file) {
	const fileUrl = URL.createObjectURL(file);

	if (isImageFile(file)) {
	// Image file, will be handled by imageRef and single processing logic
	setMode("File"); // Ensure mode is "File"
	// No direct video assignment needed here, imageRef handles display
	} else if (isVideoFile(file)) {
	if (displayVideoRef.current && vlmVideoRef.current) {
	displayVideoRef.current.src = fileUrl;
	vlmVideoRef.current.src = fileUrl;
	displayVideoRef.current.load(); vlmVideoRef.current.load();
	displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
	vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
	setMode("File"); // Ensure mode is "File"
	}
	} else {
	setError("Unsupported file type. Please upload an image or video.");
	setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
	if (fileUrl) URL.revokeObjectURL(fileUrl);
	}
	} else {
	setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
	// If no file selected, revert to example video if in File mode
	if (mode === "File") {
	if (displayVideoRef.current && vlmVideoRef.current) {
	displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
	vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
	displayVideoRef.current.load(); vlmVideoRef.current.load();
	displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
	vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
	}
	}
	}
	}, [cleanupMediaSource, mode]);


	// Handler for processing an uploaded image file (one-time inference)
	const handleProcessImage = async () => {
	if (!isLoaded \|\| !uploadedFile \|\| !isImageFile(uploadedFile) \|\| !imageRef.current \|\| !canvasRef.current) {
	setError("Image or model not ready for processing, or no image file selected.");
	return;
	}

	const img = imageRef.current;
	const canvas = canvasRef.current;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;

	canvas.width = img.naturalWidth;
	canvas.height = img.naturalHeight;

	setProcessingState(true);
	setError(null);
	setInferenceStatus("Running image inference...");

	try {
	// Pass the HTMLImageElement directly to runInference
	const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
	setDebugOutput(modelOutput);
	setInferenceStatus("Image inference complete.");

	ctx.clearRect(0, 0, canvas.width, canvas.height);
	ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image

	let boxes = extractJsonFromMarkdown(modelOutput) \|\| [];
	boxes = normalizeBoxes(boxes);
	setLatestBoxes(boxes);

	if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
	} catch (e) {
	setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
	setLatestBoxes([]);
	setInferenceStatus("Image inference failed.");
	} finally {
	setProcessingState(false);
	}
	};

	// --- Rendered UI ---
	return (
	<div className="absolute inset-0 text-white flex flex-col">
	<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
	{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
	</div>
	<div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>

	<div className="flex flex-col items-center justify-center flex-1 w-full p-4">
	{/* Mode Selector */}
	<div className="mb-6 mt-4">
	<div className="flex space-x-4">
	{MODES.map((m) => (
	<button
	key={m}
	className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
	mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
	}`}
	onClick={() => setMode(m)}
	disabled={!isLoaded && m !== "File"}
	>
	{m}
	</button>
	))}
	</div>
	</div>

	{/* Dynamic Content Area */}
	<div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
	{/* Prompt Input (Common to all modes) */}
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	disabled={processingState}
	/>
	</div>

	{/* Video/Image Display and Canvas Overlay */}
	<div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
	{mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
	<img
	ref={imageRef}
	src={URL.createObjectURL(uploadedFile)}
	alt="Uploaded"
	className="max-w-full max-h-full block object-contain"
	style={{ position: 'absolute' }}
	onLoad={() => {
	if (imageRef.current && canvasRef.current) {
	canvasRef.current.width = imageRef.current.naturalWidth;
	canvasRef.current.height = imageRef.current.naturalHeight;
	}
	}}
	/>
	) : (
	<video
	ref={displayVideoRef}
	autoPlay
	muted
	playsInline
	loop
	className="max-w-full max-h-full block object-contain"
	style={{ position: 'absolute' }}
	/>
	)}
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 10 }}
	/>
	</div>

	{/* Controls specific to each mode */}
	<div className="mt-4 flex flex-col items-center gap-2">
	{mode === "Webcam" && (
	<button
	className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
	onClick={handleWebcamInput}
	disabled={processingState \|\| !isLoaded}
	>
	{mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
	</button>
	)}

	{mode === "URL" && (
	<>
	<div className="flex w-full max-w-xl">
	<input
	type="text"
	className="flex-1 px-4 py-2 rounded-l-lg text-black"
	value={currentUrlInput}
	onChange={(e) => setCurrentUrlInput(e.target.value)}
	placeholder="Paste video URL here"
	disabled={processingState}
	/>
	<button
	className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
	onClick={handleLoadUrl}
	disabled={processingState \|\| !isLoaded}
	>
	Load URL
	</button>
	</div>
	</>
	)}

	{mode === "File" && (
	<>
	<input
	type="file"
	accept="image/,video/"
	onChange={handleFileChange}
	className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
	disabled={processingState}
	/>
	{uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
	<button
	className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
	onClick={handleProcessImage}
	disabled={processingState \|\| !isLoaded}
	>
	{processingState ? "Processing Image..." : "Process Image"}
	</button>
	)}
	</>
	)}
	</div>

	{/* Error and Debug Output */}
	{error && <div className="text-red-400 mt-2 text-center">{error}</div>}
	<div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
	<div>Raw Model Output:</div>
	<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
	</div>
	</div>
	</div>

	{/* Hidden Video for VLM processing - this must be rendered always */}
	<video
	ref={vlmVideoRef}
	autoPlay
	muted
	playsInline
	loop
	style={{ display: 'none' }} // Hidden from view
	/>
	</div>
	);
	}