Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx

Quazim0t0

Upload 51 files

d5e70e6 verified 3 days ago

raw

history blame

26.1 kB

	import React, { useState, useRef, useEffect } from "react";
	import { useVLMContext } from "../context/useVLMContext";
	import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

	const MODES = ["Webcam", "URL", "File"] as const;
	type Mode = typeof MODES[number];

	const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
	const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";

	function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
	if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
	const label = arr[0];
	return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
	}
	return [];
	}

	function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
	if (!raw) return [];
	let boxes = [];
	if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
	boxes = raw.image;
	} else if (Array.isArray(raw)) {
	boxes = raw;
	} else if (typeof raw === "object" && raw !== null) {
	boxes = [raw];
	}
	return boxes
	.map((obj: any) => {
	if (!obj \|\| !obj.bbox_2d) return null;
	let bbox = obj.bbox_2d;
	// If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
	if (
	Array.isArray(bbox) &&
	bbox.length === 2 &&
	Array.isArray(bbox[0]) &&
	Array.isArray(bbox[1]) &&
	bbox[0].length === 2 &&
	bbox[1].length === 2
	) {
	bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
	}
	// If bbox_2d is [x1, y1, x2, y2], use as-is
	if (
	Array.isArray(bbox) &&
	bbox.length === 4 &&
	bbox.every((v: any) => typeof v === "number")
	) {
	return { ...obj, bbox_2d: bbox };
	}
	// Otherwise, skip
	return null;
	})
	.filter((obj: any) => obj);
	}

	function isImageFile(file: File) {
	return file.type.startsWith("image/");
	}
	function isVideoFile(file: File) {
	return file.type.startsWith("video/");
	}

	export default function MultiSourceCaptioningView() {
	const [mode, setMode] = useState<Mode>("File");
	const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
	const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
	const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
	const [processing, setProcessing] = useState(false);
	const [error, setError] = useState<string \| null>(null);
	const [webcamActive, setWebcamActive] = useState(false);
	const [uploadedFile, setUploadedFile] = useState<File \| null>(null);
	const [uploadedUrl, setUploadedUrl] = useState<string>("");
	const [videoProcessing, setVideoProcessing] = useState(false);
	const [imageProcessed, setImageProcessed] = useState(false);
	const [exampleProcessing, setExampleProcessing] = useState(false);
	const [urlProcessing, setUrlProcessing] = useState(false);
	const [debugOutput, setDebugOutput] = useState<string>("");
	const [canvasDims, setCanvasDims] = useState<{w:number,h:number}\|null>(null);
	const [videoDims, setVideoDims] = useState<{w:number,h:number}\|null>(null);
	const [inferenceStatus, setInferenceStatus] = useState<string>("");

	const videoRef = useRef<HTMLVideoElement \| null>(null);
	const overlayVideoRef = useRef<HTMLVideoElement \| null>(null); // NEW: overlay video
	const canvasRef = useRef<HTMLCanvasElement \| null>(null);
	const imageRef = useRef<HTMLImageElement \| null>(null);
	const webcamStreamRef = useRef<MediaStream \| null>(null);
	const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();

	// Persistent boxes state: {boxes: [...], timestamp: number}
	const [persistentBoxes, setPersistentBoxes] = useState<{boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]>([]);
	const BOX_PERSIST_MS = 2000; // 2 seconds

	// Helper: Add new boxes with timestamp
	const addBoxesWithTimestamp = (boxes: {label: string, bbox_2d: number[]}[]) => {
	if (!boxes \|\| boxes.length === 0) return;
	setPersistentBoxes((prev: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]) => [
	...prev.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => Date.now() - entry.timestamp < BOX_PERSIST_MS),
	{ boxes, timestamp: Date.now() }
	]);
	};

	// Helper: Get all boxes from last 2 seconds
	const getCurrentBoxes = () => {
	const now = Date.now();
	return persistentBoxes
	.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => now - entry.timestamp < BOX_PERSIST_MS)
	.flatMap((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => entry.boxes);
	};

	// Synchronize overlay video with main video
	useEffect(() => {
	const main = videoRef.current;
	const overlay = overlayVideoRef.current;
	if (!main \|\| !overlay) return;
	// Sync play/pause
	const syncPlay = () => { if (main.paused !== overlay.paused) main.paused ? overlay.pause() : overlay.play(); };
	main.addEventListener('play', () => overlay.play());
	main.addEventListener('pause', () => overlay.pause());
	// Sync seeking
	const syncTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) overlay.currentTime = main.currentTime; };
	main.addEventListener('seeked', syncTime);
	main.addEventListener('timeupdate', syncTime);
	// Clean up
	return () => {
	main.removeEventListener('play', () => overlay.play());
	main.removeEventListener('pause', () => overlay.pause());
	main.removeEventListener('seeked', syncTime);
	main.removeEventListener('timeupdate', syncTime);
	};
	}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);

	// Update: processVideoFrame now adds boxes to persistentBoxes
	const processVideoFrame = async () => {
	if (!videoRef.current \|\| !canvasRef.current) return;
	const video = videoRef.current;
	const canvas = canvasRef.current;
	if (video.paused \|\| video.ended \|\| video.videoWidth === 0) return;
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
	await runInference(video, prompt, (output: string) => {
	setDebugOutput(output);
	let boxes = extractJsonFromMarkdown(output) \|\| [];
	if (boxes.length === 0 && Array.isArray(output)) {
	boxes = parseFlatBoxArray(output);
	}
	boxes = normalizeBoxes(boxes);
	if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
	if (Array.isArray(boxes) && boxes.length > 0) {
	addBoxesWithTimestamp(boxes); // <-- Add to persistent state
	}
	});
	};

	// Draw persistent boxes on every frame
	useEffect(() => {
	const draw = () => {
	if (!videoRef.current \|\| !canvasRef.current) return;
	const video = videoRef.current;
	const canvas = canvasRef.current;
	if (video.videoWidth === 0) return;
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.clearRect(0, 0, canvas.width, canvas.height);
	const boxes = getCurrentBoxes();
	if (boxes.length > 0) {
	const scaleX = canvas.width / video.videoWidth;
	const scaleY = canvas.height / video.videoHeight;
	drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
	}
	};
	draw();
	const interval = setInterval(draw, 100);
	return () => clearInterval(interval);
	}, [persistentBoxes, videoRef, canvasRef]);

	const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
	const file = e.target.files?.[0] \|\| null;
	setUploadedFile(file);
	setUploadedUrl(file ? URL.createObjectURL(file) : "");
	setError(null);
	setImageProcessed(false);
	setVideoProcessing(false);
	setExampleProcessing(false);
	};

	// Webcam setup and teardown (unchanged)
	useEffect(() => {
	if (mode !== "Webcam") {
	if (webcamStreamRef.current) {
	webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
	webcamStreamRef.current = null;
	}
	setWebcamActive(false);
	return;
	}
	const setupWebcam = async () => {
	try {
	setError(null);
	const stream = await navigator.mediaDevices.getUserMedia({ video: true });
	webcamStreamRef.current = stream;
	if (videoRef.current) {
	videoRef.current.srcObject = stream;
	setWebcamActive(true);
	}
	} catch (e) {
	setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
	setWebcamActive(false);
	}
	};
	setupWebcam();
	return () => {
	if (webcamStreamRef.current) {
	webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
	webcamStreamRef.current = null;
	}
	setWebcamActive(false);
	};
	}, [mode]);

	// Webcam mode: process frames with setInterval
	useEffect(() => {
	if (mode !== "Webcam" \|\| !isLoaded \|\| !webcamActive) return;
	let interval: ReturnType<typeof setInterval> \| null = null;
	interval = setInterval(() => {
	processVideoFrame();
	}, 1000);
	return () => {
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference, webcamActive]);

	// URL mode: process frames with setInterval
	useEffect(() => {
	if (mode !== "URL" \|\| !isLoaded \|\| !urlProcessing) return;
	let interval: ReturnType<typeof setInterval> \| null = null;
	interval = setInterval(() => {
	processVideoFrame();
	}, 1000);
	return () => {
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference, urlProcessing]);

	// File video mode: process frames with setInterval
	useEffect(() => {
	if (mode !== "File" \|\| !isLoaded \|\| !uploadedFile \|\| !isVideoFile(uploadedFile) \|\| !videoProcessing) return;
	let interval: ReturnType<typeof setInterval> \| null = null;
	interval = setInterval(() => {
	processVideoFrame();
	}, 1000);
	return () => {
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);

	// Example video mode: process frames with setInterval
	useEffect(() => {
	if (mode !== "File" \|\| uploadedFile \|\| !isLoaded \|\| !exampleProcessing) return;
	let interval: ReturnType<typeof setInterval> \| null = null;
	interval = setInterval(() => {
	processVideoFrame();
	}, 1000);
	return () => {
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);

	// File mode: process uploaded image (only on button click)
	const handleProcessImage = async () => {
	if (!isLoaded \|\| !uploadedFile \|\| !isImageFile(uploadedFile) \|\| !imageRef.current \|\| !canvasRef.current) return;
	const img = imageRef.current;
	const canvas = canvasRef.current;
	canvas.width = img.naturalWidth;
	canvas.height = img.naturalHeight;
	setCanvasDims({w:canvas.width,h:canvas.height});
	setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
	setProcessing(true);
	setError(null);
	setInferenceStatus("Running inference...");
	await runInference(img, prompt, (output: string) => {
	setDebugOutput(output);
	setInferenceStatus("Inference complete.");
	ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
	let boxes = extractJsonFromMarkdown(output) \|\| [];
	if (boxes.length === 0 && Array.isArray(output)) {
	boxes = parseFlatBoxArray(output);
	}
	boxes = normalizeBoxes(boxes);
	console.log("Model output:", output);
	console.log("Boxes after normalization:", boxes);
	console.log("Canvas size:", canvas.width, canvas.height);
	if (boxes.length > 0) {
	const [x1, y1, x2, y2] = boxes[0].bbox_2d;
	console.log("First box coords:", x1, y1, x2, y2);
	}
	if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
	if (Array.isArray(boxes) && boxes.length > 0) {
	const scaleX = canvas.width / img.naturalWidth;
	const scaleY = canvas.height / img.naturalHeight;
	drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
	}
	setImageProcessed(true);
	});
	setProcessing(false);
	};

	// File mode: process uploaded video frames (start/stop)
	const handleToggleVideoProcessing = () => {
	setVideoProcessing((prev) => !prev);
	};

	// Handle start/stop for example video processing
	const handleToggleExampleProcessing = () => {
	setExampleProcessing((prev) => !prev);
	};

	// Handle start/stop for URL video processing
	const handleToggleUrlProcessing = () => {
	setUrlProcessing((prev) => !prev);
	};

	// Test draw box function
	const handleTestDrawBox = () => {
	if (!canvasRef.current) return;
	const canvas = canvasRef.current;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.clearRect(0, 0, canvas.width, canvas.height);
	ctx.strokeStyle = "#FF00FF";
	ctx.lineWidth = 4;
	ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
	ctx.font = "20px Arial";
	ctx.fillStyle = "#FF00FF";
	ctx.fillText("Test Box", 50, 35);
	};

	return (
	<div className="absolute inset-0 text-white">
	<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
	{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
	</div>
	<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
	<div className="flex flex-col items-center justify-center h-full w-full">
	{/* Mode Selector */}
	<div className="mb-6">
	<div className="flex space-x-4">
	{MODES.map((m) => (
	<button
	key={m}
	className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
	mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
	}`}
	onClick={() => setMode(m)}
	>
	{m}
	</button>
	))}
	</div>
	</div>

	{/* Mode Content */}
	<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
	{mode === "Webcam" && (
	<div className="w-full text-center flex flex-col items-center">
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	/>
	</div>
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	autoPlay
	muted
	playsInline
	className="w-full rounded-lg shadow-lg mb-2"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 10, pointerEvents: "none" }}
	/>
	</div>
	{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
	{error && <div className="text-red-400 mt-2">Error: {error}</div>}
	</div>
	)}
	{mode === "URL" && (
	<div className="w-full text-center flex flex-col items-center">
	<p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
	<div className="flex w-full max-w-xl mb-4">
	<input
	type="text"
	className="flex-1 px-4 py-2 rounded-l-lg text-black"
	value={inputUrl}
	onChange={(e) => setInputUrl(e.target.value)}
	placeholder="Paste video URL here"
	/>
	<button
	className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
	onClick={() => setVideoUrl(inputUrl)}
	>
	Load
	</button>
	</div>
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	/>
	</div>
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	src={videoUrl}
	controls
	autoPlay
	loop
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
	style={{ background: "#222" }}
	/>
	<video
	ref={overlayVideoRef}
	src={videoUrl}
	controls={false}
	autoPlay
	loop
	muted
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 20, pointerEvents: "none" }}
	/>
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
	onClick={handleToggleUrlProcessing}
	>
	{urlProcessing ? "Stop Processing" : "Start Processing"}
	</button>
	</div>
	{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
	{error && <div className="text-red-400 mt-2">Error: {error}</div>}
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
	onClick={handleTestDrawBox}
	>
	Test Draw Box
	</button>
	<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
	<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} \| Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
	<div>Raw Model Output:</div>
	<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
	</div>
	</div>
	)}
	{mode === "File" && (
	<div className="w-full text-center flex flex-col items-center">
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	/>
	</div>
	<div className="mb-4 w-full max-w-xl">
	<input
	type="file"
	accept="image/,video/"
	onChange={handleFileChange}
	className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
	/>
	</div>
	{/* Show uploaded image */}
	{uploadedFile && isImageFile(uploadedFile) && (
	<div className="relative w-full max-w-xl">
	<img
	ref={imageRef}
	src={uploadedUrl}
	alt="Uploaded"
	className="w-full rounded-lg shadow-lg mb-2"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 10, pointerEvents: "none" }}
	/>
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
	onClick={handleProcessImage}
	disabled={processing}
	>
	{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
	</button>
	</div>
	)}
	{/* Show uploaded video */}
	{uploadedFile && isVideoFile(uploadedFile) && (
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	src={uploadedUrl}
	controls
	autoPlay
	loop
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
	style={{ background: "#222" }}
	/>
	<video
	ref={overlayVideoRef}
	src={uploadedUrl}
	controls={false}
	autoPlay
	loop
	muted
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 20, pointerEvents: "none" }}
	/>
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
	onClick={handleToggleVideoProcessing}
	>
	{videoProcessing ? "Stop Processing" : "Start Processing"}
	</button>
	</div>
	)}
	{/* Show example video if no file uploaded */}
	{!uploadedFile && (
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	src={EXAMPLE_VIDEO_URL}
	controls
	autoPlay
	loop
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-0"
	style={{ background: "#222" }}
	/>
	<video
	ref={overlayVideoRef}
	src={EXAMPLE_VIDEO_URL}
	controls={false}
	autoPlay
	loop
	muted
	className="w-full rounded-lg shadow-lg mb-2 absolute top-0 left-0 z-10 opacity-60 pointer-events-none"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 20, pointerEvents: "none" }}
	/>
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold relative z-30"
	onClick={handleToggleExampleProcessing}
	>
	{exampleProcessing ? "Stop Processing" : "Start Processing"}
	</button>
	</div>
	)}
	{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
	{error && <div className="text-red-400 mt-2">Error: {error}</div>}
	<button
	className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
	onClick={handleTestDrawBox}
	>
	Test Draw Box
	</button>
	<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
	<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} \| Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
	<div>Raw Model Output:</div>
	<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
	</div>
	</div>
	)}
	</div>
	</div>
	</div>
	);
	}