Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx

Quazim0t0

Upload 36 files

1a9c884 verified 3 days ago

raw

history blame

10.3 kB

	import React, { useState, useRef, useEffect } from "react";
	import { FASTVLM_BOXING_PROMPT } from "../constants";
	import { useVLMContext } from "../context/useVLMContext";
	import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";

	const MODES = ["Webcam", "URL", "File"] as const;
	type Mode = typeof MODES[number];

	const EXAMPLE_VIDEO_URL =
	"https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
	const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";

	export default function MultiSourceCaptioningView() {
	const [mode, setMode] = useState<Mode>("URL");
	const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
	const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
	const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
	const [processing, setProcessing] = useState(false);
	const [error, setError] = useState<string \| null>(null);
	const [webcamActive, setWebcamActive] = useState(false);

	const videoRef = useRef<HTMLVideoElement \| null>(null);
	const canvasRef = useRef<HTMLCanvasElement \| null>(null);
	const webcamStreamRef = useRef<MediaStream \| null>(null);
	const { isLoaded, runInference } = useVLMContext();

	// Webcam setup and teardown
	useEffect(() => {
	if (mode !== "Webcam") {
	if (webcamStreamRef.current) {
	webcamStreamRef.current.getTracks().forEach((track) => track.stop());
	webcamStreamRef.current = null;
	}
	setWebcamActive(false);
	return;
	}
	let stopped = false;
	const setupWebcam = async () => {
	try {
	setError(null);
	const stream = await navigator.mediaDevices.getUserMedia({ video: true });
	webcamStreamRef.current = stream;
	if (videoRef.current) {
	videoRef.current.srcObject = stream;
	setWebcamActive(true);
	}
	} catch (e) {
	setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
	setWebcamActive(false);
	}
	};
	setupWebcam();
	return () => {
	stopped = true;
	if (webcamStreamRef.current) {
	webcamStreamRef.current.getTracks().forEach((track) => track.stop());
	webcamStreamRef.current = null;
	}
	setWebcamActive(false);
	};
	}, [mode]);

	// Process webcam frames
	useEffect(() => {
	if (mode !== "Webcam" \|\| !isLoaded \|\| !webcamActive) return;
	let interval: NodeJS.Timeout \| null = null;
	let stopped = false;
	const processFrame = async () => {
	if (!videoRef.current \|\| !canvasRef.current) return;
	const video = videoRef.current;
	const canvas = canvasRef.current;
	if (video.videoWidth === 0) return;
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
	try {
	setProcessing(true);
	setError(null);
	// Use FastVLM inference on the current frame
	const fakeVideo = {
	videoWidth: canvas.width,
	videoHeight: canvas.height,
	// @ts-ignore
	getContext: () => ctx,
	} as HTMLVideoElement;
	const result = await runInference(fakeVideo, prompt);
	// Clear canvas and redraw frame
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
	// Parse and draw boxes
	const boxes = extractJsonFromMarkdown(result) \|\| [];
	drawBoundingBoxesOnCanvas(ctx, boxes);
	} catch (e) {
	setError(e instanceof Error ? e.message : String(e));
	} finally {
	setProcessing(false);
	}
	};
	interval = setInterval(() => {
	if (!stopped) processFrame();
	}, 1000);
	return () => {
	stopped = true;
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference, webcamActive]);

	// Process video frames for URL mode
	useEffect(() => {
	if (mode !== "URL" \|\| !isLoaded) return;
	let interval: NodeJS.Timeout \| null = null;
	let stopped = false;
	const processFrame = async () => {
	if (!videoRef.current \|\| !canvasRef.current) return;
	const video = videoRef.current;
	const canvas = canvasRef.current;
	if (video.paused \|\| video.ended \|\| video.videoWidth === 0) return;
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const ctx = canvas.getContext("2d");
	if (!ctx) return;
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
	try {
	setProcessing(true);
	setError(null);
	// Use FastVLM inference on the current frame
	const fakeVideo = {
	videoWidth: canvas.width,
	videoHeight: canvas.height,
	// @ts-ignore
	getContext: () => ctx,
	} as HTMLVideoElement;
	const result = await runInference(fakeVideo, prompt);
	// Clear canvas and redraw frame
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
	// Parse and draw boxes
	const boxes = extractJsonFromMarkdown(result) \|\| [];
	drawBoundingBoxesOnCanvas(ctx, boxes);
	} catch (e) {
	setError(e instanceof Error ? e.message : String(e));
	} finally {
	setProcessing(false);
	}
	};
	interval = setInterval(() => {
	if (!stopped) processFrame();
	}, 1000);
	return () => {
	stopped = true;
	if (interval) clearInterval(interval);
	};
	}, [mode, isLoaded, prompt, runInference]);

	return (
	<div className="absolute inset-0 text-white">
	<div className="flex flex-col items-center justify-center h-full w-full">
	{/* Mode Selector */}
	<div className="mb-6">
	<div className="flex space-x-4">
	{MODES.map((m) => (
	<button
	key={m}
	className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
	mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
	}`}
	onClick={() => setMode(m)}
	>
	{m}
	</button>
	))}
	</div>
	</div>

	{/* Mode Content */}
	<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
	{mode === "Webcam" && (
	<div className="w-full text-center flex flex-col items-center">
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	/>
	</div>
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	autoPlay
	muted
	playsInline
	className="w-full rounded-lg shadow-lg mb-2"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 10, pointerEvents: "none" }}
	/>
	</div>
	{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
	{error && <div className="text-red-400 mt-2">Error: {error}</div>}
	</div>
	)}
	{mode === "URL" && (
	<div className="w-full text-center flex flex-col items-center">
	<p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
	<div className="flex w-full max-w-xl mb-4">
	<input
	type="text"
	className="flex-1 px-4 py-2 rounded-l-lg text-black"
	value={inputUrl}
	onChange={(e) => setInputUrl(e.target.value)}
	placeholder="Paste video URL here"
	/>
	<button
	className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
	onClick={() => setVideoUrl(inputUrl)}
	>
	Load
	</button>
	</div>
	<div className="mb-4 w-full max-w-xl">
	<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
	<textarea
	className="w-full p-2 rounded-lg text-black"
	rows={3}
	value={prompt}
	onChange={(e) => setPrompt(e.target.value)}
	/>
	</div>
	<div className="relative w-full max-w-xl">
	<video
	ref={videoRef}
	src={videoUrl}
	controls
	autoPlay
	loop
	className="w-full rounded-lg shadow-lg mb-2"
	style={{ background: "#222" }}
	/>
	<canvas
	ref={canvasRef}
	className="absolute top-0 left-0 w-full h-full pointer-events-none"
	style={{ zIndex: 10, pointerEvents: "none" }}
	/>
	</div>
	{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
	{error && <div className="text-red-400 mt-2">Error: {error}</div>}
	</div>
	)}
	{mode === "File" && (
	<div className="w-full text-center">
	<p className="mb-4">Upload a video or image file for detection (coming soon).</p>
	</div>
	)}
	</div>
	</div>
	</div>
	);
	}