FastVLMBoxes / src /components /MultiSourceCaptioningView.tsx
Quazim0t0's picture
Upload 37 files
c514e33 verified
raw
history blame
20.2 kB
import { useState, useRef, useEffect } from "react";
import { useVLMContext } from "../context/useVLMContext";
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
const MODES = ["Webcam", "URL", "File"] as const;
type Mode = typeof MODES[number];
const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
const label = arr[0];
return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
}
return [];
}
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
let boxes = [];
if (Array.isArray(raw)) {
boxes = raw;
} else if (typeof raw === "object" && raw !== null) {
boxes = [raw];
}
// Normalize bbox_2d to flat [x1, y1, x2, y2]
return boxes.map(obj => {
let bbox = obj.bbox_2d;
if (Array.isArray(bbox) && Array.isArray(bbox[0])) {
// Convert [[x1, y1], [x2, y2]] to [x1, y1, x2, y2]
bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
}
return { ...obj, bbox_2d: bbox };
});
}
function isImageFile(file: File) {
return file.type.startsWith("image/");
}
function isVideoFile(file: File) {
return file.type.startsWith("video/");
}
export default function MultiSourceCaptioningView() {
const [mode, setMode] = useState<Mode>("File");
const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
const [processing, setProcessing] = useState(false);
const [error, setError] = useState<string | null>(null);
const [webcamActive, setWebcamActive] = useState(false);
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
const [uploadedUrl, setUploadedUrl] = useState<string>("");
const [videoProcessing, setVideoProcessing] = useState(false);
const [imageProcessed, setImageProcessed] = useState(false);
const [exampleProcessing, setExampleProcessing] = useState(false);
const [urlProcessing, setUrlProcessing] = useState(false);
const [debugOutput, setDebugOutput] = useState<string>("");
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
const [inferenceStatus, setInferenceStatus] = useState<string>("");
const videoRef = useRef<HTMLVideoElement | null>(null);
const canvasRef = useRef<HTMLCanvasElement | null>(null);
const imageRef = useRef<HTMLImageElement | null>(null);
const webcamStreamRef = useRef<MediaStream | null>(null);
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
const processVideoFrame = async () => {
if (!videoRef.current || !canvasRef.current) return;
const video = videoRef.current;
const canvas = canvasRef.current;
if (video.paused || video.ended || video.videoWidth === 0) return;
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const ctx = canvas.getContext("2d");
if (!ctx) return;
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
await runInference(video, prompt, (output: string) => {
setDebugOutput(output); // <-- Ensure Raw Model Output is updated
let boxes = extractJsonFromMarkdown(output) || [];
if (boxes.length === 0 && Array.isArray(output)) {
boxes = parseFlatBoxArray(output);
}
boxes = normalizeBoxes(boxes);
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
drawBoundingBoxesOnCanvas(ctx, boxes);
});
};
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0] || null;
setUploadedFile(file);
setUploadedUrl(file ? URL.createObjectURL(file) : "");
setError(null);
setImageProcessed(false);
setVideoProcessing(false);
setExampleProcessing(false);
};
// Webcam setup and teardown (unchanged)
useEffect(() => {
if (mode !== "Webcam") {
if (webcamStreamRef.current) {
webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
webcamStreamRef.current = null;
}
setWebcamActive(false);
return;
}
const setupWebcam = async () => {
try {
setError(null);
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
webcamStreamRef.current = stream;
if (videoRef.current) {
videoRef.current.srcObject = stream;
setWebcamActive(true);
}
} catch (e) {
setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
setWebcamActive(false);
}
};
setupWebcam();
return () => {
if (webcamStreamRef.current) {
webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
webcamStreamRef.current = null;
}
setWebcamActive(false);
};
}, [mode]);
// Webcam mode: process frames with setInterval
useEffect(() => {
if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
let interval: ReturnType<typeof setInterval> | null = null;
interval = setInterval(() => {
processVideoFrame();
}, 1000);
return () => {
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference, webcamActive]);
// URL mode: process frames with setInterval
useEffect(() => {
if (mode !== "URL" || !isLoaded || !urlProcessing) return;
let interval: ReturnType<typeof setInterval> | null = null;
interval = setInterval(() => {
processVideoFrame();
}, 1000);
return () => {
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference, urlProcessing]);
// File video mode: process frames with setInterval
useEffect(() => {
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
let interval: ReturnType<typeof setInterval> | null = null;
interval = setInterval(() => {
processVideoFrame();
}, 1000);
return () => {
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
// Example video mode: process frames with setInterval
useEffect(() => {
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
let interval: ReturnType<typeof setInterval> | null = null;
interval = setInterval(() => {
processVideoFrame();
}, 1000);
return () => {
if (interval) clearInterval(interval);
};
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
// File mode: process uploaded image (only on button click)
const handleProcessImage = async () => {
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
const img = imageRef.current;
const canvas = canvasRef.current;
canvas.width = img.naturalWidth;
canvas.height = img.naturalHeight;
setCanvasDims({w:canvas.width,h:canvas.height});
setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
const ctx = canvas.getContext("2d");
if (!ctx) return;
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
setProcessing(true);
setError(null);
setInferenceStatus("Running inference...");
await runInference(img, prompt, (output: string) => {
setDebugOutput(output);
setInferenceStatus("Inference complete.");
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
let boxes = extractJsonFromMarkdown(output) || [];
if (boxes.length === 0 && Array.isArray(output)) {
boxes = parseFlatBoxArray(output);
}
boxes = normalizeBoxes(boxes);
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
drawBoundingBoxesOnCanvas(ctx, boxes);
setImageProcessed(true);
});
setProcessing(false);
};
// File mode: process uploaded video frames (start/stop)
const handleToggleVideoProcessing = () => {
setVideoProcessing((prev) => !prev);
};
// Handle start/stop for example video processing
const handleToggleExampleProcessing = () => {
setExampleProcessing((prev) => !prev);
};
// Handle start/stop for URL video processing
const handleToggleUrlProcessing = () => {
setUrlProcessing((prev) => !prev);
};
// Test draw box function
const handleTestDrawBox = () => {
if (!canvasRef.current) return;
const canvas = canvasRef.current;
const ctx = canvas.getContext("2d");
if (!ctx) return;
ctx.clearRect(0, 0, canvas.width, canvas.height);
ctx.strokeStyle = "#FF00FF";
ctx.lineWidth = 4;
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
ctx.font = "20px Arial";
ctx.fillStyle = "#FF00FF";
ctx.fillText("Test Box", 50, 35);
};
return (
<div className="absolute inset-0 text-white">
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
</div>
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
<div className="flex flex-col items-center justify-center h-full w-full">
{/* Mode Selector */}
<div className="mb-6">
<div className="flex space-x-4">
{MODES.map((m) => (
<button
key={m}
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
}`}
onClick={() => setMode(m)}
>
{m}
</button>
))}
</div>
</div>
{/* Mode Content */}
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
{mode === "Webcam" && (
<div className="w-full text-center flex flex-col items-center">
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
/>
</div>
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
autoPlay
muted
playsInline
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
</div>
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
</div>
)}
{mode === "URL" && (
<div className="w-full text-center flex flex-col items-center">
<p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
<div className="flex w-full max-w-xl mb-4">
<input
type="text"
className="flex-1 px-4 py-2 rounded-l-lg text-black"
value={inputUrl}
onChange={(e) => setInputUrl(e.target.value)}
placeholder="Paste video URL here"
/>
<button
className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
onClick={() => setVideoUrl(inputUrl)}
>
Load
</button>
</div>
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
/>
</div>
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
src={videoUrl}
controls
autoPlay
loop
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
<button
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
onClick={handleToggleUrlProcessing}
>
{urlProcessing ? "Stop Processing" : "Start Processing"}
</button>
</div>
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
<button
className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
onClick={handleTestDrawBox}
>
Test Draw Box
</button>
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
<div>Raw Model Output:</div>
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
</div>
</div>
)}
{mode === "File" && (
<div className="w-full text-center flex flex-col items-center">
<div className="mb-4 w-full max-w-xl">
<label className="block text-left mb-2 font-medium">Detection Prompt:</label>
<textarea
className="w-full p-2 rounded-lg text-black"
rows={3}
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
/>
</div>
<div className="mb-4 w-full max-w-xl">
<input
type="file"
accept="image/*,video/*"
onChange={handleFileChange}
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
/>
</div>
{/* Show uploaded image */}
{uploadedFile && isImageFile(uploadedFile) && (
<div className="relative w-full max-w-xl">
<img
ref={imageRef}
src={uploadedUrl}
alt="Uploaded"
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
<button
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
onClick={handleProcessImage}
disabled={processing}
>
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
</button>
</div>
)}
{/* Show uploaded video */}
{uploadedFile && isVideoFile(uploadedFile) && (
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
src={uploadedUrl}
controls
autoPlay
loop
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
<button
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
onClick={handleToggleVideoProcessing}
>
{videoProcessing ? "Stop Processing" : "Start Processing"}
</button>
</div>
)}
{/* Show example video if no file uploaded */}
{!uploadedFile && (
<div className="relative w-full max-w-xl">
<video
ref={videoRef}
src={EXAMPLE_VIDEO_URL}
controls
autoPlay
loop
className="w-full rounded-lg shadow-lg mb-2"
style={{ background: "#222" }}
/>
<canvas
ref={canvasRef}
className="absolute top-0 left-0 w-full h-full pointer-events-none"
style={{ zIndex: 10, pointerEvents: "none" }}
/>
<button
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
onClick={handleToggleExampleProcessing}
>
{exampleProcessing ? "Stop Processing" : "Start Processing"}
</button>
</div>
)}
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
{error && <div className="text-red-400 mt-2">Error: {error}</div>}
<button
className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
onClick={handleTestDrawBox}
>
Test Draw Box
</button>
<div className="mt-2 p-2 bg-gray-800 rounded text-xs">
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
<div>Raw Model Output:</div>
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
</div>
</div>
)}
</div>
</div>
</div>
);
}