Spaces:
Running
Running
import React, { useState, useRef, useEffect, useCallback } from "react"; | |
import { useVLMContext } from "../context/useVLMContext"; | |
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; | |
const MODES = ["Webcam", "URL", "File"] as const; | |
type Mode = typeof MODES[number]; | |
const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct | |
const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]"; | |
// Helper function: normalizeBoxes remains as it is used | |
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] { | |
if (!raw) return []; | |
let boxes = []; | |
if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) { | |
boxes = raw.image; | |
} else if (Array.isArray(raw)) { | |
boxes = raw; | |
} else if (typeof raw === "object" && raw !== null) { | |
boxes = [raw]; | |
} | |
return boxes | |
.map((obj: any) => { | |
if (!obj || !obj.bbox_2d) return null; | |
let bbox = obj.bbox_2d; | |
if ( | |
Array.isArray(bbox) && | |
bbox.length === 2 && | |
Array.isArray(bbox[0]) && | |
Array.isArray(bbox[1]) && | |
bbox[0].length === 2 && | |
bbox[1].length === 2 | |
) { | |
bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]; | |
} | |
if ( | |
Array.isArray(bbox) && | |
bbox.length === 4 && | |
bbox.every((v: any) => typeof v === "number") | |
) { | |
return { ...obj, bbox_2d: bbox }; | |
} | |
return null; | |
}) | |
.filter((obj: any) => obj); | |
} | |
function isImageFile(file: File) { | |
return file.type.startsWith("image/"); | |
} | |
function isVideoFile(file: File) { | |
return file.type.startsWith("video/"); | |
} | |
export default function MultiSourceCaptioningView() { | |
const [mode, setMode] = useState<Mode>("File"); | |
const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL); | |
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT); | |
const [processingState, setProcessingState] = useState(false); // General processing indicator | |
const [error, setError] = useState<string | null>(null); | |
const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream | |
const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw | |
const [inferenceStatus, setInferenceStatus] = useState<string>(""); | |
const [debugOutput, setDebugOutput] = useState<string>(""); | |
const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE | |
// Refs for the two video elements and the canvas | |
const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video | |
const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing | |
const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes | |
const imageRef = useRef<HTMLImageElement>(null); // For image file processing | |
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); | |
// --- Drawing Loop for the Visible Display --- | |
// This loop runs constantly to draw the latest boxes on the display video | |
const drawDisplayCanvas = useCallback(() => { | |
const displayVideo = displayVideoRef.current; | |
const canvas = canvasRef.current; | |
const ctx = canvas?.getContext('2d'); | |
if (!displayVideo || !canvas || !ctx) { | |
return; | |
} | |
// Adjust canvas size to match the display video's dimensions | |
// Only adjust if video has valid dimensions | |
if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 && | |
(canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) { | |
canvas.width = displayVideo.videoWidth; | |
canvas.height = displayVideo.videoHeight; | |
} | |
// Clear the canvas each frame | |
ctx.clearRect(0, 0, canvas.width, canvas.height); | |
// Draw the latest bounding boxes | |
const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero | |
const scaleY = canvas.height / (displayVideo.videoHeight || 1); | |
drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); | |
// Only request next frame if video is playing to avoid unnecessary redraws when paused/ended | |
if (!displayVideo.paused && !displayVideo.ended) { | |
requestAnimationFrame(drawDisplayCanvas); | |
} | |
}, [latestBoxes]); // Re-create if latestBoxes changes | |
// Effect to start the display drawing loop when the display video is ready | |
useEffect(() => { | |
const displayVideo = displayVideoRef.current; | |
if (displayVideo) { | |
const handleVideoReady = () => { | |
if (displayVideo.readyState >= 1) { // HAVE_METADATA | |
requestAnimationFrame(drawDisplayCanvas); | |
} | |
}; | |
displayVideo.addEventListener('loadedmetadata', handleVideoReady); | |
displayVideo.addEventListener('play', handleVideoReady); // Also start on play | |
// Also check if video is already ready (e.g., on component re-mount or autoplay) | |
if (displayVideo.readyState >= 1) { | |
requestAnimationFrame(drawDisplayCanvas); | |
} | |
return () => { | |
displayVideo.removeEventListener('loadedmetadata', handleVideoReady); | |
displayVideo.removeEventListener('play', handleVideoReady); | |
}; | |
} | |
}, [drawDisplayCanvas]); | |
// --- FastVLM Processing Loop (from hidden video) --- | |
// This interval loop controls when FastVLM processes a frame | |
useEffect(() => { | |
const vlmVideo = vlmVideoRef.current; | |
// Determine if we are in a video-based mode that requires continuous processing | |
const isVideoModeActive = ( | |
mode === "Webcam" || | |
(mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded | |
(mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile)) | |
); | |
if (!isLoaded || !vlmVideo || !isVideoModeActive) { | |
setProcessingState(false); | |
return; | |
} | |
let interval: ReturnType<typeof setInterval> | null = null; | |
const startVLMProcessing = () => { | |
if (interval) clearInterval(interval); // Clear any old interval | |
interval = setInterval(async () => { | |
if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) { | |
return; // Skip if video not ready, paused, ended, or already processing | |
} | |
setProcessingState(true); | |
setInferenceStatus("Running inference..."); | |
setError(null); | |
try { | |
// Pass the HTMLVideoElement directly to runInference | |
const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly | |
setDebugOutput(modelOutput); | |
let boxes = extractJsonFromMarkdown(modelOutput) || []; | |
boxes = normalizeBoxes(boxes); | |
setLatestBoxes(boxes); | |
setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected."); | |
} catch (e) { | |
setError("Inference error: " + (e instanceof Error ? e.message : String(e))); | |
setLatestBoxes([]); | |
setInferenceStatus("Inference failed."); | |
} finally { | |
setProcessingState(false); | |
} | |
}, 200); // Inference interval (e.g., 5 frames per second) | |
}; | |
const stopVLMProcessing = () => { | |
if (interval) clearInterval(interval); | |
interval = null; | |
setProcessingState(false); | |
setInferenceStatus("Stopped processing."); | |
}; | |
vlmVideo.addEventListener('play', startVLMProcessing); | |
vlmVideo.addEventListener('pause', stopVLMProcessing); | |
vlmVideo.addEventListener('ended', stopVLMProcessing); | |
vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability | |
// Initial check if video is already playing or ready | |
if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) { | |
startVLMProcessing(); | |
} | |
return () => { | |
stopVLMProcessing(); | |
vlmVideo.removeEventListener('play', startVLMProcessing); | |
vlmVideo.removeEventListener('pause', stopVLMProcessing); | |
vlmVideo.removeEventListener('ended', stopVLMProcessing); | |
vlmVideo.removeEventListener('loadeddata', startVLMProcessing); | |
}; | |
}, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change | |
// --- Media Source Handling --- | |
// Cleanup for media stream and object URLs | |
const cleanupMediaSource = useCallback(() => { | |
if (mediaStream) { | |
mediaStream.getTracks().forEach(track => track.stop()); | |
setMediaStream(null); | |
} | |
if (displayVideoRef.current?.src.startsWith('blob:')) { | |
URL.revokeObjectURL(displayVideoRef.current.src); | |
displayVideoRef.current.src = ""; | |
} | |
if (vlmVideoRef.current?.src.startsWith('blob:')) { | |
URL.revokeObjectURL(vlmVideoRef.current.src); | |
vlmVideoRef.current.src = ""; | |
} | |
setLatestBoxes([]); | |
setError(null); | |
setInferenceStatus(""); | |
setDebugOutput(""); | |
setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change | |
}, [mediaStream]); | |
// Handle changing the mode (Webcam, URL, File) | |
useEffect(() => { | |
cleanupMediaSource(); | |
const displayVideo = displayVideoRef.current; | |
const vlmVideo = vlmVideoRef.current; | |
if (!displayVideo || !vlmVideo) return; | |
// Reset srcObject/src to ensure fresh start | |
displayVideo.srcObject = null; | |
vlmVideo.srcObject = null; | |
displayVideo.src = ""; | |
vlmVideo.src = ""; | |
// Special handling for initial "File" mode to load example video if no file is selected | |
if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here | |
displayVideo.src = EXAMPLE_VIDEO_URL; | |
vlmVideo.src = EXAMPLE_VIDEO_URL; | |
displayVideo.load(); vlmVideo.load(); | |
displayVideo.play().catch(e => console.error("Error playing example display video:", e)); | |
vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e)); | |
} | |
}, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode | |
// Handle Webcam Input | |
const handleWebcamInput = useCallback(async () => { | |
cleanupMediaSource(); | |
try { | |
const stream = await navigator.mediaDevices.getUserMedia({ video: true }); | |
setMediaStream(stream); | |
if (displayVideoRef.current && vlmVideoRef.current) { | |
displayVideoRef.current.srcObject = stream; | |
vlmVideoRef.current.srcObject = stream; | |
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); | |
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); | |
} | |
setMode("Webcam"); | |
} catch (e) { | |
setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e))); | |
setMediaStream(null); | |
setLatestBoxes([]); | |
setInferenceStatus("Webcam access denied or failed."); | |
} | |
}, [cleanupMediaSource]); | |
// Handle URL Input (when Load button is clicked) | |
const handleLoadUrl = useCallback(() => { | |
cleanupMediaSource(); | |
const url = currentUrlInput; | |
if (!url) { | |
setError("Please enter a valid URL."); | |
return; | |
} | |
if (displayVideoRef.current && vlmVideoRef.current) { | |
displayVideoRef.current.src = url; | |
vlmVideoRef.current.src = url; | |
displayVideoRef.current.load(); vlmVideoRef.current.load(); | |
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); | |
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); | |
setMode("URL"); | |
} | |
}, [currentUrlInput, cleanupMediaSource]); | |
// Handle File Input | |
const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => { | |
cleanupMediaSource(); | |
const file = e.target.files?.[0] || null; | |
setUploadedFile(file); // <<< FIXED: Set uploadedFile state here | |
if (file) { | |
const fileUrl = URL.createObjectURL(file); | |
if (isImageFile(file)) { | |
// Image file, will be handled by imageRef and single processing logic | |
setMode("File"); // Ensure mode is "File" | |
// No direct video assignment needed here, imageRef handles display | |
} else if (isVideoFile(file)) { | |
if (displayVideoRef.current && vlmVideoRef.current) { | |
displayVideoRef.current.src = fileUrl; | |
vlmVideoRef.current.src = fileUrl; | |
displayVideoRef.current.load(); vlmVideoRef.current.load(); | |
displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); | |
vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); | |
setMode("File"); // Ensure mode is "File" | |
} | |
} else { | |
setError("Unsupported file type. Please upload an image or video."); | |
setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error | |
if (fileUrl) URL.revokeObjectURL(fileUrl); | |
} | |
} else { | |
setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected | |
// If no file selected, revert to example video if in File mode | |
if (mode === "File") { | |
if (displayVideoRef.current && vlmVideoRef.current) { | |
displayVideoRef.current.src = EXAMPLE_VIDEO_URL; | |
vlmVideoRef.current.src = EXAMPLE_VIDEO_URL; | |
displayVideoRef.current.load(); vlmVideoRef.current.load(); | |
displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e)); | |
vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e)); | |
} | |
} | |
} | |
}, [cleanupMediaSource, mode]); | |
// Handler for processing an uploaded image file (one-time inference) | |
const handleProcessImage = async () => { | |
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) { | |
setError("Image or model not ready for processing, or no image file selected."); | |
return; | |
} | |
const img = imageRef.current; | |
const canvas = canvasRef.current; | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
canvas.width = img.naturalWidth; | |
canvas.height = img.naturalHeight; | |
setProcessingState(true); | |
setError(null); | |
setInferenceStatus("Running image inference..."); | |
try { | |
// Pass the HTMLImageElement directly to runInference | |
const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly | |
setDebugOutput(modelOutput); | |
setInferenceStatus("Image inference complete."); | |
ctx.clearRect(0, 0, canvas.width, canvas.height); | |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image | |
let boxes = extractJsonFromMarkdown(modelOutput) || []; | |
boxes = normalizeBoxes(boxes); | |
setLatestBoxes(boxes); | |
if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected."); | |
} catch (e) { | |
setError("Image inference error: " + (e instanceof Error ? e.message : String(e))); | |
setLatestBoxes([]); | |
setInferenceStatus("Image inference failed."); | |
} finally { | |
setProcessingState(false); | |
} | |
}; | |
// --- Rendered UI --- | |
return ( | |
<div className="absolute inset-0 text-white flex flex-col"> | |
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50"> | |
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"} | |
</div> | |
<div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div> | |
<div className="flex flex-col items-center justify-center flex-1 w-full p-4"> | |
{/* Mode Selector */} | |
<div className="mb-6 mt-4"> | |
<div className="flex space-x-4"> | |
{MODES.map((m) => ( | |
<button | |
key={m} | |
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${ | |
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500" | |
}`} | |
onClick={() => setMode(m)} | |
disabled={!isLoaded && m !== "File"} | |
> | |
{m} | |
</button> | |
))} | |
</div> | |
</div> | |
{/* Dynamic Content Area */} | |
<div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative"> | |
{/* Prompt Input (Common to all modes) */} | |
<div className="mb-4 w-full max-w-xl"> | |
<label className="block text-left mb-2 font-medium">Detection Prompt:</label> | |
<textarea | |
className="w-full p-2 rounded-lg text-black" | |
rows={3} | |
value={prompt} | |
onChange={(e) => setPrompt(e.target.value)} | |
disabled={processingState} | |
/> | |
</div> | |
{/* Video/Image Display and Canvas Overlay */} | |
<div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}> | |
{mode === "File" && uploadedFile && isImageFile(uploadedFile) ? ( | |
<img | |
ref={imageRef} | |
src={URL.createObjectURL(uploadedFile)} | |
alt="Uploaded" | |
className="max-w-full max-h-full block object-contain" | |
style={{ position: 'absolute' }} | |
onLoad={() => { | |
if (imageRef.current && canvasRef.current) { | |
canvasRef.current.width = imageRef.current.naturalWidth; | |
canvasRef.current.height = imageRef.current.naturalHeight; | |
} | |
}} | |
/> | |
) : ( | |
<video | |
ref={displayVideoRef} | |
autoPlay | |
muted | |
playsInline | |
loop | |
className="max-w-full max-h-full block object-contain" | |
style={{ position: 'absolute' }} | |
/> | |
)} | |
<canvas | |
ref={canvasRef} | |
className="absolute top-0 left-0 w-full h-full pointer-events-none" | |
style={{ zIndex: 10 }} | |
/> | |
</div> | |
{/* Controls specific to each mode */} | |
<div className="mt-4 flex flex-col items-center gap-2"> | |
{mode === "Webcam" && ( | |
<button | |
className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50" | |
onClick={handleWebcamInput} | |
disabled={processingState || !isLoaded} | |
> | |
{mediaStream ? "Restart Webcam" : "Start Webcam"} 📸 | |
</button> | |
)} | |
{mode === "URL" && ( | |
<> | |
<div className="flex w-full max-w-xl"> | |
<input | |
type="text" | |
className="flex-1 px-4 py-2 rounded-l-lg text-black" | |
value={currentUrlInput} | |
onChange={(e) => setCurrentUrlInput(e.target.value)} | |
placeholder="Paste video URL here" | |
disabled={processingState} | |
/> | |
<button | |
className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50" | |
onClick={handleLoadUrl} | |
disabled={processingState || !isLoaded} | |
> | |
Load URL | |
</button> | |
</div> | |
</> | |
)} | |
{mode === "File" && ( | |
<> | |
<input | |
type="file" | |
accept="image/*,video/*" | |
onChange={handleFileChange} | |
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50" | |
disabled={processingState} | |
/> | |
{uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here | |
<button | |
className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50" | |
onClick={handleProcessImage} | |
disabled={processingState || !isLoaded} | |
> | |
{processingState ? "Processing Image..." : "Process Image"} | |
</button> | |
)} | |
</> | |
)} | |
</div> | |
{/* Error and Debug Output */} | |
{error && <div className="text-red-400 mt-2 text-center">{error}</div>} | |
<div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl"> | |
<div>Raw Model Output:</div> | |
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre> | |
</div> | |
</div> | |
</div> | |
{/* Hidden Video for VLM processing - this must be rendered always */} | |
<video | |
ref={vlmVideoRef} | |
autoPlay | |
muted | |
playsInline | |
loop | |
style={{ display: 'none' }} // Hidden from view | |
/> | |
</div> | |
); | |
} |