import { useState, useRef, useEffect } from "react"; import { useVLMContext } from "../context/useVLMContext"; import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; const MODES = ["File"] as const; type Mode = typeof MODES[number]; const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4"; const EXAMPLE_PROMPT = "Detect each individual animated characters in the image. The characters are moving. For each character, output a JSON array of objects with fields. Each character should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. No coordinates should be the same. This should be used to draw a box using the points around the character. This is an example of two boxes, the format of this : [x1, y1, x2, y2], [x1, y1, x2, y2]"; function isImageFile(file: File) { return file.type.startsWith("image/"); } function isVideoFile(file: File) { return file.type.startsWith("video/"); } function denormalizeBox(box: number[], width: number, height: number) { // If all values are between 0 and 1, treat as normalized if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) { return [ box[0] * width, box[1] * height, box[2] * width, box[3] * height ]; } return box; } // Add this robust fallback parser near the top function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] { // Try to parse as JSON first try { const parsed = JSON.parse(output); if (Array.isArray(parsed)) { const result: { label: string, bbox_2d: number[] }[] = []; for (const obj of parsed) { if (obj && obj.label && Array.isArray(obj.bbox_2d)) { if (Array.isArray(obj.bbox_2d[0])) { for (const arr of obj.bbox_2d) { if (Array.isArray(arr) && arr.length === 4) { result.push({ label: obj.label, bbox_2d: arr }); } } } else if (obj.bbox_2d.length === 4) { result.push({ label: obj.label, bbox_2d: obj.bbox_2d }); } } } if (result.length > 0) return result; } } catch (e) {} // Fallback: extract all [x1, y1, x2, y2] arrays from the string const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g; const boxes: { label: string, bbox_2d: number[] }[] = []; let match; while ((match = boxRegex.exec(output)) !== null) { const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])]; boxes.push({ label: '', bbox_2d: arr }); } return boxes; } export default function MultiSourceCaptioningView() { const [mode, setMode] = useState("File"); const [videoUrl] = useState(EXAMPLE_VIDEO_URL); const [prompt, setPrompt] = useState(EXAMPLE_PROMPT); const [processing, setProcessing] = useState(false); const [error, setError] = useState(null); const [uploadedFile, setUploadedFile] = useState(null); const [uploadedUrl, setUploadedUrl] = useState(""); const [videoProcessing, setVideoProcessing] = useState(false); const [imageProcessed, setImageProcessed] = useState(false); const [exampleProcessing, setExampleProcessing] = useState(false); const [debugOutput, setDebugOutput] = useState(""); const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null); const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null); const [inferenceStatus, setInferenceStatus] = useState(""); const videoRef = useRef(null); const overlayVideoRef = useRef(null); const processingVideoRef = useRef(null); const canvasRef = useRef(null); const imageRef = useRef(null); const boxHistoryRef = useRef([]); const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); // Add this useEffect for overlay video synchronization useEffect(() => { const main = videoRef.current; const overlay = overlayVideoRef.current; if (!main || !overlay) return; // Sync play/pause const onPlay = () => { if (overlay.paused) overlay.play(); }; const onPause = () => { if (!overlay.paused) overlay.pause(); }; // Sync seeking and time const onSeekOrTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) { overlay.currentTime = main.currentTime; } }; main.addEventListener('play', onPlay); main.addEventListener('pause', onPause); main.addEventListener('seeked', onSeekOrTime); main.addEventListener('timeupdate', onSeekOrTime); // Clean up return () => { main.removeEventListener('play', onPlay); main.removeEventListener('pause', onPause); main.removeEventListener('seeked', onSeekOrTime); main.removeEventListener('timeupdate', onSeekOrTime); }; }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]); useEffect(() => { if ((mode === "File") && processingVideoRef.current) { processingVideoRef.current.play().catch(() => {}); } }, [mode, videoUrl, uploadedUrl]); const processVideoFrame = async () => { if (!processingVideoRef.current || !canvasRef.current) return; const video = processingVideoRef.current; const canvas = canvasRef.current; if (video.paused || video.ended || video.videoWidth === 0) return; canvas.width = video.videoWidth; canvas.height = video.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(video, 0, 0, canvas.width, canvas.height); await runInference(video, prompt, (output: string) => { setDebugOutput(output); let boxes = extractAllBoundingBoxes(output); // Box persistence logic (2 seconds) const now = Date.now(); if (Array.isArray(boxes) && boxes.length > 0) { boxHistoryRef.current = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000); boxHistoryRef.current.push(...boxes.map(box => ({ ...box, timestamp: now }))); } // Draw all boxes from last 2 seconds const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000); ctx.clearRect(0, 0, canvas.width, canvas.height); if (boxHistory.length > 0) { const scaleX = canvas.width / video.videoWidth; const scaleY = canvas.height / video.videoHeight; // Fix: Draw all boxes, even if bbox_2d is an array of arrays const denormalizedBoxes: any[] = []; for (const b of boxHistory) { if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) { // Multiple boxes per label for (const arr of b.bbox_2d) { if (Array.isArray(arr) && arr.length === 4) { denormalizedBoxes.push({ ...b, bbox_2d: denormalizeBox(arr, canvas.width, canvas.height) }); } } } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) { // Single box denormalizedBoxes.push({ ...b, bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height) }); } } drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); } }); }; const handleFileChange = (e: React.ChangeEvent) => { const file = e.target.files?.[0] || null; setUploadedFile(file); setUploadedUrl(file ? URL.createObjectURL(file) : ""); setError(null); setImageProcessed(false); setVideoProcessing(false); setExampleProcessing(false); }; // Webcam mode: process frames with setInterval useEffect(() => { if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]); // Example video mode: process frames with setInterval useEffect(() => { if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]); // File mode: process uploaded image (only on button click) const handleProcessImage = async () => { if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return; const img = imageRef.current; const canvas = canvasRef.current; canvas.width = img.naturalWidth; canvas.height = img.naturalHeight; setCanvasDims({w:canvas.width,h:canvas.height}); setVideoDims({w:img.naturalWidth,h:img.naturalHeight}); const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(img, 0, 0, canvas.width, canvas.height); setProcessing(true); setError(null); setInferenceStatus("Running inference..."); await runInference(img, prompt, (output: string) => { setDebugOutput(output); setInferenceStatus("Inference complete."); ctx.drawImage(img, 0, 0, canvas.width, canvas.height); let boxes = extractAllBoundingBoxes(output); console.log("Model output:", output); console.log("Boxes after normalization:", boxes); console.log("Canvas size:", canvas.width, canvas.height); if (boxes.length > 0) { const [x1, y1, x2, y2] = boxes[0].bbox_2d; console.log("First box coords:", x1, y1, x2, y2); } if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid."); if (Array.isArray(boxes) && boxes.length > 0) { const scaleX = canvas.width / img.naturalWidth; const scaleY = canvas.height / img.naturalHeight; drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY }); } setImageProcessed(true); }); setProcessing(false); }; // File mode: process uploaded video frames (start/stop) const handleToggleVideoProcessing = () => { setVideoProcessing((prev) => !prev); }; // Handle start/stop for example video processing const handleToggleExampleProcessing = () => { setExampleProcessing((prev) => !prev); }; // Test draw box function const handleTestDrawBox = () => { if (!canvasRef.current) return; const canvas = canvasRef.current; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.clearRect(0, 0, canvas.width, canvas.height); ctx.strokeStyle = "#FF00FF"; ctx.lineWidth = 4; ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4)); ctx.font = "20px Arial"; ctx.fillStyle = "#FF00FF"; ctx.fillText("Test Box", 50, 35); }; useEffect(() => { const draw = () => { const overlayVideo = overlayVideoRef.current; const canvas = canvasRef.current; if (!overlayVideo || !canvas) return; if (overlayVideo.videoWidth === 0) return; canvas.width = overlayVideo.videoWidth; canvas.height = overlayVideo.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.clearRect(0, 0, canvas.width, canvas.height); const now = Date.now(); const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000); if (boxHistory.length > 0) { const scaleX = canvas.width / overlayVideo.videoWidth; const scaleY = canvas.height / overlayVideo.videoHeight; // Fix: Draw all boxes, even if bbox_2d is an array of arrays const denormalizedBoxes: any[] = []; for (const b of boxHistory) { if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) { // Multiple boxes per label for (const arr of b.bbox_2d) { if (Array.isArray(arr) && arr.length === 4) { denormalizedBoxes.push({ ...b, bbox_2d: denormalizeBox(arr, canvas.width, canvas.height) }); } } } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) { // Single box denormalizedBoxes.push({ ...b, bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height) }); } } drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); } }; draw(); const interval = setInterval(draw, 100); return () => clearInterval(interval); }, [overlayVideoRef, canvasRef]); return (
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
{inferenceStatus}
{/* Mode Selector */}
{MODES.map((m) => ( ))}
{/* Mode Content */}
{mode === "File" && (