import React, { useState, useRef, useEffect } from "react"; import { useVLMContext } from "../context/useVLMContext"; import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; const MODES = ["Webcam", "URL", "File"] as const; type Mode = typeof MODES[number]; const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4"; const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]"; function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] { if (typeof arr[0] === "string" && Array.isArray(arr[1])) { const label = arr[0]; return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox })); } return []; } function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] { if (!raw) return []; let boxes = []; if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) { boxes = raw.image; } else if (Array.isArray(raw)) { boxes = raw; } else if (typeof raw === "object" && raw !== null) { boxes = [raw]; } return boxes .map((obj: any) => { if (!obj || !obj.bbox_2d) return null; let bbox = obj.bbox_2d; // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2] if ( Array.isArray(bbox) && bbox.length === 2 && Array.isArray(bbox[0]) && Array.isArray(bbox[1]) && bbox[0].length === 2 && bbox[1].length === 2 ) { bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]; } // If bbox_2d is [x1, y1, x2, y2], use as-is if ( Array.isArray(bbox) && bbox.length === 4 && bbox.every((v: any) => typeof v === "number") ) { return { ...obj, bbox_2d: bbox }; } // Otherwise, skip return null; }) .filter((obj: any) => obj); } function isImageFile(file: File) { return file.type.startsWith("image/"); } function isVideoFile(file: File) { return file.type.startsWith("video/"); } export default function MultiSourceCaptioningView() { const [mode, setMode] = useState("File"); const [videoUrl, setVideoUrl] = useState(EXAMPLE_VIDEO_URL); const [inputUrl, setInputUrl] = useState(EXAMPLE_VIDEO_URL); const [prompt, setPrompt] = useState(EXAMPLE_PROMPT); const [processing, setProcessing] = useState(false); const [error, setError] = useState(null); const [webcamActive, setWebcamActive] = useState(false); const [uploadedFile, setUploadedFile] = useState(null); const [uploadedUrl, setUploadedUrl] = useState(""); const [videoProcessing, setVideoProcessing] = useState(false); const [imageProcessed, setImageProcessed] = useState(false); const [exampleProcessing, setExampleProcessing] = useState(false); const [urlProcessing, setUrlProcessing] = useState(false); const [debugOutput, setDebugOutput] = useState(""); const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null); const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null); const [inferenceStatus, setInferenceStatus] = useState(""); const videoRef = useRef(null); const overlayVideoRef = useRef(null); // NEW: overlay video const canvasRef = useRef(null); const imageRef = useRef(null); const webcamStreamRef = useRef(null); const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); // Persistent boxes state: {boxes: [...], timestamp: number} const [persistentBoxes, setPersistentBoxes] = useState<{boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]>([]); const BOX_PERSIST_MS = 2000; // 2 seconds // Helper: Add new boxes with timestamp const addBoxesWithTimestamp = (boxes: {label: string, bbox_2d: number[]}[]) => { if (!boxes || boxes.length === 0) return; setPersistentBoxes((prev: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}[]) => [ ...prev.filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => Date.now() - entry.timestamp < BOX_PERSIST_MS), { boxes, timestamp: Date.now() } ]); }; // Helper: Get all boxes from last 2 seconds const getCurrentBoxes = () => { const now = Date.now(); return persistentBoxes .filter((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => now - entry.timestamp < BOX_PERSIST_MS) .flatMap((entry: {boxes: {label: string, bbox_2d: number[]}[], timestamp: number}) => entry.boxes); }; // Synchronize overlay video with main video useEffect(() => { const main = videoRef.current; const overlay = overlayVideoRef.current; if (!main || !overlay) return; // Sync play/pause const syncPlay = () => { if (main.paused !== overlay.paused) main.paused ? overlay.pause() : overlay.play(); }; main.addEventListener('play', () => overlay.play()); main.addEventListener('pause', () => overlay.pause()); // Sync seeking const syncTime = () => { if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) overlay.currentTime = main.currentTime; }; main.addEventListener('seeked', syncTime); main.addEventListener('timeupdate', syncTime); // Clean up return () => { main.removeEventListener('play', () => overlay.play()); main.removeEventListener('pause', () => overlay.pause()); main.removeEventListener('seeked', syncTime); main.removeEventListener('timeupdate', syncTime); }; }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]); // Update: processVideoFrame now adds boxes to persistentBoxes const processVideoFrame = async () => { if (!videoRef.current || !canvasRef.current) return; const video = videoRef.current; const canvas = canvasRef.current; if (video.paused || video.ended || video.videoWidth === 0) return; canvas.width = video.videoWidth; canvas.height = video.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(video, 0, 0, canvas.width, canvas.height); await runInference(video, prompt, (output: string) => { setDebugOutput(output); let boxes = extractJsonFromMarkdown(output) || []; if (boxes.length === 0 && Array.isArray(output)) { boxes = parseFlatBoxArray(output); } boxes = normalizeBoxes(boxes); if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid."); if (Array.isArray(boxes) && boxes.length > 0) { addBoxesWithTimestamp(boxes); // <-- Add to persistent state } }); }; // Draw persistent boxes on every frame useEffect(() => { const draw = () => { if (!videoRef.current || !canvasRef.current) return; const video = videoRef.current; const canvas = canvasRef.current; if (video.videoWidth === 0) return; canvas.width = video.videoWidth; canvas.height = video.videoHeight; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.clearRect(0, 0, canvas.width, canvas.height); const boxes = getCurrentBoxes(); if (boxes.length > 0) { const scaleX = canvas.width / video.videoWidth; const scaleY = canvas.height / video.videoHeight; drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); } }; draw(); const interval = setInterval(draw, 100); return () => clearInterval(interval); }, [persistentBoxes, videoRef, canvasRef]); const handleFileChange = (e: React.ChangeEvent) => { const file = e.target.files?.[0] || null; setUploadedFile(file); setUploadedUrl(file ? URL.createObjectURL(file) : ""); setError(null); setImageProcessed(false); setVideoProcessing(false); setExampleProcessing(false); }; // Webcam setup and teardown (unchanged) useEffect(() => { if (mode !== "Webcam") { if (webcamStreamRef.current) { webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop()); webcamStreamRef.current = null; } setWebcamActive(false); return; } const setupWebcam = async () => { try { setError(null); const stream = await navigator.mediaDevices.getUserMedia({ video: true }); webcamStreamRef.current = stream; if (videoRef.current) { videoRef.current.srcObject = stream; setWebcamActive(true); } } catch (e) { setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e))); setWebcamActive(false); } }; setupWebcam(); return () => { if (webcamStreamRef.current) { webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop()); webcamStreamRef.current = null; } setWebcamActive(false); }; }, [mode]); // Webcam mode: process frames with setInterval useEffect(() => { if (mode !== "Webcam" || !isLoaded || !webcamActive) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, webcamActive]); // URL mode: process frames with setInterval useEffect(() => { if (mode !== "URL" || !isLoaded || !urlProcessing) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, urlProcessing]); // File video mode: process frames with setInterval useEffect(() => { if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]); // Example video mode: process frames with setInterval useEffect(() => { if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return; let interval: ReturnType | null = null; interval = setInterval(() => { processVideoFrame(); }, 1000); return () => { if (interval) clearInterval(interval); }; }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]); // File mode: process uploaded image (only on button click) const handleProcessImage = async () => { if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return; const img = imageRef.current; const canvas = canvasRef.current; canvas.width = img.naturalWidth; canvas.height = img.naturalHeight; setCanvasDims({w:canvas.width,h:canvas.height}); setVideoDims({w:img.naturalWidth,h:img.naturalHeight}); const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.drawImage(img, 0, 0, canvas.width, canvas.height); setProcessing(true); setError(null); setInferenceStatus("Running inference..."); await runInference(img, prompt, (output: string) => { setDebugOutput(output); setInferenceStatus("Inference complete."); ctx.drawImage(img, 0, 0, canvas.width, canvas.height); let boxes = extractJsonFromMarkdown(output) || []; if (boxes.length === 0 && Array.isArray(output)) { boxes = parseFlatBoxArray(output); } boxes = normalizeBoxes(boxes); console.log("Model output:", output); console.log("Boxes after normalization:", boxes); console.log("Canvas size:", canvas.width, canvas.height); if (boxes.length > 0) { const [x1, y1, x2, y2] = boxes[0].bbox_2d; console.log("First box coords:", x1, y1, x2, y2); } if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid."); if (Array.isArray(boxes) && boxes.length > 0) { const scaleX = canvas.width / img.naturalWidth; const scaleY = canvas.height / img.naturalHeight; drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY }); } setImageProcessed(true); }); setProcessing(false); }; // File mode: process uploaded video frames (start/stop) const handleToggleVideoProcessing = () => { setVideoProcessing((prev) => !prev); }; // Handle start/stop for example video processing const handleToggleExampleProcessing = () => { setExampleProcessing((prev) => !prev); }; // Handle start/stop for URL video processing const handleToggleUrlProcessing = () => { setUrlProcessing((prev) => !prev); }; // Test draw box function const handleTestDrawBox = () => { if (!canvasRef.current) return; const canvas = canvasRef.current; const ctx = canvas.getContext("2d"); if (!ctx) return; ctx.clearRect(0, 0, canvas.width, canvas.height); ctx.strokeStyle = "#FF00FF"; ctx.lineWidth = 4; ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4)); ctx.font = "20px Arial"; ctx.fillStyle = "#FF00FF"; ctx.fillText("Test Box", 50, 35); }; return (
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
{inferenceStatus}
{/* Mode Selector */}
{MODES.map((m) => ( ))}
{/* Mode Content */}
{mode === "Webcam" && (