Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 4 days ago

Commit

83c414e

verified ·

1 Parent(s): 9eb35d5

Upload 37 files

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +68 -129

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -6,7 +6,15 @@ const MODES = ["Webcam", "URL", "File"] as const;
 type Mode = typeof MODES[number];
 const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
-const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
 function isImageFile(file: File) {
   return file.type.startsWith("image/");
@@ -74,68 +82,69 @@ export default function MultiSourceCaptioningView() {
     };
   }, [mode]);
-  // Process webcam frames (unchanged)
   useEffect(() => {
     if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const processVideoFrame = async () => {
-      if (!videoRef.current || !canvasRef.current) return;
-      const video = videoRef.current;
-      const canvas = canvasRef.current;
-      if (video.videoWidth === 0) return;
-      canvas.width = video.videoWidth;
-      canvas.height = video.videoHeight;
-      const ctx = canvas.getContext("2d");
-      if (!ctx) return;
-      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-      await runInference(video, prompt, (output: string) => {
-        setDebugOutput(output);
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        const boxes = extractJsonFromMarkdown(output) || [];
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        drawBoundingBoxesOnCanvas(ctx, boxes);
-      });
-    };
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
   }, [mode, isLoaded, prompt, runInference, webcamActive]);
-  // URL mode: process video frames only when urlProcessing is true
   useEffect(() => {
     if (mode !== "URL" || !isLoaded || !urlProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const processVideoFrame = async () => {
-      if (!videoRef.current || !canvasRef.current) return;
-      const video = videoRef.current;
-      const canvas = canvasRef.current;
-      if (video.paused || video.ended || video.videoWidth === 0) return;
-      canvas.width = video.videoWidth;
-      canvas.height = video.videoHeight;
-      const ctx = canvas.getContext("2d");
-      if (!ctx) return;
-      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-      await runInference(video, prompt, (output: string) => {
-        setDebugOutput(output);
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        const boxes = extractJsonFromMarkdown(output) || [];
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        drawBoundingBoxesOnCanvas(ctx, boxes);
-      });
-    };
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
   }, [mode, isLoaded, prompt, runInference, urlProcessing]);
   // File mode: process uploaded image (only on button click)
   const handleProcessImage = async () => {
     if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
@@ -155,7 +164,10 @@ export default function MultiSourceCaptioningView() {
       setDebugOutput(output);
       setInferenceStatus("Inference complete.");
       ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      const boxes = extractJsonFromMarkdown(output) || [];
       if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
       drawBoundingBoxesOnCanvas(ctx, boxes);
       setImageProcessed(true);
@@ -164,79 +176,6 @@ export default function MultiSourceCaptioningView() {
   };
   // File mode: process uploaded video frames (start/stop)
-  useEffect(() => {
-    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const processVideoFrame = async () => {
-      if (!videoRef.current || !canvasRef.current) return;
-      const video = videoRef.current;
-      const canvas = canvasRef.current;
-      if (video.paused || video.ended || video.videoWidth === 0) return;
-      canvas.width = video.videoWidth;
-      canvas.height = video.videoHeight;
-      const ctx = canvas.getContext("2d");
-      if (!ctx) return;
-      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-      await runInference(video, prompt, (output: string) => {
-        setDebugOutput(output);
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        const boxes = extractJsonFromMarkdown(output) || [];
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        drawBoundingBoxesOnCanvas(ctx, boxes);
-      });
-    };
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
-  // File mode: process example video frames (start/stop)
-  useEffect(() => {
-    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    const processVideoFrame = async () => {
-      if (!videoRef.current || !canvasRef.current) return;
-      const video = videoRef.current;
-      const canvas = canvasRef.current;
-      if (video.paused || video.ended || video.videoWidth === 0) return;
-      canvas.width = video.videoWidth;
-      canvas.height = video.videoHeight;
-      const ctx = canvas.getContext("2d");
-      if (!ctx) return;
-      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-      await runInference(video, prompt, (output: string) => {
-        setDebugOutput(output);
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        const boxes = extractJsonFromMarkdown(output) || [];
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        drawBoundingBoxesOnCanvas(ctx, boxes);
-      });
-    };
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
-  // Handle file upload
-  const handleFileChange = (e: any) => {
-    const file = e.target.files?.[0] || null;
-    setUploadedFile(file);
-    setUploadedUrl(file ? URL.createObjectURL(file) : "");
-    setError(null);
-    setImageProcessed(false);
-    setVideoProcessing(false);
-    setExampleProcessing(false);
-  };
-  // Handle start/stop for video processing
   const handleToggleVideoProcessing = () => {
     setVideoProcessing((prev) => !prev);
   };

 type Mode = typeof MODES[number];
 const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
+const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
+function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
+  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
+    const label = arr[0];
+    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
+  }
+  return [];
+}
 function isImageFile(file: File) {
   return file.type.startsWith("image/");
     };
   }, [mode]);
+  // Replace setInterval-based frame processing with an async loop for all video modes
+  // Example for webcam mode:
   useEffect(() => {
     if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
+    let running = true;
+    async function processLoop() {
+      while (running) {
+        if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
+          await processVideoFrame();
+        }
+        await new Promise(res => setTimeout(res, 1000)); // 1 FPS
+      }
+    }
+    processLoop();
+    return () => { running = false; };
   }, [mode, isLoaded, prompt, runInference, webcamActive]);
+  // Repeat for URL, File video, Example video modes:
   useEffect(() => {
     if (mode !== "URL" || !isLoaded || !urlProcessing) return;
+    let running = true;
+    async function processLoop() {
+      while (running) {
+        if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
+          await processVideoFrame();
+        }
+        await new Promise(res => setTimeout(res, 1000));
+      }
+    }
+    processLoop();
+    return () => { running = false; };
   }, [mode, isLoaded, prompt, runInference, urlProcessing]);
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
+    let running = true;
+    async function processLoop() {
+      while (running) {
+        if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
+          await processVideoFrame();
+        }
+        await new Promise(res => setTimeout(res, 1000));
+      }
+    }
+    processLoop();
+    return () => { running = false; };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
+  useEffect(() => {
+    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
+    let running = true;
+    async function processLoop() {
+      while (running) {
+        if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
+          await processVideoFrame();
+        }
+        await new Promise(res => setTimeout(res, 1000));
+      }
+    }
+    processLoop();
+    return () => { running = false; };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
   // File mode: process uploaded image (only on button click)
   const handleProcessImage = async () => {
     if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
       setDebugOutput(output);
       setInferenceStatus("Inference complete.");
       ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractJsonFromMarkdown(output) || [];
+      if (boxes.length === 0 && Array.isArray(output)) {
+        boxes = parseFlatBoxArray(output);
+      }
       if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
       drawBoundingBoxesOnCanvas(ctx, boxes);
       setImageProcessed(true);
   };
   // File mode: process uploaded video frames (start/stop)
   const handleToggleVideoProcessing = () => {
     setVideoProcessing((prev) => !prev);
   };