Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 1 day ago

Commit

43aa15a

verified ·

1 Parent(s): 457f6cc

Upload 51 files

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +59 -12

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -75,6 +75,40 @@ function denormalizeBox(box: number[], width: number, height: number) {
   return box;
 }
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
@@ -147,12 +181,8 @@ export default function MultiSourceCaptioningView() {
     if (!ctx) return;
     ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
     await runInference(video, prompt, (output: string) => {
-      setDebugOutput(output); // <-- Ensure Raw Model Output is updated
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
       // Box persistence logic (2 seconds)
       const now = Date.now();
       if (Array.isArray(boxes) && boxes.length > 0) {
@@ -165,7 +195,28 @@ export default function MultiSourceCaptioningView() {
       if (boxHistory.length > 0) {
         const scaleX = canvas.width / video.videoWidth;
         const scaleY = canvas.height / video.videoHeight;
-        drawBoundingBoxesOnCanvas(ctx, boxHistory, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
       }
     });
   };
@@ -281,11 +332,7 @@ export default function MultiSourceCaptioningView() {
       setDebugOutput(output);
       setInferenceStatus("Inference complete.");
       ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
       console.log("Model output:", output);
       console.log("Boxes after normalization:", boxes);
       console.log("Canvas size:", canvas.width, canvas.height);

   return box;
 }
+// Add this robust fallback parser near the top
+function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
+  // Try to parse as JSON first
+  try {
+    const parsed = JSON.parse(output);
+    if (Array.isArray(parsed)) {
+      const result: { label: string, bbox_2d: number[] }[] = [];
+      for (const obj of parsed) {
+        if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
+          if (Array.isArray(obj.bbox_2d[0])) {
+            for (const arr of obj.bbox_2d) {
+              if (Array.isArray(arr) && arr.length === 4) {
+                result.push({ label: obj.label, bbox_2d: arr });
+              }
+            }
+          } else if (obj.bbox_2d.length === 4) {
+            result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
+          }
+        }
+      }
+      if (result.length > 0) return result;
+    }
+  } catch (e) {}
+  // Fallback: extract all [x1, y1, x2, y2] arrays from the string
+  const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
+  const boxes: { label: string, bbox_2d: number[] }[] = [];
+  let match;
+  while ((match = boxRegex.exec(output)) !== null) {
+    const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
+    boxes.push({ label: '', bbox_2d: arr });
+  }
+  return boxes;
+}
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
     if (!ctx) return;
     ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
     await runInference(video, prompt, (output: string) => {
+      setDebugOutput(output);
+      let boxes = extractAllBoundingBoxes(output);
       // Box persistence logic (2 seconds)
       const now = Date.now();
       if (Array.isArray(boxes) && boxes.length > 0) {
       if (boxHistory.length > 0) {
         const scaleX = canvas.width / video.videoWidth;
         const scaleY = canvas.height / video.videoHeight;
+        // Fix: Draw all boxes, even if bbox_2d is an array of arrays
+        const denormalizedBoxes: any[] = [];
+        for (const b of boxHistory) {
+          if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
+            // Multiple boxes per label
+            for (const arr of b.bbox_2d) {
+              if (Array.isArray(arr) && arr.length === 4) {
+                denormalizedBoxes.push({
+                  ...b,
+                  bbox_2d: denormalizeBox(arr, canvas.width, canvas.height)
+                });
+              }
+            }
+          } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
+            // Single box
+            denormalizedBoxes.push({
+              ...b,
+              bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height)
+            });
+          }
+        }
+        drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
       }
     });
   };
       setDebugOutput(output);
       setInferenceStatus("Inference complete.");
       ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractAllBoundingBoxes(output);
       console.log("Model output:", output);
       console.log("Boxes after normalization:", boxes);
       console.log("Canvas size:", canvas.width, canvas.height);