Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 2 days ago

Commit

35bd577

verified ·

1 Parent(s): 584b8d0

Upload 38 files

Browse files

Files changed (2) hide show

src/components/MultiSourceCaptioningView.tsx +135 -44
src/workers/inferenceWorker.ts +9 -0

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -62,6 +62,27 @@ function isVideoFile(file: File) {
   return file.type.startsWith("video/");
 }
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
@@ -80,6 +101,8 @@ export default function MultiSourceCaptioningView() {
   const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
   const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
@@ -87,6 +110,31 @@ export default function MultiSourceCaptioningView() {
   const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
   const processVideoFrame = async () => {
     if (!videoRef.current || !canvasRef.current) return;
     const video = videoRef.current;
@@ -97,28 +145,46 @@ export default function MultiSourceCaptioningView() {
     const ctx = canvas.getContext("2d");
     if (!ctx) return;
     ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-    await runInference(video, prompt, (output: string) => {
-      setDebugOutput(output); // <-- Ensure Raw Model Output is updated
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
-      console.log("Model output:", output);
-      console.log("Boxes after normalization:", boxes);
-      console.log("Canvas size:", canvas.width, canvas.height);
-      if (boxes.length > 0) {
-        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
-        console.log("First box coords:", x1, y1, x2, y2);
-      }
-      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-      if (Array.isArray(boxes) && boxes.length > 0) {
-        const scaleX = canvas.width / video.videoWidth;
-        const scaleY = canvas.height / video.videoHeight;
-        ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
-        drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
       }
-    });
   };
   const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
@@ -228,30 +294,55 @@ export default function MultiSourceCaptioningView() {
     setProcessing(true);
     setError(null);
     setInferenceStatus("Running inference...");
-    await runInference(img, prompt, (output: string) => {
-      setDebugOutput(output);
-      setInferenceStatus("Inference complete.");
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      let boxes = extractJsonFromMarkdown(output) || [];
-      if (boxes.length === 0 && Array.isArray(output)) {
-        boxes = parseFlatBoxArray(output);
-      }
-      boxes = normalizeBoxes(boxes);
-      console.log("Model output:", output);
-      console.log("Boxes after normalization:", boxes);
-      console.log("Canvas size:", canvas.width, canvas.height);
-      if (boxes.length > 0) {
-        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
-        console.log("First box coords:", x1, y1, x2, y2);
-      }
-      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-      if (Array.isArray(boxes) && boxes.length > 0) {
-        const scaleX = canvas.width / img.naturalWidth;
-        const scaleY = canvas.height / img.naturalHeight;
-        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
       }
-      setImageProcessed(true);
-    });
     setProcessing(false);
   };

   return file.type.startsWith("video/");
 }
+// Utility to get ImageData from a video or image element
+function getImageDataFromElement(media: HTMLVideoElement | HTMLImageElement): ImageData | null {
+  const canvas = document.createElement("canvas");
+  let width = 0, height = 0;
+  if (media instanceof HTMLVideoElement) {
+    width = media.videoWidth;
+    height = media.videoHeight;
+  } else if (media instanceof HTMLImageElement) {
+    width = media.naturalWidth;
+    height = media.naturalHeight;
+  } else {
+    return null;
+  }
+  canvas.width = width;
+  canvas.height = height;
+  const ctx = canvas.getContext("2d");
+  if (!ctx) return null;
+  ctx.drawImage(media, 0, 0, width, height);
+  return ctx.getImageData(0, 0, width, height);
+}
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
   const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const inferenceWorkerRef = useRef<Worker | null>(null);
+  const [useWorker, setUseWorker] = useState(true); // Toggle for worker usage
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
   const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  useEffect(() => {
+    if (useWorker) {
+      inferenceWorkerRef.current = new Worker(
+        new URL('../workers/inferenceWorker.ts', import.meta.url),
+        { type: 'module' }
+      );
+    }
+    return () => {
+      inferenceWorkerRef.current?.terminate();
+      inferenceWorkerRef.current = null;
+    };
+  }, [useWorker]);
+  // Helper to run inference in worker
+  const runInferenceInWorker = (media: HTMLVideoElement | HTMLImageElement, prompt: string) => {
+    return new Promise((resolve, reject) => {
+      if (!inferenceWorkerRef.current) return reject('No worker');
+      const imageData = getImageDataFromElement(media);
+      if (!imageData) return reject('Could not get image data');
+      inferenceWorkerRef.current.onmessage = (event) => resolve(event.data);
+      inferenceWorkerRef.current.onerror = (err) => reject(err);
+      inferenceWorkerRef.current.postMessage({ imageData, prompt });
+    });
+  };
   const processVideoFrame = async () => {
     if (!videoRef.current || !canvasRef.current) return;
     const video = videoRef.current;
     const ctx = canvas.getContext("2d");
     if (!ctx) return;
     ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+    if (useWorker && inferenceWorkerRef.current) {
+      try {
+        const output = await runInferenceInWorker(video, prompt);
+        setDebugOutput(JSON.stringify(output, null, 2));
+        let boxes = normalizeBoxes(output);
+        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+        if (Array.isArray(boxes) && boxes.length > 0) {
+          const scaleX = canvas.width / video.videoWidth;
+          const scaleY = canvas.height / video.videoHeight;
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+        }
+      } catch (err) {
+        setInferenceStatus("Worker inference failed, falling back to main thread.");
+        // fallback to main-thread inference
+        await runInference(video, prompt, (output: string) => {
+          setDebugOutput(output);
+          let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
+          if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+          if (Array.isArray(boxes) && boxes.length > 0) {
+            const scaleX = canvas.width / video.videoWidth;
+            const scaleY = canvas.height / video.videoHeight;
+            ctx.clearRect(0, 0, canvas.width, canvas.height);
+            drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+          }
+        });
       }
+    } else {
+      await runInference(video, prompt, (output: string) => {
+        setDebugOutput(output);
+        let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
+        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+        if (Array.isArray(boxes) && boxes.length > 0) {
+          const scaleX = canvas.width / video.videoWidth;
+          const scaleY = canvas.height / video.videoHeight;
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+        }
+      });
+    }
   };
   const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
     setProcessing(true);
     setError(null);
     setInferenceStatus("Running inference...");
+    if (useWorker && inferenceWorkerRef.current) {
+      try {
+        const output = await runInferenceInWorker(img, prompt);
+        setDebugOutput(JSON.stringify(output, null, 2));
+        setInferenceStatus("Inference complete.");
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+        let boxes = normalizeBoxes(output);
+        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+        if (Array.isArray(boxes) && boxes.length > 0) {
+          const scaleX = canvas.width / img.naturalWidth;
+          const scaleY = canvas.height / img.naturalHeight;
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+        }
+        setImageProcessed(true);
+      } catch (err) {
+        setInferenceStatus("Worker inference failed, falling back to main thread.");
+        // fallback to main-thread inference
+        await runInference(img, prompt, (output: string) => {
+          setDebugOutput(output);
+          setInferenceStatus("Inference complete.");
+          ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+          let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
+          if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+          if (Array.isArray(boxes) && boxes.length > 0) {
+            const scaleX = canvas.width / img.naturalWidth;
+            const scaleY = canvas.height / img.naturalHeight;
+            ctx.clearRect(0, 0, canvas.width, canvas.height);
+            drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+          }
+          setImageProcessed(true);
+        });
       }
+    } else {
+      await runInference(img, prompt, (output: string) => {
+        setDebugOutput(output);
+        setInferenceStatus("Inference complete.");
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+        let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
+        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+        if (Array.isArray(boxes) && boxes.length > 0) {
+          const scaleX = canvas.width / img.naturalWidth;
+          const scaleY = canvas.height / img.naturalHeight;
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+        }
+        setImageProcessed(true);
+      });
+    }
     setProcessing(false);
   };

src/workers/inferenceWorker.ts ADDED Viewed

	@@ -0,0 +1,9 @@

+// src/workers/inferenceWorker.ts
+self.onmessage = async (event) => {
+  const { imageData, prompt } = event.data;
+  // TODO: Import and run your real model inference here
+  // For now, just echo a fake result for testing
+  const result = [{ label: "person", bbox_2d: [[100, 50], [200, 300]] }];
+  self.postMessage(result);
+};
+export {};