Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 1 day ago

Commit

98c6726

verified ·

1 Parent(s): f90ffff

Upload 50 files

Browse files

Files changed (13) hide show

.gitignore +23 -23
public/index.html +43 -43
public/manifest.json +25 -25
public/robots.txt +3 -3
src/App.css +38 -38
src/App.js +25 -25
src/App.test.js +8 -8
src/App.tsx +6 -2
src/components/BoxAnnotator.ts +13 -3
src/components/MultiSourceCaptioningView.tsx +434 -473
src/index.js +17 -17
src/reportWebVitals.js +13 -13
src/setupTests.js +5 -5

.gitignore CHANGED Viewed

@@ -1,23 +1,23 @@
-# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
-# dependencies
-/node_modules
-/.pnp
-.pnp.js
-# testing
-/coverage
-# production
-/build
-# misc
-.DS_Store
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*

+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+# dependencies
+/node_modules
+/.pnp
+.pnp.js
+# testing
+/coverage
+# production
+/build
+# misc
+.DS_Store
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*

public/index.html CHANGED Viewed

@@ -1,43 +1,43 @@
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8" />
-    <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <meta name="theme-color" content="#000000" />
-    <meta
-      name="description"
-      content="Web site created using create-react-app"
-    />
-    <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
-    <!--
-      manifest.json provides metadata used when your web app is installed on a
-      user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
-    -->
-    <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
-    <!--
-      Notice the use of %PUBLIC_URL% in the tags above.
-      It will be replaced with the URL of the `public` folder during the build.
-      Only files inside the `public` folder can be referenced from the HTML.
-      Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
-      work correctly both with client-side routing and a non-root public URL.
-      Learn how to configure a non-root public URL by running `npm run build`.
-    -->
-    <title>React App</title>
-  </head>
-  <body>
-    <noscript>You need to enable JavaScript to run this app.</noscript>
-    <div id="root"></div>
-    <!--
-      This HTML file is a template.
-      If you open it directly in the browser, you will see an empty page.
-      You can add webfonts, meta tags, or analytics to this file.
-      The build step will place the bundled scripts into the <body> tag.
-      To begin the development, run `npm start` or `yarn start`.
-      To create a production bundle, use `npm run build` or `yarn build`.
-    -->
-  </body>
-</html>

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <meta name="theme-color" content="#000000" />
+    <meta
+      name="description"
+      content="Web site created using create-react-app"
+    />
+    <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
+    <!--
+      manifest.json provides metadata used when your web app is installed on a
+      user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
+    -->
+    <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
+    <!--
+      Notice the use of %PUBLIC_URL% in the tags above.
+      It will be replaced with the URL of the `public` folder during the build.
+      Only files inside the `public` folder can be referenced from the HTML.
+      Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
+      work correctly both with client-side routing and a non-root public URL.
+      Learn how to configure a non-root public URL by running `npm run build`.
+    -->
+    <title>React App</title>
+  </head>
+  <body>
+    <noscript>You need to enable JavaScript to run this app.</noscript>
+    <div id="root"></div>
+    <!--
+      This HTML file is a template.
+      If you open it directly in the browser, you will see an empty page.
+      You can add webfonts, meta tags, or analytics to this file.
+      The build step will place the bundled scripts into the <body> tag.
+      To begin the development, run `npm start` or `yarn start`.
+      To create a production bundle, use `npm run build` or `yarn build`.
+    -->
+  </body>
+</html>

public/manifest.json CHANGED Viewed

@@ -1,25 +1,25 @@
-{
-  "short_name": "React App",
-  "name": "Create React App Sample",
-  "icons": [
-    {
-      "src": "favicon.ico",
-      "sizes": "64x64 32x32 24x24 16x16",
-      "type": "image/x-icon"
-    },
-    {
-      "src": "logo192.png",
-      "type": "image/png",
-      "sizes": "192x192"
-    },
-    {
-      "src": "logo512.png",
-      "type": "image/png",
-      "sizes": "512x512"
-    }
-  ],
-  "start_url": ".",
-  "display": "standalone",
-  "theme_color": "#000000",
-  "background_color": "#ffffff"
-}

+{
+  "short_name": "React App",
+  "name": "Create React App Sample",
+  "icons": [
+    {
+      "src": "favicon.ico",
+      "sizes": "64x64 32x32 24x24 16x16",
+      "type": "image/x-icon"
+    },
+    {
+      "src": "logo192.png",
+      "type": "image/png",
+      "sizes": "192x192"
+    },
+    {
+      "src": "logo512.png",
+      "type": "image/png",
+      "sizes": "512x512"
+    }
+  ],
+  "start_url": ".",
+  "display": "standalone",
+  "theme_color": "#000000",
+  "background_color": "#ffffff"
+}

public/robots.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-# https://www.robotstxt.org/robotstxt.html
-User-agent: *
-Disallow:

+# https://www.robotstxt.org/robotstxt.html
+User-agent: *
+Disallow:

src/App.css CHANGED Viewed

@@ -1,38 +1,38 @@
-.App {
-  text-align: center;
-}
-.App-logo {
-  height: 40vmin;
-  pointer-events: none;
-}
-@media (prefers-reduced-motion: no-preference) {
-  .App-logo {
-    animation: App-logo-spin infinite 20s linear;
-  }
-}
-.App-header {
-  background-color: #282c34;
-  min-height: 100vh;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  font-size: calc(10px + 2vmin);
-  color: white;
-}
-.App-link {
-  color: #61dafb;
-}
-@keyframes App-logo-spin {
-  from {
-    transform: rotate(0deg);
-  }
-  to {
-    transform: rotate(360deg);
-  }
-}

+.App {
+  text-align: center;
+}
+.App-logo {
+  height: 40vmin;
+  pointer-events: none;
+}
+@media (prefers-reduced-motion: no-preference) {
+  .App-logo {
+    animation: App-logo-spin infinite 20s linear;
+  }
+}
+.App-header {
+  background-color: #282c34;
+  min-height: 100vh;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  justify-content: center;
+  font-size: calc(10px + 2vmin);
+  color: white;
+}
+.App-link {
+  color: #61dafb;
+}
+@keyframes App-logo-spin {
+  from {
+    transform: rotate(0deg);
+  }
+  to {
+    transform: rotate(360deg);
+  }
+}

src/App.js CHANGED Viewed

@@ -1,25 +1,25 @@
-import logo from './logo.svg';
-import './App.css';
-function App() {
-  return (
-    <div className="App">
-      <header className="App-header">
-        <img src={logo} className="App-logo" alt="logo" />
-        <p>
-          Edit <code>src/App.js</code> and save to reload.
-        </p>
-        <a
-          className="App-link"
-          href="https://reactjs.org"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          Learn React
-        </a>
-      </header>
-    </div>
-  );
-}
-export default App;

+import logo from './logo.svg';
+import './App.css';
+function App() {
+  return (
+    <div className="App">
+      <header className="App-header">
+        <img src={logo} className="App-logo" alt="logo" />
+        <p>
+          Edit <code>src/App.js</code> and save to reload.
+        </p>
+        <a
+          className="App-link"
+          href="https://reactjs.org"
+          target="_blank"
+          rel="noopener noreferrer"
+        >
+          Learn React
+        </a>
+      </header>
+    </div>
+  );
+}
+export default App;

src/App.test.js CHANGED Viewed

@@ -1,8 +1,8 @@
-import { render, screen } from '@testing-library/react';
-import App from './App';
-test('renders learn react link', () => {
-  render(<App />);
-  const linkElement = screen.getByText(/learn react/i);
-  expect(linkElement).toBeInTheDocument();
-});

+import { render, screen } from '@testing-library/react';
+import App from './App';
+test('renders learn react link', () => {
+  render(<App />);
+  const linkElement = screen.getByText(/learn react/i);
+  expect(linkElement).toBeInTheDocument();
+});

src/App.tsx CHANGED Viewed

@@ -11,7 +11,8 @@ export default function App() {
       await loadModel();
       setStarted(true);
     } catch (e) {
-      // error is handled by context
     }
   };
@@ -27,6 +28,9 @@ export default function App() {
           {isLoading ? "Loading Model..." : "Load Model"}
         </button>
         {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
       </div>
     );
   }
@@ -37,4 +41,4 @@ export default function App() {
       <MultiSourceCaptioningView />
     </div>
   );
-}

       await loadModel();
       setStarted(true);
     } catch (e) {
+      // error is handled by context, could log here if needed
+      console.error("Failed to load model:", e);
     }
   };
           {isLoading ? "Loading Model..." : "Load Model"}
         </button>
         {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
+        <p className="text-sm text-gray-400 mt-2">
+          Model will download on first load. This may take a moment.
+        </p>
       </div>
     );
   }
       <MultiSourceCaptioningView />
     </div>
   );
+}

src/components/BoxAnnotator.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export function extractJsonFromMarkdown(markdown: string): any[] | null {
     if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
     return null;
   } catch {
     return null;
   }
 }
@@ -31,7 +32,15 @@ export function drawBoundingBoxesOnCanvas(
   boxes: { bbox_2d: number[]; label?: string }[],
   options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
 ) {
-  if (!Array.isArray(boxes)) return; // Prevent errors if boxes is undefined/null
   const color = options?.color || "#00FF00";
   const lineWidth = options?.lineWidth || 2;
   const font = options?.font || "16px Arial";
@@ -54,9 +63,10 @@ export function drawBoundingBoxesOnCanvas(
     ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
     ctx.stroke();
     if (obj.label) {
-      ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 10 ? sy1 + 16 : sy1 - 4);
     }
   });
   ctx.restore();
-}

     if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
     return null;
   } catch {
+    console.error("Failed to parse JSON from markdown:", jsonString);
     return null;
   }
 }
   boxes: { bbox_2d: number[]; label?: string }[],
   options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
 ) {
+  if (!Array.isArray(boxes)) {
+    console.warn("drawBoundingBoxesOnCanvas: 'boxes' is not an array or is null/undefined.", boxes);
+    return;
+  }
+  if (boxes.length === 0) {
+    // console.log("drawBoundingBoxesOnCanvas: 'boxes' array is empty, nothing to draw.");
+    return;
+  }
   const color = options?.color || "#00FF00";
   const lineWidth = options?.lineWidth || 2;
   const font = options?.font || "16px Arial";
     ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
     ctx.stroke();
     if (obj.label) {
+      // Adjust text position to ensure visibility, especially if near top edge
+      ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 16 ? sy1 + 16 : sy1 - 4);
     }
   });
   ctx.restore();
+}

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -1,13 +1,22 @@
-import { useState, useRef, useEffect } from "react";
 import { useVLMContext } from "../context/useVLMContext";
 import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
 const MODES = ["Webcam", "URL", "File"] as const;
 type Mode = typeof MODES[number];
-const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
 const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
 function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
   if (!raw) return [];
   let boxes = [];
@@ -22,7 +31,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
     .map((obj: any) => {
       if (!obj || !obj.bbox_2d) return null;
       let bbox = obj.bbox_2d;
-      // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
       if (
         Array.isArray(bbox) &&
         bbox.length === 2 &&
@@ -33,7 +41,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
       ) {
         bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
       }
-      // If bbox_2d is [x1, y1, x2, y2], use as-is
       if (
         Array.isArray(bbox) &&
         bbox.length === 4 &&
@@ -41,7 +48,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
       ) {
         return { ...obj, bbox_2d: bbox };
       }
-      // Otherwise, skip
       return null;
     })
     .filter((obj: any) => obj);
@@ -54,329 +60,365 @@ function isVideoFile(file: File) {
   return file.type.startsWith("video/");
 }
-// Utility to get ImageData from a video or image element
-function getImageDataFromElement(media: HTMLVideoElement | HTMLImageElement): ImageData | null {
-  const canvas = document.createElement("canvas");
-  let width = 0, height = 0;
-  if (media instanceof HTMLVideoElement) {
-    width = media.videoWidth;
-    height = media.videoHeight;
-  } else if (media instanceof HTMLImageElement) {
-    width = media.naturalWidth;
-    height = media.naturalHeight;
-  } else {
-    return null;
-  }
-  canvas.width = width;
-  canvas.height = height;
-  const ctx = canvas.getContext("2d");
-  if (!ctx) return null;
-  ctx.drawImage(media, 0, 0, width, height);
-  return ctx.getImageData(0, 0, width, height);
-}
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
-  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
-  const [processing, setProcessing] = useState(false);
   const [error, setError] = useState<string | null>(null);
-  const [webcamActive, setWebcamActive] = useState(false);
-  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
-  const [uploadedUrl, setUploadedUrl] = useState<string>("");
-  const [videoProcessing, setVideoProcessing] = useState(false);
-  const [imageProcessed, setImageProcessed] = useState(false);
-  const [exampleProcessing, setExampleProcessing] = useState(false);
-  const [urlProcessing, setUrlProcessing] = useState(false);
-  const [debugOutput, setDebugOutput] = useState<string>("");
-  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
-  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
-  const inferenceWorkerRef = useRef<Worker | null>(null);
-  const [useWorker] = useState(true);
-  const videoRef = useRef<HTMLVideoElement | null>(null);
-  const canvasRef = useRef<HTMLCanvasElement | null>(null);
-  const imageRef = useRef<HTMLImageElement | null>(null);
-  const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
-  useEffect(() => {
-    if (useWorker) {
-      inferenceWorkerRef.current = new Worker(
-        new URL('../workers/inferenceWorker.ts', import.meta.url),
-        { type: 'module' }
-      );
     }
-    return () => {
-      inferenceWorkerRef.current?.terminate();
-      inferenceWorkerRef.current = null;
-    };
-  }, [useWorker]);
-  // Helper to run inference in worker
-  const runInferenceInWorker = (media: HTMLVideoElement | HTMLImageElement, prompt: string) => {
-    return new Promise((resolve, reject) => {
-      if (!inferenceWorkerRef.current) return reject('No worker');
-      const imageData = getImageDataFromElement(media);
-      if (!imageData) return reject('Could not get image data');
-      inferenceWorkerRef.current.onmessage = (event) => resolve(event.data);
-      inferenceWorkerRef.current.onerror = (err) => reject(err);
-      inferenceWorkerRef.current.postMessage({ imageData, prompt });
-    });
-  };
-  const processVideoFrame = async () => {
-    if (!videoRef.current || !canvasRef.current) return;
-    const video = videoRef.current;
-    const canvas = canvasRef.current;
-    if (video.paused || video.ended || video.videoWidth === 0) return;
-    canvas.width = video.videoWidth;
-    canvas.height = video.videoHeight;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-    if (useWorker && inferenceWorkerRef.current) {
-      try {
-        const output = await runInferenceInWorker(video, prompt);
-        setDebugOutput(JSON.stringify(output, null, 2));
-        let boxes = normalizeBoxes(output);
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        if (Array.isArray(boxes) && boxes.length > 0) {
-          const scaleX = canvas.width / video.videoWidth;
-          const scaleY = canvas.height / video.videoHeight;
-          ctx.clearRect(0, 0, canvas.width, canvas.height);
-          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-        }
-      } catch (err) {
-        setInferenceStatus("Worker inference failed, falling back to main thread.");
-        // fallback to main-thread inference
-        await runInference(video, prompt, (output: string) => {
-          setDebugOutput(output);
-          let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
-          if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-          if (Array.isArray(boxes) && boxes.length > 0) {
-            const scaleX = canvas.width / video.videoWidth;
-            const scaleY = canvas.height / video.videoHeight;
-            ctx.clearRect(0, 0, canvas.width, canvas.height);
-            drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-          }
-        });
-      }
-    } else {
-      await runInference(video, prompt, (output: string) => {
-        setDebugOutput(output);
-        let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        if (Array.isArray(boxes) && boxes.length > 0) {
-          const scaleX = canvas.width / video.videoWidth;
-          const scaleY = canvas.height / video.videoHeight;
-          ctx.clearRect(0, 0, canvas.width, canvas.height);
-          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-        }
-      });
     }
-  };
-  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
-    const file = e.target.files?.[0] || null;
-    setUploadedFile(file);
-    setUploadedUrl(file ? URL.createObjectURL(file) : "");
-    setError(null);
-    setImageProcessed(false);
-    setVideoProcessing(false);
-    setExampleProcessing(false);
-  };
-  // Webcam setup and teardown (unchanged)
-  useEffect(() => {
-    if (mode !== "Webcam") {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
-      }
-      setWebcamActive(false);
-      return;
     }
-    const setupWebcam = async () => {
-      try {
-        setError(null);
-        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-        webcamStreamRef.current = stream;
-        if (videoRef.current) {
-          videoRef.current.srcObject = stream;
-          setWebcamActive(true);
         }
-      } catch (e) {
-        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
-        setWebcamActive(false);
-      }
-    };
-    setupWebcam();
-    return () => {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
       }
-      setWebcamActive(false);
-    };
-  }, [mode]);
-  // Webcam mode: process frames with setInterval
   useEffect(() => {
-    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
     let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
     };
-  }, [mode, isLoaded, prompt, runInference, webcamActive]);
-  // URL mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "URL" || !isLoaded || !urlProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
       if (interval) clearInterval(interval);
     };
-  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
-  // File video mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
     return () => {
-      if (interval) clearInterval(interval);
     };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
-  // Example video mode: process frames with setInterval
   useEffect(() => {
-    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
-  // File mode: process uploaded image (only on button click)
-  const handleProcessImage = async () => {
-    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
-    const img = imageRef.current;
-    const canvas = canvasRef.current;
-    canvas.width = img.naturalWidth;
-    canvas.height = img.naturalHeight;
-    setCanvasDims({w:canvas.width,h:canvas.height});
-    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-    setProcessing(true);
     setError(null);
-    setInferenceStatus("Running inference...");
-    if (useWorker && inferenceWorkerRef.current) {
-      try {
-        const output = await runInferenceInWorker(img, prompt);
-        setDebugOutput(JSON.stringify(output, null, 2));
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-        let boxes = normalizeBoxes(output);
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        if (Array.isArray(boxes) && boxes.length > 0) {
-          const scaleX = canvas.width / img.naturalWidth;
-          const scaleY = canvas.height / img.naturalHeight;
-          ctx.clearRect(0, 0, canvas.width, canvas.height);
-          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
         }
-        setImageProcessed(true);
-      } catch (err) {
-        setInferenceStatus("Worker inference failed, falling back to main thread.");
-        // fallback to main-thread inference
-        await runInference(img, prompt, (output: string) => {
-          setDebugOutput(output);
-          setInferenceStatus("Inference complete.");
-          ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-          let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
-          if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-          if (Array.isArray(boxes) && boxes.length > 0) {
-            const scaleX = canvas.width / img.naturalWidth;
-            const scaleY = canvas.height / img.naturalHeight;
-            ctx.clearRect(0, 0, canvas.width, canvas.height);
-            drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
-          }
-          setImageProcessed(true);
-        });
       }
     } else {
-      await runInference(img, prompt, (output: string) => {
-        setDebugOutput(output);
-        setInferenceStatus("Inference complete.");
-        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-        let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
-        if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-        if (Array.isArray(boxes) && boxes.length > 0) {
-          const scaleX = canvas.width / img.naturalWidth;
-          const scaleY = canvas.height / img.naturalHeight;
-          ctx.clearRect(0, 0, canvas.width, canvas.height);
-          drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
         }
-        setImageProcessed(true);
-      });
     }
-    setProcessing(false);
-  };
-  // File mode: process uploaded video frames (start/stop)
-  const handleToggleVideoProcessing = () => {
-    setVideoProcessing((prev) => !prev);
-  };
-  // Handle start/stop for example video processing
-  const handleToggleExampleProcessing = () => {
-    setExampleProcessing((prev) => !prev);
-  };
-  // Handle start/stop for URL video processing
-  const handleToggleUrlProcessing = () => {
-    setUrlProcessing((prev) => !prev);
-  };
-  // Test draw box function
-  const handleTestDrawBox = () => {
-    if (!canvasRef.current) return;
     const canvas = canvasRef.current;
     const ctx = canvas.getContext("2d");
     if (!ctx) return;
-    ctx.clearRect(0, 0, canvas.width, canvas.height);
-    ctx.strokeStyle = "#FF00FF";
-    ctx.lineWidth = 4;
-    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
-    ctx.font = "20px Arial";
-    ctx.fillStyle = "#FF00FF";
-    ctx.fillText("Test Box", 50, 35);
   };
   return (
-    <div className="absolute inset-0 text-white">
       <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
         {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
       </div>
-      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
-      <div className="flex flex-col items-center justify-center h-full w-full">
         {/* Mode Selector */}
-        <div className="mb-6">
           <div className="flex space-x-4">
             {MODES.map((m) => (
               <button
@@ -385,6 +427,7 @@ export default function MultiSourceCaptioningView() {
                   mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
                 }`}
                 onClick={() => setMode(m)}
               >
                 {m}
               </button>
@@ -392,212 +435,130 @@ export default function MultiSourceCaptioningView() {
           </div>
         </div>
-        {/* Mode Content */}
-        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
-          {mode === "Webcam" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="relative w-full max-w-xl">
-                <video
-                  ref={videoRef}
-                  autoPlay
-                  muted
-                  playsInline
-                  className="w-full rounded-lg shadow-lg mb-2"
-                  style={{ background: "#222" }}
-                />
-                <canvas
-                  ref={canvasRef}
-                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                  style={{ zIndex: 10, pointerEvents: "none" }}
-                />
-              </div>
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-            </div>
-          )}
-          {mode === "URL" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
-              <div className="flex w-full max-w-xl mb-4">
-                <input
-                  type="text"
-                  className="flex-1 px-4 py-2 rounded-l-lg text-black"
-                  value={inputUrl}
-                  onChange={(e) => setInputUrl(e.target.value)}
-                  placeholder="Paste video URL here"
-                />
-                <button
-                  className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
-                  onClick={() => setVideoUrl(inputUrl)}
-                >
-                  Load
-                </button>
-              </div>
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="relative w-full max-w-xl">
-                <video
-                  ref={videoRef}
-                  src={videoUrl}
-                  controls
-                  autoPlay
-                  loop
-                  className="w-full rounded-lg shadow-lg mb-2"
-                  style={{ background: "#222" }}
-                />
-                <canvas
-                  ref={canvasRef}
-                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                  style={{ zIndex: 10, pointerEvents: "none" }}
-                />
-                <button
-                  className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                  onClick={handleToggleUrlProcessing}
-                >
-                  {urlProcessing ? "Stop Processing" : "Start Processing"}
-                </button>
-              </div>
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
               <button
-                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
-                onClick={handleTestDrawBox}
               >
-                Test Draw Box
               </button>
-              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
-                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
-                <div>Raw Model Output:</div>
-                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-              </div>
-            </div>
-          )}
-          {mode === "File" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="mb-4 w-full max-w-xl">
                 <input
                   type="file"
                   accept="image/*,video/*"
                   onChange={handleFileChange}
-                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
                 />
-              </div>
-              {/* Show uploaded image */}
-              {uploadedFile && isImageFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl">
-                  <img
-                    ref={imageRef}
-                    src={uploadedUrl}
-                    alt="Uploaded"
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
                   <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
                     onClick={handleProcessImage}
-                    disabled={processing}
                   >
-                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
                   </button>
-                </div>
-              )}
-              {/* Show uploaded video */}
-              {uploadedFile && isVideoFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl">
-                  <video
-                    ref={videoRef}
-                    src={uploadedUrl}
-                    controls
-                    autoPlay
-                    loop
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleVideoProcessing}
-                  >
-                    {videoProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {/* Show example video if no file uploaded */}
-              {!uploadedFile && (
-                <div className="relative w-full max-w-xl">
-                  <video
-                    ref={videoRef}
-                    src={EXAMPLE_VIDEO_URL}
-                    controls
-                    autoPlay
-                    loop
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleExampleProcessing}
-                  >
-                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-              <button
-                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
-                onClick={handleTestDrawBox}
-              >
-                Test Draw Box
-              </button>
-              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
-                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
-                <div>Raw Model Output:</div>
-                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-              </div>
-            </div>
-          )}
         </div>
       </div>
     </div>
   );
-}

+import React, { useState, useRef, useEffect, useCallback } from "react";
 import { useVLMContext } from "../context/useVLMContext";
 import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
 const MODES = ["Webcam", "URL", "File"] as const;
 type Mode = typeof MODES[number];
+const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
 const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
+// Helper functions (remain the same)
+function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
+  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
+    const label = arr[0];
+    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
+  }
+  return [];
+}
 function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
   if (!raw) return [];
   let boxes = [];
     .map((obj: any) => {
       if (!obj || !obj.bbox_2d) return null;
       let bbox = obj.bbox_2d;
       if (
         Array.isArray(bbox) &&
         bbox.length === 2 &&
       ) {
         bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
       }
       if (
         Array.isArray(bbox) &&
         bbox.length === 4 &&
       ) {
         return { ...obj, bbox_2d: bbox };
       }
       return null;
     })
     .filter((obj: any) => obj);
   return file.type.startsWith("video/");
 }
 export default function MultiSourceCaptioningView() {
   const [mode, setMode] = useState<Mode>("File");
+  const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
+  const [processingState, setProcessingState] = useState(false); // General processing indicator
   const [error, setError] = useState<string | null>(null);
+  const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
+  const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const [debugOutput, setDebugOutput] = useState<string>("");
+  // Refs for the two video elements and the canvas
+  const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
+  const vlmVideoRef = useRef<HTMLVideoElement>(null);    // The hidden video for VLM processing
+  const canvasRef = useRef<HTMLCanvasElement>(null);     // The canvas overlay for drawing boxes
+  const imageRef = useRef<HTMLImageElement>(null);       // For image file processing
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  // --- Drawing Loop for the Visible Display ---
+  // This loop runs constantly to draw the latest boxes on the display video
+  const drawDisplayCanvas = useCallback(() => {
+    const displayVideo = displayVideoRef.current;
+    const canvas = canvasRef.current;
+    const ctx = canvas?.getContext('2d');
+    if (!displayVideo || !canvas || !ctx) {
+      return;
     }
+    // Adjust canvas size to match the display video's dimensions
+    if (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight) {
+      canvas.width = displayVideo.videoWidth;
+      canvas.height = displayVideo.videoHeight;
     }
+    // Clear the canvas each frame
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    // Draw the latest bounding boxes
+    const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
+    const scaleY = canvas.height / (displayVideo.videoHeight || 1);
+    drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
+    // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
+    if (!displayVideo.paused && !displayVideo.ended) {
+      requestAnimationFrame(drawDisplayCanvas);
     }
+  }, [latestBoxes]); // Re-create if latestBoxes changes
+  // Effect to start the display drawing loop when the display video is ready
+  useEffect(() => {
+    const displayVideo = displayVideoRef.current;
+    if (displayVideo) {
+      const handleVideoReady = () => {
+        // Start the requestAnimationFrame loop once the video has loaded metadata
+        if (displayVideo.readyState >= 1) { // HAVE_METADATA
+          requestAnimationFrame(drawDisplayCanvas);
         }
+      };
+      displayVideo.addEventListener('loadedmetadata', handleVideoReady);
+      // Also check if video is already ready (e.g., on component re-mount)
+      if (displayVideo.readyState >= 1) {
+        requestAnimationFrame(drawDisplayCanvas);
       }
+      return () => {
+        displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
+      };
+    }
+  }, [drawDisplayCanvas]);
+  // --- FastVLM Processing Loop (from hidden video/image) ---
+  // This interval loop controls when FastVLM processes a frame
   useEffect(() => {
+    const vlmVideo = vlmVideoRef.current;
+    const isVideoMode = (mode === "Webcam" || (mode === "URL" && vlmVideo?.src) || (mode === "File" && vlmVideo?.src && isVideoFile(uploadedFile || null)));
+    if (!isLoaded || !vlmVideo || !isVideoMode) {
+      // If not in a video mode or VLM/video not ready, ensure processing stops
+      setProcessingState(false);
+      return;
+    }
     let interval: ReturnType<typeof setInterval> | null = null;
+    const startVLMProcessing = () => {
+      if (interval) clearInterval(interval); // Clear any old interval
+      interval = setInterval(async () => {
+        if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
+          return; // Skip if video not ready, paused, ended, or already processing
+        }
+        setProcessingState(true);
+        setInferenceStatus("Running inference...");
+        setError(null); // Clear previous errors
+        try {
+          // Create a temporary offscreen canvas to get image data from the VLM video
+          const tempCanvas = document.createElement('canvas');
+          tempCanvas.width = vlmVideo.videoWidth;
+          tempCanvas.height = vlmVideo.videoHeight;
+          const tempCtx = tempCanvas.getContext('2d', { willReadFrequently: true });
+          if (tempCtx && vlmVideo.readyState >= 2) { // HAVE_CURRENT_DATA
+            tempCtx.drawImage(vlmVideo, 0, 0, tempCanvas.width, tempCanvas.height);
+            const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height);
+            const modelOutput = await runInference(imageData, prompt); // Pass ImageData
+            setDebugOutput(modelOutput); // Update raw model output
+            let boxes = extractJsonFromMarkdown(modelOutput) || [];
+            if (boxes.length === 0 && Array.isArray(modelOutput)) { // Fallback for direct array output
+                // This condition `Array.isArray(modelOutput)` is unlikely if modelOutput is string,
+                // so ensure `extractJsonFromMarkdown` is robust or `runInference` returns expected string
+            }
+            boxes = normalizeBoxes(boxes);
+            setLatestBoxes(boxes); // Update state, triggers display canvas redraw
+            setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
+          } else {
+             setInferenceStatus("Video not ready for processing.");
+          }
+        } catch (e) {
+          setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
+          setLatestBoxes([]);
+          setInferenceStatus("Inference failed.");
+        } finally {
+          setProcessingState(false); // Processing finished
+        }
+      }, 200); // Inference interval (e.g., 5 frames per second)
     };
+    const stopVLMProcessing = () => {
       if (interval) clearInterval(interval);
+      interval = null;
+      setProcessingState(false);
+      setInferenceStatus("Stopped processing.");
     };
+    // Start/stop processing based on video playback events
+    vlmVideo.addEventListener('play', startVLMProcessing);
+    vlmVideo.addEventListener('pause', stopVLMProcessing);
+    vlmVideo.addEventListener('ended', stopVLMProcessing);
+    // Initial check if video is already playing (e.g., after initial load/autoplay)
+    if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
+        startVLMProcessing();
+    }
+    // Cleanup function for useEffect
     return () => {
+      stopVLMProcessing();
+      vlmVideo.removeEventListener('play', startVLMProcessing);
+      vlmVideo.removeEventListener('pause', stopVLMProcessing);
+      vlmVideo.removeEventListener('ended', stopVLMProcessing);
     };
+  }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Added uploadedFile for file mode re-trigger
+  // --- Media Source Handling ---
+  // Cleanup for media stream and object URLs
+  const cleanupMediaSource = useCallback(() => {
+    if (mediaStream) {
+      mediaStream.getTracks().forEach(track => track.stop());
+      setMediaStream(null);
+    }
+    // Revoke any created blob URLs (for file inputs)
+    if (displayVideoRef.current?.src.startsWith('blob:')) {
+      URL.revokeObjectURL(displayVideoRef.current.src);
+      displayVideoRef.current.src = "";
+    }
+    if (vlmVideoRef.current?.src.startsWith('blob:')) {
+      URL.revokeObjectURL(vlmVideoRef.current.src);
+      vlmVideoRef.current.src = "";
+    }
+    setLatestBoxes([]); // Clear boxes when source changes
+    setError(null);
+    setInferenceStatus("");
+    setDebugOutput("");
+  }, [mediaStream]);
+  // Handle changing the mode (Webcam, URL, File)
   useEffect(() => {
+    cleanupMediaSource(); // Clean up previous source
+    const displayVideo = displayVideoRef.current;
+    const vlmVideo = vlmVideoRef.current;
+    if (!displayVideo || !vlmVideo) return;
+    // Reset srcObject/src to ensure fresh start
+    displayVideo.srcObject = null;
+    vlmVideo.srcObject = null;
+    displayVideo.src = "";
+    vlmVideo.src = "";
+    setLatestBoxes([]); // Clear boxes on mode change
     setError(null);
+    setInferenceStatus("");
+    setDebugOutput("");
+    // Special handling for initial file mode to load example video
+    if (mode === "File" && !uploadedFile) {
+        displayVideo.src = EXAMPLE_VIDEO_URL;
+        vlmVideo.src = EXAMPLE_VIDEO_URL;
+        displayVideo.load(); vlmVideo.load(); // Load the video
+        displayVideo.play().catch(e => console.error("Error playing example display video:", e));
+        vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
+    }
+  }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
+  // Handle Webcam Input
+  const handleWebcamInput = useCallback(async () => {
+    cleanupMediaSource(); // Clean up any active stream
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+      setMediaStream(stream); // Store stream to manage it
+      if (displayVideoRef.current && vlmVideoRef.current) {
+        displayVideoRef.current.srcObject = stream;
+        vlmVideoRef.current.srcObject = stream;
+        // Programmatically play both videos
+        displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+        vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+      }
+      setMode("Webcam");
+    } catch (e) {
+      setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
+      setMediaStream(null);
+      setLatestBoxes([]);
+      setInferenceStatus("Webcam access denied or failed.");
+    }
+  }, [cleanupMediaSource]);
+  // Handle URL Input (when Load button is clicked)
+  const handleLoadUrl = useCallback(() => {
+    cleanupMediaSource(); // Clean up any active stream
+    const url = currentUrlInput;
+    if (!url) {
+      setError("Please enter a valid URL.");
+      return;
+    }
+    if (displayVideoRef.current && vlmVideoRef.current) {
+      displayVideoRef.current.src = url;
+      vlmVideoRef.current.src = url;
+      displayVideoRef.current.load(); vlmVideoRef.current.load(); // Load the video
+      displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+      vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+      setMode("URL");
+    }
+  }, [currentUrlInput, cleanupMediaSource]);
+  // Handle File Input
+  const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
+    cleanupMediaSource(); // Clean up any active stream
+    const file = e.target.files?.[0] || null;
+    if (file) {
+      const fileUrl = URL.createObjectURL(file); // Create blob URL for the file
+      // Store the file to differentiate image/video and manage its URL
+      setUploadedFile(file);
+      if (isImageFile(file)) {
+        // For images, we handle processing on a button click, not a continuous loop
+        // The imageRef will display the image
+        // The canvas will be used for processing and drawing
+        setError(null);
+        setMode("File");
+      } else if (isVideoFile(file)) {
+        if (displayVideoRef.current && vlmVideoRef.current) {
+          displayVideoRef.current.src = fileUrl;
+          vlmVideoRef.current.src = fileUrl;
+          displayVideoRef.current.load(); vlmVideoRef.current.load();
+          displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
+          vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
+          setMode("File");
         }
+      } else {
+        setError("Unsupported file type. Please upload an image or video.");
+        setUploadedFile(null);
+        if (fileUrl) URL.revokeObjectURL(fileUrl); // Clean up invalid file URL
       }
     } else {
+      setUploadedFile(null); // Clear file if nothing selected
+      // If no file selected, revert to example video if in File mode
+      if (mode === "File") {
+        if (displayVideoRef.current && vlmVideoRef.current) {
+          displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
+          vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
+          displayVideoRef.current.load(); vlmVideoRef.current.load();
+          displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
+          vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
         }
+      }
     }
+  }, [cleanupMediaSource, mode]);
+  // Handler for processing an uploaded image file (one-time inference)
+  const handleProcessImage = async () => {
+    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
+      setError("Image or model not ready for processing.");
+      return;
+    }
+    const img = imageRef.current;
     const canvas = canvasRef.current;
     const ctx = canvas.getContext("2d");
     if (!ctx) return;
+    // Ensure canvas dimensions match image for processing and display
+    canvas.width = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    setProcessingState(true);
+    setError(null);
+    setInferenceStatus("Running image inference...");
+    try {
+      // Draw image to canvas to get ImageData for inference
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
+      const modelOutput = await runInference(imageData, prompt);
+      setDebugOutput(modelOutput);
+      setInferenceStatus("Image inference complete.");
+      // Clear canvas and redraw image before drawing boxes
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractJsonFromMarkdown(modelOutput) || [];
+      boxes = normalizeBoxes(boxes);
+      setLatestBoxes(boxes); // Update latestBoxes for display
+      if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
+    } catch (e) {
+      setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
+      setLatestBoxes([]);
+      setInferenceStatus("Image inference failed.");
+    } finally {
+      setProcessingState(false);
+    }
   };
+  // --- Rendered UI ---
   return (
+    <div className="absolute inset-0 text-white flex flex-col">
       <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
         {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
       </div>
+      <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div> {/* Adjusted top margin */}
+      <div className="flex flex-col items-center justify-center flex-1 w-full p-4"> {/* Added padding */}
         {/* Mode Selector */}
+        <div className="mb-6 mt-4"> {/* Increased margin-top for selector */}
           <div className="flex space-x-4">
             {MODES.map((m) => (
               <button
                   mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
                 }`}
                 onClick={() => setMode(m)}
+                disabled={!isLoaded && m !== "File"} // Disable if model not loaded, except for initial file view
               >
                 {m}
               </button>
           </div>
         </div>
+        {/* Dynamic Content Area */}
+        <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
+          {/* Prompt Input (Common to all modes) */}
+          <div className="mb-4 w-full max-w-xl">
+            <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+            <textarea
+              className="w-full p-2 rounded-lg text-black"
+              rows={3}
+              value={prompt}
+              onChange={(e) => setPrompt(e.target.value)}
+              disabled={processingState}
+            />
+          </div>
+          {/* Video/Image Display and Canvas Overlay */}
+          <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
+            {/* Conditional rendering for image vs video display */}
+            {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
+              <img
+                ref={imageRef}
+                src={URL.createObjectURL(uploadedFile)} // Use object URL for display
+                alt="Uploaded"
+                className="max-w-full max-h-full block object-contain"
+                style={{ position: 'absolute' }}
+                onLoad={() => {
+                    // This is important to ensure canvas matches image size for single image processing
+                    if (imageRef.current && canvasRef.current) {
+                        canvasRef.current.width = imageRef.current.naturalWidth;
+                        canvasRef.current.height = imageRef.current.naturalHeight;
+                    }
+                }}
+              />
+            ) : (
+              <video
+                ref={displayVideoRef}
+                autoPlay
+                muted
+                playsInline
+                loop // Loop for URL and File videos
+                className="max-w-full max-h-full block object-contain"
+                style={{ position: 'absolute' }}
+              />
+            )}
+            <canvas
+              ref={canvasRef}
+              className="absolute top-0 left-0 w-full h-full pointer-events-none"
+              style={{ zIndex: 10 }}
+            />
+          </div>
+          {/* Controls specific to each mode */}
+          <div className="mt-4 flex flex-col items-center gap-2">
+            {mode === "Webcam" && (
               <button
+                className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
+                onClick={handleWebcamInput} // This button sets up/starts webcam
+                disabled={processingState || !isLoaded}
               >
+                {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
               </button>
+            )}
+            {mode === "URL" && (
+              <>
+                <div className="flex w-full max-w-xl">
+                  <input
+                    type="text"
+                    className="flex-1 px-4 py-2 rounded-l-lg text-black"
+                    value={currentUrlInput}
+                    onChange={(e) => setCurrentUrlInput(e.target.value)}
+                    placeholder="Paste video URL here"
+                    disabled={processingState}
+                  />
+                  <button
+                    className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
+                    onClick={handleLoadUrl}
+                    disabled={processingState || !isLoaded}
+                  >
+                    Load URL
+                  </button>
+                </div>
+              </>
+            )}
+            {mode === "File" && (
+              <>
                 <input
                   type="file"
                   accept="image/*,video/*"
                   onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
+                  disabled={processingState}
                 />
+                {uploadedFile && isImageFile(uploadedFile) && (
                   <button
+                    className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
                     onClick={handleProcessImage}
+                    disabled={processingState || !isLoaded}
                   >
+                    {processingState ? "Processing Image..." : "Process Image"}
                   </button>
+                )}
+              </>
+            )}
+          </div>
+          {/* Error and Debug Output */}
+          {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
+          <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
+            <div>Raw Model Output:</div>
+            <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+          </div>
         </div>
       </div>
+      {/* Hidden Video for VLM processing - this must be rendered always */}
+      <video
+        ref={vlmVideoRef}
+        autoPlay
+        muted
+        playsInline
+        loop // Loop for URL and File videos
+        style={{ display: 'none' }} // Hidden from view
+      />
     </div>
   );
+}

src/index.js CHANGED Viewed

@@ -1,17 +1,17 @@
-import React from 'react';
-import ReactDOM from 'react-dom/client';
-import './index.css';
-import App from './App';
-import reportWebVitals from './reportWebVitals';
-const root = ReactDOM.createRoot(document.getElementById('root'));
-root.render(
-  <React.StrictMode>
-    <App />
-  </React.StrictMode>
-);
-// If you want to start measuring performance in your app, pass a function
-// to log results (for example: reportWebVitals(console.log))
-// or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
-reportWebVitals();

+import React from 'react';
+import ReactDOM from 'react-dom/client';
+import './index.css';
+import App from './App';
+import reportWebVitals from './reportWebVitals';
+const root = ReactDOM.createRoot(document.getElementById('root'));
+root.render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>
+);
+// If you want to start measuring performance in your app, pass a function
+// to log results (for example: reportWebVitals(console.log))
+// or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
+reportWebVitals();

src/reportWebVitals.js CHANGED Viewed

@@ -1,13 +1,13 @@
-const reportWebVitals = onPerfEntry => {
-  if (onPerfEntry && onPerfEntry instanceof Function) {
-    import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
-      getCLS(onPerfEntry);
-      getFID(onPerfEntry);
-      getFCP(onPerfEntry);
-      getLCP(onPerfEntry);
-      getTTFB(onPerfEntry);
-    });
-  }
-};
-export default reportWebVitals;

+const reportWebVitals = onPerfEntry => {
+  if (onPerfEntry && onPerfEntry instanceof Function) {
+    import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
+      getCLS(onPerfEntry);
+      getFID(onPerfEntry);
+      getFCP(onPerfEntry);
+      getLCP(onPerfEntry);
+      getTTFB(onPerfEntry);
+    });
+  }
+};
+export default reportWebVitals;

src/setupTests.js CHANGED Viewed

@@ -1,5 +1,5 @@
-// jest-dom adds custom jest matchers for asserting on DOM nodes.
-// allows you to do things like:
-// expect(element).toHaveTextContent(/react/i)
-// learn more: https://github.com/testing-library/jest-dom
-import '@testing-library/jest-dom';

+// jest-dom adds custom jest matchers for asserting on DOM nodes.
+// allows you to do things like:
+// expect(element).toHaveTextContent(/react/i)
+// learn more: https://github.com/testing-library/jest-dom
+import '@testing-library/jest-dom';