FastVLM-YoloV8n-v2

Running

App Files Files Community

Quazim0t0 commited on Jul 14

Commit

b7c497f

verified ·

1 Parent(s): 67f9568

Upload 51 files

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +720 -716

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -1,717 +1,721 @@
-import * as React from "react";
-import { useState, useRef, useEffect } from "react";
-import { useVLMContext } from "../context/useVLMContext";
-import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
-const MODES = ["File"] as const;
-type Mode = typeof MODES[number];
-const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
-const EXAMPLE_PROMPT = "Describe the video";
-function isImageFile(file: File) {
-  return file.type.startsWith("image/");
-}
-function isVideoFile(file: File) {
-  return file.type.startsWith("video/");
-}
-function denormalizeBox(box: number[], width: number, height: number) {
-  // If all values are between 0 and 1, treat as normalized
-  if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
-    return [
-      box[0] * width,
-      box[1] * height,
-      box[2] * width,
-      box[3] * height
-    ];
-  }
-  return box;
-}
-// Add this robust fallback parser near the top
-function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
-  // Try to parse as JSON first
-  try {
-    const parsed = JSON.parse(output);
-    if (Array.isArray(parsed)) {
-      const result: { label: string, bbox_2d: number[] }[] = [];
-      for (const obj of parsed) {
-        if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
-          if (Array.isArray(obj.bbox_2d[0])) {
-            for (const arr of obj.bbox_2d) {
-              if (Array.isArray(arr) && arr.length === 4) {
-                result.push({ label: obj.label, bbox_2d: arr });
-              }
-            }
-          } else if (obj.bbox_2d.length === 4) {
-            result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
-          }
-        }
-      }
-      if (result.length > 0) return result;
-    }
-  } catch (e) {}
-  // Fallback: extract all [x1, y1, x2, y2] arrays from the string
-  const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
-  const boxes: { label: string, bbox_2d: number[] }[] = [];
-  let match;
-  while ((match = boxRegex.exec(output)) !== null) {
-    const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
-    boxes.push({ label: '', bbox_2d: arr });
-  }
-  return boxes;
-}
-// NOTE: You must install onnxruntime-web:
-// npm install onnxruntime-web
-// @ts-ignore
-import * as ort from 'onnxruntime-web';
-// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
-// Set your YOLOv8 ONNX model URL here:
-const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
-// Add these constants to match the YOLOv8 input size
-const YOLOV8_INPUT_WIDTH = 640;
-const YOLOV8_INPUT_HEIGHT = 480;
-// 1. Load the ONNX model once
-let yoloSession: ort.InferenceSession | null = null;
-// Add a busy flag to prevent concurrent YOLOv8 inferences
-let isYoloBusy = false;
-async function loadYoloModel() {
-  if (!yoloSession) {
-    yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
-  }
-  return yoloSession;
-}
-// COCO class names for YOLOv8
-const YOLO_CLASSES: string[] = [
-  "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
-  "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
-  "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
-  "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
-  "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
-];
-// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
-function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
-  const width = 640;
-  const height = 480;
-  const canvas = document.createElement('canvas');
-  canvas.width = width;
-  canvas.height = height;
-  const ctx = canvas.getContext('2d');
-  if (!ctx) throw new Error('Could not get 2D context');
-  ctx.drawImage(video, 0, 0, width, height);
-  const imageData = ctx.getImageData(0, 0, width, height);
-  const { data } = imageData;
-  // Convert to Float32Array [1,3,480,640], normalize to [0,1]
-  const floatData = new Float32Array(1 * 3 * height * width);
-  for (let i = 0; i < width * height; i++) {
-    floatData[i] = data[i * 4] / 255; // R
-    floatData[i + width * height] = data[i * 4 + 1] / 255; // G
-    floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
-  }
-  return new ort.Tensor('float32', floatData, [1, 3, height, width]);
-}
-// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
-function postprocessYoloOutput(output: ort.Tensor) {
-  // output.dims: [1, num_detections, 6]
-  const data = output.data;
-  const numDetections = output.dims[1];
-  const results = [];
-  for (let i = 0; i < numDetections; i++) {
-    const offset = i * 6;
-    const x1 = data[offset];
-    const y1 = data[offset + 1];
-    const x2 = data[offset + 2];
-    const y2 = data[offset + 3];
-    const score = data[offset + 4];
-    const classId = data[offset + 5];
-    if (score < 0.2) continue; // adjust threshold as needed
-    results.push({
-      bbox: [x1, y1, x2, y2],
-      label: YOLO_CLASSES[classId] || `class_${classId}`,
-      score
-    });
-  }
-  return results;
-}
-// Helper type guard for annotation
-function hasAnnotation(obj: any): obj is { annotation: string } {
-  return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
-}
-export default function MultiSourceCaptioningView() {
-  const [mode, setMode] = useState<Mode>("File");
-  const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
-  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
-  const [processing, setProcessing] = useState(false);
-  const [error, setError] = useState<string | null>(null);
-  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
-  const [uploadedUrl, setUploadedUrl] = useState<string>("");
-  const [videoProcessing, setVideoProcessing] = useState(false);
-  const [imageProcessed, setImageProcessed] = useState(false);
-  const [exampleProcessing, setExampleProcessing] = useState(false);
-  const [debugOutput, setDebugOutput] = useState<string>("");
-  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
-  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
-  const [inferenceStatus, setInferenceStatus] = useState<string>("");
-  const [showProcessingVideo, setShowProcessingVideo] = useState(false);
-  const videoRef = useRef<HTMLVideoElement | null>(null);
-  const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
-  const processingVideoRef = useRef<HTMLVideoElement | null>(null);
-  const canvasRef = useRef<HTMLCanvasElement | null>(null);
-  const imageRef = useRef<HTMLImageElement | null>(null);
-  const boxHistoryRef = useRef<any[]>([]);
-  // Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
-  const lastYoloBoxesRef = React.useRef<any[]>([]);
-  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
-  // Remove videoProcessingRef and exampleProcessingRef
-  // Add a single processingLoopRef
-  const processingLoopRef = React.useRef(false);
-  const processVideoLoop = async () => {
-    if (!processingLoopRef.current) return;
-    if (isYoloBusy) {
-      // Optionally log: "Inference already running, skipping frame"
-      requestAnimationFrame(processVideoLoop);
-      return;
-    }
-    await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
-    // Schedule the next frame as soon as possible
-    requestAnimationFrame(processVideoLoop);
-  };
-  const processExampleLoop = async () => {
-    while (processingLoopRef.current) {
-      await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
-      await new Promise(res => setTimeout(res, 1000));
-    }
-  };
-  // Set your YOLOv8 ONNX backend API endpoint here:
-  // const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
-  // Add this useEffect for overlay video synchronization
-  useEffect(() => {
-    const main = videoRef.current;
-    const overlay = overlayVideoRef.current;
-    if (!main || !overlay) return;
-    // Sync play/pause
-    const onPlay = () => { if (overlay.paused) overlay.play(); };
-    const onPause = () => { if (!overlay.paused) overlay.pause(); };
-    // Sync seeking and time
-    const onSeekOrTime = () => {
-      if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
-        overlay.currentTime = main.currentTime;
-      }
-    };
-    main.addEventListener('play', onPlay);
-    main.addEventListener('pause', onPause);
-    main.addEventListener('seeked', onSeekOrTime);
-    main.addEventListener('timeupdate', onSeekOrTime);
-    // Clean up
-    return () => {
-      main.removeEventListener('play', onPlay);
-      main.removeEventListener('pause', onPause);
-      main.removeEventListener('seeked', onSeekOrTime);
-      main.removeEventListener('timeupdate', onSeekOrTime);
-    };
-  }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
-  useEffect(() => {
-    if ((mode === "File") && processingVideoRef.current) {
-      processingVideoRef.current.play().catch(() => {});
-    }
-  }, [mode, videoUrl, uploadedUrl]);
-  // Remove old prompt-based box extraction logic and only use the above for video frames.
-  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
-    const file = e.target.files?.[0] || null;
-    setUploadedFile(file);
-    setUploadedUrl(file ? URL.createObjectURL(file) : "");
-    setError(null);
-    setImageProcessed(false);
-    setVideoProcessing(false);
-    setExampleProcessing(false);
-  };
-  // Webcam mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
-    processVideoLoop();
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
-  // Example video mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
-    processExampleLoop();
-  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
-  // File mode: process uploaded image (only on button click)
-  const handleProcessImage = async () => {
-    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
-    const img = imageRef.current;
-    const canvas = canvasRef.current;
-    canvas.width = img.naturalWidth;
-    canvas.height = img.naturalHeight;
-    setCanvasDims({w:canvas.width,h:canvas.height});
-    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-    setProcessing(true);
-    setError(null);
-    setInferenceStatus("Running inference...");
-    await runInference(img, prompt, (output: string) => {
-      setDebugOutput(output);
-      setInferenceStatus("Inference complete.");
-      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
-      let boxes = extractAllBoundingBoxes(output);
-      console.log("Model output:", output);
-      console.log("Boxes after normalization:", boxes);
-      console.log("Canvas size:", canvas.width, canvas.height);
-      if (boxes.length > 0) {
-        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
-        console.log("First box coords:", x1, y1, x2, y2);
-      }
-      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
-      if (Array.isArray(boxes) && boxes.length > 0) {
-        const scaleX = canvas.width / img.naturalWidth;
-        const scaleY = canvas.height / img.naturalHeight;
-        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
-      }
-      setImageProcessed(true);
-    });
-    setProcessing(false);
-  };
-  // File mode: process uploaded video frames (start/stop)
-  const handleToggleVideoProcessing = () => {
-    setVideoProcessing((prev: boolean) => {
-      const next = !prev;
-      // Always stop all loops before starting
-      processingLoopRef.current = false;
-      setTimeout(() => {
-        if (next) {
-          processingLoopRef.current = true;
-          processVideoLoop();
-        }
-      }, 50);
-      return next;
-    });
-  };
-  // Handle start/stop for example video processing
-  const handleToggleExampleProcessing = () => {
-    setExampleProcessing((prev: boolean) => {
-      const next = !prev;
-      // Always stop all loops before starting
-      processingLoopRef.current = false;
-      setTimeout(() => {
-        if (next) {
-          processingLoopRef.current = true;
-          processVideoLoop();
-        }
-      }, 50);
-      return next;
-    });
-  };
-  // Test draw box function
-  const handleTestDrawBox = () => {
-    if (!canvasRef.current) return;
-    const canvas = canvasRef.current;
-    const ctx = canvas.getContext("2d");
-    if (!ctx) return;
-    ctx.clearRect(0, 0, canvas.width, canvas.height);
-    ctx.strokeStyle = "#FF00FF";
-    ctx.lineWidth = 4;
-    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
-    ctx.font = "20px Arial";
-    ctx.fillStyle = "#FF00FF";
-    ctx.fillText("Test Box", 50, 35);
-  };
-  useEffect(() => {
-    const draw = () => {
-      const overlayVideo = overlayVideoRef.current;
-      const canvas = canvasRef.current;
-      if (!overlayVideo || !canvas) return;
-      const displayWidth = overlayVideo.clientWidth;
-      const displayHeight = overlayVideo.clientHeight;
-      canvas.width = displayWidth;
-      canvas.height = displayHeight;
-      const ctx = canvas.getContext("2d");
-      if (!ctx) return;
-      ctx.clearRect(0, 0, canvas.width, canvas.height);
-      const now = Date.now();
-      const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
-      if (boxHistory.length > 0) {
-        // Fix: Draw all boxes, even if bbox_2d is an array of arrays
-        const denormalizedBoxes: any[] = [];
-        for (const b of boxHistory) {
-          if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
-            // Multiple boxes per label
-            for (const arr of b.bbox_2d) {
-              if (Array.isArray(arr) && arr.length === 4) {
-                denormalizedBoxes.push({
-                  ...b,
-                  bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
-                });
-              }
-            }
-          } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
-            // Single box
-            denormalizedBoxes.push({
-              ...b,
-              bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
-            });
-          }
-        }
-        drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
-      }
-    };
-    draw();
-    const interval = setInterval(draw, 100);
-    // Redraw on window resize
-    const handleResize = () => draw();
-    window.addEventListener('resize', handleResize);
-    return () => {
-      clearInterval(interval);
-      window.removeEventListener('resize', handleResize);
-    };
-  }, [overlayVideoRef, canvasRef]);
-  // Drawing loop: draws the latest YOLOv8 boxes every frame
-  React.useEffect(() => {
-    let running = true;
-    function drawLoop() {
-      if (!running) return;
-      const overlayVideo = overlayVideoRef.current;
-      const canvas = canvasRef.current;
-      const processingVideo = processingVideoRef.current;
-      if (canvas && overlayVideo && processingVideo) {
-        // Set canvas size to match the visible video
-        canvas.width = overlayVideo.clientWidth;
-        canvas.height = overlayVideo.clientHeight;
-        const ctx = canvas.getContext('2d');
-        if (ctx) {
-          ctx.clearRect(0, 0, canvas.width, canvas.height);
-          // Draw all YOLOv8 boxes from last detection
-          const yoloBoxes = lastYoloBoxesRef.current;
-          yoloBoxes.forEach((obj: any) => {
-            // Scale from YOLOv8 input size to canvas size
-            const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
-            const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
-            const [x1, y1, x2, y2] = obj.bbox;
-            const drawX = x1 * scaleX;
-            const drawY = y1 * scaleY;
-            const drawW = (x2 - x1) * scaleX;
-            const drawH = (y2 - y1) * scaleY;
-            ctx.strokeStyle = '#00FFFF';
-            ctx.lineWidth = 5;
-            ctx.strokeRect(drawX, drawY, drawW, drawH);
-            ctx.font = 'bold 22px Arial';
-            // Draw YOLOv8 label and confidence
-            const yoloLabel = obj.label || '';
-            const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
-            const yoloText = `${yoloLabel}${yoloScore}`;
-            ctx.fillStyle = 'rgba(0,0,0,0.7)';
-            const yoloTextWidth = ctx.measureText(yoloText).width + 8;
-            ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
-            ctx.fillStyle = '#00FFFF';
-            ctx.fillText(yoloText, drawX, drawY - 4);
-            // Draw FastVLM annotation below the box if available
-            if (hasAnnotation(obj)) {
-              ctx.font = 'bold 18px Arial';
-              ctx.fillStyle = 'rgba(0,0,0,0.7)';
-              const annTextWidth = ctx.measureText(obj.annotation).width + 8;
-              ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
-              ctx.fillStyle = '#00FFFF';
-              ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
-            }
-          });
-        }
-      }
-      requestAnimationFrame(drawLoop);
-    }
-    drawLoop();
-    return () => { running = false; };
-  }, [overlayVideoRef, canvasRef, processingVideoRef]);
-  // YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
-  const yoloDetectionLoop = async () => {
-    if (!processingLoopRef.current) return;
-    if (isYoloBusy) {
-      requestAnimationFrame(yoloDetectionLoop);
-      return;
-    }
-    isYoloBusy = true;
-    try {
-      const processingVideo = processingVideoRef.current;
-      if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
-        isYoloBusy = false;
-        requestAnimationFrame(yoloDetectionLoop);
-        return;
-      }
-      // Run YOLOv8 detection
-      const session = await loadYoloModel();
-      const inputTensor = preprocessFrameToTensor(processingVideo);
-      const feeds: Record<string, ort.Tensor> = {};
-      feeds[session.inputNames[0]] = inputTensor;
-      const results = await session.run(feeds);
-      const output = results[session.outputNames[0]];
-      const detections = postprocessYoloOutput(output);
-      lastYoloBoxesRef.current = detections;
-      // Run FastVLM on the full frame (wait for YOLOv8 to finish)
-      await runInference(processingVideo, prompt, (output: string) => {
-        setDebugOutput(output);
-      });
-    } catch (err) {
-      console.error('YOLOv8+FastVLM error:', err);
-    } finally {
-      isYoloBusy = false;
-      requestAnimationFrame(yoloDetectionLoop);
-    }
-  };
-  // Add this effect after the processing loop and toggle handlers
-  useEffect(() => {
-    // Stop processing loop on video source change or processing toggle
-    processingLoopRef.current = false;
-    // Start processing loop for the correct video after refs update
-    setTimeout(() => {
-      if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
-        processingLoopRef.current = true;
-        yoloDetectionLoop();
-      } else if (exampleProcessing && !uploadedFile) {
-        processingLoopRef.current = true;
-        yoloDetectionLoop();
-      }
-    }, 100);
-    // eslint-disable-next-line
-  }, [uploadedFile, videoProcessing, exampleProcessing]);
-  return (
-    <div className="absolute inset-0 text-white">
-      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
-        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
-      </div>
-      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
-      <div className="flex flex-col items-center justify-center h-full w-full">
-        {/* Mode Selector */}
-        <div className="mb-6">
-          <div className="flex space-x-4">
-            {MODES.map((m) => (
-              <button
-                key={m}
-                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
-                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
-                }`}
-                onClick={() => setMode(m)}
-              >
-                {m}
-              </button>
-            ))}
-          </div>
-        </div>
-        {/* Mode Content */}
-        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
-          {mode === "File" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="mb-4 w-full max-w-xl">
-                <input
-                  type="file"
-                  accept="image/*,video/*"
-                  onChange={handleFileChange}
-                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
-                />
-              </div>
-              {/* Add toggle button above video area */}
-              <div className="mb-2 w-full max-w-xl flex justify-end">
-                <button
-                  className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
-                  onClick={() => setShowProcessingVideo(v => !v)}
-                  type="button"
-                >
-                  {showProcessingVideo ? 'Hide' : 'Show'} Processed Video
-                </button>
-              </div>
-              {/* Show uploaded image */}
-              {uploadedFile && isImageFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl">
-                  <img
-                    ref={imageRef}
-                    src={uploadedUrl}
-                    alt="Uploaded"
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222" }}
-                  />
-                  <canvas
-                    ref={canvasRef}
-                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                    style={{ zIndex: 10, pointerEvents: "none" }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleProcessImage}
-                    disabled={processing}
-                  >
-                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
-                  </button>
-                </div>
-              )}
-              {/* Show uploaded video */}
-              {uploadedFile && isVideoFile(uploadedFile) && (
-                <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
-                  {/* Visible overlay video for user */}
-                  <video
-                    ref={overlayVideoRef}
-                    src={uploadedUrl}
-                    controls
-                    autoPlay
-                    loop
-                    muted
-                    playsInline
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222", display: "block" }}
-                    onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
-                      if (canvasRef.current) {
-                        canvasRef.current.width = e.currentTarget.clientWidth;
-                        canvasRef.current.height = e.currentTarget.clientHeight;
-                      }
-                    }}
-                    onResize={() => {
-                      if (canvasRef.current && overlayVideoRef.current) {
-                        canvasRef.current.width = overlayVideoRef.current.clientWidth;
-                        canvasRef.current.height = overlayVideoRef.current.clientHeight;
-                      }
-                    }}
-                  />
-                  {/* Canvas overlay */}
-                  <canvas
-                    ref={canvasRef}
-                    style={{
-                      position: "absolute",
-                      top: 0,
-                      left: 0,
-                      width: "100%",
-                      height: "100%",
-                      zIndex: 100,
-                      pointerEvents: "none",
-                      display: "block"
-                    }}
-                    width={overlayVideoRef.current?.clientWidth || 640}
-                    height={overlayVideoRef.current?.clientHeight || 480}
-                  />
-                  {/* Hidden or visible processing video for FastVLM/canvas */}
-                  <video
-                    ref={processingVideoRef}
-                    src={uploadedUrl}
-                    autoPlay
-                    loop
-                    muted
-                    playsInline
-                    style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
-                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleVideoProcessing}
-                  >
-                    {videoProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {/* Show example video if no file uploaded */}
-              {!uploadedFile && (
-                <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
-                  {/* Visible overlay video for user */}
-                  <video
-                    ref={overlayVideoRef}
-                    src={EXAMPLE_VIDEO_URL}
-                    controls
-                    autoPlay
-                    loop
-                    muted
-                    playsInline
-                    className="w-full rounded-lg shadow-lg mb-2"
-                    style={{ background: "#222", display: "block" }}
-                  />
-                  {/* Canvas overlay */}
-                  <canvas
-                    ref={canvasRef}
-                    style={{
-                      position: "absolute",
-                      top: 0,
-                      left: 0,
-                      width: "100%",
-                      height: "100%",
-                      zIndex: 100,
-                      pointerEvents: "none",
-                      display: "block"
-                    }}
-                    width={overlayVideoRef.current?.clientWidth || 640}
-                    height={overlayVideoRef.current?.clientHeight || 480}
-                  />
-                  {/* Hidden or visible processing video for FastVLM/canvas */}
-                  <video
-                    ref={processingVideoRef}
-                    src={EXAMPLE_VIDEO_URL}
-                    autoPlay
-                    loop
-                    muted
-                    playsInline
-                    style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
-                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
-                  />
-                  <button
-                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
-                    onClick={handleToggleExampleProcessing}
-                  >
-                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
-                  </button>
-                </div>
-              )}
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-              <button
-                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
-                onClick={handleTestDrawBox}
-              >
-                Test Draw Box
-              </button>
-              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
-                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
-                <div>Raw Model Output:</div>
-                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
-              </div>
-            </div>
-          )}
-        </div>
-      </div>
-    </div>
-  );
 }

+import * as React from "react";
+import { useState, useRef, useEffect } from "react";
+import { useVLMContext } from "../context/useVLMContext";
+import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
+const MODES = ["File"] as const;
+type Mode = typeof MODES[number];
+const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4";
+const EXAMPLE_PROMPT = "Describe the video";
+function isImageFile(file: File) {
+  return file.type.startsWith("image/");
+}
+function isVideoFile(file: File) {
+  return file.type.startsWith("video/");
+}
+function denormalizeBox(box: number[], width: number, height: number) {
+  // If all values are between 0 and 1, treat as normalized
+  if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) {
+    return [
+      box[0] * width,
+      box[1] * height,
+      box[2] * width,
+      box[3] * height
+    ];
+  }
+  return box;
+}
+// Add this robust fallback parser near the top
+function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
+  // Try to parse as JSON first
+  try {
+    const parsed = JSON.parse(output);
+    if (Array.isArray(parsed)) {
+      const result: { label: string, bbox_2d: number[] }[] = [];
+      for (const obj of parsed) {
+        if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
+          if (Array.isArray(obj.bbox_2d[0])) {
+            for (const arr of obj.bbox_2d) {
+              if (Array.isArray(arr) && arr.length === 4) {
+                result.push({ label: obj.label, bbox_2d: arr });
+              }
+            }
+          } else if (obj.bbox_2d.length === 4) {
+            result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
+          }
+        }
+      }
+      if (result.length > 0) return result;
+    }
+  } catch (e) {}
+  // Fallback: extract all [x1, y1, x2, y2] arrays from the string
+  const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
+  const boxes: { label: string, bbox_2d: number[] }[] = [];
+  let match;
+  while ((match = boxRegex.exec(output)) !== null) {
+    const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
+    boxes.push({ label: '', bbox_2d: arr });
+  }
+  return boxes;
+}
+// NOTE: You must install onnxruntime-web:
+// npm install onnxruntime-web
+// @ts-ignore
+import * as ort from 'onnxruntime-web';
+// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web';
+// Set your YOLOv8 ONNX model URL here:
+const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE
+// Add these constants to match the YOLOv8 input size
+const YOLOV8_INPUT_WIDTH = 640;
+const YOLOV8_INPUT_HEIGHT = 480;
+// 1. Load the ONNX model once
+let yoloSession: ort.InferenceSession | null = null;
+// Add a busy flag to prevent concurrent YOLOv8 inferences
+let isYoloBusy = false;
+async function loadYoloModel() {
+  if (!yoloSession) {
+    yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL);
+  }
+  return yoloSession;
+}
+// COCO class names for YOLOv8
+const YOLO_CLASSES: string[] = [
+  "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
+  "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+  "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
+  "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+  "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
+];
+// Preprocess video frame to YOLOv8 input tensor [1,3,640,640]
+function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor {
+  const width = 640;
+  const height = 480;
+  const canvas = document.createElement('canvas');
+  canvas.width = width;
+  canvas.height = height;
+  const ctx = canvas.getContext('2d');
+  if (!ctx) throw new Error('Could not get 2D context');
+  ctx.drawImage(video, 0, 0, width, height);
+  const imageData = ctx.getImageData(0, 0, width, height);
+  const { data } = imageData;
+  // Convert to Float32Array [1,3,480,640], normalize to [0,1]
+  const floatData = new Float32Array(1 * 3 * height * width);
+  for (let i = 0; i < width * height; i++) {
+    floatData[i] = data[i * 4] / 255; // R
+    floatData[i + width * height] = data[i * 4 + 1] / 255; // G
+    floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B
+  }
+  return new ort.Tensor('float32', floatData, [1, 3, height, width]);
+}
+// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters
+function postprocessYoloOutput(output: ort.Tensor) {
+  // output.dims: [1, num_detections, 6]
+  const data = output.data;
+  const numDetections = output.dims[1];
+  const results = [];
+  for (let i = 0; i < numDetections; i++) {
+    const offset = i * 6;
+    const x1 = data[offset];
+    const y1 = data[offset + 1];
+    const x2 = data[offset + 2];
+    const y2 = data[offset + 3];
+    const score = data[offset + 4];
+    const classId = data[offset + 5];
+    if (score < 0.2) continue; // adjust threshold as needed
+    results.push({
+      bbox: [x1, y1, x2, y2],
+      label: YOLO_CLASSES[classId] || `class_${classId}`,
+      score
+    });
+  }
+  return results;
+}
+// Helper type guard for annotation
+function hasAnnotation(obj: any): obj is { annotation: string } {
+  return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string';
+}
+export default function MultiSourceCaptioningView() {
+  const [mode, setMode] = useState<Mode>("File");
+  const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
+  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
+  const [processing, setProcessing] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [uploadedUrl, setUploadedUrl] = useState<string>("");
+  const [videoProcessing, setVideoProcessing] = useState(false);
+  const [imageProcessed, setImageProcessed] = useState(false);
+  const [exampleProcessing, setExampleProcessing] = useState(false);
+  const [debugOutput, setDebugOutput] = useState<string>("");
+  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
+  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
+  const [inferenceStatus, setInferenceStatus] = useState<string>("");
+  const [showProcessingVideo, setShowProcessingVideo] = useState(false);
+  const videoRef = useRef<HTMLVideoElement | null>(null);
+  const overlayVideoRef = useRef<HTMLVideoElement | null>(null);
+  const processingVideoRef = useRef<HTMLVideoElement | null>(null);
+  const canvasRef = useRef<HTMLCanvasElement | null>(null);
+  const imageRef = useRef<HTMLImageElement | null>(null);
+  const boxHistoryRef = useRef<any[]>([]);
+  // Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation)
+  const lastYoloBoxesRef = React.useRef<any[]>([]);
+  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
+  // Remove videoProcessingRef and exampleProcessingRef
+  // Add a single processingLoopRef
+  const processingLoopRef = React.useRef(false);
+  const processVideoLoop = async () => {
+    if (!processingLoopRef.current) return;
+    if (isYoloBusy) {
+      // Optionally log: "Inference already running, skipping frame"
+      requestAnimationFrame(processVideoLoop);
+      return;
+    }
+    await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
+    // Schedule the next frame as soon as possible
+    requestAnimationFrame(processVideoLoop);
+  };
+  const processExampleLoop = async () => {
+    while (processingLoopRef.current) {
+      await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop
+      await new Promise(res => setTimeout(res, 1000));
+    }
+  };
+  // Set your YOLOv8 ONNX backend API endpoint here:
+  // const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE
+  // Add this useEffect for overlay video synchronization
+  useEffect(() => {
+    const main = videoRef.current;
+    const overlay = overlayVideoRef.current;
+    if (!main || !overlay) return;
+    // Sync play/pause
+    const onPlay = () => { if (overlay.paused) overlay.play(); };
+    const onPause = () => { if (!overlay.paused) overlay.pause(); };
+    // Sync seeking and time
+    const onSeekOrTime = () => {
+      if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) {
+        overlay.currentTime = main.currentTime;
+      }
+    };
+    main.addEventListener('play', onPlay);
+    main.addEventListener('pause', onPause);
+    main.addEventListener('seeked', onSeekOrTime);
+    main.addEventListener('timeupdate', onSeekOrTime);
+    // Clean up
+    return () => {
+      main.removeEventListener('play', onPlay);
+      main.removeEventListener('pause', onPause);
+      main.removeEventListener('seeked', onSeekOrTime);
+      main.removeEventListener('timeupdate', onSeekOrTime);
+    };
+  }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]);
+  useEffect(() => {
+    if ((mode === "File") && processingVideoRef.current) {
+      processingVideoRef.current.play().catch(() => {});
+    }
+  }, [mode, videoUrl, uploadedUrl]);
+  // Remove old prompt-based box extraction logic and only use the above for video frames.
+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0] || null;
+    setUploadedFile(file);
+    setUploadedUrl(file ? URL.createObjectURL(file) : "");
+    setError(null);
+    setImageProcessed(false);
+    setVideoProcessing(false);
+    setExampleProcessing(false);
+  };
+  // Webcam mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
+    processVideoLoop();
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
+  // Example video mode: process frames with setInterval
+  useEffect(() => {
+    if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
+    processExampleLoop();
+  }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
+  // File mode: process uploaded image (only on button click)
+  const handleProcessImage = async () => {
+    if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
+    const img = imageRef.current;
+    const canvas = canvasRef.current;
+    canvas.width = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    setCanvasDims({w:canvas.width,h:canvas.height});
+    setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+    setProcessing(true);
+    setError(null);
+    setInferenceStatus("Running inference...");
+    await runInference(img, prompt, (output: string) => {
+      setDebugOutput(output);
+      setInferenceStatus("Inference complete.");
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      let boxes = extractAllBoundingBoxes(output);
+      console.log("Model output:", output);
+      console.log("Boxes after normalization:", boxes);
+      console.log("Canvas size:", canvas.width, canvas.height);
+      if (boxes.length > 0) {
+        const [x1, y1, x2, y2] = boxes[0].bbox_2d;
+        console.log("First box coords:", x1, y1, x2, y2);
+      }
+      if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
+      if (Array.isArray(boxes) && boxes.length > 0) {
+        const scaleX = canvas.width / img.naturalWidth;
+        const scaleY = canvas.height / img.naturalHeight;
+        drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
+      }
+      setImageProcessed(true);
+    });
+    setProcessing(false);
+  };
+  // File mode: process uploaded video frames (start/stop)
+  const handleToggleVideoProcessing = () => {
+    setVideoProcessing((prev: boolean) => {
+      const next = !prev;
+      // Always stop all loops before starting
+      processingLoopRef.current = false;
+      setTimeout(() => {
+        if (next) {
+          processingLoopRef.current = true;
+          processVideoLoop();
+        }
+      }, 50);
+      return next;
+    });
+  };
+  // Handle start/stop for example video processing
+  const handleToggleExampleProcessing = () => {
+    setExampleProcessing((prev: boolean) => {
+      const next = !prev;
+      // Always stop all loops before starting
+      processingLoopRef.current = false;
+      setTimeout(() => {
+        if (next) {
+          processingLoopRef.current = true;
+          processVideoLoop();
+        }
+      }, 50);
+      return next;
+    });
+  };
+  // Test draw box function
+  const handleTestDrawBox = () => {
+    if (!canvasRef.current) return;
+    const canvas = canvasRef.current;
+    const ctx = canvas.getContext("2d");
+    if (!ctx) return;
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    ctx.strokeStyle = "#FF00FF";
+    ctx.lineWidth = 4;
+    ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
+    ctx.font = "20px Arial";
+    ctx.fillStyle = "#FF00FF";
+    ctx.fillText("Test Box", 50, 35);
+  };
+  useEffect(() => {
+    const draw = () => {
+      const overlayVideo = overlayVideoRef.current;
+      const canvas = canvasRef.current;
+      if (!overlayVideo || !canvas) return;
+      const displayWidth = overlayVideo.clientWidth;
+      const displayHeight = overlayVideo.clientHeight;
+      canvas.width = displayWidth;
+      canvas.height = displayHeight;
+      const ctx = canvas.getContext("2d");
+      if (!ctx) return;
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      const now = Date.now();
+      const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000);
+      if (boxHistory.length > 0) {
+        // Fix: Draw all boxes, even if bbox_2d is an array of arrays
+        const denormalizedBoxes: any[] = [];
+        for (const b of boxHistory) {
+          if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
+            // Multiple boxes per label
+            for (const arr of b.bbox_2d) {
+              if (Array.isArray(arr) && arr.length === 4) {
+                denormalizedBoxes.push({
+                  ...b,
+                  bbox_2d: denormalizeBox(arr, displayWidth, displayHeight)
+                });
+              }
+            }
+          } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
+            // Single box
+            denormalizedBoxes.push({
+              ...b,
+              bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight)
+            });
+          }
+        }
+        drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 });
+      }
+    };
+    draw();
+    const interval = setInterval(draw, 100);
+    // Redraw on window resize
+    const handleResize = () => draw();
+    window.addEventListener('resize', handleResize);
+    return () => {
+      clearInterval(interval);
+      window.removeEventListener('resize', handleResize);
+    };
+  }, [overlayVideoRef, canvasRef]);
+  // Drawing loop: draws the latest YOLOv8 boxes every frame
+  React.useEffect(() => {
+    let running = true;
+    function drawLoop() {
+      if (!running) return;
+      const overlayVideo = overlayVideoRef.current;
+      const canvas = canvasRef.current;
+      const processingVideo = processingVideoRef.current;
+      if (canvas && overlayVideo && processingVideo) {
+        // Set canvas size to match the visible video
+        canvas.width = overlayVideo.clientWidth;
+        canvas.height = overlayVideo.clientHeight;
+        const ctx = canvas.getContext('2d');
+        if (ctx) {
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          // Draw all YOLOv8 boxes from last detection
+          const yoloBoxes = lastYoloBoxesRef.current;
+          yoloBoxes.forEach((obj: any) => {
+            // Scale from YOLOv8 input size to canvas size
+            const scaleX = canvas.width / YOLOV8_INPUT_WIDTH;
+            const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT;
+            const [x1, y1, x2, y2] = obj.bbox;
+            const drawX = x1 * scaleX;
+            const drawY = y1 * scaleY;
+            const drawW = (x2 - x1) * scaleX;
+            const drawH = (y2 - y1) * scaleY;
+            ctx.strokeStyle = '#00FFFF';
+            ctx.lineWidth = 5;
+            ctx.strokeRect(drawX, drawY, drawW, drawH);
+            ctx.font = 'bold 22px Arial';
+            // Draw YOLOv8 label and confidence
+            const yoloLabel = obj.label || '';
+            const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : '';
+            const yoloText = `${yoloLabel}${yoloScore}`;
+            ctx.fillStyle = 'rgba(0,0,0,0.7)';
+            const yoloTextWidth = ctx.measureText(yoloText).width + 8;
+            ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26);
+            ctx.fillStyle = '#00FFFF';
+            ctx.fillText(yoloText, drawX, drawY - 4);
+            // Draw FastVLM annotation below the box if available
+            if (hasAnnotation(obj)) {
+              ctx.font = 'bold 18px Arial';
+              ctx.fillStyle = 'rgba(0,0,0,0.7)';
+              const annTextWidth = ctx.measureText(obj.annotation).width + 8;
+              ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24);
+              ctx.fillStyle = '#00FFFF';
+              ctx.fillText(obj.annotation, drawX, drawY + drawH + 22);
+            }
+          });
+        }
+      }
+      requestAnimationFrame(drawLoop);
+    }
+    drawLoop();
+    return () => { running = false; };
+  }, [overlayVideoRef, canvasRef, processingVideoRef]);
+  // YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background
+  const yoloDetectionLoop = async () => {
+    if (!processingLoopRef.current) return;
+    if (isYoloBusy) {
+      requestAnimationFrame(yoloDetectionLoop);
+      return;
+    }
+    isYoloBusy = true;
+    try {
+      const processingVideo = processingVideoRef.current;
+      if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) {
+        isYoloBusy = false;
+        requestAnimationFrame(yoloDetectionLoop);
+        return;
+      }
+      // Run YOLOv8 detection
+      const session = await loadYoloModel();
+      const inputTensor = preprocessFrameToTensor(processingVideo);
+      const feeds: Record<string, ort.Tensor> = {};
+      feeds[session.inputNames[0]] = inputTensor;
+      const results = await session.run(feeds);
+      const output = results[session.outputNames[0]];
+      const detections = postprocessYoloOutput(output);
+      lastYoloBoxesRef.current = detections;
+      // Run FastVLM on the full frame (wait for YOLOv8 to finish)
+      await runInference(processingVideo, prompt, (output: string) => {
+        setDebugOutput(output);
+      });
+    } catch (err) {
+      console.error('YOLOv8+FastVLM error:', err);
+    } finally {
+      isYoloBusy = false;
+      requestAnimationFrame(yoloDetectionLoop);
+    }
+  };
+  // Add this effect after the processing loop and toggle handlers
+  useEffect(() => {
+    // Stop processing loop on video source change or processing toggle
+    processingLoopRef.current = false;
+    // Start processing loop for the correct video after refs update
+    setTimeout(() => {
+      if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) {
+        processingLoopRef.current = true;
+        yoloDetectionLoop();
+      } else if (exampleProcessing && !uploadedFile) {
+        processingLoopRef.current = true;
+        yoloDetectionLoop();
+      }
+    }, 100);
+    // eslint-disable-next-line
+  }, [uploadedFile, videoProcessing, exampleProcessing]);
+  return (
+    <div className="absolute inset-0 text-white">
+      <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
+        {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
+      </div>
+      <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
+      <div className="flex flex-col items-center justify-center h-full w-full">
+        {/* Mode Selector */}
+        <div className="mb-6">
+          <div className="flex space-x-4">
+            {MODES.map((m) => (
+              <button
+                key={m}
+                className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
+                  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
+                }`}
+                onClick={() => setMode(m)}
+              >
+                {m}
+              </button>
+            ))}
+          </div>
+        </div>
+        {/* Mode Content */}
+        <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
+          {mode === "File" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <input
+                  type="file"
+                  accept="image/*,video/*"
+                  onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
+                />
+              </div>
+              {/* Add toggle button above video area */}
+              <div className="mb-2 w-full max-w-xl flex justify-end">
+                <button
+                  className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`}
+                  onClick={() => setShowProcessingVideo(v => !v)}
+                  type="button"
+                >
+                  {showProcessingVideo ? 'Hide' : 'Show'} Processed Video
+                </button>
+              </div>
+              {/* Show uploaded image */}
+              {uploadedFile && isImageFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <img
+                    ref={imageRef}
+                    src={uploadedUrl}
+                    alt="Uploaded"
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleProcessImage}
+                    disabled={processing}
+                  >
+                    {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
+                  </button>
+                </div>
+              )}
+              {/* Show uploaded video */}
+              {uploadedFile && isVideoFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
+                  {/* Visible overlay video for user */}
+                  <video
+                    ref={overlayVideoRef}
+                    src={uploadedUrl}
+                    controls
+                    autoPlay
+                    loop
+                    muted
+                    playsInline
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222", display: "block" }}
+                    crossOrigin="anonymous"
+                    onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => {
+                      if (canvasRef.current) {
+                        canvasRef.current.width = e.currentTarget.clientWidth;
+                        canvasRef.current.height = e.currentTarget.clientHeight;
+                      }
+                    }}
+                    onResize={() => {
+                      if (canvasRef.current && overlayVideoRef.current) {
+                        canvasRef.current.width = overlayVideoRef.current.clientWidth;
+                        canvasRef.current.height = overlayVideoRef.current.clientHeight;
+                      }
+                    }}
+                  />
+                  {/* Canvas overlay */}
+                  <canvas
+                    ref={canvasRef}
+                    style={{
+                      position: "absolute",
+                      top: 0,
+                      left: 0,
+                      width: "100%",
+                      height: "100%",
+                      zIndex: 100,
+                      pointerEvents: "none",
+                      display: "block"
+                    }}
+                    width={overlayVideoRef.current?.clientWidth || 640}
+                    height={overlayVideoRef.current?.clientHeight || 480}
+                  />
+                  {/* Hidden or visible processing video for FastVLM/canvas */}
+                  <video
+                    ref={processingVideoRef}
+                    src={uploadedUrl}
+                    autoPlay
+                    loop
+                    muted
+                    playsInline
+                    crossOrigin="anonymous"
+                    style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
+                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleToggleVideoProcessing}
+                  >
+                    {videoProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {/* Show example video if no file uploaded */}
+              {!uploadedFile && (
+                <div className="relative w-full max-w-xl" style={{ position: 'relative' }}>
+                  {/* Visible overlay video for user */}
+                  <video
+                    ref={overlayVideoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    controls
+                    autoPlay
+                    loop
+                    muted
+                    playsInline
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222", display: "block" }}
+                    crossOrigin="anonymous"
+                  />
+                  {/* Canvas overlay */}
+                  <canvas
+                    ref={canvasRef}
+                    style={{
+                      position: "absolute",
+                      top: 0,
+                      left: 0,
+                      width: "100%",
+                      height: "100%",
+                      zIndex: 100,
+                      pointerEvents: "none",
+                      display: "block"
+                    }}
+                    width={overlayVideoRef.current?.clientWidth || 640}
+                    height={overlayVideoRef.current?.clientHeight || 480}
+                  />
+                  {/* Hidden or visible processing video for FastVLM/canvas */}
+                  <video
+                    ref={processingVideoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    autoPlay
+                    loop
+                    muted
+                    playsInline
+                    crossOrigin="anonymous"
+                    style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }}
+                    onLoadedData={e => { e.currentTarget.play().catch(() => {}); }}
+                  />
+                  <button
+                    className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
+                    onClick={handleToggleExampleProcessing}
+                  >
+                    {exampleProcessing ? "Stop Processing" : "Start Processing"}
+                  </button>
+                </div>
+              )}
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
+              <button
+                className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
+                onClick={handleTestDrawBox}
+              >
+                Test Draw Box
+              </button>
+              <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
+                <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
+                <div>Raw Model Output:</div>
+                <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+    </div>
+  );
 }