Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 1 day ago

Commit

d6cc922

verified ·

1 Parent(s): b5e736c

Upload 51 files

Browse files

Files changed (2) hide show

README.md +12 -12
src/components/MultiSourceCaptioningView.tsx +2 -80

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: FastVLMBoxes (Use File Upload, Not Webcam)
-emoji: 📈
-colorFrom: purple
-colorTo: pink
-sdk: static
-pinned: false
-app_build_command: npm run build
-app_file: dist/index.html
-short_description: Real-time video boxing powered by FastVLM
----
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: FastVLM WebGPU
+emoji: 🍎
+colorFrom: blue
+colorTo: green
+sdk: static
+pinned: false
+app_build_command: npm run build
+app_file: dist/index.html
+short_description: Real-time video captioning powered by FastVLM
+---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -2,11 +2,11 @@ import { useState, useRef, useEffect } from "react";
 import { useVLMContext } from "../context/useVLMContext";
 import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
-const MODES = ["Webcam", "File"] as const;
 type Mode = typeof MODES[number];
 const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
-const EXAMPLE_PROMPT = "Detect each individual bird in the image. The birds are moving. For each object, output a JSON array of objects with fields. Each bird should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. This should be used to draw a box using the points around the bird. Follow the format of this Example: [x1, y1, x2, y2], [x1, y1, x2, y2]";
 function isImageFile(file: File) {
   return file.type.startsWith("image/");
@@ -68,7 +68,6 @@ export default function MultiSourceCaptioningView() {
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
   const [processing, setProcessing] = useState(false);
   const [error, setError] = useState<string | null>(null);
-  const [webcamActive, setWebcamActive] = useState(false);
   const [uploadedFile, setUploadedFile] = useState<File | null>(null);
   const [uploadedUrl, setUploadedUrl] = useState<string>("");
   const [videoProcessing, setVideoProcessing] = useState(false);
@@ -85,7 +84,6 @@ export default function MultiSourceCaptioningView() {
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
   const imageRef = useRef<HTMLImageElement | null>(null);
   const boxHistoryRef = useRef<any[]>([]);
-  const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
   // Add this useEffect for overlay video synchronization
@@ -182,53 +180,7 @@ export default function MultiSourceCaptioningView() {
     setExampleProcessing(false);
   };
-  // Webcam setup and teardown (unchanged)
-  useEffect(() => {
-    if (mode !== "Webcam") {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
-      }
-      setWebcamActive(false);
-      return;
-    }
-    const setupWebcam = async () => {
-      try {
-        setError(null);
-        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-        webcamStreamRef.current = stream;
-        if (videoRef.current) {
-          videoRef.current.srcObject = stream;
-          setWebcamActive(true);
-        }
-      } catch (e) {
-        setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
-        setWebcamActive(false);
-      }
-    };
-    setupWebcam();
-    return () => {
-      if (webcamStreamRef.current) {
-        webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
-        webcamStreamRef.current = null;
-      }
-      setWebcamActive(false);
-    };
-  }, [mode]);
   // Webcam mode: process frames with setInterval
-  useEffect(() => {
-    if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
-    let interval: ReturnType<typeof setInterval> | null = null;
-    interval = setInterval(() => {
-      processVideoFrame();
-    }, 1000);
-    return () => {
-      if (interval) clearInterval(interval);
-    };
-  }, [mode, isLoaded, prompt, runInference, webcamActive]);
-  // File video mode: process frames with setInterval
   useEffect(() => {
     if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
     let interval: ReturnType<typeof setInterval> | null = null;
@@ -386,36 +338,6 @@ export default function MultiSourceCaptioningView() {
         {/* Mode Content */}
         <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
-          {mode === "Webcam" && (
-            <div className="w-full text-center flex flex-col items-center">
-              <div className="mb-4 w-full max-w-xl">
-                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
-                <textarea
-                  className="w-full p-2 rounded-lg text-black"
-                  rows={3}
-                  value={prompt}
-                  onChange={(e) => setPrompt(e.target.value)}
-                />
-              </div>
-              <div className="relative w-full max-w-xl">
-                <video
-                  ref={videoRef}
-                  autoPlay
-                  muted
-                  playsInline
-                  className="w-full rounded-lg shadow-lg mb-2"
-                  style={{ background: "#222" }}
-                />
-                <canvas
-                  ref={canvasRef}
-                  className="absolute top-0 left-0 w-full h-full pointer-events-none"
-                  style={{ zIndex: 10, pointerEvents: "none" }}
-                />
-              </div>
-              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
-              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
-            </div>
-          )}
           {mode === "File" && (
             <div className="w-full text-center flex flex-col items-center">
               <div className="mb-4 w-full max-w-xl">

 import { useVLMContext } from "../context/useVLMContext";
 import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
+const MODES = ["File"] as const;
 type Mode = typeof MODES[number];
 const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
+const EXAMPLE_PROMPT = "Detect each individual animated characters in the image. The characters are moving. For each character, output a JSON array of objects with fields. Each character should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. No coordinates should be the same. This should be used to draw a box using the points around the character. This is an example of two boxes, the format of this : [x1, y1, x2, y2], [x1, y1, x2, y2]";
 function isImageFile(file: File) {
   return file.type.startsWith("image/");
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
   const [processing, setProcessing] = useState(false);
   const [error, setError] = useState<string | null>(null);
   const [uploadedFile, setUploadedFile] = useState<File | null>(null);
   const [uploadedUrl, setUploadedUrl] = useState<string>("");
   const [videoProcessing, setVideoProcessing] = useState(false);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
   const imageRef = useRef<HTMLImageElement | null>(null);
   const boxHistoryRef = useRef<any[]>([]);
   const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
   // Add this useEffect for overlay video synchronization
     setExampleProcessing(false);
   };
   // Webcam mode: process frames with setInterval
   useEffect(() => {
     if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
     let interval: ReturnType<typeof setInterval> | null = null;
         {/* Mode Content */}
         <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
           {mode === "File" && (
             <div className="w-full text-center flex flex-col items-center">
               <div className="mb-4 w-full max-w-xl">