Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 2 days ago

Commit

5bcc8b4

verified ·

1 Parent(s): d6ad5d9

Upload 36 files

Browse files

Files changed (1) hide show

src/components/MultiSourceCaptioningView.tsx +169 -12

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -9,21 +9,31 @@ const EXAMPLE_VIDEO_URL =
   "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
 const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
 export default function MultiSourceCaptioningView() {
-  const [mode, setMode] = useState<Mode>("URL");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
   const [processing, setProcessing] = useState(false);
   const [error, setError] = useState<string | null>(null);
   const [webcamActive, setWebcamActive] = useState(false);
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
   const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, runInference } = useVLMContext();
-  // Webcam setup and teardown
   useEffect(() => {
     if (mode !== "Webcam") {
       if (webcamStreamRef.current) {
@@ -57,7 +67,7 @@ export default function MultiSourceCaptioningView() {
     };
   }, [mode]);
-  // Process webcam frames
   useEffect(() => {
     if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
     let interval: ReturnType<typeof setInterval> | null = null;
@@ -74,16 +84,13 @@ export default function MultiSourceCaptioningView() {
       try {
         setProcessing(true);
         setError(null);
-        // Use FastVLM inference on the current frame
         const fakeVideo = {
           videoWidth: canvas.width,
           videoHeight: canvas.height,
           getContext: () => ctx,
         } as unknown as HTMLVideoElement;
         const result = await runInference(fakeVideo, prompt);
-        // Clear canvas and redraw frame
         ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        // Parse and draw boxes
         const boxes = extractJsonFromMarkdown(result) || [];
         drawBoundingBoxesOnCanvas(ctx, boxes);
       } catch (e) {
@@ -100,7 +107,7 @@ export default function MultiSourceCaptioningView() {
     };
   }, [mode, isLoaded, prompt, runInference, webcamActive]);
-  // Process video frames for URL mode
   useEffect(() => {
     if (mode !== "URL" || !isLoaded) return;
     let interval: ReturnType<typeof setInterval> | null = null;
@@ -117,16 +124,13 @@ export default function MultiSourceCaptioningView() {
       try {
         setProcessing(true);
         setError(null);
-        // Use FastVLM inference on the current frame
         const fakeVideo = {
           videoWidth: canvas.width,
           videoHeight: canvas.height,
           getContext: () => ctx,
         } as unknown as HTMLVideoElement;
         const result = await runInference(fakeVideo, prompt);
-        // Clear canvas and redraw frame
         ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
-        // Parse and draw boxes
         const boxes = extractJsonFromMarkdown(result) || [];
         drawBoundingBoxesOnCanvas(ctx, boxes);
       } catch (e) {
@@ -143,6 +147,86 @@ export default function MultiSourceCaptioningView() {
     };
   }, [mode, isLoaded, prompt, runInference]);
   return (
     <div className="absolute inset-0 text-white">
       <div className="flex flex-col items-center justify-center h-full w-full">
@@ -243,8 +327,81 @@ export default function MultiSourceCaptioningView() {
             </div>
           )}
           {mode === "File" && (
-            <div className="w-full text-center">
-              <p className="mb-4">Upload a video or image file for detection (coming soon).</p>
             </div>
           )}
         </div>

   "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
 const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
+function isImageFile(file: File) {
+  return file.type.startsWith("image/");
+}
+function isVideoFile(file: File) {
+  return file.type.startsWith("video/");
+}
 export default function MultiSourceCaptioningView() {
+  const [mode, setMode] = useState<Mode>("File");
   const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
   const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
   const [processing, setProcessing] = useState(false);
   const [error, setError] = useState<string | null>(null);
   const [webcamActive, setWebcamActive] = useState(false);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [uploadedUrl, setUploadedUrl] = useState<string>("");
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);
+  const imageRef = useRef<HTMLImageElement | null>(null);
   const webcamStreamRef = useRef<MediaStream | null>(null);
   const { isLoaded, runInference } = useVLMContext();
+  // Webcam setup and teardown (unchanged)
   useEffect(() => {
     if (mode !== "Webcam") {
       if (webcamStreamRef.current) {
     };
   }, [mode]);
+  // Process webcam frames (unchanged)
   useEffect(() => {
     if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
     let interval: ReturnType<typeof setInterval> | null = null;
       try {
         setProcessing(true);
         setError(null);
         const fakeVideo = {
           videoWidth: canvas.width,
           videoHeight: canvas.height,
           getContext: () => ctx,
         } as unknown as HTMLVideoElement;
         const result = await runInference(fakeVideo, prompt);
         ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
         const boxes = extractJsonFromMarkdown(result) || [];
         drawBoundingBoxesOnCanvas(ctx, boxes);
       } catch (e) {
     };
   }, [mode, isLoaded, prompt, runInference, webcamActive]);
+  // Process video frames for URL mode (unchanged)
   useEffect(() => {
     if (mode !== "URL" || !isLoaded) return;
     let interval: ReturnType<typeof setInterval> | null = null;
       try {
         setProcessing(true);
         setError(null);
         const fakeVideo = {
           videoWidth: canvas.width,
           videoHeight: canvas.height,
           getContext: () => ctx,
         } as unknown as HTMLVideoElement;
         const result = await runInference(fakeVideo, prompt);
         ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
         const boxes = extractJsonFromMarkdown(result) || [];
         drawBoundingBoxesOnCanvas(ctx, boxes);
       } catch (e) {
     };
   }, [mode, isLoaded, prompt, runInference]);
+  // File mode: process uploaded image
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isImageFile(uploadedFile)) return;
+    const img = imageRef.current;
+    const canvas = canvasRef.current;
+    if (!img || !canvas) return;
+    img.onload = async () => {
+      canvas.width = img.naturalWidth;
+      canvas.height = img.naturalHeight;
+      const ctx = canvas.getContext("2d");
+      if (!ctx) return;
+      ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+      try {
+        setProcessing(true);
+        setError(null);
+        const fakeVideo = {
+          videoWidth: canvas.width,
+          videoHeight: canvas.height,
+          getContext: () => ctx,
+        } as unknown as HTMLVideoElement;
+        const result = await runInference(fakeVideo, prompt);
+        ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
+        const boxes = extractJsonFromMarkdown(result) || [];
+        drawBoundingBoxesOnCanvas(ctx, boxes);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setProcessing(false);
+      }
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile]);
+  // File mode: process uploaded video frames
+  useEffect(() => {
+    if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile)) return;
+    let interval: ReturnType<typeof setInterval> | null = null;
+    const processFrame = async () => {
+      if (!videoRef.current || !canvasRef.current) return;
+      const video = videoRef.current;
+      const canvas = canvasRef.current;
+      if (video.paused || video.ended || video.videoWidth === 0) return;
+      canvas.width = video.videoWidth;
+      canvas.height = video.videoHeight;
+      const ctx = canvas.getContext("2d");
+      if (!ctx) return;
+      ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+      try {
+        setProcessing(true);
+        setError(null);
+        const fakeVideo = {
+          videoWidth: canvas.width,
+          videoHeight: canvas.height,
+          getContext: () => ctx,
+        } as unknown as HTMLVideoElement;
+        const result = await runInference(fakeVideo, prompt);
+        ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
+        const boxes = extractJsonFromMarkdown(result) || [];
+        drawBoundingBoxesOnCanvas(ctx, boxes);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : String(e));
+      } finally {
+        setProcessing(false);
+      }
+    };
+    interval = setInterval(() => {
+      processFrame();
+    }, 1000);
+    return () => {
+      if (interval) clearInterval(interval);
+    };
+  }, [mode, isLoaded, prompt, runInference, uploadedFile]);
+  // Handle file upload
+  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0] || null;
+    setUploadedFile(file);
+    setUploadedUrl(file ? URL.createObjectURL(file) : "");
+    setError(null);
+  };
   return (
     <div className="absolute inset-0 text-white">
       <div className="flex flex-col items-center justify-center h-full w-full">
             </div>
           )}
           {mode === "File" && (
+            <div className="w-full text-center flex flex-col items-center">
+              <div className="mb-4 w-full max-w-xl">
+                <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
+                <textarea
+                  className="w-full p-2 rounded-lg text-black"
+                  rows={3}
+                  value={prompt}
+                  onChange={(e) => setPrompt(e.target.value)}
+                />
+              </div>
+              <div className="mb-4 w-full max-w-xl">
+                <input
+                  type="file"
+                  accept="image/*,video/*"
+                  onChange={handleFileChange}
+                  className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
+                />
+              </div>
+              {/* Show uploaded image */}
+              {uploadedFile && isImageFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <img
+                    ref={imageRef}
+                    src={uploadedUrl}
+                    alt="Uploaded"
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                </div>
+              )}
+              {/* Show uploaded video */}
+              {uploadedFile && isVideoFile(uploadedFile) && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={uploadedUrl}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                </div>
+              )}
+              {/* Show example video if no file uploaded */}
+              {!uploadedFile && (
+                <div className="relative w-full max-w-xl">
+                  <video
+                    ref={videoRef}
+                    src={EXAMPLE_VIDEO_URL}
+                    controls
+                    autoPlay
+                    loop
+                    className="w-full rounded-lg shadow-lg mb-2"
+                    style={{ background: "#222" }}
+                  />
+                  <canvas
+                    ref={canvasRef}
+                    className="absolute top-0 left-0 w-full h-full pointer-events-none"
+                    style={{ zIndex: 10, pointerEvents: "none" }}
+                  />
+                </div>
+              )}
+              {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
+              {error && <div className="text-red-400 mt-2">Error: {error}</div>}
             </div>
           )}
         </div>