Quazim0t0 commited on
Commit
d6cc922
·
verified ·
1 Parent(s): b5e736c

Upload 51 files

Browse files
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: FastVLMBoxes (Use File Upload, Not Webcam)
3
- emoji: 📈
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: static
7
- pinned: false
8
- app_build_command: npm run build
9
- app_file: dist/index.html
10
- short_description: Real-time video boxing powered by FastVLM
11
- ---
12
-
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: FastVLM WebGPU
3
+ emoji: 🍎
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: static
7
+ pinned: false
8
+ app_build_command: npm run build
9
+ app_file: dist/index.html
10
+ short_description: Real-time video captioning powered by FastVLM
11
+ ---
12
+
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -2,11 +2,11 @@ import { useState, useRef, useEffect } from "react";
2
  import { useVLMContext } from "../context/useVLMContext";
3
  import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
 
5
- const MODES = ["Webcam", "File"] as const;
6
  type Mode = typeof MODES[number];
7
 
8
  const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
9
- const EXAMPLE_PROMPT = "Detect each individual bird in the image. The birds are moving. For each object, output a JSON array of objects with fields. Each bird should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. This should be used to draw a box using the points around the bird. Follow the format of this Example: [x1, y1, x2, y2], [x1, y1, x2, y2]";
10
 
11
  function isImageFile(file: File) {
12
  return file.type.startsWith("image/");
@@ -68,7 +68,6 @@ export default function MultiSourceCaptioningView() {
68
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
69
  const [processing, setProcessing] = useState(false);
70
  const [error, setError] = useState<string | null>(null);
71
- const [webcamActive, setWebcamActive] = useState(false);
72
  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
73
  const [uploadedUrl, setUploadedUrl] = useState<string>("");
74
  const [videoProcessing, setVideoProcessing] = useState(false);
@@ -85,7 +84,6 @@ export default function MultiSourceCaptioningView() {
85
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
86
  const imageRef = useRef<HTMLImageElement | null>(null);
87
  const boxHistoryRef = useRef<any[]>([]);
88
- const webcamStreamRef = useRef<MediaStream | null>(null);
89
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
90
 
91
  // Add this useEffect for overlay video synchronization
@@ -182,53 +180,7 @@ export default function MultiSourceCaptioningView() {
182
  setExampleProcessing(false);
183
  };
184
 
185
- // Webcam setup and teardown (unchanged)
186
- useEffect(() => {
187
- if (mode !== "Webcam") {
188
- if (webcamStreamRef.current) {
189
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
190
- webcamStreamRef.current = null;
191
- }
192
- setWebcamActive(false);
193
- return;
194
- }
195
- const setupWebcam = async () => {
196
- try {
197
- setError(null);
198
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
199
- webcamStreamRef.current = stream;
200
- if (videoRef.current) {
201
- videoRef.current.srcObject = stream;
202
- setWebcamActive(true);
203
- }
204
- } catch (e) {
205
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
206
- setWebcamActive(false);
207
- }
208
- };
209
- setupWebcam();
210
- return () => {
211
- if (webcamStreamRef.current) {
212
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
213
- webcamStreamRef.current = null;
214
- }
215
- setWebcamActive(false);
216
- };
217
- }, [mode]);
218
-
219
  // Webcam mode: process frames with setInterval
220
- useEffect(() => {
221
- if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
222
- let interval: ReturnType<typeof setInterval> | null = null;
223
- interval = setInterval(() => {
224
- processVideoFrame();
225
- }, 1000);
226
- return () => {
227
- if (interval) clearInterval(interval);
228
- };
229
- }, [mode, isLoaded, prompt, runInference, webcamActive]);
230
-
231
- // File video mode: process frames with setInterval
232
  useEffect(() => {
233
  if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
234
  let interval: ReturnType<typeof setInterval> | null = null;
@@ -386,36 +338,6 @@ export default function MultiSourceCaptioningView() {
386
 
387
  {/* Mode Content */}
388
  <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
389
- {mode === "Webcam" && (
390
- <div className="w-full text-center flex flex-col items-center">
391
- <div className="mb-4 w-full max-w-xl">
392
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
393
- <textarea
394
- className="w-full p-2 rounded-lg text-black"
395
- rows={3}
396
- value={prompt}
397
- onChange={(e) => setPrompt(e.target.value)}
398
- />
399
- </div>
400
- <div className="relative w-full max-w-xl">
401
- <video
402
- ref={videoRef}
403
- autoPlay
404
- muted
405
- playsInline
406
- className="w-full rounded-lg shadow-lg mb-2"
407
- style={{ background: "#222" }}
408
- />
409
- <canvas
410
- ref={canvasRef}
411
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
412
- style={{ zIndex: 10, pointerEvents: "none" }}
413
- />
414
- </div>
415
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
416
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
417
- </div>
418
- )}
419
  {mode === "File" && (
420
  <div className="w-full text-center flex flex-col items-center">
421
  <div className="mb-4 w-full max-w-xl">
 
2
  import { useVLMContext } from "../context/useVLMContext";
3
  import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
 
5
+ const MODES = ["File"] as const;
6
  type Mode = typeof MODES[number];
7
 
8
  const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
9
+ const EXAMPLE_PROMPT = "Detect each individual animated characters in the image. The characters are moving. For each character, output a JSON array of objects with fields. Each character should have its own ([x1, y1, x2, y2]) where coordinates are in pixel values. No coordinates should be the same. This should be used to draw a box using the points around the character. This is an example of two boxes, the format of this : [x1, y1, x2, y2], [x1, y1, x2, y2]";
10
 
11
  function isImageFile(file: File) {
12
  return file.type.startsWith("image/");
 
68
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
69
  const [processing, setProcessing] = useState(false);
70
  const [error, setError] = useState<string | null>(null);
 
71
  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
72
  const [uploadedUrl, setUploadedUrl] = useState<string>("");
73
  const [videoProcessing, setVideoProcessing] = useState(false);
 
84
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
85
  const imageRef = useRef<HTMLImageElement | null>(null);
86
  const boxHistoryRef = useRef<any[]>([]);
 
87
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
88
 
89
  // Add this useEffect for overlay video synchronization
 
180
  setExampleProcessing(false);
181
  };
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  // Webcam mode: process frames with setInterval
 
 
 
 
 
 
 
 
 
 
 
 
184
  useEffect(() => {
185
  if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
186
  let interval: ReturnType<typeof setInterval> | null = null;
 
338
 
339
  {/* Mode Content */}
340
  <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  {mode === "File" && (
342
  <div className="w-full text-center flex flex-col items-center">
343
  <div className="mb-4 w-full max-w-xl">