Spaces:

Quazim0t0
/

FastVLMBoxes

Running

App Files Files Community

Quazim0t0 commited on 1 day ago

Commit

13cb9dc

verified ·

1 Parent(s): 35bd577

Upload 38 files

Browse files

Files changed (2) hide show

src/components/MultiSourceCaptioningView.tsx +1 -9
src/workers/inferenceWorker.ts +65 -4

src/components/MultiSourceCaptioningView.tsx CHANGED Viewed

@@ -8,14 +8,6 @@ type Mode = typeof MODES[number];
 const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
 const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
-function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
-  if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
-    const label = arr[0];
-    return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
-  }
-  return [];
-}
 function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
   if (!raw) return [];
   let boxes = [];
@@ -102,7 +94,7 @@ export default function MultiSourceCaptioningView() {
   const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
   const inferenceWorkerRef = useRef<Worker | null>(null);
-  const [useWorker, setUseWorker] = useState(true); // Toggle for worker usage
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);

 const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
 const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
 function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
   if (!raw) return [];
   let boxes = [];
   const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
   const [inferenceStatus, setInferenceStatus] = useState<string>("");
   const inferenceWorkerRef = useRef<Worker | null>(null);
+  const [useWorker] = useState(true);
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const canvasRef = useRef<HTMLCanvasElement | null>(null);

src/workers/inferenceWorker.ts CHANGED Viewed

@@ -1,9 +1,70 @@
 // src/workers/inferenceWorker.ts
 self.onmessage = async (event) => {
   const { imageData, prompt } = event.data;
-  // TODO: Import and run your real model inference here
-  // For now, just echo a fake result for testing
-  const result = [{ label: "person", bbox_2d: [[100, 50], [200, 300]] }];
-  self.postMessage(result);
 };
 export {};

 // src/workers/inferenceWorker.ts
+// Import transformers.js from CDN
+importScripts('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.js');
+let processor = null;
+let model = null;
+let isLoaded = false;
+const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
+async function loadModelAndProcessor() {
+  if (isLoaded) return;
+  // @ts-ignore
+  processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID);
+  // @ts-ignore
+  model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
+    dtype: {
+      embed_tokens: "fp16",
+      vision_encoder: "q4",
+      decoder_model_merged: "q4",
+    },
+    device: "webgpu",
+  });
+  isLoaded = true;
+}
 self.onmessage = async (event) => {
   const { imageData, prompt } = event.data;
+  await loadModelAndProcessor();
+  // Convert imageData to RawImage
+  // @ts-ignore
+  const rawImg = new window.transformers.RawImage(
+    imageData.data,
+    imageData.width,
+    imageData.height,
+    4
+  );
+  const messages = [
+    {
+      role: "system",
+      content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`,
+    },
+    { role: "user", content: `<image>${prompt}` },
+  ];
+  // @ts-ignore
+  const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true });
+  // @ts-ignore
+  const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false });
+  // @ts-ignore
+  const outputs = await model.generate({
+    ...inputs,
+    max_new_tokens: 512,
+    do_sample: false,
+    repetition_penalty: 1.2,
+  });
+  // @ts-ignore
+  const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), {
+    skip_special_tokens: true,
+  });
+  // Return the decoded result (should be a string or JSON)
+  self.postMessage(decoded[0].trim());
 };
 export {};