Spaces:

Quazim0t0
/

FastVLMBoxes

Running

File size: 1,942 Bytes

/// <reference lib="webworker" />
// @ts-ignore
importScripts('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.js');

let processor: any = null;
let model: any = null;
let isLoaded = false;

const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";

async function loadModelAndProcessor() {
  if (isLoaded) return;
  // @ts-ignore
  processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID);
  // @ts-ignore
  model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
    dtype: {
      embed_tokens: "fp16",
      vision_encoder: "q4",
      decoder_model_merged: "q4",
    },
    device: "webgpu",
  });
  isLoaded = true;
}

self.onmessage = async (event) => {
  const { imageData, prompt } = event.data;
  await loadModelAndProcessor();

  // Convert imageData to RawImage
  // @ts-ignore
  const rawImg = new window.transformers.RawImage(
    imageData.data,
    imageData.width,
    imageData.height,
    4
  );

  const messages = [
    {
      role: "system",
      content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`,
    },
    { role: "user", content: `<image>${prompt}` },
  ];
  // @ts-ignore
  const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true });
  // @ts-ignore
  const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false });

  // @ts-ignore
  const outputs = await model.generate({
    ...inputs,
    max_new_tokens: 512,
    do_sample: false,
    repetition_penalty: 1.2,
  });

  // @ts-ignore
  const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), {
    skip_special_tokens: true,
  });

  // Return the decoded result (should be a string or JSON)
  self.postMessage(decoded[0].trim());
};

export {};