///
// @ts-ignore
importScripts('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.13.0/dist/transformers.min.js');
let processor: any = null;
let model: any = null;
let isLoaded = false;
const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
async function loadModelAndProcessor() {
if (isLoaded) return;
// @ts-ignore
processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID);
// @ts-ignore
model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
dtype: {
embed_tokens: "fp16",
vision_encoder: "q4",
decoder_model_merged: "q4",
},
device: "webgpu",
});
isLoaded = true;
}
self.onmessage = async (event) => {
const { imageData, prompt } = event.data;
await loadModelAndProcessor();
// Convert imageData to RawImage
// @ts-ignore
const rawImg = new window.transformers.RawImage(
imageData.data,
imageData.width,
imageData.height,
4
);
const messages = [
{
role: "system",
content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`,
},
{ role: "user", content: `${prompt}` },
];
// @ts-ignore
const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true });
// @ts-ignore
const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false });
// @ts-ignore
const outputs = await model.generate({
...inputs,
max_new_tokens: 512,
do_sample: false,
repetition_penalty: 1.2,
});
// @ts-ignore
const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), {
skip_special_tokens: true,
});
// Return the decoded result (should be a string or JSON)
self.postMessage(decoded[0].trim());
};
export {};