Spaces:
Running
Running
/// <reference lib="webworker" /> | |
// @ts-ignore | |
importScripts('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.js'); | |
let processor: any = null; | |
let model: any = null; | |
let isLoaded = false; | |
const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX"; | |
async function loadModelAndProcessor() { | |
if (isLoaded) return; | |
// @ts-ignore | |
processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID); | |
// @ts-ignore | |
model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, { | |
dtype: { | |
embed_tokens: "fp16", | |
vision_encoder: "q4", | |
decoder_model_merged: "q4", | |
}, | |
device: "webgpu", | |
}); | |
isLoaded = true; | |
} | |
self.onmessage = async (event) => { | |
const { imageData, prompt } = event.data; | |
await loadModelAndProcessor(); | |
// Convert imageData to RawImage | |
// @ts-ignore | |
const rawImg = new window.transformers.RawImage( | |
imageData.data, | |
imageData.width, | |
imageData.height, | |
4 | |
); | |
const messages = [ | |
{ | |
role: "system", | |
content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`, | |
}, | |
{ role: "user", content: `<image>${prompt}` }, | |
]; | |
// @ts-ignore | |
const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true }); | |
// @ts-ignore | |
const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false }); | |
// @ts-ignore | |
const outputs = await model.generate({ | |
...inputs, | |
max_new_tokens: 512, | |
do_sample: false, | |
repetition_penalty: 1.2, | |
}); | |
// @ts-ignore | |
const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { | |
skip_special_tokens: true, | |
}); | |
// Return the decoded result (should be a string or JSON) | |
self.postMessage(decoded[0].trim()); | |
}; | |
export {}; |