/// // @ts-ignore importScripts('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.13.0/dist/transformers.min.js'); let processor: any = null; let model: any = null; let isLoaded = false; const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX"; async function loadModelAndProcessor() { if (isLoaded) return; // @ts-ignore processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID); // @ts-ignore model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, { dtype: { embed_tokens: "fp16", vision_encoder: "q4", decoder_model_merged: "q4", }, device: "webgpu", }); isLoaded = true; } self.onmessage = async (event) => { const { imageData, prompt } = event.data; await loadModelAndProcessor(); // Convert imageData to RawImage // @ts-ignore const rawImg = new window.transformers.RawImage( imageData.data, imageData.width, imageData.height, 4 ); const messages = [ { role: "system", content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`, }, { role: "user", content: `${prompt}` }, ]; // @ts-ignore const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true }); // @ts-ignore const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false }); // @ts-ignore const outputs = await model.generate({ ...inputs, max_new_tokens: 512, do_sample: false, repetition_penalty: 1.2, }); // @ts-ignore const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true, }); // Return the decoded result (should be a string or JSON) self.postMessage(decoded[0].trim()); }; export {};