Quazim0t0 commited on
Commit
13cb9dc
·
verified ·
1 Parent(s): 35bd577

Upload 38 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -8,14 +8,6 @@ type Mode = typeof MODES[number];
8
  const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
  const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
 
11
- function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
- if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
- const label = arr[0];
14
- return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
- }
16
- return [];
17
- }
18
-
19
  function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
  if (!raw) return [];
21
  let boxes = [];
@@ -102,7 +94,7 @@ export default function MultiSourceCaptioningView() {
102
  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
103
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
104
  const inferenceWorkerRef = useRef<Worker | null>(null);
105
- const [useWorker, setUseWorker] = useState(true); // Toggle for worker usage
106
 
107
  const videoRef = useRef<HTMLVideoElement | null>(null);
108
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
 
8
  const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
  const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
 
 
 
 
 
 
 
 
 
11
  function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
12
  if (!raw) return [];
13
  let boxes = [];
 
94
  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
95
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
96
  const inferenceWorkerRef = useRef<Worker | null>(null);
97
+ const [useWorker] = useState(true);
98
 
99
  const videoRef = useRef<HTMLVideoElement | null>(null);
100
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
src/workers/inferenceWorker.ts CHANGED
@@ -1,9 +1,70 @@
1
  // src/workers/inferenceWorker.ts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  self.onmessage = async (event) => {
3
  const { imageData, prompt } = event.data;
4
- // TODO: Import and run your real model inference here
5
- // For now, just echo a fake result for testing
6
- const result = [{ label: "person", bbox_2d: [[100, 50], [200, 300]] }];
7
- self.postMessage(result);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  };
 
9
  export {};
 
1
  // src/workers/inferenceWorker.ts
2
+
3
+ // Import transformers.js from CDN
4
+ importScripts('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.js');
5
+
6
+ let processor = null;
7
+ let model = null;
8
+ let isLoaded = false;
9
+
10
+ const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX";
11
+
12
+ async function loadModelAndProcessor() {
13
+ if (isLoaded) return;
14
+ // @ts-ignore
15
+ processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID);
16
+ // @ts-ignore
17
+ model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, {
18
+ dtype: {
19
+ embed_tokens: "fp16",
20
+ vision_encoder: "q4",
21
+ decoder_model_merged: "q4",
22
+ },
23
+ device: "webgpu",
24
+ });
25
+ isLoaded = true;
26
+ }
27
+
28
  self.onmessage = async (event) => {
29
  const { imageData, prompt } = event.data;
30
+ await loadModelAndProcessor();
31
+
32
+ // Convert imageData to RawImage
33
+ // @ts-ignore
34
+ const rawImg = new window.transformers.RawImage(
35
+ imageData.data,
36
+ imageData.width,
37
+ imageData.height,
38
+ 4
39
+ );
40
+
41
+ const messages = [
42
+ {
43
+ role: "system",
44
+ content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`,
45
+ },
46
+ { role: "user", content: `<image>${prompt}` },
47
+ ];
48
+ // @ts-ignore
49
+ const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true });
50
+ // @ts-ignore
51
+ const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false });
52
+
53
+ // @ts-ignore
54
+ const outputs = await model.generate({
55
+ ...inputs,
56
+ max_new_tokens: 512,
57
+ do_sample: false,
58
+ repetition_penalty: 1.2,
59
+ });
60
+
61
+ // @ts-ignore
62
+ const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), {
63
+ skip_special_tokens: true,
64
+ });
65
+
66
+ // Return the decoded result (should be a string or JSON)
67
+ self.postMessage(decoded[0].trim());
68
  };
69
+
70
  export {};