Quazim0t0 commited on
Commit
35bd577
·
verified ·
1 Parent(s): 584b8d0

Upload 38 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -62,6 +62,27 @@ function isVideoFile(file: File) {
62
  return file.type.startsWith("video/");
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  export default function MultiSourceCaptioningView() {
66
  const [mode, setMode] = useState<Mode>("File");
67
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
@@ -80,6 +101,8 @@ export default function MultiSourceCaptioningView() {
80
  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
 
 
83
 
84
  const videoRef = useRef<HTMLVideoElement | null>(null);
85
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
@@ -87,6 +110,31 @@ export default function MultiSourceCaptioningView() {
87
  const webcamStreamRef = useRef<MediaStream | null>(null);
88
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  const processVideoFrame = async () => {
91
  if (!videoRef.current || !canvasRef.current) return;
92
  const video = videoRef.current;
@@ -97,28 +145,46 @@ export default function MultiSourceCaptioningView() {
97
  const ctx = canvas.getContext("2d");
98
  if (!ctx) return;
99
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
100
- await runInference(video, prompt, (output: string) => {
101
- setDebugOutput(output); // <-- Ensure Raw Model Output is updated
102
- let boxes = extractJsonFromMarkdown(output) || [];
103
- if (boxes.length === 0 && Array.isArray(output)) {
104
- boxes = parseFlatBoxArray(output);
105
- }
106
- boxes = normalizeBoxes(boxes);
107
- console.log("Model output:", output);
108
- console.log("Boxes after normalization:", boxes);
109
- console.log("Canvas size:", canvas.width, canvas.height);
110
- if (boxes.length > 0) {
111
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
112
- console.log("First box coords:", x1, y1, x2, y2);
113
- }
114
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
115
- if (Array.isArray(boxes) && boxes.length > 0) {
116
- const scaleX = canvas.width / video.videoWidth;
117
- const scaleY = canvas.height / video.videoHeight;
118
- ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
119
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
 
 
 
 
 
 
120
  }
121
- });
 
 
 
 
 
 
 
 
 
 
 
 
122
  };
123
 
124
  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
@@ -228,30 +294,55 @@ export default function MultiSourceCaptioningView() {
228
  setProcessing(true);
229
  setError(null);
230
  setInferenceStatus("Running inference...");
231
- await runInference(img, prompt, (output: string) => {
232
- setDebugOutput(output);
233
- setInferenceStatus("Inference complete.");
234
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
235
- let boxes = extractJsonFromMarkdown(output) || [];
236
- if (boxes.length === 0 && Array.isArray(output)) {
237
- boxes = parseFlatBoxArray(output);
238
- }
239
- boxes = normalizeBoxes(boxes);
240
- console.log("Model output:", output);
241
- console.log("Boxes after normalization:", boxes);
242
- console.log("Canvas size:", canvas.width, canvas.height);
243
- if (boxes.length > 0) {
244
- const [x1, y1, x2, y2] = boxes[0].bbox_2d;
245
- console.log("First box coords:", x1, y1, x2, y2);
246
- }
247
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
248
- if (Array.isArray(boxes) && boxes.length > 0) {
249
- const scaleX = canvas.width / img.naturalWidth;
250
- const scaleY = canvas.height / img.naturalHeight;
251
- drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
 
 
 
 
 
 
 
 
 
 
 
252
  }
253
- setImageProcessed(true);
254
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  setProcessing(false);
256
  };
257
 
 
62
  return file.type.startsWith("video/");
63
  }
64
 
65
+ // Utility to get ImageData from a video or image element
66
+ function getImageDataFromElement(media: HTMLVideoElement | HTMLImageElement): ImageData | null {
67
+ const canvas = document.createElement("canvas");
68
+ let width = 0, height = 0;
69
+ if (media instanceof HTMLVideoElement) {
70
+ width = media.videoWidth;
71
+ height = media.videoHeight;
72
+ } else if (media instanceof HTMLImageElement) {
73
+ width = media.naturalWidth;
74
+ height = media.naturalHeight;
75
+ } else {
76
+ return null;
77
+ }
78
+ canvas.width = width;
79
+ canvas.height = height;
80
+ const ctx = canvas.getContext("2d");
81
+ if (!ctx) return null;
82
+ ctx.drawImage(media, 0, 0, width, height);
83
+ return ctx.getImageData(0, 0, width, height);
84
+ }
85
+
86
  export default function MultiSourceCaptioningView() {
87
  const [mode, setMode] = useState<Mode>("File");
88
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
 
101
  const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
102
  const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
103
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
104
+ const inferenceWorkerRef = useRef<Worker | null>(null);
105
+ const [useWorker, setUseWorker] = useState(true); // Toggle for worker usage
106
 
107
  const videoRef = useRef<HTMLVideoElement | null>(null);
108
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
 
110
  const webcamStreamRef = useRef<MediaStream | null>(null);
111
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
112
 
113
+ useEffect(() => {
114
+ if (useWorker) {
115
+ inferenceWorkerRef.current = new Worker(
116
+ new URL('../workers/inferenceWorker.ts', import.meta.url),
117
+ { type: 'module' }
118
+ );
119
+ }
120
+ return () => {
121
+ inferenceWorkerRef.current?.terminate();
122
+ inferenceWorkerRef.current = null;
123
+ };
124
+ }, [useWorker]);
125
+
126
+ // Helper to run inference in worker
127
+ const runInferenceInWorker = (media: HTMLVideoElement | HTMLImageElement, prompt: string) => {
128
+ return new Promise((resolve, reject) => {
129
+ if (!inferenceWorkerRef.current) return reject('No worker');
130
+ const imageData = getImageDataFromElement(media);
131
+ if (!imageData) return reject('Could not get image data');
132
+ inferenceWorkerRef.current.onmessage = (event) => resolve(event.data);
133
+ inferenceWorkerRef.current.onerror = (err) => reject(err);
134
+ inferenceWorkerRef.current.postMessage({ imageData, prompt });
135
+ });
136
+ };
137
+
138
  const processVideoFrame = async () => {
139
  if (!videoRef.current || !canvasRef.current) return;
140
  const video = videoRef.current;
 
145
  const ctx = canvas.getContext("2d");
146
  if (!ctx) return;
147
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
148
+ if (useWorker && inferenceWorkerRef.current) {
149
+ try {
150
+ const output = await runInferenceInWorker(video, prompt);
151
+ setDebugOutput(JSON.stringify(output, null, 2));
152
+ let boxes = normalizeBoxes(output);
153
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
154
+ if (Array.isArray(boxes) && boxes.length > 0) {
155
+ const scaleX = canvas.width / video.videoWidth;
156
+ const scaleY = canvas.height / video.videoHeight;
157
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
158
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
159
+ }
160
+ } catch (err) {
161
+ setInferenceStatus("Worker inference failed, falling back to main thread.");
162
+ // fallback to main-thread inference
163
+ await runInference(video, prompt, (output: string) => {
164
+ setDebugOutput(output);
165
+ let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
166
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
167
+ if (Array.isArray(boxes) && boxes.length > 0) {
168
+ const scaleX = canvas.width / video.videoWidth;
169
+ const scaleY = canvas.height / video.videoHeight;
170
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
171
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
172
+ }
173
+ });
174
  }
175
+ } else {
176
+ await runInference(video, prompt, (output: string) => {
177
+ setDebugOutput(output);
178
+ let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
179
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
180
+ if (Array.isArray(boxes) && boxes.length > 0) {
181
+ const scaleX = canvas.width / video.videoWidth;
182
+ const scaleY = canvas.height / video.videoHeight;
183
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
184
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
185
+ }
186
+ });
187
+ }
188
  };
189
 
190
  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
 
294
  setProcessing(true);
295
  setError(null);
296
  setInferenceStatus("Running inference...");
297
+ if (useWorker && inferenceWorkerRef.current) {
298
+ try {
299
+ const output = await runInferenceInWorker(img, prompt);
300
+ setDebugOutput(JSON.stringify(output, null, 2));
301
+ setInferenceStatus("Inference complete.");
302
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
303
+ let boxes = normalizeBoxes(output);
304
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
305
+ if (Array.isArray(boxes) && boxes.length > 0) {
306
+ const scaleX = canvas.width / img.naturalWidth;
307
+ const scaleY = canvas.height / img.naturalHeight;
308
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
309
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
310
+ }
311
+ setImageProcessed(true);
312
+ } catch (err) {
313
+ setInferenceStatus("Worker inference failed, falling back to main thread.");
314
+ // fallback to main-thread inference
315
+ await runInference(img, prompt, (output: string) => {
316
+ setDebugOutput(output);
317
+ setInferenceStatus("Inference complete.");
318
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
319
+ let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
320
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
321
+ if (Array.isArray(boxes) && boxes.length > 0) {
322
+ const scaleX = canvas.width / img.naturalWidth;
323
+ const scaleY = canvas.height / img.naturalHeight;
324
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
325
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
326
+ }
327
+ setImageProcessed(true);
328
+ });
329
  }
330
+ } else {
331
+ await runInference(img, prompt, (output: string) => {
332
+ setDebugOutput(output);
333
+ setInferenceStatus("Inference complete.");
334
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
335
+ let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
336
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
337
+ if (Array.isArray(boxes) && boxes.length > 0) {
338
+ const scaleX = canvas.width / img.naturalWidth;
339
+ const scaleY = canvas.height / img.naturalHeight;
340
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
341
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
342
+ }
343
+ setImageProcessed(true);
344
+ });
345
+ }
346
  setProcessing(false);
347
  };
348
 
src/workers/inferenceWorker.ts ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ // src/workers/inferenceWorker.ts
2
+ self.onmessage = async (event) => {
3
+ const { imageData, prompt } = event.data;
4
+ // TODO: Import and run your real model inference here
5
+ // For now, just echo a fake result for testing
6
+ const result = [{ label: "person", bbox_2d: [[100, 50], [200, 300]] }];
7
+ self.postMessage(result);
8
+ };
9
+ export {};