Quazim0t0 commited on
Commit
43aa15a
·
verified ·
1 Parent(s): 457f6cc

Upload 51 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -75,6 +75,40 @@ function denormalizeBox(box: number[], width: number, height: number) {
75
  return box;
76
  }
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  export default function MultiSourceCaptioningView() {
79
  const [mode, setMode] = useState<Mode>("File");
80
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
@@ -147,12 +181,8 @@ export default function MultiSourceCaptioningView() {
147
  if (!ctx) return;
148
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
149
  await runInference(video, prompt, (output: string) => {
150
- setDebugOutput(output); // <-- Ensure Raw Model Output is updated
151
- let boxes = extractJsonFromMarkdown(output) || [];
152
- if (boxes.length === 0 && Array.isArray(output)) {
153
- boxes = parseFlatBoxArray(output);
154
- }
155
- boxes = normalizeBoxes(boxes);
156
  // Box persistence logic (2 seconds)
157
  const now = Date.now();
158
  if (Array.isArray(boxes) && boxes.length > 0) {
@@ -165,7 +195,28 @@ export default function MultiSourceCaptioningView() {
165
  if (boxHistory.length > 0) {
166
  const scaleX = canvas.width / video.videoWidth;
167
  const scaleY = canvas.height / video.videoHeight;
168
- drawBoundingBoxesOnCanvas(ctx, boxHistory, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  }
170
  });
171
  };
@@ -281,11 +332,7 @@ export default function MultiSourceCaptioningView() {
281
  setDebugOutput(output);
282
  setInferenceStatus("Inference complete.");
283
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
284
- let boxes = extractJsonFromMarkdown(output) || [];
285
- if (boxes.length === 0 && Array.isArray(output)) {
286
- boxes = parseFlatBoxArray(output);
287
- }
288
- boxes = normalizeBoxes(boxes);
289
  console.log("Model output:", output);
290
  console.log("Boxes after normalization:", boxes);
291
  console.log("Canvas size:", canvas.width, canvas.height);
 
75
  return box;
76
  }
77
 
78
+ // Add this robust fallback parser near the top
79
+ function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
80
+ // Try to parse as JSON first
81
+ try {
82
+ const parsed = JSON.parse(output);
83
+ if (Array.isArray(parsed)) {
84
+ const result: { label: string, bbox_2d: number[] }[] = [];
85
+ for (const obj of parsed) {
86
+ if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
87
+ if (Array.isArray(obj.bbox_2d[0])) {
88
+ for (const arr of obj.bbox_2d) {
89
+ if (Array.isArray(arr) && arr.length === 4) {
90
+ result.push({ label: obj.label, bbox_2d: arr });
91
+ }
92
+ }
93
+ } else if (obj.bbox_2d.length === 4) {
94
+ result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
95
+ }
96
+ }
97
+ }
98
+ if (result.length > 0) return result;
99
+ }
100
+ } catch (e) {}
101
+ // Fallback: extract all [x1, y1, x2, y2] arrays from the string
102
+ const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
103
+ const boxes: { label: string, bbox_2d: number[] }[] = [];
104
+ let match;
105
+ while ((match = boxRegex.exec(output)) !== null) {
106
+ const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
107
+ boxes.push({ label: '', bbox_2d: arr });
108
+ }
109
+ return boxes;
110
+ }
111
+
112
  export default function MultiSourceCaptioningView() {
113
  const [mode, setMode] = useState<Mode>("File");
114
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
 
181
  if (!ctx) return;
182
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
183
  await runInference(video, prompt, (output: string) => {
184
+ setDebugOutput(output);
185
+ let boxes = extractAllBoundingBoxes(output);
 
 
 
 
186
  // Box persistence logic (2 seconds)
187
  const now = Date.now();
188
  if (Array.isArray(boxes) && boxes.length > 0) {
 
195
  if (boxHistory.length > 0) {
196
  const scaleX = canvas.width / video.videoWidth;
197
  const scaleY = canvas.height / video.videoHeight;
198
+ // Fix: Draw all boxes, even if bbox_2d is an array of arrays
199
+ const denormalizedBoxes: any[] = [];
200
+ for (const b of boxHistory) {
201
+ if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
202
+ // Multiple boxes per label
203
+ for (const arr of b.bbox_2d) {
204
+ if (Array.isArray(arr) && arr.length === 4) {
205
+ denormalizedBoxes.push({
206
+ ...b,
207
+ bbox_2d: denormalizeBox(arr, canvas.width, canvas.height)
208
+ });
209
+ }
210
+ }
211
+ } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
212
+ // Single box
213
+ denormalizedBoxes.push({
214
+ ...b,
215
+ bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height)
216
+ });
217
+ }
218
+ }
219
+ drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
220
  }
221
  });
222
  };
 
332
  setDebugOutput(output);
333
  setInferenceStatus("Inference complete.");
334
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
335
+ let boxes = extractAllBoundingBoxes(output);
 
 
 
 
336
  console.log("Model output:", output);
337
  console.log("Boxes after normalization:", boxes);
338
  console.log("Canvas size:", canvas.width, canvas.height);