Spaces:
Running
Running
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
@@ -75,6 +75,40 @@ function denormalizeBox(box: number[], width: number, height: number) {
|
|
75 |
return box;
|
76 |
}
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
export default function MultiSourceCaptioningView() {
|
79 |
const [mode, setMode] = useState<Mode>("File");
|
80 |
const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
@@ -147,12 +181,8 @@ export default function MultiSourceCaptioningView() {
|
|
147 |
if (!ctx) return;
|
148 |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
149 |
await runInference(video, prompt, (output: string) => {
|
150 |
-
setDebugOutput(output);
|
151 |
-
let boxes =
|
152 |
-
if (boxes.length === 0 && Array.isArray(output)) {
|
153 |
-
boxes = parseFlatBoxArray(output);
|
154 |
-
}
|
155 |
-
boxes = normalizeBoxes(boxes);
|
156 |
// Box persistence logic (2 seconds)
|
157 |
const now = Date.now();
|
158 |
if (Array.isArray(boxes) && boxes.length > 0) {
|
@@ -165,7 +195,28 @@ export default function MultiSourceCaptioningView() {
|
|
165 |
if (boxHistory.length > 0) {
|
166 |
const scaleX = canvas.width / video.videoWidth;
|
167 |
const scaleY = canvas.height / video.videoHeight;
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
}
|
170 |
});
|
171 |
};
|
@@ -281,11 +332,7 @@ export default function MultiSourceCaptioningView() {
|
|
281 |
setDebugOutput(output);
|
282 |
setInferenceStatus("Inference complete.");
|
283 |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
284 |
-
let boxes =
|
285 |
-
if (boxes.length === 0 && Array.isArray(output)) {
|
286 |
-
boxes = parseFlatBoxArray(output);
|
287 |
-
}
|
288 |
-
boxes = normalizeBoxes(boxes);
|
289 |
console.log("Model output:", output);
|
290 |
console.log("Boxes after normalization:", boxes);
|
291 |
console.log("Canvas size:", canvas.width, canvas.height);
|
|
|
75 |
return box;
|
76 |
}
|
77 |
|
78 |
+
// Add this robust fallback parser near the top
|
79 |
+
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] {
|
80 |
+
// Try to parse as JSON first
|
81 |
+
try {
|
82 |
+
const parsed = JSON.parse(output);
|
83 |
+
if (Array.isArray(parsed)) {
|
84 |
+
const result: { label: string, bbox_2d: number[] }[] = [];
|
85 |
+
for (const obj of parsed) {
|
86 |
+
if (obj && obj.label && Array.isArray(obj.bbox_2d)) {
|
87 |
+
if (Array.isArray(obj.bbox_2d[0])) {
|
88 |
+
for (const arr of obj.bbox_2d) {
|
89 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
90 |
+
result.push({ label: obj.label, bbox_2d: arr });
|
91 |
+
}
|
92 |
+
}
|
93 |
+
} else if (obj.bbox_2d.length === 4) {
|
94 |
+
result.push({ label: obj.label, bbox_2d: obj.bbox_2d });
|
95 |
+
}
|
96 |
+
}
|
97 |
+
}
|
98 |
+
if (result.length > 0) return result;
|
99 |
+
}
|
100 |
+
} catch (e) {}
|
101 |
+
// Fallback: extract all [x1, y1, x2, y2] arrays from the string
|
102 |
+
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g;
|
103 |
+
const boxes: { label: string, bbox_2d: number[] }[] = [];
|
104 |
+
let match;
|
105 |
+
while ((match = boxRegex.exec(output)) !== null) {
|
106 |
+
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])];
|
107 |
+
boxes.push({ label: '', bbox_2d: arr });
|
108 |
+
}
|
109 |
+
return boxes;
|
110 |
+
}
|
111 |
+
|
112 |
export default function MultiSourceCaptioningView() {
|
113 |
const [mode, setMode] = useState<Mode>("File");
|
114 |
const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
|
|
|
181 |
if (!ctx) return;
|
182 |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
183 |
await runInference(video, prompt, (output: string) => {
|
184 |
+
setDebugOutput(output);
|
185 |
+
let boxes = extractAllBoundingBoxes(output);
|
|
|
|
|
|
|
|
|
186 |
// Box persistence logic (2 seconds)
|
187 |
const now = Date.now();
|
188 |
if (Array.isArray(boxes) && boxes.length > 0) {
|
|
|
195 |
if (boxHistory.length > 0) {
|
196 |
const scaleX = canvas.width / video.videoWidth;
|
197 |
const scaleY = canvas.height / video.videoHeight;
|
198 |
+
// Fix: Draw all boxes, even if bbox_2d is an array of arrays
|
199 |
+
const denormalizedBoxes: any[] = [];
|
200 |
+
for (const b of boxHistory) {
|
201 |
+
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) {
|
202 |
+
// Multiple boxes per label
|
203 |
+
for (const arr of b.bbox_2d) {
|
204 |
+
if (Array.isArray(arr) && arr.length === 4) {
|
205 |
+
denormalizedBoxes.push({
|
206 |
+
...b,
|
207 |
+
bbox_2d: denormalizeBox(arr, canvas.width, canvas.height)
|
208 |
+
});
|
209 |
+
}
|
210 |
+
}
|
211 |
+
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) {
|
212 |
+
// Single box
|
213 |
+
denormalizedBoxes.push({
|
214 |
+
...b,
|
215 |
+
bbox_2d: denormalizeBox(b.bbox_2d, canvas.width, canvas.height)
|
216 |
+
});
|
217 |
+
}
|
218 |
+
}
|
219 |
+
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
|
220 |
}
|
221 |
});
|
222 |
};
|
|
|
332 |
setDebugOutput(output);
|
333 |
setInferenceStatus("Inference complete.");
|
334 |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
335 |
+
let boxes = extractAllBoundingBoxes(output);
|
|
|
|
|
|
|
|
|
336 |
console.log("Model output:", output);
|
337 |
console.log("Boxes after normalization:", boxes);
|
338 |
console.log("Canvas size:", canvas.width, canvas.height);
|