Spaces:
Running
Running
Upload 51 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import { useState, useRef, useEffect } from "react";
|
2 |
import { useVLMContext } from "../context/useVLMContext";
|
3 |
-
import {
|
4 |
|
5 |
const MODES = ["Webcam", "URL", "File"] as const;
|
6 |
type Mode = typeof MODES[number];
|
@@ -8,53 +8,6 @@ type Mode = typeof MODES[number];
|
|
8 |
const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
|
9 |
const EXAMPLE_PROMPT = "Detect all birds in the image. For each bird, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"bird\", \"bbox_2d\": [x1, y1, x2, y2]}]";
|
10 |
|
11 |
-
function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
|
12 |
-
if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
|
13 |
-
const label = arr[0];
|
14 |
-
return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
|
15 |
-
}
|
16 |
-
return [];
|
17 |
-
}
|
18 |
-
|
19 |
-
function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
|
20 |
-
if (!raw) return [];
|
21 |
-
let boxes = [];
|
22 |
-
if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
|
23 |
-
boxes = raw.image;
|
24 |
-
} else if (Array.isArray(raw)) {
|
25 |
-
boxes = raw;
|
26 |
-
} else if (typeof raw === "object" && raw !== null) {
|
27 |
-
boxes = [raw];
|
28 |
-
}
|
29 |
-
return boxes
|
30 |
-
.map((obj: any) => {
|
31 |
-
if (!obj || !obj.bbox_2d) return null;
|
32 |
-
let bbox = obj.bbox_2d;
|
33 |
-
// If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
|
34 |
-
if (
|
35 |
-
Array.isArray(bbox) &&
|
36 |
-
bbox.length === 2 &&
|
37 |
-
Array.isArray(bbox[0]) &&
|
38 |
-
Array.isArray(bbox[1]) &&
|
39 |
-
bbox[0].length === 2 &&
|
40 |
-
bbox[1].length === 2
|
41 |
-
) {
|
42 |
-
bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
|
43 |
-
}
|
44 |
-
// If bbox_2d is [x1, y1, x2, y2], use as-is
|
45 |
-
if (
|
46 |
-
Array.isArray(bbox) &&
|
47 |
-
bbox.length === 4 &&
|
48 |
-
bbox.every((v: any) => typeof v === "number")
|
49 |
-
) {
|
50 |
-
return { ...obj, bbox_2d: bbox };
|
51 |
-
}
|
52 |
-
// Otherwise, skip
|
53 |
-
return null;
|
54 |
-
})
|
55 |
-
.filter((obj: any) => obj);
|
56 |
-
}
|
57 |
-
|
58 |
function isImageFile(file: File) {
|
59 |
return file.type.startsWith("image/");
|
60 |
}
|
|
|
1 |
import { useState, useRef, useEffect } from "react";
|
2 |
import { useVLMContext } from "../context/useVLMContext";
|
3 |
+
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
|
4 |
|
5 |
const MODES = ["Webcam", "URL", "File"] as const;
|
6 |
type Mode = typeof MODES[number];
|
|
|
8 |
const EXAMPLE_VIDEO_URL = "/space/videos/1.mp4";
|
9 |
const EXAMPLE_PROMPT = "Detect all birds in the image. For each bird, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"bird\", \"bbox_2d\": [x1, y1, x2, y2]}]";
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
function isImageFile(file: File) {
|
12 |
return file.type.startsWith("image/");
|
13 |
}
|