Spaces:
Running
Running
Upload 37 files
Browse files
src/components/MultiSourceCaptioningView.tsx
CHANGED
@@ -6,7 +6,15 @@ const MODES = ["Webcam", "URL", "File"] as const;
|
|
6 |
type Mode = typeof MODES[number];
|
7 |
|
8 |
const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
|
9 |
-
const EXAMPLE_PROMPT = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
function isImageFile(file: File) {
|
12 |
return file.type.startsWith("image/");
|
@@ -74,68 +82,69 @@ export default function MultiSourceCaptioningView() {
|
|
74 |
};
|
75 |
}, [mode]);
|
76 |
|
77 |
-
//
|
|
|
78 |
useEffect(() => {
|
79 |
if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
|
80 |
-
let
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
await runInference(video, prompt, (output: string) => {
|
92 |
-
setDebugOutput(output);
|
93 |
-
setInferenceStatus("Inference complete.");
|
94 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
95 |
-
const boxes = extractJsonFromMarkdown(output) || [];
|
96 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
97 |
-
drawBoundingBoxesOnCanvas(ctx, boxes);
|
98 |
-
});
|
99 |
-
};
|
100 |
-
interval = setInterval(() => {
|
101 |
-
processVideoFrame();
|
102 |
-
}, 1000);
|
103 |
-
return () => {
|
104 |
-
if (interval) clearInterval(interval);
|
105 |
-
};
|
106 |
}, [mode, isLoaded, prompt, runInference, webcamActive]);
|
107 |
|
108 |
-
// URL
|
109 |
useEffect(() => {
|
110 |
if (mode !== "URL" || !isLoaded || !urlProcessing) return;
|
111 |
-
let
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
await runInference(video, prompt, (output: string) => {
|
123 |
-
setDebugOutput(output);
|
124 |
-
setInferenceStatus("Inference complete.");
|
125 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
126 |
-
const boxes = extractJsonFromMarkdown(output) || [];
|
127 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
128 |
-
drawBoundingBoxesOnCanvas(ctx, boxes);
|
129 |
-
});
|
130 |
-
};
|
131 |
-
interval = setInterval(() => {
|
132 |
-
processVideoFrame();
|
133 |
-
}, 1000);
|
134 |
-
return () => {
|
135 |
-
if (interval) clearInterval(interval);
|
136 |
-
};
|
137 |
}, [mode, isLoaded, prompt, runInference, urlProcessing]);
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
// File mode: process uploaded image (only on button click)
|
140 |
const handleProcessImage = async () => {
|
141 |
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
@@ -155,7 +164,10 @@ export default function MultiSourceCaptioningView() {
|
|
155 |
setDebugOutput(output);
|
156 |
setInferenceStatus("Inference complete.");
|
157 |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
158 |
-
|
|
|
|
|
|
|
159 |
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
160 |
drawBoundingBoxesOnCanvas(ctx, boxes);
|
161 |
setImageProcessed(true);
|
@@ -164,79 +176,6 @@ export default function MultiSourceCaptioningView() {
|
|
164 |
};
|
165 |
|
166 |
// File mode: process uploaded video frames (start/stop)
|
167 |
-
useEffect(() => {
|
168 |
-
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
169 |
-
let interval: ReturnType<typeof setInterval> | null = null;
|
170 |
-
const processVideoFrame = async () => {
|
171 |
-
if (!videoRef.current || !canvasRef.current) return;
|
172 |
-
const video = videoRef.current;
|
173 |
-
const canvas = canvasRef.current;
|
174 |
-
if (video.paused || video.ended || video.videoWidth === 0) return;
|
175 |
-
canvas.width = video.videoWidth;
|
176 |
-
canvas.height = video.videoHeight;
|
177 |
-
const ctx = canvas.getContext("2d");
|
178 |
-
if (!ctx) return;
|
179 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
180 |
-
await runInference(video, prompt, (output: string) => {
|
181 |
-
setDebugOutput(output);
|
182 |
-
setInferenceStatus("Inference complete.");
|
183 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
184 |
-
const boxes = extractJsonFromMarkdown(output) || [];
|
185 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
186 |
-
drawBoundingBoxesOnCanvas(ctx, boxes);
|
187 |
-
});
|
188 |
-
};
|
189 |
-
interval = setInterval(() => {
|
190 |
-
processVideoFrame();
|
191 |
-
}, 1000);
|
192 |
-
return () => {
|
193 |
-
if (interval) clearInterval(interval);
|
194 |
-
};
|
195 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
196 |
-
|
197 |
-
// File mode: process example video frames (start/stop)
|
198 |
-
useEffect(() => {
|
199 |
-
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
200 |
-
let interval: ReturnType<typeof setInterval> | null = null;
|
201 |
-
const processVideoFrame = async () => {
|
202 |
-
if (!videoRef.current || !canvasRef.current) return;
|
203 |
-
const video = videoRef.current;
|
204 |
-
const canvas = canvasRef.current;
|
205 |
-
if (video.paused || video.ended || video.videoWidth === 0) return;
|
206 |
-
canvas.width = video.videoWidth;
|
207 |
-
canvas.height = video.videoHeight;
|
208 |
-
const ctx = canvas.getContext("2d");
|
209 |
-
if (!ctx) return;
|
210 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
211 |
-
await runInference(video, prompt, (output: string) => {
|
212 |
-
setDebugOutput(output);
|
213 |
-
setInferenceStatus("Inference complete.");
|
214 |
-
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
215 |
-
const boxes = extractJsonFromMarkdown(output) || [];
|
216 |
-
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
217 |
-
drawBoundingBoxesOnCanvas(ctx, boxes);
|
218 |
-
});
|
219 |
-
};
|
220 |
-
interval = setInterval(() => {
|
221 |
-
processVideoFrame();
|
222 |
-
}, 1000);
|
223 |
-
return () => {
|
224 |
-
if (interval) clearInterval(interval);
|
225 |
-
};
|
226 |
-
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
227 |
-
|
228 |
-
// Handle file upload
|
229 |
-
const handleFileChange = (e: any) => {
|
230 |
-
const file = e.target.files?.[0] || null;
|
231 |
-
setUploadedFile(file);
|
232 |
-
setUploadedUrl(file ? URL.createObjectURL(file) : "");
|
233 |
-
setError(null);
|
234 |
-
setImageProcessed(false);
|
235 |
-
setVideoProcessing(false);
|
236 |
-
setExampleProcessing(false);
|
237 |
-
};
|
238 |
-
|
239 |
-
// Handle start/stop for video processing
|
240 |
const handleToggleVideoProcessing = () => {
|
241 |
setVideoProcessing((prev) => !prev);
|
242 |
};
|
|
|
6 |
type Mode = typeof MODES[number];
|
7 |
|
8 |
const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
|
9 |
+
const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
|
10 |
+
|
11 |
+
function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
|
12 |
+
if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
|
13 |
+
const label = arr[0];
|
14 |
+
return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
|
15 |
+
}
|
16 |
+
return [];
|
17 |
+
}
|
18 |
|
19 |
function isImageFile(file: File) {
|
20 |
return file.type.startsWith("image/");
|
|
|
82 |
};
|
83 |
}, [mode]);
|
84 |
|
85 |
+
// Replace setInterval-based frame processing with an async loop for all video modes
|
86 |
+
// Example for webcam mode:
|
87 |
useEffect(() => {
|
88 |
if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
|
89 |
+
let running = true;
|
90 |
+
async function processLoop() {
|
91 |
+
while (running) {
|
92 |
+
if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
|
93 |
+
await processVideoFrame();
|
94 |
+
}
|
95 |
+
await new Promise(res => setTimeout(res, 1000)); // 1 FPS
|
96 |
+
}
|
97 |
+
}
|
98 |
+
processLoop();
|
99 |
+
return () => { running = false; };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
}, [mode, isLoaded, prompt, runInference, webcamActive]);
|
101 |
|
102 |
+
// Repeat for URL, File video, Example video modes:
|
103 |
useEffect(() => {
|
104 |
if (mode !== "URL" || !isLoaded || !urlProcessing) return;
|
105 |
+
let running = true;
|
106 |
+
async function processLoop() {
|
107 |
+
while (running) {
|
108 |
+
if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
|
109 |
+
await processVideoFrame();
|
110 |
+
}
|
111 |
+
await new Promise(res => setTimeout(res, 1000));
|
112 |
+
}
|
113 |
+
}
|
114 |
+
processLoop();
|
115 |
+
return () => { running = false; };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
}, [mode, isLoaded, prompt, runInference, urlProcessing]);
|
117 |
|
118 |
+
useEffect(() => {
|
119 |
+
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
|
120 |
+
let running = true;
|
121 |
+
async function processLoop() {
|
122 |
+
while (running) {
|
123 |
+
if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
|
124 |
+
await processVideoFrame();
|
125 |
+
}
|
126 |
+
await new Promise(res => setTimeout(res, 1000));
|
127 |
+
}
|
128 |
+
}
|
129 |
+
processLoop();
|
130 |
+
return () => { running = false; };
|
131 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
|
132 |
+
|
133 |
+
useEffect(() => {
|
134 |
+
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
|
135 |
+
let running = true;
|
136 |
+
async function processLoop() {
|
137 |
+
while (running) {
|
138 |
+
if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
|
139 |
+
await processVideoFrame();
|
140 |
+
}
|
141 |
+
await new Promise(res => setTimeout(res, 1000));
|
142 |
+
}
|
143 |
+
}
|
144 |
+
processLoop();
|
145 |
+
return () => { running = false; };
|
146 |
+
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
|
147 |
+
|
148 |
// File mode: process uploaded image (only on button click)
|
149 |
const handleProcessImage = async () => {
|
150 |
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
|
|
|
164 |
setDebugOutput(output);
|
165 |
setInferenceStatus("Inference complete.");
|
166 |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
|
167 |
+
let boxes = extractJsonFromMarkdown(output) || [];
|
168 |
+
if (boxes.length === 0 && Array.isArray(output)) {
|
169 |
+
boxes = parseFlatBoxArray(output);
|
170 |
+
}
|
171 |
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
|
172 |
drawBoundingBoxesOnCanvas(ctx, boxes);
|
173 |
setImageProcessed(true);
|
|
|
176 |
};
|
177 |
|
178 |
// File mode: process uploaded video frames (start/stop)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
const handleToggleVideoProcessing = () => {
|
180 |
setVideoProcessing((prev) => !prev);
|
181 |
};
|