Quazim0t0 commited on
Commit
83c414e
·
verified ·
1 Parent(s): 9eb35d5

Upload 37 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -6,7 +6,15 @@ const MODES = ["Webcam", "URL", "File"] as const;
6
  type Mode = typeof MODES[number];
7
 
8
  const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
- const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
 
 
 
 
 
 
 
 
10
 
11
  function isImageFile(file: File) {
12
  return file.type.startsWith("image/");
@@ -74,68 +82,69 @@ export default function MultiSourceCaptioningView() {
74
  };
75
  }, [mode]);
76
 
77
- // Process webcam frames (unchanged)
 
78
  useEffect(() => {
79
  if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
80
- let interval: ReturnType<typeof setInterval> | null = null;
81
- const processVideoFrame = async () => {
82
- if (!videoRef.current || !canvasRef.current) return;
83
- const video = videoRef.current;
84
- const canvas = canvasRef.current;
85
- if (video.videoWidth === 0) return;
86
- canvas.width = video.videoWidth;
87
- canvas.height = video.videoHeight;
88
- const ctx = canvas.getContext("2d");
89
- if (!ctx) return;
90
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
91
- await runInference(video, prompt, (output: string) => {
92
- setDebugOutput(output);
93
- setInferenceStatus("Inference complete.");
94
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
95
- const boxes = extractJsonFromMarkdown(output) || [];
96
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
97
- drawBoundingBoxesOnCanvas(ctx, boxes);
98
- });
99
- };
100
- interval = setInterval(() => {
101
- processVideoFrame();
102
- }, 1000);
103
- return () => {
104
- if (interval) clearInterval(interval);
105
- };
106
  }, [mode, isLoaded, prompt, runInference, webcamActive]);
107
 
108
- // URL mode: process video frames only when urlProcessing is true
109
  useEffect(() => {
110
  if (mode !== "URL" || !isLoaded || !urlProcessing) return;
111
- let interval: ReturnType<typeof setInterval> | null = null;
112
- const processVideoFrame = async () => {
113
- if (!videoRef.current || !canvasRef.current) return;
114
- const video = videoRef.current;
115
- const canvas = canvasRef.current;
116
- if (video.paused || video.ended || video.videoWidth === 0) return;
117
- canvas.width = video.videoWidth;
118
- canvas.height = video.videoHeight;
119
- const ctx = canvas.getContext("2d");
120
- if (!ctx) return;
121
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
122
- await runInference(video, prompt, (output: string) => {
123
- setDebugOutput(output);
124
- setInferenceStatus("Inference complete.");
125
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
126
- const boxes = extractJsonFromMarkdown(output) || [];
127
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
128
- drawBoundingBoxesOnCanvas(ctx, boxes);
129
- });
130
- };
131
- interval = setInterval(() => {
132
- processVideoFrame();
133
- }, 1000);
134
- return () => {
135
- if (interval) clearInterval(interval);
136
- };
137
  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  // File mode: process uploaded image (only on button click)
140
  const handleProcessImage = async () => {
141
  if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
@@ -155,7 +164,10 @@ export default function MultiSourceCaptioningView() {
155
  setDebugOutput(output);
156
  setInferenceStatus("Inference complete.");
157
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
158
- const boxes = extractJsonFromMarkdown(output) || [];
 
 
 
159
  if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
160
  drawBoundingBoxesOnCanvas(ctx, boxes);
161
  setImageProcessed(true);
@@ -164,79 +176,6 @@ export default function MultiSourceCaptioningView() {
164
  };
165
 
166
  // File mode: process uploaded video frames (start/stop)
167
- useEffect(() => {
168
- if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
169
- let interval: ReturnType<typeof setInterval> | null = null;
170
- const processVideoFrame = async () => {
171
- if (!videoRef.current || !canvasRef.current) return;
172
- const video = videoRef.current;
173
- const canvas = canvasRef.current;
174
- if (video.paused || video.ended || video.videoWidth === 0) return;
175
- canvas.width = video.videoWidth;
176
- canvas.height = video.videoHeight;
177
- const ctx = canvas.getContext("2d");
178
- if (!ctx) return;
179
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
180
- await runInference(video, prompt, (output: string) => {
181
- setDebugOutput(output);
182
- setInferenceStatus("Inference complete.");
183
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
184
- const boxes = extractJsonFromMarkdown(output) || [];
185
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
186
- drawBoundingBoxesOnCanvas(ctx, boxes);
187
- });
188
- };
189
- interval = setInterval(() => {
190
- processVideoFrame();
191
- }, 1000);
192
- return () => {
193
- if (interval) clearInterval(interval);
194
- };
195
- }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
196
-
197
- // File mode: process example video frames (start/stop)
198
- useEffect(() => {
199
- if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
200
- let interval: ReturnType<typeof setInterval> | null = null;
201
- const processVideoFrame = async () => {
202
- if (!videoRef.current || !canvasRef.current) return;
203
- const video = videoRef.current;
204
- const canvas = canvasRef.current;
205
- if (video.paused || video.ended || video.videoWidth === 0) return;
206
- canvas.width = video.videoWidth;
207
- canvas.height = video.videoHeight;
208
- const ctx = canvas.getContext("2d");
209
- if (!ctx) return;
210
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
211
- await runInference(video, prompt, (output: string) => {
212
- setDebugOutput(output);
213
- setInferenceStatus("Inference complete.");
214
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
215
- const boxes = extractJsonFromMarkdown(output) || [];
216
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
217
- drawBoundingBoxesOnCanvas(ctx, boxes);
218
- });
219
- };
220
- interval = setInterval(() => {
221
- processVideoFrame();
222
- }, 1000);
223
- return () => {
224
- if (interval) clearInterval(interval);
225
- };
226
- }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
227
-
228
- // Handle file upload
229
- const handleFileChange = (e: any) => {
230
- const file = e.target.files?.[0] || null;
231
- setUploadedFile(file);
232
- setUploadedUrl(file ? URL.createObjectURL(file) : "");
233
- setError(null);
234
- setImageProcessed(false);
235
- setVideoProcessing(false);
236
- setExampleProcessing(false);
237
- };
238
-
239
- // Handle start/stop for video processing
240
  const handleToggleVideoProcessing = () => {
241
  setVideoProcessing((prev) => !prev);
242
  };
 
6
  type Mode = typeof MODES[number];
7
 
8
  const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
+ const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
+
11
+ function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
+ if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
+ const label = arr[0];
14
+ return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
+ }
16
+ return [];
17
+ }
18
 
19
  function isImageFile(file: File) {
20
  return file.type.startsWith("image/");
 
82
  };
83
  }, [mode]);
84
 
85
+ // Replace setInterval-based frame processing with an async loop for all video modes
86
+ // Example for webcam mode:
87
  useEffect(() => {
88
  if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
89
+ let running = true;
90
+ async function processLoop() {
91
+ while (running) {
92
+ if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
93
+ await processVideoFrame();
94
+ }
95
+ await new Promise(res => setTimeout(res, 1000)); // 1 FPS
96
+ }
97
+ }
98
+ processLoop();
99
+ return () => { running = false; };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }, [mode, isLoaded, prompt, runInference, webcamActive]);
101
 
102
+ // Repeat for URL, File video, Example video modes:
103
  useEffect(() => {
104
  if (mode !== "URL" || !isLoaded || !urlProcessing) return;
105
+ let running = true;
106
+ async function processLoop() {
107
+ while (running) {
108
+ if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
109
+ await processVideoFrame();
110
+ }
111
+ await new Promise(res => setTimeout(res, 1000));
112
+ }
113
+ }
114
+ processLoop();
115
+ return () => { running = false; };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  }, [mode, isLoaded, prompt, runInference, urlProcessing]);
117
 
118
+ useEffect(() => {
119
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
120
+ let running = true;
121
+ async function processLoop() {
122
+ while (running) {
123
+ if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
124
+ await processVideoFrame();
125
+ }
126
+ await new Promise(res => setTimeout(res, 1000));
127
+ }
128
+ }
129
+ processLoop();
130
+ return () => { running = false; };
131
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
132
+
133
+ useEffect(() => {
134
+ if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
135
+ let running = true;
136
+ async function processLoop() {
137
+ while (running) {
138
+ if (videoRef.current && !videoRef.current.paused && !videoRef.current.ended && videoRef.current.videoWidth > 0) {
139
+ await processVideoFrame();
140
+ }
141
+ await new Promise(res => setTimeout(res, 1000));
142
+ }
143
+ }
144
+ processLoop();
145
+ return () => { running = false; };
146
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
147
+
148
  // File mode: process uploaded image (only on button click)
149
  const handleProcessImage = async () => {
150
  if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
 
164
  setDebugOutput(output);
165
  setInferenceStatus("Inference complete.");
166
  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
167
+ let boxes = extractJsonFromMarkdown(output) || [];
168
+ if (boxes.length === 0 && Array.isArray(output)) {
169
+ boxes = parseFlatBoxArray(output);
170
+ }
171
  if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
172
  drawBoundingBoxesOnCanvas(ctx, boxes);
173
  setImageProcessed(true);
 
176
  };
177
 
178
  // File mode: process uploaded video frames (start/stop)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  const handleToggleVideoProcessing = () => {
180
  setVideoProcessing((prev) => !prev);
181
  };