Quazim0t0 commited on
Commit
96b5215
·
verified ·
1 Parent(s): 20197c2

Upload 51 files

Browse files
src/App.tsx CHANGED
@@ -11,8 +11,7 @@ export default function App() {
11
  await loadModel();
12
  setStarted(true);
13
  } catch (e) {
14
- // error is handled by context, could log here if needed
15
- console.error("Failed to load model:", e);
16
  }
17
  };
18
 
@@ -28,9 +27,6 @@ export default function App() {
28
  {isLoading ? "Loading Model..." : "Load Model"}
29
  </button>
30
  {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
31
- <p className="text-sm text-gray-400 mt-2">
32
- Model will download on first load. This may take a moment.
33
- </p>
34
  </div>
35
  );
36
  }
@@ -41,4 +37,4 @@ export default function App() {
41
  <MultiSourceCaptioningView />
42
  </div>
43
  );
44
- }
 
11
  await loadModel();
12
  setStarted(true);
13
  } catch (e) {
14
+ // error is handled by context
 
15
  }
16
  };
17
 
 
27
  {isLoading ? "Loading Model..." : "Load Model"}
28
  </button>
29
  {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
 
 
 
30
  </div>
31
  );
32
  }
 
37
  <MultiSourceCaptioningView />
38
  </div>
39
  );
40
+ }
src/components/BoxAnnotator.ts CHANGED
@@ -16,7 +16,6 @@ export function extractJsonFromMarkdown(markdown: string): any[] | null {
16
  if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
17
  return null;
18
  } catch {
19
- console.error("Failed to parse JSON from markdown:", jsonString);
20
  return null;
21
  }
22
  }
@@ -32,15 +31,7 @@ export function drawBoundingBoxesOnCanvas(
32
  boxes: { bbox_2d: number[]; label?: string }[],
33
  options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
34
  ) {
35
- if (!Array.isArray(boxes)) {
36
- console.warn("drawBoundingBoxesOnCanvas: 'boxes' is not an array or is null/undefined.", boxes);
37
- return;
38
- }
39
- if (boxes.length === 0) {
40
- // console.log("drawBoundingBoxesOnCanvas: 'boxes' array is empty, nothing to draw.");
41
- return;
42
- }
43
-
44
  const color = options?.color || "#00FF00";
45
  const lineWidth = options?.lineWidth || 2;
46
  const font = options?.font || "16px Arial";
@@ -63,10 +54,9 @@ export function drawBoundingBoxesOnCanvas(
63
  ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
64
  ctx.stroke();
65
  if (obj.label) {
66
- // Adjust text position to ensure visibility, especially if near top edge
67
- ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 16 ? sy1 + 16 : sy1 - 4);
68
  }
69
  });
70
 
71
  ctx.restore();
72
- }
 
16
  if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
17
  return null;
18
  } catch {
 
19
  return null;
20
  }
21
  }
 
31
  boxes: { bbox_2d: number[]; label?: string }[],
32
  options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
33
  ) {
34
+ if (!Array.isArray(boxes)) return; // Prevent errors if boxes is undefined/null
 
 
 
 
 
 
 
 
35
  const color = options?.color || "#00FF00";
36
  const lineWidth = options?.lineWidth || 2;
37
  const font = options?.font || "16px Arial";
 
54
  ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
55
  ctx.stroke();
56
  if (obj.label) {
57
+ ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 10 ? sy1 + 16 : sy1 - 4);
 
58
  }
59
  });
60
 
61
  ctx.restore();
62
+ }
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,533 +1,520 @@
1
- import React, { useState, useRef, useEffect, useCallback } from "react";
2
- import { useVLMContext } from "../context/useVLMContext";
3
- import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
-
5
- const MODES = ["Webcam", "URL", "File"] as const;
6
- type Mode = typeof MODES[number];
7
-
8
- const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
9
- const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
-
11
- // Helper function: normalizeBoxes remains as it is used
12
- function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
13
- if (!raw) return [];
14
- let boxes = [];
15
- if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
16
- boxes = raw.image;
17
- } else if (Array.isArray(raw)) {
18
- boxes = raw;
19
- } else if (typeof raw === "object" && raw !== null) {
20
- boxes = [raw];
21
- }
22
- return boxes
23
- .map((obj: any) => {
24
- if (!obj || !obj.bbox_2d) return null;
25
- let bbox = obj.bbox_2d;
26
- if (
27
- Array.isArray(bbox) &&
28
- bbox.length === 2 &&
29
- Array.isArray(bbox[0]) &&
30
- Array.isArray(bbox[1]) &&
31
- bbox[0].length === 2 &&
32
- bbox[1].length === 2
33
- ) {
34
- bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
35
- }
36
- if (
37
- Array.isArray(bbox) &&
38
- bbox.length === 4 &&
39
- bbox.every((v: any) => typeof v === "number")
40
- ) {
41
- return { ...obj, bbox_2d: bbox };
42
- }
43
- return null;
44
- })
45
- .filter((obj: any) => obj);
46
- }
47
-
48
- function isImageFile(file: File) {
49
- return file.type.startsWith("image/");
50
- }
51
- function isVideoFile(file: File) {
52
- return file.type.startsWith("video/");
53
- }
54
-
55
- export default function MultiSourceCaptioningView() {
56
- const [mode, setMode] = useState<Mode>("File");
57
- const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
58
- const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
59
- const [processingState, setProcessingState] = useState(false); // General processing indicator
60
- const [error, setError] = useState<string | null>(null);
61
- const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
62
- const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
63
- const [inferenceStatus, setInferenceStatus] = useState<string>("");
64
- const [debugOutput, setDebugOutput] = useState<string>("");
65
- const [uploadedFile, setUploadedFile] = useState<File | null>(null); // <<< ADDED THIS STATE
66
-
67
- // Refs for the two video elements and the canvas
68
- const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
69
- const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
70
- const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
71
- const imageRef = useRef<HTMLImageElement>(null); // For image file processing
72
-
73
- const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
74
-
75
- // --- Drawing Loop for the Visible Display ---
76
- // This loop runs constantly to draw the latest boxes on the display video
77
- const drawDisplayCanvas = useCallback(() => {
78
- const displayVideo = displayVideoRef.current;
79
- const canvas = canvasRef.current;
80
- const ctx = canvas?.getContext('2d');
81
-
82
- if (!displayVideo || !canvas || !ctx) {
83
- return;
84
- }
85
-
86
- // Adjust canvas size to match the display video's dimensions
87
- // Only adjust if video has valid dimensions
88
- if (displayVideo.videoWidth > 0 && displayVideo.videoHeight > 0 &&
89
- (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight)) {
90
- canvas.width = displayVideo.videoWidth;
91
- canvas.height = displayVideo.videoHeight;
92
- }
93
-
94
- // Clear the canvas each frame
95
- ctx.clearRect(0, 0, canvas.width, canvas.height);
96
-
97
- // Draw the latest bounding boxes
98
- const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
99
- const scaleY = canvas.height / (displayVideo.videoHeight || 1);
100
- drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
101
-
102
- // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
103
- if (!displayVideo.paused && !displayVideo.ended) {
104
- requestAnimationFrame(drawDisplayCanvas);
105
- }
106
- }, [latestBoxes]); // Re-create if latestBoxes changes
107
-
108
- // Effect to start the display drawing loop when the display video is ready
109
- useEffect(() => {
110
- const displayVideo = displayVideoRef.current;
111
- if (displayVideo) {
112
- const handleVideoReady = () => {
113
- if (displayVideo.readyState >= 1) { // HAVE_METADATA
114
- requestAnimationFrame(drawDisplayCanvas);
115
- }
116
- };
117
- displayVideo.addEventListener('loadedmetadata', handleVideoReady);
118
- displayVideo.addEventListener('play', handleVideoReady); // Also start on play
119
- // Also check if video is already ready (e.g., on component re-mount or autoplay)
120
- if (displayVideo.readyState >= 1) {
121
- requestAnimationFrame(drawDisplayCanvas);
122
- }
123
- return () => {
124
- displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
125
- displayVideo.removeEventListener('play', handleVideoReady);
126
- };
127
- }
128
- }, [drawDisplayCanvas]);
129
-
130
- // --- FastVLM Processing Loop (from hidden video) ---
131
- // This interval loop controls when FastVLM processes a frame
132
- useEffect(() => {
133
- const vlmVideo = vlmVideoRef.current;
134
- // Determine if we are in a video-based mode that requires continuous processing
135
- const isVideoModeActive = (
136
- mode === "Webcam" ||
137
- (mode === "URL" && !!vlmVideo?.src) || // Check if URL video is loaded
138
- (mode === "File" && !!vlmVideo?.src && uploadedFile && isVideoFile(uploadedFile))
139
- );
140
-
141
- if (!isLoaded || !vlmVideo || !isVideoModeActive) {
142
- setProcessingState(false);
143
- return;
144
- }
145
-
146
- let interval: ReturnType<typeof setInterval> | null = null;
147
-
148
- const startVLMProcessing = () => {
149
- if (interval) clearInterval(interval); // Clear any old interval
150
-
151
- interval = setInterval(async () => {
152
- if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
153
- return; // Skip if video not ready, paused, ended, or already processing
154
- }
155
-
156
- setProcessingState(true);
157
- setInferenceStatus("Running inference...");
158
- setError(null);
159
-
160
- try {
161
- // Pass the HTMLVideoElement directly to runInference
162
- const modelOutput = await runInference(vlmVideo, prompt); // <<< FIXED: Pass video element directly
163
- setDebugOutput(modelOutput);
164
-
165
- let boxes = extractJsonFromMarkdown(modelOutput) || [];
166
- boxes = normalizeBoxes(boxes);
167
-
168
- setLatestBoxes(boxes);
169
- setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
170
- } catch (e) {
171
- setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
172
- setLatestBoxes([]);
173
- setInferenceStatus("Inference failed.");
174
- } finally {
175
- setProcessingState(false);
176
- }
177
- }, 200); // Inference interval (e.g., 5 frames per second)
178
- };
179
-
180
- const stopVLMProcessing = () => {
181
- if (interval) clearInterval(interval);
182
- interval = null;
183
- setProcessingState(false);
184
- setInferenceStatus("Stopped processing.");
185
- };
186
-
187
- vlmVideo.addEventListener('play', startVLMProcessing);
188
- vlmVideo.addEventListener('pause', stopVLMProcessing);
189
- vlmVideo.addEventListener('ended', stopVLMProcessing);
190
- vlmVideo.addEventListener('loadeddata', startVLMProcessing); // Also start on loadeddata for better reliability
191
-
192
- // Initial check if video is already playing or ready
193
- if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
194
- startVLMProcessing();
195
- }
196
-
197
- return () => {
198
- stopVLMProcessing();
199
- vlmVideo.removeEventListener('play', startVLMProcessing);
200
- vlmVideo.removeEventListener('pause', stopVLMProcessing);
201
- vlmVideo.removeEventListener('ended', stopVLMProcessing);
202
- vlmVideo.removeEventListener('loadeddata', startVLMProcessing);
203
- };
204
- }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Keep uploadedFile for re-trigger on file change
205
-
206
- // --- Media Source Handling ---
207
-
208
- // Cleanup for media stream and object URLs
209
- const cleanupMediaSource = useCallback(() => {
210
- if (mediaStream) {
211
- mediaStream.getTracks().forEach(track => track.stop());
212
- setMediaStream(null);
213
- }
214
- if (displayVideoRef.current?.src.startsWith('blob:')) {
215
- URL.revokeObjectURL(displayVideoRef.current.src);
216
- displayVideoRef.current.src = "";
217
- }
218
- if (vlmVideoRef.current?.src.startsWith('blob:')) {
219
- URL.revokeObjectURL(vlmVideoRef.current.src);
220
- vlmVideoRef.current.src = "";
221
- }
222
- setLatestBoxes([]);
223
- setError(null);
224
- setInferenceStatus("");
225
- setDebugOutput("");
226
- setUploadedFile(null); // <<< ADDED: Clear uploaded file on source change
227
- }, [mediaStream]);
228
-
229
- // Handle changing the mode (Webcam, URL, File)
230
- useEffect(() => {
231
- cleanupMediaSource();
232
-
233
- const displayVideo = displayVideoRef.current;
234
- const vlmVideo = vlmVideoRef.current;
235
-
236
- if (!displayVideo || !vlmVideo) return;
237
-
238
- // Reset srcObject/src to ensure fresh start
239
- displayVideo.srcObject = null;
240
- vlmVideo.srcObject = null;
241
- displayVideo.src = "";
242
- vlmVideo.src = "";
243
-
244
- // Special handling for initial "File" mode to load example video if no file is selected
245
- if (mode === "File" && !uploadedFile) { // <<< FIXED: Check uploadedFile here
246
- displayVideo.src = EXAMPLE_VIDEO_URL;
247
- vlmVideo.src = EXAMPLE_VIDEO_URL;
248
- displayVideo.load(); vlmVideo.load();
249
- displayVideo.play().catch(e => console.error("Error playing example display video:", e));
250
- vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
251
- }
252
- }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
253
-
254
- // Handle Webcam Input
255
- const handleWebcamInput = useCallback(async () => {
256
- cleanupMediaSource();
257
- try {
258
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
259
- setMediaStream(stream);
260
-
261
- if (displayVideoRef.current && vlmVideoRef.current) {
262
- displayVideoRef.current.srcObject = stream;
263
- vlmVideoRef.current.srcObject = stream;
264
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
265
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
266
- }
267
- setMode("Webcam");
268
- } catch (e) {
269
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
270
- setMediaStream(null);
271
- setLatestBoxes([]);
272
- setInferenceStatus("Webcam access denied or failed.");
273
- }
274
- }, [cleanupMediaSource]);
275
-
276
- // Handle URL Input (when Load button is clicked)
277
- const handleLoadUrl = useCallback(() => {
278
- cleanupMediaSource();
279
-
280
- const url = currentUrlInput;
281
- if (!url) {
282
- setError("Please enter a valid URL.");
283
- return;
284
- }
285
-
286
- if (displayVideoRef.current && vlmVideoRef.current) {
287
- displayVideoRef.current.src = url;
288
- vlmVideoRef.current.src = url;
289
- displayVideoRef.current.load(); vlmVideoRef.current.load();
290
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
291
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
292
- setMode("URL");
293
- }
294
- }, [currentUrlInput, cleanupMediaSource]);
295
-
296
- // Handle File Input
297
- const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
298
- cleanupMediaSource();
299
-
300
- const file = e.target.files?.[0] || null;
301
- setUploadedFile(file); // <<< FIXED: Set uploadedFile state here
302
-
303
- if (file) {
304
- const fileUrl = URL.createObjectURL(file);
305
-
306
- if (isImageFile(file)) {
307
- // Image file, will be handled by imageRef and single processing logic
308
- setMode("File"); // Ensure mode is "File"
309
- // No direct video assignment needed here, imageRef handles display
310
- } else if (isVideoFile(file)) {
311
- if (displayVideoRef.current && vlmVideoRef.current) {
312
- displayVideoRef.current.src = fileUrl;
313
- vlmVideoRef.current.src = fileUrl;
314
- displayVideoRef.current.load(); vlmVideoRef.current.load();
315
- displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
316
- vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
317
- setMode("File"); // Ensure mode is "File"
318
- }
319
- } else {
320
- setError("Unsupported file type. Please upload an image or video.");
321
- setUploadedFile(null); // <<< FIXED: Clear uploadedFile on error
322
- if (fileUrl) URL.revokeObjectURL(fileUrl);
323
- }
324
- } else {
325
- setUploadedFile(null); // <<< FIXED: Clear uploadedFile if no file selected
326
- // If no file selected, revert to example video if in File mode
327
- if (mode === "File") {
328
- if (displayVideoRef.current && vlmVideoRef.current) {
329
- displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
330
- vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
331
- displayVideoRef.current.load(); vlmVideoRef.current.load();
332
- displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
333
- vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
334
- }
335
- }
336
- }
337
- }, [cleanupMediaSource, mode]);
338
-
339
-
340
- // Handler for processing an uploaded image file (one-time inference)
341
- const handleProcessImage = async () => {
342
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
343
- setError("Image or model not ready for processing, or no image file selected.");
344
- return;
345
- }
346
-
347
- const img = imageRef.current;
348
- const canvas = canvasRef.current;
349
- const ctx = canvas.getContext("2d");
350
- if (!ctx) return;
351
-
352
- canvas.width = img.naturalWidth;
353
- canvas.height = img.naturalHeight;
354
-
355
- setProcessingState(true);
356
- setError(null);
357
- setInferenceStatus("Running image inference...");
358
-
359
- try {
360
- // Pass the HTMLImageElement directly to runInference
361
- const modelOutput = await runInference(img, prompt); // <<< FIXED: Pass image element directly
362
- setDebugOutput(modelOutput);
363
- setInferenceStatus("Image inference complete.");
364
-
365
- ctx.clearRect(0, 0, canvas.width, canvas.height);
366
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height); // Redraw image
367
-
368
- let boxes = extractJsonFromMarkdown(modelOutput) || [];
369
- boxes = normalizeBoxes(boxes);
370
- setLatestBoxes(boxes);
371
-
372
- if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
373
- } catch (e) {
374
- setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
375
- setLatestBoxes([]);
376
- setInferenceStatus("Image inference failed.");
377
- } finally {
378
- setProcessingState(false);
379
- }
380
- };
381
-
382
- // --- Rendered UI ---
383
- return (
384
- <div className="absolute inset-0 text-white flex flex-col">
385
- <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
386
- {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
387
- </div>
388
- <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div>
389
-
390
- <div className="flex flex-col items-center justify-center flex-1 w-full p-4">
391
- {/* Mode Selector */}
392
- <div className="mb-6 mt-4">
393
- <div className="flex space-x-4">
394
- {MODES.map((m) => (
395
- <button
396
- key={m}
397
- className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
398
- mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
399
- }`}
400
- onClick={() => setMode(m)}
401
- disabled={!isLoaded && m !== "File"}
402
- >
403
- {m}
404
- </button>
405
- ))}
406
- </div>
407
- </div>
408
-
409
- {/* Dynamic Content Area */}
410
- <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
411
- {/* Prompt Input (Common to all modes) */}
412
- <div className="mb-4 w-full max-w-xl">
413
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
414
- <textarea
415
- className="w-full p-2 rounded-lg text-black"
416
- rows={3}
417
- value={prompt}
418
- onChange={(e) => setPrompt(e.target.value)}
419
- disabled={processingState}
420
- />
421
- </div>
422
-
423
- {/* Video/Image Display and Canvas Overlay */}
424
- <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
425
- {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
426
- <img
427
- ref={imageRef}
428
- src={URL.createObjectURL(uploadedFile)}
429
- alt="Uploaded"
430
- className="max-w-full max-h-full block object-contain"
431
- style={{ position: 'absolute' }}
432
- onLoad={() => {
433
- if (imageRef.current && canvasRef.current) {
434
- canvasRef.current.width = imageRef.current.naturalWidth;
435
- canvasRef.current.height = imageRef.current.naturalHeight;
436
- }
437
- }}
438
- />
439
- ) : (
440
- <video
441
- ref={displayVideoRef}
442
- autoPlay
443
- muted
444
- playsInline
445
- loop
446
- className="max-w-full max-h-full block object-contain"
447
- style={{ position: 'absolute' }}
448
- />
449
- )}
450
- <canvas
451
- ref={canvasRef}
452
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
453
- style={{ zIndex: 10 }}
454
- />
455
- </div>
456
-
457
- {/* Controls specific to each mode */}
458
- <div className="mt-4 flex flex-col items-center gap-2">
459
- {mode === "Webcam" && (
460
- <button
461
- className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
462
- onClick={handleWebcamInput}
463
- disabled={processingState || !isLoaded}
464
- >
465
- {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
466
- </button>
467
- )}
468
-
469
- {mode === "URL" && (
470
- <>
471
- <div className="flex w-full max-w-xl">
472
- <input
473
- type="text"
474
- className="flex-1 px-4 py-2 rounded-l-lg text-black"
475
- value={currentUrlInput}
476
- onChange={(e) => setCurrentUrlInput(e.target.value)}
477
- placeholder="Paste video URL here"
478
- disabled={processingState}
479
- />
480
- <button
481
- className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
482
- onClick={handleLoadUrl}
483
- disabled={processingState || !isLoaded}
484
- >
485
- Load URL
486
- </button>
487
- </div>
488
- </>
489
- )}
490
-
491
- {mode === "File" && (
492
- <>
493
- <input
494
- type="file"
495
- accept="image/*,video/*"
496
- onChange={handleFileChange}
497
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
498
- disabled={processingState}
499
- />
500
- {uploadedFile && isImageFile(uploadedFile) && ( // <<< FIXED: Check uploadedFile here
501
- <button
502
- className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
503
- onClick={handleProcessImage}
504
- disabled={processingState || !isLoaded}
505
- >
506
- {processingState ? "Processing Image..." : "Process Image"}
507
- </button>
508
- )}
509
- </>
510
- )}
511
- </div>
512
-
513
- {/* Error and Debug Output */}
514
- {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
515
- <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
516
- <div>Raw Model Output:</div>
517
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
518
- </div>
519
- </div>
520
- </div>
521
-
522
- {/* Hidden Video for VLM processing - this must be rendered always */}
523
- <video
524
- ref={vlmVideoRef}
525
- autoPlay
526
- muted
527
- playsInline
528
- loop
529
- style={{ display: 'none' }} // Hidden from view
530
- />
531
- </div>
532
- );
533
- }
 
1
+ import { useState, useRef, useEffect } from "react";
2
+ import { useVLMContext } from "../context/useVLMContext";
3
+ import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
+
5
+ const MODES = ["Webcam", "URL", "File"] as const;
6
+ type Mode = typeof MODES[number];
7
+
8
+ const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
+ const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
+
11
+ function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
12
+ if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
13
+ const label = arr[0];
14
+ return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
15
+ }
16
+ return [];
17
+ }
18
+
19
+ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
20
+ if (!raw) return [];
21
+ let boxes = [];
22
+ if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) {
23
+ boxes = raw.image;
24
+ } else if (Array.isArray(raw)) {
25
+ boxes = raw;
26
+ } else if (typeof raw === "object" && raw !== null) {
27
+ boxes = [raw];
28
+ }
29
+ return boxes
30
+ .map((obj: any) => {
31
+ if (!obj || !obj.bbox_2d) return null;
32
+ let bbox = obj.bbox_2d;
33
+ // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
34
+ if (
35
+ Array.isArray(bbox) &&
36
+ bbox.length === 2 &&
37
+ Array.isArray(bbox[0]) &&
38
+ Array.isArray(bbox[1]) &&
39
+ bbox[0].length === 2 &&
40
+ bbox[1].length === 2
41
+ ) {
42
+ bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
+ }
44
+ // If bbox_2d is [x1, y1, x2, y2], use as-is
45
+ if (
46
+ Array.isArray(bbox) &&
47
+ bbox.length === 4 &&
48
+ bbox.every((v: any) => typeof v === "number")
49
+ ) {
50
+ return { ...obj, bbox_2d: bbox };
51
+ }
52
+ // Otherwise, skip
53
+ return null;
54
+ })
55
+ .filter((obj: any) => obj);
56
+ }
57
+
58
+ function isImageFile(file: File) {
59
+ return file.type.startsWith("image/");
60
+ }
61
+ function isVideoFile(file: File) {
62
+ return file.type.startsWith("video/");
63
+ }
64
+
65
+ export default function MultiSourceCaptioningView() {
66
+ const [mode, setMode] = useState<Mode>("File");
67
+ const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
68
+ const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
69
+ const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
70
+ const [processing, setProcessing] = useState(false);
71
+ const [error, setError] = useState<string | null>(null);
72
+ const [webcamActive, setWebcamActive] = useState(false);
73
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
74
+ const [uploadedUrl, setUploadedUrl] = useState<string>("");
75
+ const [videoProcessing, setVideoProcessing] = useState(false);
76
+ const [imageProcessed, setImageProcessed] = useState(false);
77
+ const [exampleProcessing, setExampleProcessing] = useState(false);
78
+ const [urlProcessing, setUrlProcessing] = useState(false);
79
+ const [debugOutput, setDebugOutput] = useState<string>("");
80
+ const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
81
+ const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
82
+ const [inferenceStatus, setInferenceStatus] = useState<string>("");
83
+
84
+ const videoRef = useRef<HTMLVideoElement | null>(null);
85
+ const canvasRef = useRef<HTMLCanvasElement | null>(null);
86
+ const imageRef = useRef<HTMLImageElement | null>(null);
87
+ const webcamStreamRef = useRef<MediaStream | null>(null);
88
+ const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
89
+
90
+ const processVideoFrame = async () => {
91
+ if (!videoRef.current || !canvasRef.current) return;
92
+ const video = videoRef.current;
93
+ const canvas = canvasRef.current;
94
+ if (video.paused || video.ended || video.videoWidth === 0) return;
95
+ canvas.width = video.videoWidth;
96
+ canvas.height = video.videoHeight;
97
+ const ctx = canvas.getContext("2d");
98
+ if (!ctx) return;
99
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
100
+ await runInference(video, prompt, (output: string) => {
101
+ setDebugOutput(output); // <-- Ensure Raw Model Output is updated
102
+ let boxes = extractJsonFromMarkdown(output) || [];
103
+ if (boxes.length === 0 && Array.isArray(output)) {
104
+ boxes = parseFlatBoxArray(output);
105
+ }
106
+ boxes = normalizeBoxes(boxes);
107
+ console.log("Model output:", output);
108
+ console.log("Boxes after normalization:", boxes);
109
+ console.log("Canvas size:", canvas.width, canvas.height);
110
+ if (boxes.length > 0) {
111
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
112
+ console.log("First box coords:", x1, y1, x2, y2);
113
+ }
114
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
115
+ if (Array.isArray(boxes) && boxes.length > 0) {
116
+ const scaleX = canvas.width / video.videoWidth;
117
+ const scaleY = canvas.height / video.videoHeight;
118
+ ctx.clearRect(0, 0, canvas.width, canvas.height); // Clear canvas before drawing boxes
119
+ drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Use visible color and thick line
120
+ }
121
+ });
122
+ };
123
+
124
+ const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
125
+ const file = e.target.files?.[0] || null;
126
+ setUploadedFile(file);
127
+ setUploadedUrl(file ? URL.createObjectURL(file) : "");
128
+ setError(null);
129
+ setImageProcessed(false);
130
+ setVideoProcessing(false);
131
+ setExampleProcessing(false);
132
+ };
133
+
134
+ // Webcam setup and teardown (unchanged)
135
+ useEffect(() => {
136
+ if (mode !== "Webcam") {
137
+ if (webcamStreamRef.current) {
138
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
139
+ webcamStreamRef.current = null;
140
+ }
141
+ setWebcamActive(false);
142
+ return;
143
+ }
144
+ const setupWebcam = async () => {
145
+ try {
146
+ setError(null);
147
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
148
+ webcamStreamRef.current = stream;
149
+ if (videoRef.current) {
150
+ videoRef.current.srcObject = stream;
151
+ setWebcamActive(true);
152
+ }
153
+ } catch (e) {
154
+ setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
155
+ setWebcamActive(false);
156
+ }
157
+ };
158
+ setupWebcam();
159
+ return () => {
160
+ if (webcamStreamRef.current) {
161
+ webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
162
+ webcamStreamRef.current = null;
163
+ }
164
+ setWebcamActive(false);
165
+ };
166
+ }, [mode]);
167
+
168
+ // Webcam mode: process frames with setInterval
169
+ useEffect(() => {
170
+ if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
171
+ let interval: ReturnType<typeof setInterval> | null = null;
172
+ interval = setInterval(() => {
173
+ processVideoFrame();
174
+ }, 1000);
175
+ return () => {
176
+ if (interval) clearInterval(interval);
177
+ };
178
+ }, [mode, isLoaded, prompt, runInference, webcamActive]);
179
+
180
+ // URL mode: process frames with setInterval
181
+ useEffect(() => {
182
+ if (mode !== "URL" || !isLoaded || !urlProcessing) return;
183
+ let interval: ReturnType<typeof setInterval> | null = null;
184
+ interval = setInterval(() => {
185
+ processVideoFrame();
186
+ }, 1000);
187
+ return () => {
188
+ if (interval) clearInterval(interval);
189
+ };
190
+ }, [mode, isLoaded, prompt, runInference, urlProcessing]);
191
+
192
+ // File video mode: process frames with setInterval
193
+ useEffect(() => {
194
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
195
+ let interval: ReturnType<typeof setInterval> | null = null;
196
+ interval = setInterval(() => {
197
+ processVideoFrame();
198
+ }, 1000);
199
+ return () => {
200
+ if (interval) clearInterval(interval);
201
+ };
202
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
203
+
204
+ // Example video mode: process frames with setInterval
205
+ useEffect(() => {
206
+ if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
207
+ let interval: ReturnType<typeof setInterval> | null = null;
208
+ interval = setInterval(() => {
209
+ processVideoFrame();
210
+ }, 1000);
211
+ return () => {
212
+ if (interval) clearInterval(interval);
213
+ };
214
+ }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
215
+
216
+ // File mode: process uploaded image (only on button click)
217
+ const handleProcessImage = async () => {
218
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
219
+ const img = imageRef.current;
220
+ const canvas = canvasRef.current;
221
+ canvas.width = img.naturalWidth;
222
+ canvas.height = img.naturalHeight;
223
+ setCanvasDims({w:canvas.width,h:canvas.height});
224
+ setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
225
+ const ctx = canvas.getContext("2d");
226
+ if (!ctx) return;
227
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
228
+ setProcessing(true);
229
+ setError(null);
230
+ setInferenceStatus("Running inference...");
231
+ await runInference(img, prompt, (output: string) => {
232
+ setDebugOutput(output);
233
+ setInferenceStatus("Inference complete.");
234
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
235
+ let boxes = extractJsonFromMarkdown(output) || [];
236
+ if (boxes.length === 0 && Array.isArray(output)) {
237
+ boxes = parseFlatBoxArray(output);
238
+ }
239
+ boxes = normalizeBoxes(boxes);
240
+ console.log("Model output:", output);
241
+ console.log("Boxes after normalization:", boxes);
242
+ console.log("Canvas size:", canvas.width, canvas.height);
243
+ if (boxes.length > 0) {
244
+ const [x1, y1, x2, y2] = boxes[0].bbox_2d;
245
+ console.log("First box coords:", x1, y1, x2, y2);
246
+ }
247
+ if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
248
+ if (Array.isArray(boxes) && boxes.length > 0) {
249
+ const scaleX = canvas.width / img.naturalWidth;
250
+ const scaleY = canvas.height / img.naturalHeight;
251
+ drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY });
252
+ }
253
+ setImageProcessed(true);
254
+ });
255
+ setProcessing(false);
256
+ };
257
+
258
+ // File mode: process uploaded video frames (start/stop)
259
+ const handleToggleVideoProcessing = () => {
260
+ setVideoProcessing((prev) => !prev);
261
+ };
262
+
263
+ // Handle start/stop for example video processing
264
+ const handleToggleExampleProcessing = () => {
265
+ setExampleProcessing((prev) => !prev);
266
+ };
267
+
268
+ // Handle start/stop for URL video processing
269
+ const handleToggleUrlProcessing = () => {
270
+ setUrlProcessing((prev) => !prev);
271
+ };
272
+
273
+ // Test draw box function
274
+ const handleTestDrawBox = () => {
275
+ if (!canvasRef.current) return;
276
+ const canvas = canvasRef.current;
277
+ const ctx = canvas.getContext("2d");
278
+ if (!ctx) return;
279
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
280
+ ctx.strokeStyle = "#FF00FF";
281
+ ctx.lineWidth = 4;
282
+ ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
283
+ ctx.font = "20px Arial";
284
+ ctx.fillStyle = "#FF00FF";
285
+ ctx.fillText("Test Box", 50, 35);
286
+ };
287
+
288
+ return (
289
+ <div className="absolute inset-0 text-white">
290
+ <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
291
+ {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
292
+ </div>
293
+ <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
294
+ <div className="flex flex-col items-center justify-center h-full w-full">
295
+ {/* Mode Selector */}
296
+ <div className="mb-6">
297
+ <div className="flex space-x-4">
298
+ {MODES.map((m) => (
299
+ <button
300
+ key={m}
301
+ className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${
302
+ mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
303
+ }`}
304
+ onClick={() => setMode(m)}
305
+ >
306
+ {m}
307
+ </button>
308
+ ))}
309
+ </div>
310
+ </div>
311
+
312
+ {/* Mode Content */}
313
+ <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
314
+ {mode === "Webcam" && (
315
+ <div className="w-full text-center flex flex-col items-center">
316
+ <div className="mb-4 w-full max-w-xl">
317
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
318
+ <textarea
319
+ className="w-full p-2 rounded-lg text-black"
320
+ rows={3}
321
+ value={prompt}
322
+ onChange={(e) => setPrompt(e.target.value)}
323
+ />
324
+ </div>
325
+ <div className="relative w-full max-w-xl">
326
+ <video
327
+ ref={videoRef}
328
+ autoPlay
329
+ muted
330
+ playsInline
331
+ className="w-full rounded-lg shadow-lg mb-2"
332
+ style={{ background: "#222" }}
333
+ />
334
+ <canvas
335
+ ref={canvasRef}
336
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
337
+ style={{ zIndex: 10, pointerEvents: "none" }}
338
+ />
339
+ </div>
340
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
341
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
342
+ </div>
343
+ )}
344
+ {mode === "URL" && (
345
+ <div className="w-full text-center flex flex-col items-center">
346
+ <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
347
+ <div className="flex w-full max-w-xl mb-4">
348
+ <input
349
+ type="text"
350
+ className="flex-1 px-4 py-2 rounded-l-lg text-black"
351
+ value={inputUrl}
352
+ onChange={(e) => setInputUrl(e.target.value)}
353
+ placeholder="Paste video URL here"
354
+ />
355
+ <button
356
+ className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
357
+ onClick={() => setVideoUrl(inputUrl)}
358
+ >
359
+ Load
360
+ </button>
361
+ </div>
362
+ <div className="mb-4 w-full max-w-xl">
363
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
364
+ <textarea
365
+ className="w-full p-2 rounded-lg text-black"
366
+ rows={3}
367
+ value={prompt}
368
+ onChange={(e) => setPrompt(e.target.value)}
369
+ />
370
+ </div>
371
+ <div className="relative w-full max-w-xl">
372
+ <video
373
+ ref={videoRef}
374
+ src={videoUrl}
375
+ controls
376
+ autoPlay
377
+ loop
378
+ className="w-full rounded-lg shadow-lg mb-2"
379
+ style={{ background: "#222" }}
380
+ />
381
+ <canvas
382
+ ref={canvasRef}
383
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
384
+ style={{ zIndex: 10, pointerEvents: "none" }}
385
+ />
386
+ <button
387
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
388
+ onClick={handleToggleUrlProcessing}
389
+ >
390
+ {urlProcessing ? "Stop Processing" : "Start Processing"}
391
+ </button>
392
+ </div>
393
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
394
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
395
+ <button
396
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
397
+ onClick={handleTestDrawBox}
398
+ >
399
+ Test Draw Box
400
+ </button>
401
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
402
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
403
+ <div>Raw Model Output:</div>
404
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
405
+ </div>
406
+ </div>
407
+ )}
408
+ {mode === "File" && (
409
+ <div className="w-full text-center flex flex-col items-center">
410
+ <div className="mb-4 w-full max-w-xl">
411
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
412
+ <textarea
413
+ className="w-full p-2 rounded-lg text-black"
414
+ rows={3}
415
+ value={prompt}
416
+ onChange={(e) => setPrompt(e.target.value)}
417
+ />
418
+ </div>
419
+ <div className="mb-4 w-full max-w-xl">
420
+ <input
421
+ type="file"
422
+ accept="image/*,video/*"
423
+ onChange={handleFileChange}
424
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
425
+ />
426
+ </div>
427
+ {/* Show uploaded image */}
428
+ {uploadedFile && isImageFile(uploadedFile) && (
429
+ <div className="relative w-full max-w-xl">
430
+ <img
431
+ ref={imageRef}
432
+ src={uploadedUrl}
433
+ alt="Uploaded"
434
+ className="w-full rounded-lg shadow-lg mb-2"
435
+ style={{ background: "#222" }}
436
+ />
437
+ <canvas
438
+ ref={canvasRef}
439
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
440
+ style={{ zIndex: 10, pointerEvents: "none" }}
441
+ />
442
+ <button
443
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
444
+ onClick={handleProcessImage}
445
+ disabled={processing}
446
+ >
447
+ {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
448
+ </button>
449
+ </div>
450
+ )}
451
+ {/* Show uploaded video */}
452
+ {uploadedFile && isVideoFile(uploadedFile) && (
453
+ <div className="relative w-full max-w-xl">
454
+ <video
455
+ ref={videoRef}
456
+ src={uploadedUrl}
457
+ controls
458
+ autoPlay
459
+ loop
460
+ className="w-full rounded-lg shadow-lg mb-2"
461
+ style={{ background: "#222" }}
462
+ />
463
+ <canvas
464
+ ref={canvasRef}
465
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
466
+ style={{ zIndex: 10, pointerEvents: "none" }}
467
+ />
468
+ <button
469
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
470
+ onClick={handleToggleVideoProcessing}
471
+ >
472
+ {videoProcessing ? "Stop Processing" : "Start Processing"}
473
+ </button>
474
+ </div>
475
+ )}
476
+ {/* Show example video if no file uploaded */}
477
+ {!uploadedFile && (
478
+ <div className="relative w-full max-w-xl">
479
+ <video
480
+ ref={videoRef}
481
+ src={EXAMPLE_VIDEO_URL}
482
+ controls
483
+ autoPlay
484
+ loop
485
+ className="w-full rounded-lg shadow-lg mb-2"
486
+ style={{ background: "#222" }}
487
+ />
488
+ <canvas
489
+ ref={canvasRef}
490
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
491
+ style={{ zIndex: 10, pointerEvents: "none" }}
492
+ />
493
+ <button
494
+ className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
495
+ onClick={handleToggleExampleProcessing}
496
+ >
497
+ {exampleProcessing ? "Stop Processing" : "Start Processing"}
498
+ </button>
499
+ </div>
500
+ )}
501
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
502
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
503
+ <button
504
+ className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
505
+ onClick={handleTestDrawBox}
506
+ >
507
+ Test Draw Box
508
+ </button>
509
+ <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
510
+ <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
511
+ <div>Raw Model Output:</div>
512
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
513
+ </div>
514
+ </div>
515
+ )}
516
+ </div>
517
+ </div>
518
+ </div>
519
+ );
520
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
src/index.js CHANGED
@@ -14,4 +14,4 @@ root.render(
14
  // If you want to start measuring performance in your app, pass a function
15
  // to log results (for example: reportWebVitals(console.log))
16
  // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17
- reportWebVitals();
 
14
  // If you want to start measuring performance in your app, pass a function
15
  // to log results (for example: reportWebVitals(console.log))
16
  // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17
+ reportWebVitals();