Quazim0t0 commited on
Commit
98c6726
·
verified ·
1 Parent(s): f90ffff

Upload 50 files

Browse files
.gitignore CHANGED
@@ -1,23 +1,23 @@
1
- # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2
-
3
- # dependencies
4
- /node_modules
5
- /.pnp
6
- .pnp.js
7
-
8
- # testing
9
- /coverage
10
-
11
- # production
12
- /build
13
-
14
- # misc
15
- .DS_Store
16
- .env.local
17
- .env.development.local
18
- .env.test.local
19
- .env.production.local
20
-
21
- npm-debug.log*
22
- yarn-debug.log*
23
- yarn-error.log*
 
1
+ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2
+
3
+ # dependencies
4
+ /node_modules
5
+ /.pnp
6
+ .pnp.js
7
+
8
+ # testing
9
+ /coverage
10
+
11
+ # production
12
+ /build
13
+
14
+ # misc
15
+ .DS_Store
16
+ .env.local
17
+ .env.development.local
18
+ .env.test.local
19
+ .env.production.local
20
+
21
+ npm-debug.log*
22
+ yarn-debug.log*
23
+ yarn-error.log*
public/index.html CHANGED
@@ -1,43 +1,43 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="utf-8" />
5
- <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
6
- <meta name="viewport" content="width=device-width, initial-scale=1" />
7
- <meta name="theme-color" content="#000000" />
8
- <meta
9
- name="description"
10
- content="Web site created using create-react-app"
11
- />
12
- <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13
- <!--
14
- manifest.json provides metadata used when your web app is installed on a
15
- user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16
- -->
17
- <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18
- <!--
19
- Notice the use of %PUBLIC_URL% in the tags above.
20
- It will be replaced with the URL of the `public` folder during the build.
21
- Only files inside the `public` folder can be referenced from the HTML.
22
-
23
- Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24
- work correctly both with client-side routing and a non-root public URL.
25
- Learn how to configure a non-root public URL by running `npm run build`.
26
- -->
27
- <title>React App</title>
28
- </head>
29
- <body>
30
- <noscript>You need to enable JavaScript to run this app.</noscript>
31
- <div id="root"></div>
32
- <!--
33
- This HTML file is a template.
34
- If you open it directly in the browser, you will see an empty page.
35
-
36
- You can add webfonts, meta tags, or analytics to this file.
37
- The build step will place the bundled scripts into the <body> tag.
38
-
39
- To begin the development, run `npm start` or `yarn start`.
40
- To create a production bundle, use `npm run build` or `yarn build`.
41
- -->
42
- </body>
43
- </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
7
+ <meta name="theme-color" content="#000000" />
8
+ <meta
9
+ name="description"
10
+ content="Web site created using create-react-app"
11
+ />
12
+ <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13
+ <!--
14
+ manifest.json provides metadata used when your web app is installed on a
15
+ user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16
+ -->
17
+ <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18
+ <!--
19
+ Notice the use of %PUBLIC_URL% in the tags above.
20
+ It will be replaced with the URL of the `public` folder during the build.
21
+ Only files inside the `public` folder can be referenced from the HTML.
22
+
23
+ Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24
+ work correctly both with client-side routing and a non-root public URL.
25
+ Learn how to configure a non-root public URL by running `npm run build`.
26
+ -->
27
+ <title>React App</title>
28
+ </head>
29
+ <body>
30
+ <noscript>You need to enable JavaScript to run this app.</noscript>
31
+ <div id="root"></div>
32
+ <!--
33
+ This HTML file is a template.
34
+ If you open it directly in the browser, you will see an empty page.
35
+
36
+ You can add webfonts, meta tags, or analytics to this file.
37
+ The build step will place the bundled scripts into the <body> tag.
38
+
39
+ To begin the development, run `npm start` or `yarn start`.
40
+ To create a production bundle, use `npm run build` or `yarn build`.
41
+ -->
42
+ </body>
43
+ </html>
public/manifest.json CHANGED
@@ -1,25 +1,25 @@
1
- {
2
- "short_name": "React App",
3
- "name": "Create React App Sample",
4
- "icons": [
5
- {
6
- "src": "favicon.ico",
7
- "sizes": "64x64 32x32 24x24 16x16",
8
- "type": "image/x-icon"
9
- },
10
- {
11
- "src": "logo192.png",
12
- "type": "image/png",
13
- "sizes": "192x192"
14
- },
15
- {
16
- "src": "logo512.png",
17
- "type": "image/png",
18
- "sizes": "512x512"
19
- }
20
- ],
21
- "start_url": ".",
22
- "display": "standalone",
23
- "theme_color": "#000000",
24
- "background_color": "#ffffff"
25
- }
 
1
+ {
2
+ "short_name": "React App",
3
+ "name": "Create React App Sample",
4
+ "icons": [
5
+ {
6
+ "src": "favicon.ico",
7
+ "sizes": "64x64 32x32 24x24 16x16",
8
+ "type": "image/x-icon"
9
+ },
10
+ {
11
+ "src": "logo192.png",
12
+ "type": "image/png",
13
+ "sizes": "192x192"
14
+ },
15
+ {
16
+ "src": "logo512.png",
17
+ "type": "image/png",
18
+ "sizes": "512x512"
19
+ }
20
+ ],
21
+ "start_url": ".",
22
+ "display": "standalone",
23
+ "theme_color": "#000000",
24
+ "background_color": "#ffffff"
25
+ }
public/robots.txt CHANGED
@@ -1,3 +1,3 @@
1
- # https://www.robotstxt.org/robotstxt.html
2
- User-agent: *
3
- Disallow:
 
1
+ # https://www.robotstxt.org/robotstxt.html
2
+ User-agent: *
3
+ Disallow:
src/App.css CHANGED
@@ -1,38 +1,38 @@
1
- .App {
2
- text-align: center;
3
- }
4
-
5
- .App-logo {
6
- height: 40vmin;
7
- pointer-events: none;
8
- }
9
-
10
- @media (prefers-reduced-motion: no-preference) {
11
- .App-logo {
12
- animation: App-logo-spin infinite 20s linear;
13
- }
14
- }
15
-
16
- .App-header {
17
- background-color: #282c34;
18
- min-height: 100vh;
19
- display: flex;
20
- flex-direction: column;
21
- align-items: center;
22
- justify-content: center;
23
- font-size: calc(10px + 2vmin);
24
- color: white;
25
- }
26
-
27
- .App-link {
28
- color: #61dafb;
29
- }
30
-
31
- @keyframes App-logo-spin {
32
- from {
33
- transform: rotate(0deg);
34
- }
35
- to {
36
- transform: rotate(360deg);
37
- }
38
- }
 
1
+ .App {
2
+ text-align: center;
3
+ }
4
+
5
+ .App-logo {
6
+ height: 40vmin;
7
+ pointer-events: none;
8
+ }
9
+
10
+ @media (prefers-reduced-motion: no-preference) {
11
+ .App-logo {
12
+ animation: App-logo-spin infinite 20s linear;
13
+ }
14
+ }
15
+
16
+ .App-header {
17
+ background-color: #282c34;
18
+ min-height: 100vh;
19
+ display: flex;
20
+ flex-direction: column;
21
+ align-items: center;
22
+ justify-content: center;
23
+ font-size: calc(10px + 2vmin);
24
+ color: white;
25
+ }
26
+
27
+ .App-link {
28
+ color: #61dafb;
29
+ }
30
+
31
+ @keyframes App-logo-spin {
32
+ from {
33
+ transform: rotate(0deg);
34
+ }
35
+ to {
36
+ transform: rotate(360deg);
37
+ }
38
+ }
src/App.js CHANGED
@@ -1,25 +1,25 @@
1
- import logo from './logo.svg';
2
- import './App.css';
3
-
4
- function App() {
5
- return (
6
- <div className="App">
7
- <header className="App-header">
8
- <img src={logo} className="App-logo" alt="logo" />
9
- <p>
10
- Edit <code>src/App.js</code> and save to reload.
11
- </p>
12
- <a
13
- className="App-link"
14
- href="https://reactjs.org"
15
- target="_blank"
16
- rel="noopener noreferrer"
17
- >
18
- Learn React
19
- </a>
20
- </header>
21
- </div>
22
- );
23
- }
24
-
25
- export default App;
 
1
+ import logo from './logo.svg';
2
+ import './App.css';
3
+
4
+ function App() {
5
+ return (
6
+ <div className="App">
7
+ <header className="App-header">
8
+ <img src={logo} className="App-logo" alt="logo" />
9
+ <p>
10
+ Edit <code>src/App.js</code> and save to reload.
11
+ </p>
12
+ <a
13
+ className="App-link"
14
+ href="https://reactjs.org"
15
+ target="_blank"
16
+ rel="noopener noreferrer"
17
+ >
18
+ Learn React
19
+ </a>
20
+ </header>
21
+ </div>
22
+ );
23
+ }
24
+
25
+ export default App;
src/App.test.js CHANGED
@@ -1,8 +1,8 @@
1
- import { render, screen } from '@testing-library/react';
2
- import App from './App';
3
-
4
- test('renders learn react link', () => {
5
- render(<App />);
6
- const linkElement = screen.getByText(/learn react/i);
7
- expect(linkElement).toBeInTheDocument();
8
- });
 
1
+ import { render, screen } from '@testing-library/react';
2
+ import App from './App';
3
+
4
+ test('renders learn react link', () => {
5
+ render(<App />);
6
+ const linkElement = screen.getByText(/learn react/i);
7
+ expect(linkElement).toBeInTheDocument();
8
+ });
src/App.tsx CHANGED
@@ -11,7 +11,8 @@ export default function App() {
11
  await loadModel();
12
  setStarted(true);
13
  } catch (e) {
14
- // error is handled by context
 
15
  }
16
  };
17
 
@@ -27,6 +28,9 @@ export default function App() {
27
  {isLoading ? "Loading Model..." : "Load Model"}
28
  </button>
29
  {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
 
 
 
30
  </div>
31
  );
32
  }
@@ -37,4 +41,4 @@ export default function App() {
37
  <MultiSourceCaptioningView />
38
  </div>
39
  );
40
- }
 
11
  await loadModel();
12
  setStarted(true);
13
  } catch (e) {
14
+ // error is handled by context, could log here if needed
15
+ console.error("Failed to load model:", e);
16
  }
17
  };
18
 
 
28
  {isLoading ? "Loading Model..." : "Load Model"}
29
  </button>
30
  {error && <div className="text-red-400 mt-2">Model error: {error}</div>}
31
+ <p className="text-sm text-gray-400 mt-2">
32
+ Model will download on first load. This may take a moment.
33
+ </p>
34
  </div>
35
  );
36
  }
 
41
  <MultiSourceCaptioningView />
42
  </div>
43
  );
44
+ }
src/components/BoxAnnotator.ts CHANGED
@@ -16,6 +16,7 @@ export function extractJsonFromMarkdown(markdown: string): any[] | null {
16
  if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
17
  return null;
18
  } catch {
 
19
  return null;
20
  }
21
  }
@@ -31,7 +32,15 @@ export function drawBoundingBoxesOnCanvas(
31
  boxes: { bbox_2d: number[]; label?: string }[],
32
  options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
33
  ) {
34
- if (!Array.isArray(boxes)) return; // Prevent errors if boxes is undefined/null
 
 
 
 
 
 
 
 
35
  const color = options?.color || "#00FF00";
36
  const lineWidth = options?.lineWidth || 2;
37
  const font = options?.font || "16px Arial";
@@ -54,9 +63,10 @@ export function drawBoundingBoxesOnCanvas(
54
  ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
55
  ctx.stroke();
56
  if (obj.label) {
57
- ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 10 ? sy1 + 16 : sy1 - 4);
 
58
  }
59
  });
60
 
61
  ctx.restore();
62
- }
 
16
  if (typeof parsed === "object" && parsed !== null) return [parsed]; // <-- Wrap object in array
17
  return null;
18
  } catch {
19
+ console.error("Failed to parse JSON from markdown:", jsonString);
20
  return null;
21
  }
22
  }
 
32
  boxes: { bbox_2d: number[]; label?: string }[],
33
  options?: { color?: string; lineWidth?: number; font?: string, scaleX?: number, scaleY?: number }
34
  ) {
35
+ if (!Array.isArray(boxes)) {
36
+ console.warn("drawBoundingBoxesOnCanvas: 'boxes' is not an array or is null/undefined.", boxes);
37
+ return;
38
+ }
39
+ if (boxes.length === 0) {
40
+ // console.log("drawBoundingBoxesOnCanvas: 'boxes' array is empty, nothing to draw.");
41
+ return;
42
+ }
43
+
44
  const color = options?.color || "#00FF00";
45
  const lineWidth = options?.lineWidth || 2;
46
  const font = options?.font || "16px Arial";
 
63
  ctx.rect(sx1, sy1, sx2 - sx1, sy2 - sy1);
64
  ctx.stroke();
65
  if (obj.label) {
66
+ // Adjust text position to ensure visibility, especially if near top edge
67
+ ctx.fillText(obj.label, sx1 + 4, sy1 - 4 < 16 ? sy1 + 16 : sy1 - 4);
68
  }
69
  });
70
 
71
  ctx.restore();
72
+ }
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -1,13 +1,22 @@
1
- import { useState, useRef, useEffect } from "react";
2
  import { useVLMContext } from "../context/useVLMContext";
3
  import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
 
5
  const MODES = ["Webcam", "URL", "File"] as const;
6
  type Mode = typeof MODES[number];
7
 
8
- const EXAMPLE_VIDEO_URL = "/videos/1.mp4";
9
  const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
 
 
 
 
 
 
 
 
 
 
11
  function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
12
  if (!raw) return [];
13
  let boxes = [];
@@ -22,7 +31,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
22
  .map((obj: any) => {
23
  if (!obj || !obj.bbox_2d) return null;
24
  let bbox = obj.bbox_2d;
25
- // If bbox_2d is [[x1, y1], [x2, y2]], convert to [x1, y1, x2, y2]
26
  if (
27
  Array.isArray(bbox) &&
28
  bbox.length === 2 &&
@@ -33,7 +41,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
33
  ) {
34
  bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
35
  }
36
- // If bbox_2d is [x1, y1, x2, y2], use as-is
37
  if (
38
  Array.isArray(bbox) &&
39
  bbox.length === 4 &&
@@ -41,7 +48,6 @@ function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
41
  ) {
42
  return { ...obj, bbox_2d: bbox };
43
  }
44
- // Otherwise, skip
45
  return null;
46
  })
47
  .filter((obj: any) => obj);
@@ -54,329 +60,365 @@ function isVideoFile(file: File) {
54
  return file.type.startsWith("video/");
55
  }
56
 
57
- // Utility to get ImageData from a video or image element
58
- function getImageDataFromElement(media: HTMLVideoElement | HTMLImageElement): ImageData | null {
59
- const canvas = document.createElement("canvas");
60
- let width = 0, height = 0;
61
- if (media instanceof HTMLVideoElement) {
62
- width = media.videoWidth;
63
- height = media.videoHeight;
64
- } else if (media instanceof HTMLImageElement) {
65
- width = media.naturalWidth;
66
- height = media.naturalHeight;
67
- } else {
68
- return null;
69
- }
70
- canvas.width = width;
71
- canvas.height = height;
72
- const ctx = canvas.getContext("2d");
73
- if (!ctx) return null;
74
- ctx.drawImage(media, 0, 0, width, height);
75
- return ctx.getImageData(0, 0, width, height);
76
- }
77
-
78
  export default function MultiSourceCaptioningView() {
79
  const [mode, setMode] = useState<Mode>("File");
80
- const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
81
- const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
82
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
83
- const [processing, setProcessing] = useState(false);
84
  const [error, setError] = useState<string | null>(null);
85
- const [webcamActive, setWebcamActive] = useState(false);
86
- const [uploadedFile, setUploadedFile] = useState<File | null>(null);
87
- const [uploadedUrl, setUploadedUrl] = useState<string>("");
88
- const [videoProcessing, setVideoProcessing] = useState(false);
89
- const [imageProcessed, setImageProcessed] = useState(false);
90
- const [exampleProcessing, setExampleProcessing] = useState(false);
91
- const [urlProcessing, setUrlProcessing] = useState(false);
92
- const [debugOutput, setDebugOutput] = useState<string>("");
93
- const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null);
94
- const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null);
95
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
96
- const inferenceWorkerRef = useRef<Worker | null>(null);
97
- const [useWorker] = useState(true);
 
 
 
 
 
98
 
99
- const videoRef = useRef<HTMLVideoElement | null>(null);
100
- const canvasRef = useRef<HTMLCanvasElement | null>(null);
101
- const imageRef = useRef<HTMLImageElement | null>(null);
102
- const webcamStreamRef = useRef<MediaStream | null>(null);
103
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
104
 
105
- useEffect(() => {
106
- if (useWorker) {
107
- inferenceWorkerRef.current = new Worker(
108
- new URL('../workers/inferenceWorker.ts', import.meta.url),
109
- { type: 'module' }
110
- );
 
 
 
111
  }
112
- return () => {
113
- inferenceWorkerRef.current?.terminate();
114
- inferenceWorkerRef.current = null;
115
- };
116
- }, [useWorker]);
117
-
118
- // Helper to run inference in worker
119
- const runInferenceInWorker = (media: HTMLVideoElement | HTMLImageElement, prompt: string) => {
120
- return new Promise((resolve, reject) => {
121
- if (!inferenceWorkerRef.current) return reject('No worker');
122
- const imageData = getImageDataFromElement(media);
123
- if (!imageData) return reject('Could not get image data');
124
- inferenceWorkerRef.current.onmessage = (event) => resolve(event.data);
125
- inferenceWorkerRef.current.onerror = (err) => reject(err);
126
- inferenceWorkerRef.current.postMessage({ imageData, prompt });
127
- });
128
- };
129
 
130
- const processVideoFrame = async () => {
131
- if (!videoRef.current || !canvasRef.current) return;
132
- const video = videoRef.current;
133
- const canvas = canvasRef.current;
134
- if (video.paused || video.ended || video.videoWidth === 0) return;
135
- canvas.width = video.videoWidth;
136
- canvas.height = video.videoHeight;
137
- const ctx = canvas.getContext("2d");
138
- if (!ctx) return;
139
- ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
140
- if (useWorker && inferenceWorkerRef.current) {
141
- try {
142
- const output = await runInferenceInWorker(video, prompt);
143
- setDebugOutput(JSON.stringify(output, null, 2));
144
- let boxes = normalizeBoxes(output);
145
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
146
- if (Array.isArray(boxes) && boxes.length > 0) {
147
- const scaleX = canvas.width / video.videoWidth;
148
- const scaleY = canvas.height / video.videoHeight;
149
- ctx.clearRect(0, 0, canvas.width, canvas.height);
150
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
151
- }
152
- } catch (err) {
153
- setInferenceStatus("Worker inference failed, falling back to main thread.");
154
- // fallback to main-thread inference
155
- await runInference(video, prompt, (output: string) => {
156
- setDebugOutput(output);
157
- let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
158
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
159
- if (Array.isArray(boxes) && boxes.length > 0) {
160
- const scaleX = canvas.width / video.videoWidth;
161
- const scaleY = canvas.height / video.videoHeight;
162
- ctx.clearRect(0, 0, canvas.width, canvas.height);
163
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
164
- }
165
- });
166
- }
167
- } else {
168
- await runInference(video, prompt, (output: string) => {
169
- setDebugOutput(output);
170
- let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
171
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
172
- if (Array.isArray(boxes) && boxes.length > 0) {
173
- const scaleX = canvas.width / video.videoWidth;
174
- const scaleY = canvas.height / video.videoHeight;
175
- ctx.clearRect(0, 0, canvas.width, canvas.height);
176
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
177
- }
178
- });
179
  }
180
- };
181
 
182
- const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
183
- const file = e.target.files?.[0] || null;
184
- setUploadedFile(file);
185
- setUploadedUrl(file ? URL.createObjectURL(file) : "");
186
- setError(null);
187
- setImageProcessed(false);
188
- setVideoProcessing(false);
189
- setExampleProcessing(false);
190
- };
191
 
192
- // Webcam setup and teardown (unchanged)
193
- useEffect(() => {
194
- if (mode !== "Webcam") {
195
- if (webcamStreamRef.current) {
196
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
197
- webcamStreamRef.current = null;
198
- }
199
- setWebcamActive(false);
200
- return;
201
  }
202
- const setupWebcam = async () => {
203
- try {
204
- setError(null);
205
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
206
- webcamStreamRef.current = stream;
207
- if (videoRef.current) {
208
- videoRef.current.srcObject = stream;
209
- setWebcamActive(true);
 
 
210
  }
211
- } catch (e) {
212
- setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
213
- setWebcamActive(false);
214
- }
215
- };
216
- setupWebcam();
217
- return () => {
218
- if (webcamStreamRef.current) {
219
- webcamStreamRef.current.getTracks().forEach((track: MediaStreamTrack) => track.stop());
220
- webcamStreamRef.current = null;
221
  }
222
- setWebcamActive(false);
223
- };
224
- }, [mode]);
 
 
225
 
226
- // Webcam mode: process frames with setInterval
 
227
  useEffect(() => {
228
- if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
 
 
 
 
 
 
 
 
229
  let interval: ReturnType<typeof setInterval> | null = null;
230
- interval = setInterval(() => {
231
- processVideoFrame();
232
- }, 1000);
233
- return () => {
234
- if (interval) clearInterval(interval);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  };
236
- }, [mode, isLoaded, prompt, runInference, webcamActive]);
237
 
238
- // URL mode: process frames with setInterval
239
- useEffect(() => {
240
- if (mode !== "URL" || !isLoaded || !urlProcessing) return;
241
- let interval: ReturnType<typeof setInterval> | null = null;
242
- interval = setInterval(() => {
243
- processVideoFrame();
244
- }, 1000);
245
- return () => {
246
  if (interval) clearInterval(interval);
 
 
 
247
  };
248
- }, [mode, isLoaded, prompt, runInference, urlProcessing]);
249
 
250
- // File video mode: process frames with setInterval
251
- useEffect(() => {
252
- if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return;
253
- let interval: ReturnType<typeof setInterval> | null = null;
254
- interval = setInterval(() => {
255
- processVideoFrame();
256
- }, 1000);
 
 
 
 
257
  return () => {
258
- if (interval) clearInterval(interval);
 
 
 
259
  };
260
- }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]);
261
 
262
- // Example video mode: process frames with setInterval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  useEffect(() => {
264
- if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return;
265
- let interval: ReturnType<typeof setInterval> | null = null;
266
- interval = setInterval(() => {
267
- processVideoFrame();
268
- }, 1000);
269
- return () => {
270
- if (interval) clearInterval(interval);
271
- };
272
- }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]);
273
 
274
- // File mode: process uploaded image (only on button click)
275
- const handleProcessImage = async () => {
276
- if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return;
277
- const img = imageRef.current;
278
- const canvas = canvasRef.current;
279
- canvas.width = img.naturalWidth;
280
- canvas.height = img.naturalHeight;
281
- setCanvasDims({w:canvas.width,h:canvas.height});
282
- setVideoDims({w:img.naturalWidth,h:img.naturalHeight});
283
- const ctx = canvas.getContext("2d");
284
- if (!ctx) return;
285
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
286
- setProcessing(true);
287
  setError(null);
288
- setInferenceStatus("Running inference...");
289
- if (useWorker && inferenceWorkerRef.current) {
290
- try {
291
- const output = await runInferenceInWorker(img, prompt);
292
- setDebugOutput(JSON.stringify(output, null, 2));
293
- setInferenceStatus("Inference complete.");
294
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
295
- let boxes = normalizeBoxes(output);
296
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
297
- if (Array.isArray(boxes) && boxes.length > 0) {
298
- const scaleX = canvas.width / img.naturalWidth;
299
- const scaleY = canvas.height / img.naturalHeight;
300
- ctx.clearRect(0, 0, canvas.width, canvas.height);
301
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
- setImageProcessed(true);
304
- } catch (err) {
305
- setInferenceStatus("Worker inference failed, falling back to main thread.");
306
- // fallback to main-thread inference
307
- await runInference(img, prompt, (output: string) => {
308
- setDebugOutput(output);
309
- setInferenceStatus("Inference complete.");
310
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
311
- let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
312
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
313
- if (Array.isArray(boxes) && boxes.length > 0) {
314
- const scaleX = canvas.width / img.naturalWidth;
315
- const scaleY = canvas.height / img.naturalHeight;
316
- ctx.clearRect(0, 0, canvas.width, canvas.height);
317
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
318
- }
319
- setImageProcessed(true);
320
- });
321
  }
322
  } else {
323
- await runInference(img, prompt, (output: string) => {
324
- setDebugOutput(output);
325
- setInferenceStatus("Inference complete.");
326
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
327
- let boxes = normalizeBoxes(extractJsonFromMarkdown(output) || []);
328
- if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid.");
329
- if (Array.isArray(boxes) && boxes.length > 0) {
330
- const scaleX = canvas.width / img.naturalWidth;
331
- const scaleY = canvas.height / img.naturalHeight;
332
- ctx.clearRect(0, 0, canvas.width, canvas.height);
333
- drawBoundingBoxesOnCanvas(ctx, boxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
334
  }
335
- setImageProcessed(true);
336
- });
337
  }
338
- setProcessing(false);
339
- };
340
 
341
- // File mode: process uploaded video frames (start/stop)
342
- const handleToggleVideoProcessing = () => {
343
- setVideoProcessing((prev) => !prev);
344
- };
345
-
346
- // Handle start/stop for example video processing
347
- const handleToggleExampleProcessing = () => {
348
- setExampleProcessing((prev) => !prev);
349
- };
350
 
351
- // Handle start/stop for URL video processing
352
- const handleToggleUrlProcessing = () => {
353
- setUrlProcessing((prev) => !prev);
354
- };
 
 
355
 
356
- // Test draw box function
357
- const handleTestDrawBox = () => {
358
- if (!canvasRef.current) return;
359
  const canvas = canvasRef.current;
360
  const ctx = canvas.getContext("2d");
361
  if (!ctx) return;
362
- ctx.clearRect(0, 0, canvas.width, canvas.height);
363
- ctx.strokeStyle = "#FF00FF";
364
- ctx.lineWidth = 4;
365
- ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4));
366
- ctx.font = "20px Arial";
367
- ctx.fillStyle = "#FF00FF";
368
- ctx.fillText("Test Box", 50, 35);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  };
370
 
 
371
  return (
372
- <div className="absolute inset-0 text-white">
373
  <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
374
  {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
375
  </div>
376
- <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div>
377
- <div className="flex flex-col items-center justify-center h-full w-full">
 
378
  {/* Mode Selector */}
379
- <div className="mb-6">
380
  <div className="flex space-x-4">
381
  {MODES.map((m) => (
382
  <button
@@ -385,6 +427,7 @@ export default function MultiSourceCaptioningView() {
385
  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
386
  }`}
387
  onClick={() => setMode(m)}
 
388
  >
389
  {m}
390
  </button>
@@ -392,212 +435,130 @@ export default function MultiSourceCaptioningView() {
392
  </div>
393
  </div>
394
 
395
- {/* Mode Content */}
396
- <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center">
397
- {mode === "Webcam" && (
398
- <div className="w-full text-center flex flex-col items-center">
399
- <div className="mb-4 w-full max-w-xl">
400
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
401
- <textarea
402
- className="w-full p-2 rounded-lg text-black"
403
- rows={3}
404
- value={prompt}
405
- onChange={(e) => setPrompt(e.target.value)}
406
- />
407
- </div>
408
- <div className="relative w-full max-w-xl">
409
- <video
410
- ref={videoRef}
411
- autoPlay
412
- muted
413
- playsInline
414
- className="w-full rounded-lg shadow-lg mb-2"
415
- style={{ background: "#222" }}
416
- />
417
- <canvas
418
- ref={canvasRef}
419
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
420
- style={{ zIndex: 10, pointerEvents: "none" }}
421
- />
422
- </div>
423
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
424
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
425
- </div>
426
- )}
427
- {mode === "URL" && (
428
- <div className="w-full text-center flex flex-col items-center">
429
- <p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p>
430
- <div className="flex w-full max-w-xl mb-4">
431
- <input
432
- type="text"
433
- className="flex-1 px-4 py-2 rounded-l-lg text-black"
434
- value={inputUrl}
435
- onChange={(e) => setInputUrl(e.target.value)}
436
- placeholder="Paste video URL here"
437
- />
438
- <button
439
- className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold"
440
- onClick={() => setVideoUrl(inputUrl)}
441
- >
442
- Load
443
- </button>
444
- </div>
445
- <div className="mb-4 w-full max-w-xl">
446
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
447
- <textarea
448
- className="w-full p-2 rounded-lg text-black"
449
- rows={3}
450
- value={prompt}
451
- onChange={(e) => setPrompt(e.target.value)}
452
- />
453
- </div>
454
- <div className="relative w-full max-w-xl">
455
- <video
456
- ref={videoRef}
457
- src={videoUrl}
458
- controls
459
- autoPlay
460
- loop
461
- className="w-full rounded-lg shadow-lg mb-2"
462
- style={{ background: "#222" }}
463
- />
464
- <canvas
465
- ref={canvasRef}
466
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
467
- style={{ zIndex: 10, pointerEvents: "none" }}
468
- />
469
- <button
470
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
471
- onClick={handleToggleUrlProcessing}
472
- >
473
- {urlProcessing ? "Stop Processing" : "Start Processing"}
474
- </button>
475
- </div>
476
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
477
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
478
  <button
479
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
480
- onClick={handleTestDrawBox}
 
481
  >
482
- Test Draw Box
483
  </button>
484
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
485
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
486
- <div>Raw Model Output:</div>
487
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
488
- </div>
489
- </div>
490
- )}
491
- {mode === "File" && (
492
- <div className="w-full text-center flex flex-col items-center">
493
- <div className="mb-4 w-full max-w-xl">
494
- <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
495
- <textarea
496
- className="w-full p-2 rounded-lg text-black"
497
- rows={3}
498
- value={prompt}
499
- onChange={(e) => setPrompt(e.target.value)}
500
- />
501
- </div>
502
- <div className="mb-4 w-full max-w-xl">
 
 
 
 
 
 
 
503
  <input
504
  type="file"
505
  accept="image/*,video/*"
506
  onChange={handleFileChange}
507
- className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
 
508
  />
509
- </div>
510
- {/* Show uploaded image */}
511
- {uploadedFile && isImageFile(uploadedFile) && (
512
- <div className="relative w-full max-w-xl">
513
- <img
514
- ref={imageRef}
515
- src={uploadedUrl}
516
- alt="Uploaded"
517
- className="w-full rounded-lg shadow-lg mb-2"
518
- style={{ background: "#222" }}
519
- />
520
- <canvas
521
- ref={canvasRef}
522
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
523
- style={{ zIndex: 10, pointerEvents: "none" }}
524
- />
525
  <button
526
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
527
  onClick={handleProcessImage}
528
- disabled={processing}
529
  >
530
- {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"}
531
  </button>
532
- </div>
533
- )}
534
- {/* Show uploaded video */}
535
- {uploadedFile && isVideoFile(uploadedFile) && (
536
- <div className="relative w-full max-w-xl">
537
- <video
538
- ref={videoRef}
539
- src={uploadedUrl}
540
- controls
541
- autoPlay
542
- loop
543
- className="w-full rounded-lg shadow-lg mb-2"
544
- style={{ background: "#222" }}
545
- />
546
- <canvas
547
- ref={canvasRef}
548
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
549
- style={{ zIndex: 10, pointerEvents: "none" }}
550
- />
551
- <button
552
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
553
- onClick={handleToggleVideoProcessing}
554
- >
555
- {videoProcessing ? "Stop Processing" : "Start Processing"}
556
- </button>
557
- </div>
558
- )}
559
- {/* Show example video if no file uploaded */}
560
- {!uploadedFile && (
561
- <div className="relative w-full max-w-xl">
562
- <video
563
- ref={videoRef}
564
- src={EXAMPLE_VIDEO_URL}
565
- controls
566
- autoPlay
567
- loop
568
- className="w-full rounded-lg shadow-lg mb-2"
569
- style={{ background: "#222" }}
570
- />
571
- <canvas
572
- ref={canvasRef}
573
- className="absolute top-0 left-0 w-full h-full pointer-events-none"
574
- style={{ zIndex: 10, pointerEvents: "none" }}
575
- />
576
- <button
577
- className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold"
578
- onClick={handleToggleExampleProcessing}
579
- >
580
- {exampleProcessing ? "Stop Processing" : "Start Processing"}
581
- </button>
582
- </div>
583
- )}
584
- {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
585
- {error && <div className="text-red-400 mt-2">Error: {error}</div>}
586
- <button
587
- className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold"
588
- onClick={handleTestDrawBox}
589
- >
590
- Test Draw Box
591
- </button>
592
- <div className="mt-2 p-2 bg-gray-800 rounded text-xs">
593
- <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div>
594
- <div>Raw Model Output:</div>
595
- <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
596
- </div>
597
- </div>
598
- )}
599
  </div>
600
  </div>
 
 
 
 
 
 
 
 
 
 
601
  </div>
602
  );
603
- }
 
1
+ import React, { useState, useRef, useEffect, useCallback } from "react";
2
  import { useVLMContext } from "../context/useVLMContext";
3
  import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator";
4
 
5
  const MODES = ["Webcam", "URL", "File"] as const;
6
  type Mode = typeof MODES[number];
7
 
8
+ const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct
9
  const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]";
10
 
11
+ // Helper functions (remain the same)
12
+ function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] {
13
+ if (typeof arr[0] === "string" && Array.isArray(arr[1])) {
14
+ const label = arr[0];
15
+ return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox }));
16
+ }
17
+ return [];
18
+ }
19
+
20
  function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] {
21
  if (!raw) return [];
22
  let boxes = [];
 
31
  .map((obj: any) => {
32
  if (!obj || !obj.bbox_2d) return null;
33
  let bbox = obj.bbox_2d;
 
34
  if (
35
  Array.isArray(bbox) &&
36
  bbox.length === 2 &&
 
41
  ) {
42
  bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]];
43
  }
 
44
  if (
45
  Array.isArray(bbox) &&
46
  bbox.length === 4 &&
 
48
  ) {
49
  return { ...obj, bbox_2d: bbox };
50
  }
 
51
  return null;
52
  })
53
  .filter((obj: any) => obj);
 
60
  return file.type.startsWith("video/");
61
  }
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  export default function MultiSourceCaptioningView() {
64
  const [mode, setMode] = useState<Mode>("File");
65
+ const [currentUrlInput, setCurrentUrlInput] = useState<string>(EXAMPLE_VIDEO_URL);
 
66
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
67
+ const [processingState, setProcessingState] = useState(false); // General processing indicator
68
  const [error, setError] = useState<string | null>(null);
69
+ const [mediaStream, setMediaStream] = useState<MediaStream | null>(null); // For webcam stream
70
+ const [latestBoxes, setLatestBoxes] = useState<any[]>([]); // State for boxes to draw
 
 
 
 
 
 
 
 
71
  const [inferenceStatus, setInferenceStatus] = useState<string>("");
72
+ const [debugOutput, setDebugOutput] = useState<string>("");
73
+
74
+ // Refs for the two video elements and the canvas
75
+ const displayVideoRef = useRef<HTMLVideoElement>(null); // The visible video
76
+ const vlmVideoRef = useRef<HTMLVideoElement>(null); // The hidden video for VLM processing
77
+ const canvasRef = useRef<HTMLCanvasElement>(null); // The canvas overlay for drawing boxes
78
+ const imageRef = useRef<HTMLImageElement>(null); // For image file processing
79
 
 
 
 
 
80
  const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext();
81
 
82
+ // --- Drawing Loop for the Visible Display ---
83
+ // This loop runs constantly to draw the latest boxes on the display video
84
+ const drawDisplayCanvas = useCallback(() => {
85
+ const displayVideo = displayVideoRef.current;
86
+ const canvas = canvasRef.current;
87
+ const ctx = canvas?.getContext('2d');
88
+
89
+ if (!displayVideo || !canvas || !ctx) {
90
+ return;
91
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ // Adjust canvas size to match the display video's dimensions
94
+ if (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight) {
95
+ canvas.width = displayVideo.videoWidth;
96
+ canvas.height = displayVideo.videoHeight;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  }
 
98
 
99
+ // Clear the canvas each frame
100
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
 
 
 
 
 
 
 
101
 
102
+ // Draw the latest bounding boxes
103
+ const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero
104
+ const scaleY = canvas.height / (displayVideo.videoHeight || 1);
105
+ drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY });
106
+
107
+ // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended
108
+ if (!displayVideo.paused && !displayVideo.ended) {
109
+ requestAnimationFrame(drawDisplayCanvas);
 
110
  }
111
+ }, [latestBoxes]); // Re-create if latestBoxes changes
112
+
113
+ // Effect to start the display drawing loop when the display video is ready
114
+ useEffect(() => {
115
+ const displayVideo = displayVideoRef.current;
116
+ if (displayVideo) {
117
+ const handleVideoReady = () => {
118
+ // Start the requestAnimationFrame loop once the video has loaded metadata
119
+ if (displayVideo.readyState >= 1) { // HAVE_METADATA
120
+ requestAnimationFrame(drawDisplayCanvas);
121
  }
122
+ };
123
+ displayVideo.addEventListener('loadedmetadata', handleVideoReady);
124
+ // Also check if video is already ready (e.g., on component re-mount)
125
+ if (displayVideo.readyState >= 1) {
126
+ requestAnimationFrame(drawDisplayCanvas);
 
 
 
 
 
127
  }
128
+ return () => {
129
+ displayVideo.removeEventListener('loadedmetadata', handleVideoReady);
130
+ };
131
+ }
132
+ }, [drawDisplayCanvas]);
133
 
134
+ // --- FastVLM Processing Loop (from hidden video/image) ---
135
+ // This interval loop controls when FastVLM processes a frame
136
  useEffect(() => {
137
+ const vlmVideo = vlmVideoRef.current;
138
+ const isVideoMode = (mode === "Webcam" || (mode === "URL" && vlmVideo?.src) || (mode === "File" && vlmVideo?.src && isVideoFile(uploadedFile || null)));
139
+
140
+ if (!isLoaded || !vlmVideo || !isVideoMode) {
141
+ // If not in a video mode or VLM/video not ready, ensure processing stops
142
+ setProcessingState(false);
143
+ return;
144
+ }
145
+
146
  let interval: ReturnType<typeof setInterval> | null = null;
147
+
148
+ const startVLMProcessing = () => {
149
+ if (interval) clearInterval(interval); // Clear any old interval
150
+
151
+ interval = setInterval(async () => {
152
+ if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) {
153
+ return; // Skip if video not ready, paused, ended, or already processing
154
+ }
155
+
156
+ setProcessingState(true);
157
+ setInferenceStatus("Running inference...");
158
+ setError(null); // Clear previous errors
159
+
160
+ try {
161
+ // Create a temporary offscreen canvas to get image data from the VLM video
162
+ const tempCanvas = document.createElement('canvas');
163
+ tempCanvas.width = vlmVideo.videoWidth;
164
+ tempCanvas.height = vlmVideo.videoHeight;
165
+ const tempCtx = tempCanvas.getContext('2d', { willReadFrequently: true });
166
+
167
+ if (tempCtx && vlmVideo.readyState >= 2) { // HAVE_CURRENT_DATA
168
+ tempCtx.drawImage(vlmVideo, 0, 0, tempCanvas.width, tempCanvas.height);
169
+ const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height);
170
+
171
+ const modelOutput = await runInference(imageData, prompt); // Pass ImageData
172
+ setDebugOutput(modelOutput); // Update raw model output
173
+
174
+ let boxes = extractJsonFromMarkdown(modelOutput) || [];
175
+ if (boxes.length === 0 && Array.isArray(modelOutput)) { // Fallback for direct array output
176
+ // This condition `Array.isArray(modelOutput)` is unlikely if modelOutput is string,
177
+ // so ensure `extractJsonFromMarkdown` is robust or `runInference` returns expected string
178
+ }
179
+ boxes = normalizeBoxes(boxes);
180
+
181
+ setLatestBoxes(boxes); // Update state, triggers display canvas redraw
182
+ setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected.");
183
+ } else {
184
+ setInferenceStatus("Video not ready for processing.");
185
+ }
186
+ } catch (e) {
187
+ setError("Inference error: " + (e instanceof Error ? e.message : String(e)));
188
+ setLatestBoxes([]);
189
+ setInferenceStatus("Inference failed.");
190
+ } finally {
191
+ setProcessingState(false); // Processing finished
192
+ }
193
+ }, 200); // Inference interval (e.g., 5 frames per second)
194
  };
 
195
 
196
+ const stopVLMProcessing = () => {
 
 
 
 
 
 
 
197
  if (interval) clearInterval(interval);
198
+ interval = null;
199
+ setProcessingState(false);
200
+ setInferenceStatus("Stopped processing.");
201
  };
 
202
 
203
+ // Start/stop processing based on video playback events
204
+ vlmVideo.addEventListener('play', startVLMProcessing);
205
+ vlmVideo.addEventListener('pause', stopVLMProcessing);
206
+ vlmVideo.addEventListener('ended', stopVLMProcessing);
207
+
208
+ // Initial check if video is already playing (e.g., after initial load/autoplay)
209
+ if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) {
210
+ startVLMProcessing();
211
+ }
212
+
213
+ // Cleanup function for useEffect
214
  return () => {
215
+ stopVLMProcessing();
216
+ vlmVideo.removeEventListener('play', startVLMProcessing);
217
+ vlmVideo.removeEventListener('pause', stopVLMProcessing);
218
+ vlmVideo.removeEventListener('ended', stopVLMProcessing);
219
  };
220
+ }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Added uploadedFile for file mode re-trigger
221
 
222
+ // --- Media Source Handling ---
223
+
224
+ // Cleanup for media stream and object URLs
225
+ const cleanupMediaSource = useCallback(() => {
226
+ if (mediaStream) {
227
+ mediaStream.getTracks().forEach(track => track.stop());
228
+ setMediaStream(null);
229
+ }
230
+ // Revoke any created blob URLs (for file inputs)
231
+ if (displayVideoRef.current?.src.startsWith('blob:')) {
232
+ URL.revokeObjectURL(displayVideoRef.current.src);
233
+ displayVideoRef.current.src = "";
234
+ }
235
+ if (vlmVideoRef.current?.src.startsWith('blob:')) {
236
+ URL.revokeObjectURL(vlmVideoRef.current.src);
237
+ vlmVideoRef.current.src = "";
238
+ }
239
+ setLatestBoxes([]); // Clear boxes when source changes
240
+ setError(null);
241
+ setInferenceStatus("");
242
+ setDebugOutput("");
243
+ }, [mediaStream]);
244
+
245
+ // Handle changing the mode (Webcam, URL, File)
246
  useEffect(() => {
247
+ cleanupMediaSource(); // Clean up previous source
 
 
 
 
 
 
 
 
248
 
249
+ const displayVideo = displayVideoRef.current;
250
+ const vlmVideo = vlmVideoRef.current;
251
+
252
+ if (!displayVideo || !vlmVideo) return;
253
+
254
+ // Reset srcObject/src to ensure fresh start
255
+ displayVideo.srcObject = null;
256
+ vlmVideo.srcObject = null;
257
+ displayVideo.src = "";
258
+ vlmVideo.src = "";
259
+
260
+ setLatestBoxes([]); // Clear boxes on mode change
 
261
  setError(null);
262
+ setInferenceStatus("");
263
+ setDebugOutput("");
264
+
265
+ // Special handling for initial file mode to load example video
266
+ if (mode === "File" && !uploadedFile) {
267
+ displayVideo.src = EXAMPLE_VIDEO_URL;
268
+ vlmVideo.src = EXAMPLE_VIDEO_URL;
269
+ displayVideo.load(); vlmVideo.load(); // Load the video
270
+ displayVideo.play().catch(e => console.error("Error playing example display video:", e));
271
+ vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e));
272
+ }
273
+ }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode
274
+
275
+ // Handle Webcam Input
276
+ const handleWebcamInput = useCallback(async () => {
277
+ cleanupMediaSource(); // Clean up any active stream
278
+ try {
279
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
280
+ setMediaStream(stream); // Store stream to manage it
281
+
282
+ if (displayVideoRef.current && vlmVideoRef.current) {
283
+ displayVideoRef.current.srcObject = stream;
284
+ vlmVideoRef.current.srcObject = stream;
285
+ // Programmatically play both videos
286
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
287
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
288
+ }
289
+ setMode("Webcam");
290
+ } catch (e) {
291
+ setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e)));
292
+ setMediaStream(null);
293
+ setLatestBoxes([]);
294
+ setInferenceStatus("Webcam access denied or failed.");
295
+ }
296
+ }, [cleanupMediaSource]);
297
+
298
+ // Handle URL Input (when Load button is clicked)
299
+ const handleLoadUrl = useCallback(() => {
300
+ cleanupMediaSource(); // Clean up any active stream
301
+
302
+ const url = currentUrlInput;
303
+ if (!url) {
304
+ setError("Please enter a valid URL.");
305
+ return;
306
+ }
307
+
308
+ if (displayVideoRef.current && vlmVideoRef.current) {
309
+ displayVideoRef.current.src = url;
310
+ vlmVideoRef.current.src = url;
311
+ displayVideoRef.current.load(); vlmVideoRef.current.load(); // Load the video
312
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
313
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
314
+ setMode("URL");
315
+ }
316
+ }, [currentUrlInput, cleanupMediaSource]);
317
+
318
+ // Handle File Input
319
+ const handleFileChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
320
+ cleanupMediaSource(); // Clean up any active stream
321
+
322
+ const file = e.target.files?.[0] || null;
323
+ if (file) {
324
+ const fileUrl = URL.createObjectURL(file); // Create blob URL for the file
325
+ // Store the file to differentiate image/video and manage its URL
326
+ setUploadedFile(file);
327
+
328
+ if (isImageFile(file)) {
329
+ // For images, we handle processing on a button click, not a continuous loop
330
+ // The imageRef will display the image
331
+ // The canvas will be used for processing and drawing
332
+ setError(null);
333
+ setMode("File");
334
+ } else if (isVideoFile(file)) {
335
+ if (displayVideoRef.current && vlmVideoRef.current) {
336
+ displayVideoRef.current.src = fileUrl;
337
+ vlmVideoRef.current.src = fileUrl;
338
+ displayVideoRef.current.load(); vlmVideoRef.current.load();
339
+ displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e));
340
+ vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e));
341
+ setMode("File");
342
  }
343
+ } else {
344
+ setError("Unsupported file type. Please upload an image or video.");
345
+ setUploadedFile(null);
346
+ if (fileUrl) URL.revokeObjectURL(fileUrl); // Clean up invalid file URL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  }
348
  } else {
349
+ setUploadedFile(null); // Clear file if nothing selected
350
+ // If no file selected, revert to example video if in File mode
351
+ if (mode === "File") {
352
+ if (displayVideoRef.current && vlmVideoRef.current) {
353
+ displayVideoRef.current.src = EXAMPLE_VIDEO_URL;
354
+ vlmVideoRef.current.src = EXAMPLE_VIDEO_URL;
355
+ displayVideoRef.current.load(); vlmVideoRef.current.load();
356
+ displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e));
357
+ vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e));
 
 
358
  }
359
+ }
 
360
  }
361
+ }, [cleanupMediaSource, mode]);
 
362
 
 
 
 
 
 
 
 
 
 
363
 
364
+ // Handler for processing an uploaded image file (one-time inference)
365
+ const handleProcessImage = async () => {
366
+ if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) {
367
+ setError("Image or model not ready for processing.");
368
+ return;
369
+ }
370
 
371
+ const img = imageRef.current;
 
 
372
  const canvas = canvasRef.current;
373
  const ctx = canvas.getContext("2d");
374
  if (!ctx) return;
375
+
376
+ // Ensure canvas dimensions match image for processing and display
377
+ canvas.width = img.naturalWidth;
378
+ canvas.height = img.naturalHeight;
379
+
380
+ setProcessingState(true);
381
+ setError(null);
382
+ setInferenceStatus("Running image inference...");
383
+
384
+ try {
385
+ // Draw image to canvas to get ImageData for inference
386
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
387
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
388
+
389
+ const modelOutput = await runInference(imageData, prompt);
390
+ setDebugOutput(modelOutput);
391
+ setInferenceStatus("Image inference complete.");
392
+
393
+ // Clear canvas and redraw image before drawing boxes
394
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
395
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
396
+
397
+ let boxes = extractJsonFromMarkdown(modelOutput) || [];
398
+ boxes = normalizeBoxes(boxes);
399
+ setLatestBoxes(boxes); // Update latestBoxes for display
400
+
401
+ if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected.");
402
+ } catch (e) {
403
+ setError("Image inference error: " + (e instanceof Error ? e.message : String(e)));
404
+ setLatestBoxes([]);
405
+ setInferenceStatus("Image inference failed.");
406
+ } finally {
407
+ setProcessingState(false);
408
+ }
409
  };
410
 
411
+ // --- Rendered UI ---
412
  return (
413
+ <div className="absolute inset-0 text-white flex flex-col">
414
  <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50">
415
  {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
416
  </div>
417
+ <div className="text-center text-sm text-blue-300 mt-10">{inferenceStatus}</div> {/* Adjusted top margin */}
418
+
419
+ <div className="flex flex-col items-center justify-center flex-1 w-full p-4"> {/* Added padding */}
420
  {/* Mode Selector */}
421
+ <div className="mb-6 mt-4"> {/* Increased margin-top for selector */}
422
  <div className="flex space-x-4">
423
  {MODES.map((m) => (
424
  <button
 
427
  mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500"
428
  }`}
429
  onClick={() => setMode(m)}
430
+ disabled={!isLoaded && m !== "File"} // Disable if model not loaded, except for initial file view
431
  >
432
  {m}
433
  </button>
 
435
  </div>
436
  </div>
437
 
438
+ {/* Dynamic Content Area */}
439
+ <div className="w-full max-w-4xl flex-1 flex flex-col items-center justify-center relative">
440
+ {/* Prompt Input (Common to all modes) */}
441
+ <div className="mb-4 w-full max-w-xl">
442
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
443
+ <textarea
444
+ className="w-full p-2 rounded-lg text-black"
445
+ rows={3}
446
+ value={prompt}
447
+ onChange={(e) => setPrompt(e.target.value)}
448
+ disabled={processingState}
449
+ />
450
+ </div>
451
+
452
+ {/* Video/Image Display and Canvas Overlay */}
453
+ <div className="relative w-full" style={{ maxWidth: '1280px', aspectRatio: '16/9', backgroundColor: '#000', display: 'flex', justifyContent: 'center', alignItems: 'center' }}>
454
+ {/* Conditional rendering for image vs video display */}
455
+ {mode === "File" && uploadedFile && isImageFile(uploadedFile) ? (
456
+ <img
457
+ ref={imageRef}
458
+ src={URL.createObjectURL(uploadedFile)} // Use object URL for display
459
+ alt="Uploaded"
460
+ className="max-w-full max-h-full block object-contain"
461
+ style={{ position: 'absolute' }}
462
+ onLoad={() => {
463
+ // This is important to ensure canvas matches image size for single image processing
464
+ if (imageRef.current && canvasRef.current) {
465
+ canvasRef.current.width = imageRef.current.naturalWidth;
466
+ canvasRef.current.height = imageRef.current.naturalHeight;
467
+ }
468
+ }}
469
+ />
470
+ ) : (
471
+ <video
472
+ ref={displayVideoRef}
473
+ autoPlay
474
+ muted
475
+ playsInline
476
+ loop // Loop for URL and File videos
477
+ className="max-w-full max-h-full block object-contain"
478
+ style={{ position: 'absolute' }}
479
+ />
480
+ )}
481
+ <canvas
482
+ ref={canvasRef}
483
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
484
+ style={{ zIndex: 10 }}
485
+ />
486
+ </div>
487
+
488
+ {/* Controls specific to each mode */}
489
+ <div className="mt-4 flex flex-col items-center gap-2">
490
+ {mode === "Webcam" && (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  <button
492
+ className="px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
493
+ onClick={handleWebcamInput} // This button sets up/starts webcam
494
+ disabled={processingState || !isLoaded}
495
  >
496
+ {mediaStream ? "Restart Webcam" : "Start Webcam"} 📸
497
  </button>
498
+ )}
499
+
500
+ {mode === "URL" && (
501
+ <>
502
+ <div className="flex w-full max-w-xl">
503
+ <input
504
+ type="text"
505
+ className="flex-1 px-4 py-2 rounded-l-lg text-black"
506
+ value={currentUrlInput}
507
+ onChange={(e) => setCurrentUrlInput(e.target.value)}
508
+ placeholder="Paste video URL here"
509
+ disabled={processingState}
510
+ />
511
+ <button
512
+ className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
513
+ onClick={handleLoadUrl}
514
+ disabled={processingState || !isLoaded}
515
+ >
516
+ Load URL
517
+ </button>
518
+ </div>
519
+ </>
520
+ )}
521
+
522
+ {mode === "File" && (
523
+ <>
524
  <input
525
  type="file"
526
  accept="image/*,video/*"
527
  onChange={handleFileChange}
528
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700 disabled:opacity-50"
529
+ disabled={processingState}
530
  />
531
+ {uploadedFile && isImageFile(uploadedFile) && (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  <button
533
+ className="mt-2 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold hover:bg-blue-700 disabled:opacity-50"
534
  onClick={handleProcessImage}
535
+ disabled={processingState || !isLoaded}
536
  >
537
+ {processingState ? "Processing Image..." : "Process Image"}
538
  </button>
539
+ )}
540
+ </>
541
+ )}
542
+ </div>
543
+
544
+ {/* Error and Debug Output */}
545
+ {error && <div className="text-red-400 mt-2 text-center">{error}</div>}
546
+ <div className="mt-4 p-2 bg-gray-800 rounded text-xs w-full max-w-xl">
547
+ <div>Raw Model Output:</div>
548
+ <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre>
549
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  </div>
551
  </div>
552
+
553
+ {/* Hidden Video for VLM processing - this must be rendered always */}
554
+ <video
555
+ ref={vlmVideoRef}
556
+ autoPlay
557
+ muted
558
+ playsInline
559
+ loop // Loop for URL and File videos
560
+ style={{ display: 'none' }} // Hidden from view
561
+ />
562
  </div>
563
  );
564
+ }
src/index.js CHANGED
@@ -1,17 +1,17 @@
1
- import React from 'react';
2
- import ReactDOM from 'react-dom/client';
3
- import './index.css';
4
- import App from './App';
5
- import reportWebVitals from './reportWebVitals';
6
-
7
- const root = ReactDOM.createRoot(document.getElementById('root'));
8
- root.render(
9
- <React.StrictMode>
10
- <App />
11
- </React.StrictMode>
12
- );
13
-
14
- // If you want to start measuring performance in your app, pass a function
15
- // to log results (for example: reportWebVitals(console.log))
16
- // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17
- reportWebVitals();
 
1
+ import React from 'react';
2
+ import ReactDOM from 'react-dom/client';
3
+ import './index.css';
4
+ import App from './App';
5
+ import reportWebVitals from './reportWebVitals';
6
+
7
+ const root = ReactDOM.createRoot(document.getElementById('root'));
8
+ root.render(
9
+ <React.StrictMode>
10
+ <App />
11
+ </React.StrictMode>
12
+ );
13
+
14
+ // If you want to start measuring performance in your app, pass a function
15
+ // to log results (for example: reportWebVitals(console.log))
16
+ // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17
+ reportWebVitals();
src/reportWebVitals.js CHANGED
@@ -1,13 +1,13 @@
1
- const reportWebVitals = onPerfEntry => {
2
- if (onPerfEntry && onPerfEntry instanceof Function) {
3
- import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4
- getCLS(onPerfEntry);
5
- getFID(onPerfEntry);
6
- getFCP(onPerfEntry);
7
- getLCP(onPerfEntry);
8
- getTTFB(onPerfEntry);
9
- });
10
- }
11
- };
12
-
13
- export default reportWebVitals;
 
1
+ const reportWebVitals = onPerfEntry => {
2
+ if (onPerfEntry && onPerfEntry instanceof Function) {
3
+ import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4
+ getCLS(onPerfEntry);
5
+ getFID(onPerfEntry);
6
+ getFCP(onPerfEntry);
7
+ getLCP(onPerfEntry);
8
+ getTTFB(onPerfEntry);
9
+ });
10
+ }
11
+ };
12
+
13
+ export default reportWebVitals;
src/setupTests.js CHANGED
@@ -1,5 +1,5 @@
1
- // jest-dom adds custom jest matchers for asserting on DOM nodes.
2
- // allows you to do things like:
3
- // expect(element).toHaveTextContent(/react/i)
4
- // learn more: https://github.com/testing-library/jest-dom
5
- import '@testing-library/jest-dom';
 
1
+ // jest-dom adds custom jest matchers for asserting on DOM nodes.
2
+ // allows you to do things like:
3
+ // expect(element).toHaveTextContent(/react/i)
4
+ // learn more: https://github.com/testing-library/jest-dom
5
+ import '@testing-library/jest-dom';