Quazim0t0 commited on
Commit
5bcc8b4
·
verified ·
1 Parent(s): d6ad5d9

Upload 36 files

Browse files
src/components/MultiSourceCaptioningView.tsx CHANGED
@@ -9,21 +9,31 @@ const EXAMPLE_VIDEO_URL =
9
  "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
10
  const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
11
 
 
 
 
 
 
 
 
12
  export default function MultiSourceCaptioningView() {
13
- const [mode, setMode] = useState<Mode>("URL");
14
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
15
  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
16
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
17
  const [processing, setProcessing] = useState(false);
18
  const [error, setError] = useState<string | null>(null);
19
  const [webcamActive, setWebcamActive] = useState(false);
 
 
20
 
21
  const videoRef = useRef<HTMLVideoElement | null>(null);
22
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
 
23
  const webcamStreamRef = useRef<MediaStream | null>(null);
24
  const { isLoaded, runInference } = useVLMContext();
25
 
26
- // Webcam setup and teardown
27
  useEffect(() => {
28
  if (mode !== "Webcam") {
29
  if (webcamStreamRef.current) {
@@ -57,7 +67,7 @@ export default function MultiSourceCaptioningView() {
57
  };
58
  }, [mode]);
59
 
60
- // Process webcam frames
61
  useEffect(() => {
62
  if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
63
  let interval: ReturnType<typeof setInterval> | null = null;
@@ -74,16 +84,13 @@ export default function MultiSourceCaptioningView() {
74
  try {
75
  setProcessing(true);
76
  setError(null);
77
- // Use FastVLM inference on the current frame
78
  const fakeVideo = {
79
  videoWidth: canvas.width,
80
  videoHeight: canvas.height,
81
  getContext: () => ctx,
82
  } as unknown as HTMLVideoElement;
83
  const result = await runInference(fakeVideo, prompt);
84
- // Clear canvas and redraw frame
85
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
86
- // Parse and draw boxes
87
  const boxes = extractJsonFromMarkdown(result) || [];
88
  drawBoundingBoxesOnCanvas(ctx, boxes);
89
  } catch (e) {
@@ -100,7 +107,7 @@ export default function MultiSourceCaptioningView() {
100
  };
101
  }, [mode, isLoaded, prompt, runInference, webcamActive]);
102
 
103
- // Process video frames for URL mode
104
  useEffect(() => {
105
  if (mode !== "URL" || !isLoaded) return;
106
  let interval: ReturnType<typeof setInterval> | null = null;
@@ -117,16 +124,13 @@ export default function MultiSourceCaptioningView() {
117
  try {
118
  setProcessing(true);
119
  setError(null);
120
- // Use FastVLM inference on the current frame
121
  const fakeVideo = {
122
  videoWidth: canvas.width,
123
  videoHeight: canvas.height,
124
  getContext: () => ctx,
125
  } as unknown as HTMLVideoElement;
126
  const result = await runInference(fakeVideo, prompt);
127
- // Clear canvas and redraw frame
128
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
129
- // Parse and draw boxes
130
  const boxes = extractJsonFromMarkdown(result) || [];
131
  drawBoundingBoxesOnCanvas(ctx, boxes);
132
  } catch (e) {
@@ -143,6 +147,86 @@ export default function MultiSourceCaptioningView() {
143
  };
144
  }, [mode, isLoaded, prompt, runInference]);
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  return (
147
  <div className="absolute inset-0 text-white">
148
  <div className="flex flex-col items-center justify-center h-full w-full">
@@ -243,8 +327,81 @@ export default function MultiSourceCaptioningView() {
243
  </div>
244
  )}
245
  {mode === "File" && (
246
- <div className="w-full text-center">
247
- <p className="mb-4">Upload a video or image file for detection (coming soon).</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  </div>
249
  )}
250
  </div>
 
9
  "https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4";
10
  const EXAMPLE_PROMPT = "Find as many objects in the video and box them.";
11
 
12
+ function isImageFile(file: File) {
13
+ return file.type.startsWith("image/");
14
+ }
15
+ function isVideoFile(file: File) {
16
+ return file.type.startsWith("video/");
17
+ }
18
+
19
  export default function MultiSourceCaptioningView() {
20
+ const [mode, setMode] = useState<Mode>("File");
21
  const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL);
22
  const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL);
23
  const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT);
24
  const [processing, setProcessing] = useState(false);
25
  const [error, setError] = useState<string | null>(null);
26
  const [webcamActive, setWebcamActive] = useState(false);
27
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
28
+ const [uploadedUrl, setUploadedUrl] = useState<string>("");
29
 
30
  const videoRef = useRef<HTMLVideoElement | null>(null);
31
  const canvasRef = useRef<HTMLCanvasElement | null>(null);
32
+ const imageRef = useRef<HTMLImageElement | null>(null);
33
  const webcamStreamRef = useRef<MediaStream | null>(null);
34
  const { isLoaded, runInference } = useVLMContext();
35
 
36
+ // Webcam setup and teardown (unchanged)
37
  useEffect(() => {
38
  if (mode !== "Webcam") {
39
  if (webcamStreamRef.current) {
 
67
  };
68
  }, [mode]);
69
 
70
+ // Process webcam frames (unchanged)
71
  useEffect(() => {
72
  if (mode !== "Webcam" || !isLoaded || !webcamActive) return;
73
  let interval: ReturnType<typeof setInterval> | null = null;
 
84
  try {
85
  setProcessing(true);
86
  setError(null);
 
87
  const fakeVideo = {
88
  videoWidth: canvas.width,
89
  videoHeight: canvas.height,
90
  getContext: () => ctx,
91
  } as unknown as HTMLVideoElement;
92
  const result = await runInference(fakeVideo, prompt);
 
93
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
 
94
  const boxes = extractJsonFromMarkdown(result) || [];
95
  drawBoundingBoxesOnCanvas(ctx, boxes);
96
  } catch (e) {
 
107
  };
108
  }, [mode, isLoaded, prompt, runInference, webcamActive]);
109
 
110
+ // Process video frames for URL mode (unchanged)
111
  useEffect(() => {
112
  if (mode !== "URL" || !isLoaded) return;
113
  let interval: ReturnType<typeof setInterval> | null = null;
 
124
  try {
125
  setProcessing(true);
126
  setError(null);
 
127
  const fakeVideo = {
128
  videoWidth: canvas.width,
129
  videoHeight: canvas.height,
130
  getContext: () => ctx,
131
  } as unknown as HTMLVideoElement;
132
  const result = await runInference(fakeVideo, prompt);
 
133
  ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
 
134
  const boxes = extractJsonFromMarkdown(result) || [];
135
  drawBoundingBoxesOnCanvas(ctx, boxes);
136
  } catch (e) {
 
147
  };
148
  }, [mode, isLoaded, prompt, runInference]);
149
 
150
+ // File mode: process uploaded image
151
+ useEffect(() => {
152
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isImageFile(uploadedFile)) return;
153
+ const img = imageRef.current;
154
+ const canvas = canvasRef.current;
155
+ if (!img || !canvas) return;
156
+ img.onload = async () => {
157
+ canvas.width = img.naturalWidth;
158
+ canvas.height = img.naturalHeight;
159
+ const ctx = canvas.getContext("2d");
160
+ if (!ctx) return;
161
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
162
+ try {
163
+ setProcessing(true);
164
+ setError(null);
165
+ const fakeVideo = {
166
+ videoWidth: canvas.width,
167
+ videoHeight: canvas.height,
168
+ getContext: () => ctx,
169
+ } as unknown as HTMLVideoElement;
170
+ const result = await runInference(fakeVideo, prompt);
171
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height);
172
+ const boxes = extractJsonFromMarkdown(result) || [];
173
+ drawBoundingBoxesOnCanvas(ctx, boxes);
174
+ } catch (e) {
175
+ setError(e instanceof Error ? e.message : String(e));
176
+ } finally {
177
+ setProcessing(false);
178
+ }
179
+ };
180
+ }, [mode, isLoaded, prompt, runInference, uploadedFile]);
181
+
182
+ // File mode: process uploaded video frames
183
+ useEffect(() => {
184
+ if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile)) return;
185
+ let interval: ReturnType<typeof setInterval> | null = null;
186
+ const processFrame = async () => {
187
+ if (!videoRef.current || !canvasRef.current) return;
188
+ const video = videoRef.current;
189
+ const canvas = canvasRef.current;
190
+ if (video.paused || video.ended || video.videoWidth === 0) return;
191
+ canvas.width = video.videoWidth;
192
+ canvas.height = video.videoHeight;
193
+ const ctx = canvas.getContext("2d");
194
+ if (!ctx) return;
195
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
196
+ try {
197
+ setProcessing(true);
198
+ setError(null);
199
+ const fakeVideo = {
200
+ videoWidth: canvas.width,
201
+ videoHeight: canvas.height,
202
+ getContext: () => ctx,
203
+ } as unknown as HTMLVideoElement;
204
+ const result = await runInference(fakeVideo, prompt);
205
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
206
+ const boxes = extractJsonFromMarkdown(result) || [];
207
+ drawBoundingBoxesOnCanvas(ctx, boxes);
208
+ } catch (e) {
209
+ setError(e instanceof Error ? e.message : String(e));
210
+ } finally {
211
+ setProcessing(false);
212
+ }
213
+ };
214
+ interval = setInterval(() => {
215
+ processFrame();
216
+ }, 1000);
217
+ return () => {
218
+ if (interval) clearInterval(interval);
219
+ };
220
+ }, [mode, isLoaded, prompt, runInference, uploadedFile]);
221
+
222
+ // Handle file upload
223
+ const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
224
+ const file = e.target.files?.[0] || null;
225
+ setUploadedFile(file);
226
+ setUploadedUrl(file ? URL.createObjectURL(file) : "");
227
+ setError(null);
228
+ };
229
+
230
  return (
231
  <div className="absolute inset-0 text-white">
232
  <div className="flex flex-col items-center justify-center h-full w-full">
 
327
  </div>
328
  )}
329
  {mode === "File" && (
330
+ <div className="w-full text-center flex flex-col items-center">
331
+ <div className="mb-4 w-full max-w-xl">
332
+ <label className="block text-left mb-2 font-medium">Detection Prompt:</label>
333
+ <textarea
334
+ className="w-full p-2 rounded-lg text-black"
335
+ rows={3}
336
+ value={prompt}
337
+ onChange={(e) => setPrompt(e.target.value)}
338
+ />
339
+ </div>
340
+ <div className="mb-4 w-full max-w-xl">
341
+ <input
342
+ type="file"
343
+ accept="image/*,video/*"
344
+ onChange={handleFileChange}
345
+ className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700"
346
+ />
347
+ </div>
348
+ {/* Show uploaded image */}
349
+ {uploadedFile && isImageFile(uploadedFile) && (
350
+ <div className="relative w-full max-w-xl">
351
+ <img
352
+ ref={imageRef}
353
+ src={uploadedUrl}
354
+ alt="Uploaded"
355
+ className="w-full rounded-lg shadow-lg mb-2"
356
+ style={{ background: "#222" }}
357
+ />
358
+ <canvas
359
+ ref={canvasRef}
360
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
361
+ style={{ zIndex: 10, pointerEvents: "none" }}
362
+ />
363
+ </div>
364
+ )}
365
+ {/* Show uploaded video */}
366
+ {uploadedFile && isVideoFile(uploadedFile) && (
367
+ <div className="relative w-full max-w-xl">
368
+ <video
369
+ ref={videoRef}
370
+ src={uploadedUrl}
371
+ controls
372
+ autoPlay
373
+ loop
374
+ className="w-full rounded-lg shadow-lg mb-2"
375
+ style={{ background: "#222" }}
376
+ />
377
+ <canvas
378
+ ref={canvasRef}
379
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
380
+ style={{ zIndex: 10, pointerEvents: "none" }}
381
+ />
382
+ </div>
383
+ )}
384
+ {/* Show example video if no file uploaded */}
385
+ {!uploadedFile && (
386
+ <div className="relative w-full max-w-xl">
387
+ <video
388
+ ref={videoRef}
389
+ src={EXAMPLE_VIDEO_URL}
390
+ controls
391
+ autoPlay
392
+ loop
393
+ className="w-full rounded-lg shadow-lg mb-2"
394
+ style={{ background: "#222" }}
395
+ />
396
+ <canvas
397
+ ref={canvasRef}
398
+ className="absolute top-0 left-0 w-full h-full pointer-events-none"
399
+ style={{ zIndex: 10, pointerEvents: "none" }}
400
+ />
401
+ </div>
402
+ )}
403
+ {processing && <div className="text-blue-400 mt-2">Processing frame...</div>}
404
+ {error && <div className="text-red-400 mt-2">Error: {error}</div>}
405
  </div>
406
  )}
407
  </div>