Spaces:
Running
Running
import React, { useState, useRef, useEffect } from "react"; | |
import { FASTVLM_BOXING_PROMPT } from "../constants"; | |
import { useVLMContext } from "../context/useVLMContext"; | |
import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; | |
const MODES = ["Webcam", "URL", "File"] as const; | |
type Mode = typeof MODES[number]; | |
const EXAMPLE_VIDEO_URL = | |
"https://dm0qx8t0i9gc9.cloudfront.net/watermarks/video/47Fj2US_gijjhliil/large-group-of-people-walking-at-city_rpem-bqvu__f51e7e41cf28b832502c9709c8eb2fd8__P360.mp4"; | |
const EXAMPLE_PROMPT = "Find as many objects in the video and box them."; | |
export default function MultiSourceCaptioningView() { | |
const [mode, setMode] = useState<Mode>("URL"); | |
const [videoUrl, setVideoUrl] = useState<string>(EXAMPLE_VIDEO_URL); | |
const [inputUrl, setInputUrl] = useState<string>(EXAMPLE_VIDEO_URL); | |
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT); | |
const [processing, setProcessing] = useState(false); | |
const [error, setError] = useState<string | null>(null); | |
const [webcamActive, setWebcamActive] = useState(false); | |
const videoRef = useRef<HTMLVideoElement | null>(null); | |
const canvasRef = useRef<HTMLCanvasElement | null>(null); | |
const webcamStreamRef = useRef<MediaStream | null>(null); | |
const { isLoaded, runInference } = useVLMContext(); | |
// Webcam setup and teardown | |
useEffect(() => { | |
if (mode !== "Webcam") { | |
if (webcamStreamRef.current) { | |
webcamStreamRef.current.getTracks().forEach((track) => track.stop()); | |
webcamStreamRef.current = null; | |
} | |
setWebcamActive(false); | |
return; | |
} | |
let stopped = false; | |
const setupWebcam = async () => { | |
try { | |
setError(null); | |
const stream = await navigator.mediaDevices.getUserMedia({ video: true }); | |
webcamStreamRef.current = stream; | |
if (videoRef.current) { | |
videoRef.current.srcObject = stream; | |
setWebcamActive(true); | |
} | |
} catch (e) { | |
setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e))); | |
setWebcamActive(false); | |
} | |
}; | |
setupWebcam(); | |
return () => { | |
stopped = true; | |
if (webcamStreamRef.current) { | |
webcamStreamRef.current.getTracks().forEach((track) => track.stop()); | |
webcamStreamRef.current = null; | |
} | |
setWebcamActive(false); | |
}; | |
}, [mode]); | |
// Process webcam frames | |
useEffect(() => { | |
if (mode !== "Webcam" || !isLoaded || !webcamActive) return; | |
let interval: NodeJS.Timeout | null = null; | |
let stopped = false; | |
const processFrame = async () => { | |
if (!videoRef.current || !canvasRef.current) return; | |
const video = videoRef.current; | |
const canvas = canvasRef.current; | |
if (video.videoWidth === 0) return; | |
canvas.width = video.videoWidth; | |
canvas.height = video.videoHeight; | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
try { | |
setProcessing(true); | |
setError(null); | |
// Use FastVLM inference on the current frame | |
const fakeVideo = { | |
videoWidth: canvas.width, | |
videoHeight: canvas.height, | |
// @ts-ignore | |
getContext: () => ctx, | |
} as HTMLVideoElement; | |
const result = await runInference(fakeVideo, prompt); | |
// Clear canvas and redraw frame | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
// Parse and draw boxes | |
const boxes = extractJsonFromMarkdown(result) || []; | |
drawBoundingBoxesOnCanvas(ctx, boxes); | |
} catch (e) { | |
setError(e instanceof Error ? e.message : String(e)); | |
} finally { | |
setProcessing(false); | |
} | |
}; | |
interval = setInterval(() => { | |
if (!stopped) processFrame(); | |
}, 1000); | |
return () => { | |
stopped = true; | |
if (interval) clearInterval(interval); | |
}; | |
}, [mode, isLoaded, prompt, runInference, webcamActive]); | |
// Process video frames for URL mode | |
useEffect(() => { | |
if (mode !== "URL" || !isLoaded) return; | |
let interval: NodeJS.Timeout | null = null; | |
let stopped = false; | |
const processFrame = async () => { | |
if (!videoRef.current || !canvasRef.current) return; | |
const video = videoRef.current; | |
const canvas = canvasRef.current; | |
if (video.paused || video.ended || video.videoWidth === 0) return; | |
canvas.width = video.videoWidth; | |
canvas.height = video.videoHeight; | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
try { | |
setProcessing(true); | |
setError(null); | |
// Use FastVLM inference on the current frame | |
const fakeVideo = { | |
videoWidth: canvas.width, | |
videoHeight: canvas.height, | |
// @ts-ignore | |
getContext: () => ctx, | |
} as HTMLVideoElement; | |
const result = await runInference(fakeVideo, prompt); | |
// Clear canvas and redraw frame | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
// Parse and draw boxes | |
const boxes = extractJsonFromMarkdown(result) || []; | |
drawBoundingBoxesOnCanvas(ctx, boxes); | |
} catch (e) { | |
setError(e instanceof Error ? e.message : String(e)); | |
} finally { | |
setProcessing(false); | |
} | |
}; | |
interval = setInterval(() => { | |
if (!stopped) processFrame(); | |
}, 1000); | |
return () => { | |
stopped = true; | |
if (interval) clearInterval(interval); | |
}; | |
}, [mode, isLoaded, prompt, runInference]); | |
return ( | |
<div className="absolute inset-0 text-white"> | |
<div className="flex flex-col items-center justify-center h-full w-full"> | |
{/* Mode Selector */} | |
<div className="mb-6"> | |
<div className="flex space-x-4"> | |
{MODES.map((m) => ( | |
<button | |
key={m} | |
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${ | |
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500" | |
}`} | |
onClick={() => setMode(m)} | |
> | |
{m} | |
</button> | |
))} | |
</div> | |
</div> | |
{/* Mode Content */} | |
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center"> | |
{mode === "Webcam" && ( | |
<div className="w-full text-center flex flex-col items-center"> | |
<div className="mb-4 w-full max-w-xl"> | |
<label className="block text-left mb-2 font-medium">Detection Prompt:</label> | |
<textarea | |
className="w-full p-2 rounded-lg text-black" | |
rows={3} | |
value={prompt} | |
onChange={(e) => setPrompt(e.target.value)} | |
/> | |
</div> | |
<div className="relative w-full max-w-xl"> | |
<video | |
ref={videoRef} | |
autoPlay | |
muted | |
playsInline | |
className="w-full rounded-lg shadow-lg mb-2" | |
style={{ background: "#222" }} | |
/> | |
<canvas | |
ref={canvasRef} | |
className="absolute top-0 left-0 w-full h-full pointer-events-none" | |
style={{ zIndex: 10, pointerEvents: "none" }} | |
/> | |
</div> | |
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>} | |
{error && <div className="text-red-400 mt-2">Error: {error}</div>} | |
</div> | |
)} | |
{mode === "URL" && ( | |
<div className="w-full text-center flex flex-col items-center"> | |
<p className="mb-4">Enter a video stream URL (e.g., HTTP MP4, MJPEG, HLS, etc.):</p> | |
<div className="flex w-full max-w-xl mb-4"> | |
<input | |
type="text" | |
className="flex-1 px-4 py-2 rounded-l-lg text-black" | |
value={inputUrl} | |
onChange={(e) => setInputUrl(e.target.value)} | |
placeholder="Paste video URL here" | |
/> | |
<button | |
className="px-4 py-2 rounded-r-lg bg-blue-600 text-white font-semibold" | |
onClick={() => setVideoUrl(inputUrl)} | |
> | |
Load | |
</button> | |
</div> | |
<div className="mb-4 w-full max-w-xl"> | |
<label className="block text-left mb-2 font-medium">Detection Prompt:</label> | |
<textarea | |
className="w-full p-2 rounded-lg text-black" | |
rows={3} | |
value={prompt} | |
onChange={(e) => setPrompt(e.target.value)} | |
/> | |
</div> | |
<div className="relative w-full max-w-xl"> | |
<video | |
ref={videoRef} | |
src={videoUrl} | |
controls | |
autoPlay | |
loop | |
className="w-full rounded-lg shadow-lg mb-2" | |
style={{ background: "#222" }} | |
/> | |
<canvas | |
ref={canvasRef} | |
className="absolute top-0 left-0 w-full h-full pointer-events-none" | |
style={{ zIndex: 10, pointerEvents: "none" }} | |
/> | |
</div> | |
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>} | |
{error && <div className="text-red-400 mt-2">Error: {error}</div>} | |
</div> | |
)} | |
{mode === "File" && ( | |
<div className="w-full text-center"> | |
<p className="mb-4">Upload a video or image file for detection (coming soon).</p> | |
</div> | |
)} | |
</div> | |
</div> | |
</div> | |
); | |
} |