Spaces:

webml-community
/

smolvlm-realtime-webgpu

Running

App Files Files Community

Xenova HF Staff commited on May 12

Commit

61161af

verified ·

1 Parent(s): 26e9e10

Update index.html

Browse files

Files changed (1) hide show

index.html +254 -23

index.html CHANGED Viewed

@@ -1,29 +1,260 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
-    <meta charset="UTF-8" />
-    <link rel="stylesheet" href="style.css" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Transformers.js - Object Detection</title>
 </head>
 <body>
-    <h1>Object Detection w/ 🤗 Transformers.js</h1>
-    <label id="container" for="upload">
-        <svg width="25" height="25" viewBox="0 0 25 25" fill="none" xmlns="http://www.w3.org/2000/svg">
-            <path fill="#000"
-                d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z">
-            </path>
-        </svg>
-        Click to upload image
-        <label id="example">(or try example)</label>
-    </label>
-    <label id="status">Loading model...</label>
-    <input id="upload" type="file" accept="image/*" />
-    <script src="index.js" type="module"></script>
-</body>
-</html>

 <!DOCTYPE html>
 <html lang="en">
 <head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Camera Interaction App</title>
+    <style>
+        body {
+            font-family: sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            gap: 20px;
+            padding: 20px;
+            background-color: #f0f0f0;
+        }
+        .controls, .io-areas {
+            display: flex;
+            gap: 10px;
+            align-items: center;
+            background-color: #fff;
+            padding: 15px;
+            border-radius: 8px;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+        }
+        .io-areas {
+            flex-direction: column;
+            align-items: stretch;
+        }
+        textarea {
+            width: 300px;
+            height: 80px;
+            padding: 8px;
+            border: 1px solid #ccc;
+            border-radius: 4px;
+            font-size: 14px;
+        }
+        #videoFeed {
+            width: 480px;
+            height: 360px;
+            border: 2px solid #333;
+            background-color: #000;
+            border-radius: 8px;
+        }
+        #startButton {
+            padding: 10px 20px;
+            font-size: 16px;
+            cursor: pointer;
+            border: none;
+            border-radius: 4px;
+            color: white;
+        }
+        #startButton.start {
+            background-color: #28a745; /* Green */
+        }
+        #startButton.stop {
+            background-color: #dc3545; /* Red */
+        }
+        label {
+            font-weight: bold;
+        }
+        select {
+            padding: 8px;
+            border-radius: 4px;
+            border: 1px solid #ccc;
+        }
+        .hidden {
+            display: none;
+        }
+    </style>
 </head>
 <body>
+    <h1>Camera Interaction App</h1>
+    <video id="videoFeed" autoplay playsinline></video>
+    <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
+    <div class="io-areas">
+        <div>
+            <label for="instructionText">Instruction:</label><br>
+            <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
+        </div>
+        <div>
+            <label for="responseText">Response:</label><br>
+            <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
+        </div>
+    </div>
+    <div class="controls">
+        <label for="intervalSelect">Interval between 2 requests:</label>
+        <select id="intervalSelect" name="Interval between 2 requests">
+            <option value="0" selected>0ms</option>
+            <option value="100">100ms</option>
+            <option value="250">250ms</option>
+            <option value="500">500ms</option>
+            <option value="1000">1s</option>
+            <option value="2000">2s</option>
+        </select>
+        <button id="startButton" class="start">Start</button>
+    </div>
+    <script type="module">
+        import {
+            AutoProcessor,
+            AutoModelForVision2Seq,
+            RawImage
+        } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
+        const video = document.getElementById('videoFeed');
+        const canvas = document.getElementById('canvas');
+        const instructionText = document.getElementById('instructionText');
+        const responseText = document.getElementById('responseText');
+        const intervalSelect = document.getElementById('intervalSelect');
+        const startButton = document.getElementById('startButton');
+        instructionText.value = "In one sentence, what do you see?"; // default instruction
+        let stream;
+        let isProcessing = false;
+        let processor, model;
+        async function initModel() {
+            const modelId = 'HuggingFaceTB/SmolVLM-Instruct';
+            processor = await AutoProcessor.from_pretrained(modelId);
+            model = await AutoModelForVision2Seq.from_pretrained(modelId, {
+                dtype: {
+                    embed_tokens: 'fp16',
+                    vision_encoder: 'q4',
+                    decoder_model_merged: 'q4'
+                },
+                device: "webgpu",
+            });
+        }
+        async function initCamera() {
+            try {
+                stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
+                video.srcObject = stream;
+                responseText.value = "Camera access granted. Ready to start.";
+            } catch (err) {
+                console.error("Error accessing camera:", err);
+                responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
+                alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
+            }
+        }
+        function captureImage() {
+            if (!stream || !video.videoWidth) {
+                console.warn("Video stream not ready for capture.");
+                return null;
+            }
+            canvas.width = video.videoWidth;
+            canvas.height = video.videoHeight;
+            const context = canvas.getContext('2d', { willReadFrequently: true });
+            context.drawImage(video, 0, 0, canvas.width, canvas.height);
+            const frame = context.getImageData(0, 0, canvas.width, canvas.height);
+            return new RawImage(frame.data, frame.width, frame.height, 4);
+        }
+        async function runLocalVisionInference(imgElement, instruction) {
+            const messages = [{
+                role: 'user',
+                content: [
+                    { type: 'image' },
+                    { type: 'text', text: instruction }
+                ]
+            }];
+            const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
+            const inputs = await processor(text, [imgElement], { do_image_splitting: false });
+            const generatedIds = await model.generate({ ...inputs, max_new_tokens: 100 });
+            const output = processor.batch_decode(
+                generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
+                { skip_special_tokens: true }
+            );
+            return output[0].trim();
+        }
+        async function sendData() {
+            if (!isProcessing) return;
+            const instruction = instructionText.value;
+            const rawImg = captureImage();
+            if (!rawImg) {
+                responseText.value = 'Capture failed';
+                return;
+            }
+            try {
+                const reply = await runLocalVisionInference(rawImg, instruction);
+                responseText.value = reply;
+            } catch (e) {
+                console.error(e);
+                responseText.value = `Error: ${e.message}`;
+            }
+        }
+        function sleep(ms) {
+            return new Promise(resolve => setTimeout(resolve, ms));
+        }
+        async function processingLoop() {
+            const intervalMs = parseInt(intervalSelect.value, 10);
+            while (isProcessing) {
+                await sendData();
+                if (!isProcessing) break;
+                await sleep(intervalMs);
+            }
+        }
+        function handleStart() {
+            if (!stream) {
+                responseText.value = "Camera not available. Cannot start.";
+                alert("Camera not available. Please grant permission first.");
+                return;
+            }
+            isProcessing = true;
+            startButton.textContent = "Stop";
+            startButton.classList.replace('start', 'stop');
+            instructionText.disabled = true;
+            intervalSelect.disabled = true;
+            responseText.value = "Processing started...";
+            processingLoop();
+        }
+        function handleStop() {
+            isProcessing = false;
+            startButton.textContent = "Start";
+            startButton.classList.replace('stop', 'start');
+            instructionText.disabled = false;
+            intervalSelect.disabled = false;
+            if (responseText.value.startsWith("Processing started...")) {
+                responseText.value = "Processing stopped.";
+            }
+        }
+        startButton.addEventListener('click', () => {
+            if (isProcessing) {
+                handleStop();
+            } else {
+                handleStart();
+            }
+        });
+        window.addEventListener('DOMContentLoaded', async () => {
+            await initModel();
+            await initCamera();
+        });
+        window.addEventListener('beforeunload', () => {
+            if (stream) {
+                stream.getTracks().forEach(track => track.stop());
+            }
+        });
+    </script>
+</body>
+</html>