Xenova's picture
Xenova HF Staff
Update index.html
61161af verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Camera Interaction App</title>
<style>
body {
font-family: sans-serif;
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
padding: 20px;
background-color: #f0f0f0;
}
.controls, .io-areas {
display: flex;
gap: 10px;
align-items: center;
background-color: #fff;
padding: 15px;
border-radius: 8px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.io-areas {
flex-direction: column;
align-items: stretch;
}
textarea {
width: 300px;
height: 80px;
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
font-size: 14px;
}
#videoFeed {
width: 480px;
height: 360px;
border: 2px solid #333;
background-color: #000;
border-radius: 8px;
}
#startButton {
padding: 10px 20px;
font-size: 16px;
cursor: pointer;
border: none;
border-radius: 4px;
color: white;
}
#startButton.start {
background-color: #28a745; /* Green */
}
#startButton.stop {
background-color: #dc3545; /* Red */
}
label {
font-weight: bold;
}
select {
padding: 8px;
border-radius: 4px;
border: 1px solid #ccc;
}
.hidden {
display: none;
}
</style>
</head>
<body>
<h1>Camera Interaction App</h1>
<video id="videoFeed" autoplay playsinline></video>
<canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
<div class="io-areas">
<div>
<label for="instructionText">Instruction:</label><br>
<textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
</div>
<div>
<label for="responseText">Response:</label><br>
<textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
</div>
</div>
<div class="controls">
<label for="intervalSelect">Interval between 2 requests:</label>
<select id="intervalSelect" name="Interval between 2 requests">
<option value="0" selected>0ms</option>
<option value="100">100ms</option>
<option value="250">250ms</option>
<option value="500">500ms</option>
<option value="1000">1s</option>
<option value="2000">2s</option>
</select>
<button id="startButton" class="start">Start</button>
</div>
<script type="module">
import {
AutoProcessor,
AutoModelForVision2Seq,
RawImage
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
const video = document.getElementById('videoFeed');
const canvas = document.getElementById('canvas');
const instructionText = document.getElementById('instructionText');
const responseText = document.getElementById('responseText');
const intervalSelect = document.getElementById('intervalSelect');
const startButton = document.getElementById('startButton');
instructionText.value = "In one sentence, what do you see?"; // default instruction
let stream;
let isProcessing = false;
let processor, model;
async function initModel() {
const modelId = 'HuggingFaceTB/SmolVLM-Instruct';
processor = await AutoProcessor.from_pretrained(modelId);
model = await AutoModelForVision2Seq.from_pretrained(modelId, {
dtype: {
embed_tokens: 'fp16',
vision_encoder: 'q4',
decoder_model_merged: 'q4'
},
device: "webgpu",
});
}
async function initCamera() {
try {
stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
video.srcObject = stream;
responseText.value = "Camera access granted. Ready to start.";
} catch (err) {
console.error("Error accessing camera:", err);
responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
}
}
function captureImage() {
if (!stream || !video.videoWidth) {
console.warn("Video stream not ready for capture.");
return null;
}
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const context = canvas.getContext('2d', { willReadFrequently: true });
context.drawImage(video, 0, 0, canvas.width, canvas.height);
const frame = context.getImageData(0, 0, canvas.width, canvas.height);
return new RawImage(frame.data, frame.width, frame.height, 4);
}
async function runLocalVisionInference(imgElement, instruction) {
const messages = [{
role: 'user',
content: [
{ type: 'image' },
{ type: 'text', text: instruction }
]
}];
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
const inputs = await processor(text, [imgElement], { do_image_splitting: false });
const generatedIds = await model.generate({ ...inputs, max_new_tokens: 100 });
const output = processor.batch_decode(
generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true }
);
return output[0].trim();
}
async function sendData() {
if (!isProcessing) return;
const instruction = instructionText.value;
const rawImg = captureImage();
if (!rawImg) {
responseText.value = 'Capture failed';
return;
}
try {
const reply = await runLocalVisionInference(rawImg, instruction);
responseText.value = reply;
} catch (e) {
console.error(e);
responseText.value = `Error: ${e.message}`;
}
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function processingLoop() {
const intervalMs = parseInt(intervalSelect.value, 10);
while (isProcessing) {
await sendData();
if (!isProcessing) break;
await sleep(intervalMs);
}
}
function handleStart() {
if (!stream) {
responseText.value = "Camera not available. Cannot start.";
alert("Camera not available. Please grant permission first.");
return;
}
isProcessing = true;
startButton.textContent = "Stop";
startButton.classList.replace('start', 'stop');
instructionText.disabled = true;
intervalSelect.disabled = true;
responseText.value = "Processing started...";
processingLoop();
}
function handleStop() {
isProcessing = false;
startButton.textContent = "Start";
startButton.classList.replace('stop', 'start');
instructionText.disabled = false;
intervalSelect.disabled = false;
if (responseText.value.startsWith("Processing started...")) {
responseText.value = "Processing stopped.";
}
}
startButton.addEventListener('click', () => {
if (isProcessing) {
handleStop();
} else {
handleStart();
}
});
window.addEventListener('DOMContentLoaded', async () => {
await initModel();
await initCamera();
});
window.addEventListener('beforeunload', () => {
if (stream) {
stream.getTracks().forEach(track => track.stop());
}
});
</script>
</body>
</html>