Xenova HF Staff commited on
Commit
61161af
·
verified ·
1 Parent(s): 26e9e10

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +254 -23
index.html CHANGED
@@ -1,29 +1,260 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
3
-
4
  <head>
5
- <meta charset="UTF-8" />
6
- <link rel="stylesheet" href="style.css" />
7
-
8
- <meta name="viewport" content="width=device-width, initial-scale=1.0" />
9
- <title>Transformers.js - Object Detection</title>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  </head>
11
-
12
  <body>
13
- <h1>Object Detection w/ 🤗 Transformers.js</h1>
14
- <label id="container" for="upload">
15
- <svg width="25" height="25" viewBox="0 0 25 25" fill="none" xmlns="http://www.w3.org/2000/svg">
16
- <path fill="#000"
17
- d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z">
18
- </path>
19
- </svg>
20
- Click to upload image
21
- <label id="example">(or try example)</label>
22
- </label>
23
- <label id="status">Loading model...</label>
24
- <input id="upload" type="file" accept="image/*" />
25
-
26
- <script src="index.js" type="module"></script>
27
- </body>
28
 
29
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  <!DOCTYPE html>
2
  <html lang="en">
 
3
  <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Camera Interaction App</title>
7
+ <style>
8
+ body {
9
+ font-family: sans-serif;
10
+ display: flex;
11
+ flex-direction: column;
12
+ align-items: center;
13
+ gap: 20px;
14
+ padding: 20px;
15
+ background-color: #f0f0f0;
16
+ }
17
+ .controls, .io-areas {
18
+ display: flex;
19
+ gap: 10px;
20
+ align-items: center;
21
+ background-color: #fff;
22
+ padding: 15px;
23
+ border-radius: 8px;
24
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
25
+ }
26
+ .io-areas {
27
+ flex-direction: column;
28
+ align-items: stretch;
29
+ }
30
+ textarea {
31
+ width: 300px;
32
+ height: 80px;
33
+ padding: 8px;
34
+ border: 1px solid #ccc;
35
+ border-radius: 4px;
36
+ font-size: 14px;
37
+ }
38
+ #videoFeed {
39
+ width: 480px;
40
+ height: 360px;
41
+ border: 2px solid #333;
42
+ background-color: #000;
43
+ border-radius: 8px;
44
+ }
45
+ #startButton {
46
+ padding: 10px 20px;
47
+ font-size: 16px;
48
+ cursor: pointer;
49
+ border: none;
50
+ border-radius: 4px;
51
+ color: white;
52
+ }
53
+ #startButton.start {
54
+ background-color: #28a745; /* Green */
55
+ }
56
+ #startButton.stop {
57
+ background-color: #dc3545; /* Red */
58
+ }
59
+ label {
60
+ font-weight: bold;
61
+ }
62
+ select {
63
+ padding: 8px;
64
+ border-radius: 4px;
65
+ border: 1px solid #ccc;
66
+ }
67
+ .hidden {
68
+ display: none;
69
+ }
70
+ </style>
71
  </head>
 
72
  <body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ <h1>Camera Interaction App</h1>
75
+
76
+ <video id="videoFeed" autoplay playsinline></video>
77
+ <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
78
+
79
+ <div class="io-areas">
80
+ <div>
81
+ <label for="instructionText">Instruction:</label><br>
82
+ <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
83
+ </div>
84
+ <div>
85
+ <label for="responseText">Response:</label><br>
86
+ <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
87
+ </div>
88
+ </div>
89
+
90
+ <div class="controls">
91
+ <label for="intervalSelect">Interval between 2 requests:</label>
92
+ <select id="intervalSelect" name="Interval between 2 requests">
93
+ <option value="0" selected>0ms</option>
94
+ <option value="100">100ms</option>
95
+ <option value="250">250ms</option>
96
+ <option value="500">500ms</option>
97
+ <option value="1000">1s</option>
98
+ <option value="2000">2s</option>
99
+ </select>
100
+ <button id="startButton" class="start">Start</button>
101
+ </div>
102
+
103
+ <script type="module">
104
+ import {
105
+ AutoProcessor,
106
+ AutoModelForVision2Seq,
107
+ RawImage
108
+ } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
109
+
110
+ const video = document.getElementById('videoFeed');
111
+ const canvas = document.getElementById('canvas');
112
+ const instructionText = document.getElementById('instructionText');
113
+ const responseText = document.getElementById('responseText');
114
+ const intervalSelect = document.getElementById('intervalSelect');
115
+ const startButton = document.getElementById('startButton');
116
+
117
+ instructionText.value = "In one sentence, what do you see?"; // default instruction
118
+
119
+ let stream;
120
+ let isProcessing = false;
121
+
122
+ let processor, model;
123
+
124
+ async function initModel() {
125
+ const modelId = 'HuggingFaceTB/SmolVLM-Instruct';
126
+ processor = await AutoProcessor.from_pretrained(modelId);
127
+ model = await AutoModelForVision2Seq.from_pretrained(modelId, {
128
+ dtype: {
129
+ embed_tokens: 'fp16',
130
+ vision_encoder: 'q4',
131
+ decoder_model_merged: 'q4'
132
+ },
133
+ device: "webgpu",
134
+ });
135
+ }
136
+
137
+ async function initCamera() {
138
+ try {
139
+ stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
140
+ video.srcObject = stream;
141
+ responseText.value = "Camera access granted. Ready to start.";
142
+ } catch (err) {
143
+ console.error("Error accessing camera:", err);
144
+ responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
145
+ alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
146
+ }
147
+ }
148
+
149
+ function captureImage() {
150
+ if (!stream || !video.videoWidth) {
151
+ console.warn("Video stream not ready for capture.");
152
+ return null;
153
+ }
154
+ canvas.width = video.videoWidth;
155
+ canvas.height = video.videoHeight;
156
+ const context = canvas.getContext('2d', { willReadFrequently: true });
157
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
158
+ const frame = context.getImageData(0, 0, canvas.width, canvas.height);
159
+ return new RawImage(frame.data, frame.width, frame.height, 4);
160
+ }
161
+
162
+ async function runLocalVisionInference(imgElement, instruction) {
163
+ const messages = [{
164
+ role: 'user',
165
+ content: [
166
+ { type: 'image' },
167
+ { type: 'text', text: instruction }
168
+ ]
169
+ }];
170
+ const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
171
+ const inputs = await processor(text, [imgElement], { do_image_splitting: false });
172
+ const generatedIds = await model.generate({ ...inputs, max_new_tokens: 100 });
173
+ const output = processor.batch_decode(
174
+ generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
175
+ { skip_special_tokens: true }
176
+ );
177
+ return output[0].trim();
178
+ }
179
+
180
+ async function sendData() {
181
+ if (!isProcessing) return;
182
+ const instruction = instructionText.value;
183
+ const rawImg = captureImage();
184
+ if (!rawImg) {
185
+ responseText.value = 'Capture failed';
186
+ return;
187
+ }
188
+ try {
189
+ const reply = await runLocalVisionInference(rawImg, instruction);
190
+ responseText.value = reply;
191
+ } catch (e) {
192
+ console.error(e);
193
+ responseText.value = `Error: ${e.message}`;
194
+ }
195
+ }
196
+
197
+ function sleep(ms) {
198
+ return new Promise(resolve => setTimeout(resolve, ms));
199
+ }
200
+
201
+ async function processingLoop() {
202
+ const intervalMs = parseInt(intervalSelect.value, 10);
203
+ while (isProcessing) {
204
+ await sendData();
205
+ if (!isProcessing) break;
206
+ await sleep(intervalMs);
207
+ }
208
+ }
209
+
210
+ function handleStart() {
211
+ if (!stream) {
212
+ responseText.value = "Camera not available. Cannot start.";
213
+ alert("Camera not available. Please grant permission first.");
214
+ return;
215
+ }
216
+ isProcessing = true;
217
+ startButton.textContent = "Stop";
218
+ startButton.classList.replace('start', 'stop');
219
+
220
+ instructionText.disabled = true;
221
+ intervalSelect.disabled = true;
222
+
223
+ responseText.value = "Processing started...";
224
+
225
+ processingLoop();
226
+ }
227
+
228
+ function handleStop() {
229
+ isProcessing = false;
230
+ startButton.textContent = "Start";
231
+ startButton.classList.replace('stop', 'start');
232
+
233
+ instructionText.disabled = false;
234
+ intervalSelect.disabled = false;
235
+ if (responseText.value.startsWith("Processing started...")) {
236
+ responseText.value = "Processing stopped.";
237
+ }
238
+ }
239
+
240
+ startButton.addEventListener('click', () => {
241
+ if (isProcessing) {
242
+ handleStop();
243
+ } else {
244
+ handleStart();
245
+ }
246
+ });
247
+
248
+ window.addEventListener('DOMContentLoaded', async () => {
249
+ await initModel();
250
+ await initCamera();
251
+ });
252
+
253
+ window.addEventListener('beforeunload', () => {
254
+ if (stream) {
255
+ stream.getTracks().forEach(track => track.stop());
256
+ }
257
+ });
258
+ </script>
259
+ </body>
260
+ </html>