Xenova HF Staff tudi2d commited on
Commit
108ddae
·
verified ·
1 Parent(s): a42014f

Add loading indicators (#1)

Browse files

- Add loading indicators (8f1d0a995d975ebc6baa5bd63d58010c29457914)
- Update index.html (77e02fc5beb202f47a6ad9c0c9363ef43240cfe4)


Co-authored-by: Philipp Hugenroth <[email protected]>

Files changed (1) hide show
  1. index.html +287 -217
index.html CHANGED
@@ -1,260 +1,330 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
  <title>Camera Interaction App</title>
7
  <style>
8
- body {
9
- font-family: sans-serif;
10
- display: flex;
11
- flex-direction: column;
12
- align-items: center;
13
- gap: 20px;
14
- padding: 20px;
15
- background-color: #f0f0f0;
16
- }
17
- .controls, .io-areas {
18
- display: flex;
19
- gap: 10px;
20
- align-items: center;
21
- background-color: #fff;
22
- padding: 15px;
23
- border-radius: 8px;
24
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
25
- }
26
- .io-areas {
27
- flex-direction: column;
28
- align-items: stretch;
29
- }
30
- textarea {
31
- width: 300px;
32
- height: 80px;
33
- padding: 8px;
34
- border: 1px solid #ccc;
35
- border-radius: 4px;
36
- font-size: 14px;
37
- }
38
- #videoFeed {
39
- width: 480px;
40
- height: 360px;
41
- border: 2px solid #333;
42
- background-color: #000;
43
- border-radius: 8px;
44
- }
45
- #startButton {
46
- padding: 10px 20px;
47
- font-size: 16px;
48
- cursor: pointer;
49
- border: none;
50
- border-radius: 4px;
51
- color: white;
52
- }
53
- #startButton.start {
54
- background-color: #28a745; /* Green */
55
- }
56
- #startButton.stop {
57
- background-color: #dc3545; /* Red */
58
- }
59
- label {
60
- font-weight: bold;
61
- }
62
- select {
63
- padding: 8px;
64
- border-radius: 4px;
65
- border: 1px solid #ccc;
66
- }
67
- .hidden {
68
- display: none;
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  </style>
71
- </head>
72
- <body>
73
-
74
  <h1>Camera Interaction App</h1>
75
 
76
- <video id="videoFeed" autoplay playsinline></video>
77
- <canvas id="canvas" class="hidden"></canvas> <!-- For capturing frames -->
 
 
 
 
78
 
79
  <div class="io-areas">
80
- <div>
81
- <label for="instructionText">Instruction:</label><br>
82
- <textarea id="instructionText" style="height: 2em; width: 40em" name="Instruction"></textarea>
83
- </div>
84
- <div>
85
- <label for="responseText">Response:</label><br>
86
- <textarea id="responseText" style="height: 2em; width: 40em" name="Response" readonly placeholder="Server response will appear here..."></textarea>
87
- </div>
 
 
 
 
 
 
 
 
 
 
88
  </div>
89
 
90
  <div class="controls">
91
- <label for="intervalSelect">Interval between 2 requests:</label>
92
- <select id="intervalSelect" name="Interval between 2 requests">
93
- <option value="0" selected>0ms</option>
94
- <option value="100">100ms</option>
95
- <option value="250">250ms</option>
96
- <option value="500">500ms</option>
97
- <option value="1000">1s</option>
98
- <option value="2000">2s</option>
99
- </select>
100
- <button id="startButton" class="start">Start</button>
101
  </div>
102
 
103
  <script type="module">
104
- import {
105
- AutoProcessor,
106
- AutoModelForVision2Seq,
107
- RawImage
108
- } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js';
109
 
110
- const video = document.getElementById('videoFeed');
111
- const canvas = document.getElementById('canvas');
112
- const instructionText = document.getElementById('instructionText');
113
- const responseText = document.getElementById('responseText');
114
- const intervalSelect = document.getElementById('intervalSelect');
115
- const startButton = document.getElementById('startButton');
 
116
 
117
- instructionText.value = "In one sentence, what do you see?"; // default instruction
118
 
119
- let stream;
120
- let isProcessing = false;
121
 
122
- let processor, model;
123
 
124
- async function initModel() {
125
- const modelId = 'HuggingFaceTB/SmolVLM-Instruct';
126
- processor = await AutoProcessor.from_pretrained(modelId);
127
- model = await AutoModelForVision2Seq.from_pretrained(modelId, {
128
- dtype: {
129
- embed_tokens: 'fp16',
130
- vision_encoder: 'q4',
131
- decoder_model_merged: 'q4'
132
- },
133
- device: "webgpu",
134
- });
135
- }
 
 
 
 
 
136
 
137
- async function initCamera() {
138
- try {
139
- stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: false });
140
- video.srcObject = stream;
141
- responseText.value = "Camera access granted. Ready to start.";
142
- } catch (err) {
143
- console.error("Error accessing camera:", err);
144
- responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
145
- alert(`Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`);
146
- }
 
 
 
 
147
  }
 
148
 
149
- function captureImage() {
150
- if (!stream || !video.videoWidth) {
151
- console.warn("Video stream not ready for capture.");
152
- return null;
153
- }
154
- canvas.width = video.videoWidth;
155
- canvas.height = video.videoHeight;
156
- const context = canvas.getContext('2d', { willReadFrequently: true });
157
- context.drawImage(video, 0, 0, canvas.width, canvas.height);
158
- const frame = context.getImageData(0, 0, canvas.width, canvas.height);
159
- return new RawImage(frame.data, frame.width, frame.height, 4);
160
  }
 
 
 
 
 
 
 
161
 
162
- async function runLocalVisionInference(imgElement, instruction) {
163
- const messages = [{
164
- role: 'user',
165
- content: [
166
- { type: 'image' },
167
- { type: 'text', text: instruction }
168
- ]
169
- }];
170
- const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
171
- const inputs = await processor(text, [imgElement], { do_image_splitting: false });
172
- const generatedIds = await model.generate({ ...inputs, max_new_tokens: 100 });
173
- const output = processor.batch_decode(
174
- generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
175
- { skip_special_tokens: true }
176
- );
177
- return output[0].trim();
178
- }
 
 
 
 
 
 
179
 
180
- async function sendData() {
181
- if (!isProcessing) return;
182
- const instruction = instructionText.value;
183
- const rawImg = captureImage();
184
- if (!rawImg) {
185
- responseText.value = 'Capture failed';
186
- return;
187
- }
188
- try {
189
- const reply = await runLocalVisionInference(rawImg, instruction);
190
- responseText.value = reply;
191
- } catch (e) {
192
- console.error(e);
193
- responseText.value = `Error: ${e.message}`;
194
- }
195
  }
 
 
 
 
 
196
 
197
- function sleep(ms) {
198
- return new Promise(resolve => setTimeout(resolve, ms));
 
 
 
 
199
  }
 
200
 
201
- async function processingLoop() {
202
- const intervalMs = parseInt(intervalSelect.value, 10);
203
- while (isProcessing) {
204
- await sendData();
205
- if (!isProcessing) break;
206
- await sleep(intervalMs);
207
- }
208
  }
 
 
 
209
 
210
- function handleStart() {
211
- if (!stream) {
212
- responseText.value = "Camera not available. Cannot start.";
213
- alert("Camera not available. Please grant permission first.");
214
- return;
215
- }
216
- isProcessing = true;
217
- startButton.textContent = "Stop";
218
- startButton.classList.replace('start', 'stop');
219
 
220
- instructionText.disabled = true;
221
- intervalSelect.disabled = true;
222
 
223
- responseText.value = "Processing started...";
 
224
 
225
- processingLoop();
 
 
 
 
 
 
 
 
226
  }
 
227
 
228
- function handleStop() {
229
- isProcessing = false;
230
- startButton.textContent = "Start";
231
- startButton.classList.replace('stop', 'start');
 
 
 
232
 
233
- instructionText.disabled = false;
234
- intervalSelect.disabled = false;
235
- if (responseText.value.startsWith("Processing started...")) {
236
- responseText.value = "Processing stopped.";
237
- }
 
 
 
 
 
 
 
 
238
  }
239
 
240
- startButton.addEventListener('click', () => {
241
- if (isProcessing) {
242
- handleStop();
243
- } else {
244
- handleStart();
245
- }
246
- });
247
 
248
- window.addEventListener('DOMContentLoaded', async () => {
249
- await initModel();
250
- await initCamera();
251
- });
252
-
253
- window.addEventListener('beforeunload', () => {
254
- if (stream) {
255
- stream.getTracks().forEach(track => track.stop());
256
- }
257
- });
258
  </script>
259
- </body>
260
  </html>
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
  <title>Camera Interaction App</title>
7
  <style>
8
+ body {
9
+ font-family: sans-serif;
10
+ display: flex;
11
+ flex-direction: column;
12
+ align-items: center;
13
+ gap: 20px;
14
+ padding: 20px;
15
+ background-color: #f0f0f0;
16
+ }
17
+ .controls,
18
+ .io-areas {
19
+ display: flex;
20
+ gap: 10px;
21
+ align-items: center;
22
+ background-color: #fff;
23
+ padding: 15px;
24
+ border-radius: 8px;
25
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
26
+ }
27
+ .io-areas {
28
+ flex-direction: column;
29
+ align-items: stretch;
30
+ }
31
+ textarea {
32
+ width: 300px;
33
+ height: 80px;
34
+ padding: 8px;
35
+ border: 1px solid #ccc;
36
+ border-radius: 4px;
37
+ font-size: 14px;
38
+ }
39
+ #videoFeed {
40
+ display: block;
41
+ width: 100%;
42
+ height: 100%;
43
+ border-radius: 6px;
44
+ object-fit: cover;
45
+ }
46
+ #videoContainer {
47
+ position: relative;
48
+ width: 480px;
49
+ height: 360px;
50
+ border: 2px solid #333;
51
+ background-color: #000;
52
+ border-radius: 8px;
53
+ margin: 0 auto;
54
+ }
55
+ #loadingOverlay {
56
+ position: absolute;
57
+ top: 0;
58
+ left: 0;
59
+ width: 100%;
60
+ height: 100%;
61
+ display: none;
62
+ justify-content: center;
63
+ align-items: center;
64
+ background-color: rgba(0, 0, 0, 0.7);
65
+ z-index: 10;
66
+ border-radius: 6px;
67
+ color: #ffffff;
68
+ font-size: 1.5em;
69
+ font-weight: bold;
70
+ }
71
+ #startButton {
72
+ padding: 10px 20px;
73
+ font-size: 16px;
74
+ cursor: pointer;
75
+ border: none;
76
+ border-radius: 4px;
77
+ color: white;
78
+ }
79
+ #startButton.start {
80
+ background-color: #28a745; /* Green */
81
+ }
82
+ #startButton.stop {
83
+ background-color: #dc3545; /* Red */
84
+ }
85
+ label {
86
+ font-weight: bold;
87
+ }
88
+ select {
89
+ padding: 8px;
90
+ border-radius: 4px;
91
+ border: 1px solid #ccc;
92
+ }
93
+ .hidden {
94
+ display: none;
95
+ }
96
  </style>
97
+ </head>
98
+ <body>
 
99
  <h1>Camera Interaction App</h1>
100
 
101
+ <div id="videoContainer">
102
+ <video id="videoFeed" autoplay playsinline></video>
103
+ <div id="loadingOverlay">Loading...</div>
104
+ </div>
105
+ <canvas id="canvas" class="hidden"></canvas>
106
+ <!-- For capturing frames -->
107
 
108
  <div class="io-areas">
109
+ <div>
110
+ <label for="instructionText">Instruction:</label><br />
111
+ <textarea
112
+ id="instructionText"
113
+ style="height: 2em; width: 40em"
114
+ name="Instruction"
115
+ ></textarea>
116
+ </div>
117
+ <div>
118
+ <label for="responseText">Response:</label><br />
119
+ <textarea
120
+ id="responseText"
121
+ style="height: 2em; width: 40em"
122
+ name="Response"
123
+ readonly
124
+ placeholder="Server response will appear here..."
125
+ ></textarea>
126
+ </div>
127
  </div>
128
 
129
  <div class="controls">
130
+ <label for="intervalSelect">Interval between 2 requests:</label>
131
+ <select id="intervalSelect" name="Interval between 2 requests">
132
+ <option value="0" selected>0ms</option>
133
+ <option value="100">100ms</option>
134
+ <option value="250">250ms</option>
135
+ <option value="500">500ms</option>
136
+ <option value="1000">1s</option>
137
+ <option value="2000">2s</option>
138
+ </select>
139
+ <button id="startButton" class="start">Start</button>
140
  </div>
141
 
142
  <script type="module">
143
+ import {
144
+ AutoProcessor,
145
+ AutoModelForVision2Seq,
146
+ RawImage,
147
+ } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";
148
 
149
+ const video = document.getElementById("videoFeed");
150
+ const canvas = document.getElementById("canvas");
151
+ const instructionText = document.getElementById("instructionText");
152
+ const responseText = document.getElementById("responseText");
153
+ const intervalSelect = document.getElementById("intervalSelect");
154
+ const startButton = document.getElementById("startButton");
155
+ const loadingOverlay = document.getElementById("loadingOverlay");
156
 
157
+ instructionText.value = "In one sentence, what do you see?"; // default instruction
158
 
159
+ let stream;
160
+ let isProcessing = false;
161
 
162
+ let processor, model;
163
 
164
+ async function initModel() {
165
+ const modelId = "HuggingFaceTB/SmolVLM-Instruct";
166
+ loadingOverlay.style.display = "flex";
167
+ responseText.value = "Loading processor...";
168
+ processor = await AutoProcessor.from_pretrained(modelId);
169
+ responseText.value = "Processor loaded. Loading model...";
170
+ model = await AutoModelForVision2Seq.from_pretrained(modelId, {
171
+ dtype: {
172
+ embed_tokens: "fp16",
173
+ vision_encoder: "q4",
174
+ decoder_model_merged: "q4",
175
+ },
176
+ device: "webgpu",
177
+ });
178
+ responseText.value = "Model loaded. Initializing camera...";
179
+ loadingOverlay.style.display = "none";
180
+ }
181
 
182
+ async function initCamera() {
183
+ try {
184
+ stream = await navigator.mediaDevices.getUserMedia({
185
+ video: true,
186
+ audio: false,
187
+ });
188
+ video.srcObject = stream;
189
+ responseText.value = "Camera access granted. Ready to start.";
190
+ } catch (err) {
191
+ console.error("Error accessing camera:", err);
192
+ responseText.value = `Error accessing camera: ${err.name} - ${err.message}. Please ensure permissions are granted and you are on HTTPS or localhost.`;
193
+ alert(
194
+ `Error accessing camera: ${err.name}. Make sure you've granted permission and are on HTTPS or localhost.`
195
+ );
196
  }
197
+ }
198
 
199
+ function captureImage() {
200
+ if (!stream || !video.videoWidth) {
201
+ console.warn("Video stream not ready for capture.");
202
+ return null;
 
 
 
 
 
 
 
203
  }
204
+ canvas.width = video.videoWidth;
205
+ canvas.height = video.videoHeight;
206
+ const context = canvas.getContext("2d", { willReadFrequently: true });
207
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
208
+ const frame = context.getImageData(0, 0, canvas.width, canvas.height);
209
+ return new RawImage(frame.data, frame.width, frame.height, 4);
210
+ }
211
 
212
+ async function runLocalVisionInference(imgElement, instruction) {
213
+ const messages = [
214
+ {
215
+ role: "user",
216
+ content: [{ type: "image" }, { type: "text", text: instruction }],
217
+ },
218
+ ];
219
+ const text = processor.apply_chat_template(messages, {
220
+ add_generation_prompt: true,
221
+ });
222
+ const inputs = await processor(text, [imgElement], {
223
+ do_image_splitting: false,
224
+ });
225
+ const generatedIds = await model.generate({
226
+ ...inputs,
227
+ max_new_tokens: 100,
228
+ });
229
+ const output = processor.batch_decode(
230
+ generatedIds.slice(null, [inputs.input_ids.dims.at(-1), null]),
231
+ { skip_special_tokens: true }
232
+ );
233
+ return output[0].trim();
234
+ }
235
 
236
+ async function sendData() {
237
+ if (!isProcessing) return;
238
+ const instruction = instructionText.value;
239
+ const rawImg = captureImage();
240
+ if (!rawImg) {
241
+ responseText.value = "Capture failed";
242
+ return;
243
+ }
244
+ try {
245
+ const reply = await runLocalVisionInference(rawImg, instruction);
246
+ responseText.value = reply;
247
+ } catch (e) {
248
+ console.error(e);
249
+ responseText.value = `Error: ${e.message}`;
 
250
  }
251
+ }
252
+
253
+ function sleep(ms) {
254
+ return new Promise((resolve) => setTimeout(resolve, ms));
255
+ }
256
 
257
+ async function processingLoop() {
258
+ const intervalMs = parseInt(intervalSelect.value, 10);
259
+ while (isProcessing) {
260
+ await sendData();
261
+ if (!isProcessing) break;
262
+ await sleep(intervalMs);
263
  }
264
+ }
265
 
266
+ function handleStart() {
267
+ if (!stream) {
268
+ responseText.value = "Camera not available. Cannot start.";
269
+ alert("Camera not available. Please grant permission first.");
270
+ return;
 
 
271
  }
272
+ isProcessing = true;
273
+ startButton.textContent = "Stop";
274
+ startButton.classList.replace("start", "stop");
275
 
276
+ instructionText.disabled = true;
277
+ intervalSelect.disabled = true;
 
 
 
 
 
 
 
278
 
279
+ responseText.value = "Processing started...";
 
280
 
281
+ processingLoop();
282
+ }
283
 
284
+ function handleStop() {
285
+ isProcessing = false;
286
+ startButton.textContent = "Start";
287
+ startButton.classList.replace("stop", "start");
288
+
289
+ instructionText.disabled = false;
290
+ intervalSelect.disabled = false;
291
+ if (responseText.value.startsWith("Processing started...")) {
292
+ responseText.value = "Processing stopped.";
293
  }
294
+ }
295
 
296
+ startButton.addEventListener("click", () => {
297
+ if (isProcessing) {
298
+ handleStop();
299
+ } else {
300
+ handleStart();
301
+ }
302
+ });
303
 
304
+ window.addEventListener("DOMContentLoaded", async () => {
305
+ // Check for WebGPU support
306
+ if (!navigator.gpu) {
307
+ const videoElement = document.getElementById("videoFeed");
308
+ const warningElement = document.createElement("p");
309
+ warningElement.textContent =
310
+ "WebGPU is not available in this browser.";
311
+ warningElement.style.color = "red";
312
+ warningElement.style.textAlign = "center";
313
+ videoElement.parentNode.insertBefore(
314
+ warningElement,
315
+ videoElement.nextSibling
316
+ );
317
  }
318
 
319
+ await initModel();
320
+ await initCamera();
321
+ });
 
 
 
 
322
 
323
+ window.addEventListener("beforeunload", () => {
324
+ if (stream) {
325
+ stream.getTracks().forEach((track) => track.stop());
326
+ }
327
+ });
 
 
 
 
 
328
  </script>
329
+ </body>
330
  </html>