Spaces:
Running
Running
Update index.backup5.html
Browse files- index.backup5.html +37 -89
index.backup5.html
CHANGED
@@ -161,7 +161,7 @@
|
|
161 |
<option value="quality">Highest Quality</option>
|
162 |
</select>
|
163 |
<div id="model-info">
|
164 |
-
TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM:
|
165 |
</div>
|
166 |
</div>
|
167 |
<div id="visualizer"></div>
|
@@ -179,13 +179,12 @@
|
|
179 |
|
180 |
env.localModelPath = './models';
|
181 |
|
182 |
-
//BELOW 5 statements added by RAHUL
|
183 |
// Configure environment before initializing pipelines
|
184 |
env.backends = ['wasm'];
|
185 |
env.wasm = env.wasm || {};
|
186 |
-
env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]/';
|
187 |
-
env.wasm.simd = true;
|
188 |
-
env.numThreads = navigator.hardwareConcurrency || 4;
|
189 |
|
190 |
const conversationDiv = document.getElementById('conversation');
|
191 |
const startButton = document.getElementById('startButton');
|
@@ -199,6 +198,7 @@
|
|
199 |
let myvad;
|
200 |
let sttPipeline;
|
201 |
let ttsPipeline;
|
|
|
202 |
let audioContext;
|
203 |
let analyser;
|
204 |
let dataArray;
|
@@ -228,46 +228,55 @@
|
|
228 |
const barHeight = dataArray[i] / 2;
|
229 |
bars[i].style.height = barHeight + 'px';
|
230 |
}
|
231 |
-
|
232 |
-
animationId = setTimeout(updateVisualizer, 50); // Update every 50ms - RAHUL ATLURY
|
233 |
-
|
234 |
-
//animationId = requestAnimationFrame(updateVisualizer);
|
235 |
}
|
236 |
|
237 |
-
|
238 |
async function initializePipelines() {
|
239 |
try {
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
[sttPipeline, ttsPipeline] = await Promise.all([
|
247 |
-
pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }),
|
248 |
-
pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true })
|
249 |
]);
|
250 |
|
251 |
-
addLog('System: Digital Human Voice Chat initialized. Click "Begin Call" to start.');
|
252 |
startButton.disabled = false;
|
253 |
loadingDiv.style.display = 'none';
|
254 |
} catch (error) {
|
255 |
console.error('Error initializing pipelines:', error);
|
256 |
-
addLog(
|
257 |
loadingDiv.style.display = 'none';
|
258 |
}
|
259 |
}
|
260 |
|
261 |
async function processSpeech(audio) {
|
262 |
try {
|
263 |
-
if (!sttPipeline || !ttsPipeline) {
|
264 |
throw new Error('Pipelines not initialized');
|
265 |
}
|
266 |
|
267 |
const transcription = await sttPipeline(audio);
|
268 |
addLog(`User: ${transcription.text}`);
|
269 |
|
270 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
addLog(`Bot: ${botResponse}`);
|
272 |
|
273 |
isSpeaking = true;
|
@@ -276,7 +285,7 @@
|
|
276 |
isSpeaking = false;
|
277 |
} catch (error) {
|
278 |
console.error('Error processing speech:', error);
|
279 |
-
addLog(
|
280 |
}
|
281 |
}
|
282 |
|
@@ -339,10 +348,9 @@
|
|
339 |
remoteVideo.muted = true;
|
340 |
document.getElementById('remoteVideo').volume = 0;
|
341 |
|
342 |
-
// Request both audio and video streams
|
343 |
microphoneStream = await navigator.mediaDevices.getUserMedia({
|
344 |
audio: true,
|
345 |
-
video: { width: 1, height: 1 }
|
346 |
});
|
347 |
|
348 |
localVideo.srcObject = microphoneStream;
|
@@ -351,7 +359,6 @@
|
|
351 |
console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints());
|
352 |
console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings());
|
353 |
|
354 |
-
// Implement loopback hack for improved echo cancellation
|
355 |
const offerOptions = {
|
356 |
offerToReceiveAudio: true,
|
357 |
offerToReceiveVideo: false,
|
@@ -374,13 +381,12 @@
|
|
374 |
await rtcLoopbackConnection.setLocalDescription(answer);
|
375 |
await rtcConnection.setRemoteDescription(answer);
|
376 |
|
377 |
-
// Use the loopback stream for audio processing
|
378 |
const source = audioContext.createMediaStreamSource(loopbackStream);
|
379 |
source.connect(analyser);
|
380 |
|
381 |
myvad = await vad.MicVAD.new({
|
382 |
-
|
383 |
-
|
384 |
onSpeechStart: () => {
|
385 |
addLog('--- Voice activity: speech start');
|
386 |
updateVisualizer();
|
@@ -398,62 +404,4 @@
|
|
398 |
});
|
399 |
|
400 |
await myvad.start();
|
401 |
-
startButton.textContent
|
402 |
-
isListening = true;
|
403 |
-
addLog('System: Listening...');
|
404 |
-
} catch (error) {
|
405 |
-
console.error('Error starting voice activity:', error);
|
406 |
-
addLog('System: Error starting voice detection. Please check your microphone and try again.');
|
407 |
-
}
|
408 |
-
}
|
409 |
-
|
410 |
-
async function stopListening() {
|
411 |
-
if (myvad) {
|
412 |
-
try {
|
413 |
-
await myvad.destroy();
|
414 |
-
} catch (error) {
|
415 |
-
console.error('Error stopping voice activity:', error);
|
416 |
-
}
|
417 |
-
myvad = null;
|
418 |
-
}
|
419 |
-
if (microphoneStream) {
|
420 |
-
microphoneStream.getTracks().forEach(track => track.stop());
|
421 |
-
microphoneStream = null;
|
422 |
-
}
|
423 |
-
if (audioContext) {
|
424 |
-
await audioContext.close();
|
425 |
-
audioContext = null;
|
426 |
-
}
|
427 |
-
if (localVideo) {
|
428 |
-
localVideo.srcObject = null;
|
429 |
-
}
|
430 |
-
if (remoteVideo) {
|
431 |
-
remoteVideo.srcObject = null;
|
432 |
-
}
|
433 |
-
if (rtcConnection) {
|
434 |
-
rtcConnection.close();
|
435 |
-
rtcConnection = null;
|
436 |
-
}
|
437 |
-
if (rtcLoopbackConnection) {
|
438 |
-
rtcLoopbackConnection.close();
|
439 |
-
rtcLoopbackConnection = null;
|
440 |
-
}
|
441 |
-
loopbackStream = new MediaStream();
|
442 |
-
stopCurrentAudio();
|
443 |
-
startButton.textContent = 'Begin Call';
|
444 |
-
isListening = false;
|
445 |
-
addLog('System: Stopped listening.');
|
446 |
-
cancelAnimationFrame(animationId);
|
447 |
-
addLog('System: Microphone closed');
|
448 |
-
}
|
449 |
-
|
450 |
-
startButton.addEventListener('click', toggleListening);
|
451 |
-
clearLogsButton.addEventListener('click', () => {
|
452 |
-
logsDiv.innerHTML = '';
|
453 |
-
});
|
454 |
-
|
455 |
-
createVisualizer();
|
456 |
-
initializePipelines();
|
457 |
-
</script>
|
458 |
-
</body>
|
459 |
-
</html>
|
|
|
161 |
<option value="quality">Highest Quality</option>
|
162 |
</select>
|
163 |
<div id="model-info">
|
164 |
+
TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM: Xenova/Qwen1.5-0.5B-Chat
|
165 |
</div>
|
166 |
</div>
|
167 |
<div id="visualizer"></div>
|
|
|
179 |
|
180 |
env.localModelPath = './models';
|
181 |
|
|
|
182 |
// Configure environment before initializing pipelines
|
183 |
env.backends = ['wasm'];
|
184 |
env.wasm = env.wasm || {};
|
185 |
+
env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]/';
|
186 |
+
env.wasm.simd = true;
|
187 |
+
env.numThreads = navigator.hardwareConcurrency || 4;
|
188 |
|
189 |
const conversationDiv = document.getElementById('conversation');
|
190 |
const startButton = document.getElementById('startButton');
|
|
|
198 |
let myvad;
|
199 |
let sttPipeline;
|
200 |
let ttsPipeline;
|
201 |
+
let llmPipeline;
|
202 |
let audioContext;
|
203 |
let analyser;
|
204 |
let dataArray;
|
|
|
228 |
const barHeight = dataArray[i] / 2;
|
229 |
bars[i].style.height = barHeight + 'px';
|
230 |
}
|
231 |
+
animationId = setTimeout(updateVisualizer, 50);
|
|
|
|
|
|
|
232 |
}
|
233 |
|
|
|
234 |
async function initializePipelines() {
|
235 |
try {
|
236 |
+
addLog('System: Initializing pipelines...');
|
237 |
+
[sttPipeline, ttsPipeline, llmPipeline] = await Promise.all([
|
238 |
+
pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }),
|
239 |
+
pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true }),
|
240 |
+
pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat', { quantized: true })
|
|
|
|
|
|
|
|
|
241 |
]);
|
242 |
|
243 |
+
addLog('System: Digital Human Voice Chat initialized with Qwen1.5-0.5B-Chat. Click "Begin Call" to start.');
|
244 |
startButton.disabled = false;
|
245 |
loadingDiv.style.display = 'none';
|
246 |
} catch (error) {
|
247 |
console.error('Error initializing pipelines:', error);
|
248 |
+
addLog(`System: Error initializing pipelines: ${error.message}`);
|
249 |
loadingDiv.style.display = 'none';
|
250 |
}
|
251 |
}
|
252 |
|
253 |
async function processSpeech(audio) {
|
254 |
try {
|
255 |
+
if (!sttPipeline || !ttsPipeline || !llmPipeline) {
|
256 |
throw new Error('Pipelines not initialized');
|
257 |
}
|
258 |
|
259 |
const transcription = await sttPipeline(audio);
|
260 |
addLog(`User: ${transcription.text}`);
|
261 |
|
262 |
+
const messages = [
|
263 |
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
264 |
+
{ role: 'user', content: transcription.text }
|
265 |
+
];
|
266 |
+
|
267 |
+
// Apply chat template
|
268 |
+
const text = llmPipeline.tokenizer.apply_chat_template(messages, {
|
269 |
+
tokenize: false,
|
270 |
+
add_generation_prompt: true,
|
271 |
+
});
|
272 |
+
|
273 |
+
// Generate text
|
274 |
+
const llmResponse = await llmPipeline(text, {
|
275 |
+
max_new_tokens: 128,
|
276 |
+
do_sample: false
|
277 |
+
});
|
278 |
+
|
279 |
+
const botResponse = llmResponse[0].generated_text;
|
280 |
addLog(`Bot: ${botResponse}`);
|
281 |
|
282 |
isSpeaking = true;
|
|
|
285 |
isSpeaking = false;
|
286 |
} catch (error) {
|
287 |
console.error('Error processing speech:', error);
|
288 |
+
addLog(`System: Error processing speech: ${error.message}`);
|
289 |
}
|
290 |
}
|
291 |
|
|
|
348 |
remoteVideo.muted = true;
|
349 |
document.getElementById('remoteVideo').volume = 0;
|
350 |
|
|
|
351 |
microphoneStream = await navigator.mediaDevices.getUserMedia({
|
352 |
audio: true,
|
353 |
+
video: { width: 1, height: 1 }
|
354 |
});
|
355 |
|
356 |
localVideo.srcObject = microphoneStream;
|
|
|
359 |
console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints());
|
360 |
console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings());
|
361 |
|
|
|
362 |
const offerOptions = {
|
363 |
offerToReceiveAudio: true,
|
364 |
offerToReceiveVideo: false,
|
|
|
381 |
await rtcLoopbackConnection.setLocalDescription(answer);
|
382 |
await rtcConnection.setRemoteDescription(answer);
|
383 |
|
|
|
384 |
const source = audioContext.createMediaStreamSource(loopbackStream);
|
385 |
source.connect(analyser);
|
386 |
|
387 |
myvad = await vad.MicVAD.new({
|
388 |
+
noiseSuppression: true,
|
389 |
+
aggressiveness: 3,
|
390 |
onSpeechStart: () => {
|
391 |
addLog('--- Voice activity: speech start');
|
392 |
updateVisualizer();
|
|
|
404 |
});
|
405 |
|
406 |
await myvad.start();
|
407 |
+
startButton.textContent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|