Spaces:

atlury
/

digitalhuman

Running

App Files Files Community

atlury commited on Sep 16, 2024

Commit

915dd7f

verified ·

1 Parent(s): 553e1ac

Update index.backup5.html

Browse files

Files changed (1) hide show

index.backup5.html +37 -89

index.backup5.html CHANGED Viewed

@@ -161,7 +161,7 @@
                     <option value="quality">Highest Quality</option>
                 </select>
                 <div id="model-info">
-                    TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM: Placeholder
                 </div>
             </div>
             <div id="visualizer"></div>
@@ -179,13 +179,12 @@
         env.localModelPath = './models';
-        //BELOW 5 statements added by RAHUL
         // Configure environment before initializing pipelines
         env.backends = ['wasm'];
         env.wasm = env.wasm || {};
-        env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]/'; // Ensure correct WASM paths
-        env.wasm.simd = true; // Enable SIMD if available
-        env.numThreads = navigator.hardwareConcurrency || 4; // Use available CPU cores
         const conversationDiv = document.getElementById('conversation');
         const startButton = document.getElementById('startButton');
@@ -199,6 +198,7 @@
         let myvad;
         let sttPipeline;
         let ttsPipeline;
         let audioContext;
         let analyser;
         let dataArray;
@@ -228,46 +228,55 @@
                 const barHeight = dataArray[i] / 2;
                 bars[i].style.height = barHeight + 'px';
             }
-              // Use setTimeout instead of requestAnimationFrame to reduce update frequency - RAHUL ATLURY
-              animationId = setTimeout(updateVisualizer, 50); // Update every 50ms - RAHUL ATLURY
-            //animationId = requestAnimationFrame(updateVisualizer);
         }
         async function initializePipelines() {
             try {
-                //sttPipeline = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true });  // added , { quantized: true }
-                //ttsPipeline = await pipeline('text-to-speech', 'Xenova/mms-tts-eng', {
-                //    quantized: true,   //changed to true - RAHUL ATLURY
-                //});
-                [sttPipeline, ttsPipeline] = await Promise.all([
-                  pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }),
-                  pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true })
                 ]);
-                addLog('System: Digital Human Voice Chat initialized. Click "Begin Call" to start.');
                 startButton.disabled = false;
                 loadingDiv.style.display = 'none';
             } catch (error) {
                 console.error('Error initializing pipelines:', error);
-                addLog('System: Error initializing Digital Human Voice Chat. Please check the console for details.');
                 loadingDiv.style.display = 'none';
             }
         }
         async function processSpeech(audio) {
             try {
-                if (!sttPipeline || !ttsPipeline) {
                     throw new Error('Pipelines not initialized');
                 }
                 const transcription = await sttPipeline(audio);
                 addLog(`User: ${transcription.text}`);
-                const botResponse = `I heard you say: "${transcription.text}".`;
                 addLog(`Bot: ${botResponse}`);
                 isSpeaking = true;
@@ -276,7 +285,7 @@
                 isSpeaking = false;
             } catch (error) {
                 console.error('Error processing speech:', error);
-                addLog('System: Error processing speech. Please try again.');
             }
         }
@@ -339,10 +348,9 @@
                 remoteVideo.muted = true;
                 document.getElementById('remoteVideo').volume = 0;
-                // Request both audio and video streams
                 microphoneStream = await navigator.mediaDevices.getUserMedia({
                     audio: true,
-                    video: { width: 1, height: 1 } // Minimal video for echo cancellation
                 });
                 localVideo.srcObject = microphoneStream;
@@ -351,7 +359,6 @@
                 console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints());
                 console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings());
-                // Implement loopback hack for improved echo cancellation
                 const offerOptions = {
                     offerToReceiveAudio: true,
                     offerToReceiveVideo: false,
@@ -374,13 +381,12 @@
                 await rtcLoopbackConnection.setLocalDescription(answer);
                 await rtcConnection.setRemoteDescription(answer);
-                // Use the loopback stream for audio processing
                 const source = audioContext.createMediaStreamSource(loopbackStream);
                 source.connect(analyser);
                 myvad = await vad.MicVAD.new({
-                      noiseSuppression: true,  ///Added by RAHUL Atlury
-                      aggressiveness: 3, // Higher value for more aggressive detection Added by RAHUL ATLURY
                     onSpeechStart: () => {
                         addLog('--- Voice activity: speech start');
                         updateVisualizer();
@@ -398,62 +404,4 @@
                 });
                 await myvad.start();
-                startButton.textContent = 'End Call';
-                isListening = true;
-                addLog('System: Listening...');
-            } catch (error) {
-                console.error('Error starting voice activity:', error);
-                addLog('System: Error starting voice detection. Please check your microphone and try again.');
-            }
-        }
-        async function stopListening() {
-            if (myvad) {
-                try {
-                    await myvad.destroy();
-                } catch (error) {
-                    console.error('Error stopping voice activity:', error);
-                }
-                myvad = null;
-            }
-            if (microphoneStream) {
-                microphoneStream.getTracks().forEach(track => track.stop());
-                microphoneStream = null;
-            }
-            if (audioContext) {
-                await audioContext.close();
-                audioContext = null;
-            }
-            if (localVideo) {
-                localVideo.srcObject = null;
-            }
-            if (remoteVideo) {
-                remoteVideo.srcObject = null;
-            }
-            if (rtcConnection) {
-                rtcConnection.close();
-                rtcConnection = null;
-            }
-            if (rtcLoopbackConnection) {
-                rtcLoopbackConnection.close();
-                rtcLoopbackConnection = null;
-            }
-            loopbackStream = new MediaStream();
-            stopCurrentAudio();
-            startButton.textContent = 'Begin Call';
-            isListening = false;
-            addLog('System: Stopped listening.');
-            cancelAnimationFrame(animationId);
-            addLog('System: Microphone closed');
-          }
-        startButton.addEventListener('click', toggleListening);
-        clearLogsButton.addEventListener('click', () => {
-            logsDiv.innerHTML = '';
-        });
-        createVisualizer();
-        initializePipelines();
-    </script>
-</body>
-</html>

                     <option value="quality">Highest Quality</option>
                 </select>
                 <div id="model-info">
+                    TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM: Xenova/Qwen1.5-0.5B-Chat
                 </div>
             </div>
             <div id="visualizer"></div>
         env.localModelPath = './models';
         // Configure environment before initializing pipelines
         env.backends = ['wasm'];
         env.wasm = env.wasm || {};
+        env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]/';
+        env.wasm.simd = true;
+        env.numThreads = navigator.hardwareConcurrency || 4;
         const conversationDiv = document.getElementById('conversation');
         const startButton = document.getElementById('startButton');
         let myvad;
         let sttPipeline;
         let ttsPipeline;
+        let llmPipeline;
         let audioContext;
         let analyser;
         let dataArray;
                 const barHeight = dataArray[i] / 2;
                 bars[i].style.height = barHeight + 'px';
             }
+            animationId = setTimeout(updateVisualizer, 50);
         }
         async function initializePipelines() {
             try {
+                addLog('System: Initializing pipelines...');
+                [sttPipeline, ttsPipeline, llmPipeline] = await Promise.all([
+                    pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }),
+                    pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true }),
+                    pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat', { quantized: true })
                 ]);
+                addLog('System: Digital Human Voice Chat initialized with Qwen1.5-0.5B-Chat. Click "Begin Call" to start.');
                 startButton.disabled = false;
                 loadingDiv.style.display = 'none';
             } catch (error) {
                 console.error('Error initializing pipelines:', error);
+                addLog(`System: Error initializing pipelines: ${error.message}`);
                 loadingDiv.style.display = 'none';
             }
         }
         async function processSpeech(audio) {
             try {
+                if (!sttPipeline || !ttsPipeline || !llmPipeline) {
                     throw new Error('Pipelines not initialized');
                 }
                 const transcription = await sttPipeline(audio);
                 addLog(`User: ${transcription.text}`);
+                const messages = [
+                    { role: 'system', content: 'You are a helpful assistant.' },
+                    { role: 'user', content: transcription.text }
+                ];
+                // Apply chat template
+                const text = llmPipeline.tokenizer.apply_chat_template(messages, {
+                    tokenize: false,
+                    add_generation_prompt: true,
+                });
+                // Generate text
+                const llmResponse = await llmPipeline(text, {
+                    max_new_tokens: 128,
+                    do_sample: false
+                });
+                const botResponse = llmResponse[0].generated_text;
                 addLog(`Bot: ${botResponse}`);
                 isSpeaking = true;
                 isSpeaking = false;
             } catch (error) {
                 console.error('Error processing speech:', error);
+                addLog(`System: Error processing speech: ${error.message}`);
             }
         }
                 remoteVideo.muted = true;
                 document.getElementById('remoteVideo').volume = 0;
                 microphoneStream = await navigator.mediaDevices.getUserMedia({
                     audio: true,
+                    video: { width: 1, height: 1 }
                 });
                 localVideo.srcObject = microphoneStream;
                 console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints());
                 console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings());
                 const offerOptions = {
                     offerToReceiveAudio: true,
                     offerToReceiveVideo: false,
                 await rtcLoopbackConnection.setLocalDescription(answer);
                 await rtcConnection.setRemoteDescription(answer);
                 const source = audioContext.createMediaStreamSource(loopbackStream);
                 source.connect(analyser);
                 myvad = await vad.MicVAD.new({
+                    noiseSuppression: true,
+                    aggressiveness: 3,
                     onSpeechStart: () => {
                         addLog('--- Voice activity: speech start');
                         updateVisualizer();
                 });
                 await myvad.start();
+                startButton.textContent