Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

daf7e26

verified ·

1 Parent(s): 2bc37a4

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -160

app.py CHANGED Viewed

@@ -1,187 +1,367 @@
-import gradio as gr
 import torch
 import numpy as np
-from dia.model import Dia
-import warnings
-# Suppress warnings for cleaner output
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-# Global model variable
-model = None
-def load_model_once():
-    """Load the Dia model once and cache it globally"""
-    global model
-    if model is None:
-        print("Loading Dia model... This may take a few minutes.")
         try:
-            # Load model without trying to move it manually to GPU
-            # The Dia model handles GPU placement internally
-            model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float32")
-            print("Model loaded successfully!")
-            if torch.cuda.is_available():
-                print(f"CUDA is available: {torch.cuda.get_device_name()}")
-            else:
-                print("CUDA is not available, using CPU")
         except Exception as e:
-            print(f"Error loading model: {e}")
-            raise e
-    return model
-def generate_audio(text, seed=42):
-    """Generate audio from text input with error handling"""
-    try:
-        # Clear GPU cache before generation
-        if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        current_model = load_model_once()
-        # Validate input
-        if not text or not text.strip():
-            return None, "❌ Please enter some text"
-        # Clean and format text
-        text = text.strip()
-        if not text.startswith('[S1]') and not text.startswith('[S2]'):
-            text = '[S1] ' + text
-        # Set seed for reproducibility
-        if seed:
-            torch.manual_seed(int(seed))
-            if torch.cuda.is_available():
-                torch.cuda.manual_seed(int(seed))
-        print(f"Generating speech for: {text[:100]}...")
-        # Generate audio - disable torch compile for T4 stability
-        with torch.no_grad():
-            audio_output = current_model.generate(
-                text,
-                use_torch_compile=False,  # Disabled for T4 compatibility
-                verbose=False
-            )
-        # Ensure audio_output is numpy array
-        if isinstance(audio_output, torch.Tensor):
-            audio_output = audio_output.cpu().numpy()
-        # Normalize audio to prevent clipping
-        if len(audio_output) > 0:
-            max_val = np.max(np.abs(audio_output))
-            if max_val > 1.0:
-                audio_output = audio_output / max_val * 0.95
-        print("✅ Audio generated successfully!")
-        return (44100, audio_output), "✅ Audio generated successfully!"
-    except torch.cuda.OutOfMemoryError:
-        # Handle GPU memory issues
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        error_msg = "❌ GPU memory error. Try shorter text or restart the space."
-        print(error_msg)
-        return None, error_msg
-    except Exception as e:
-        error_msg = f"❌ Error: {str(e)}"
-        print(error_msg)
-        return None, error_msg
-# Create the Gradio interface
-demo = gr.Blocks(title="Dia TTS Demo")
-with demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 20px;">
-        <h1>🎙️ Dia TTS - Ultra-Realistic Text-to-Speech</h1>
-        <p style="font-size: 18px; color: #666;">
-            Generate multi-speaker, emotion-aware dialogue using the Dia 1.6B model
-        </p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            text_input = gr.Textbox(
-                label="📝 Text to Speech",
-                placeholder="[S1] Hello there! How are you today? [S2] I'm doing great, thanks for asking! (laughs)",
-                lines=6,
-                value="[S1] Welcome to the Dia TTS demo! [S2] This is amazing technology!",
-                info="Use [S1] and [S2] for different speakers. Add emotions like (laughs), (sighs), etc."
-            )
-            seed_input = gr.Number(
-                label="🎲 Random Seed",
-                value=42,
-                precision=0,
-                info="Same seed = consistent voices"
-            )
-            generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(
-                label="🔊 Generated Audio",
-                type="numpy"
-            )
-            status_text = gr.Textbox(
-                label="📊 Status",
-                interactive=False,
-                lines=2
-            )
-    # Connect the button to the function
-    generate_btn.click(
-        fn=generate_audio,
-        inputs=[text_input, seed_input],
-        outputs=[audio_output, status_text]
-    )
-    # Add example buttons
-    with gr.Row():
-        example_btn1 = gr.Button("📻 Podcast", size="sm")
-        example_btn2 = gr.Button("😄 Chat", size="sm")
-        example_btn3 = gr.Button("🎭 Drama", size="sm")
-    # Example button functions
-    example_btn1.click(
-        lambda: "[S1] Welcome to our podcast! [S2] Thanks for having me on the show!",
-        outputs=text_input
-    )
-    example_btn2.click(
-        lambda: "[S1] Did you see the game? [S2] Yes! (laughs) It was incredible!",
-        outputs=text_input
-    )
-    example_btn3.click(
-        lambda: "[S1] I can't believe you're leaving. (sighs) [S2] I know, it's hard. (sad)",
-        outputs=text_input
-    )
-    # Usage instructions
-    gr.HTML("""
-    <div style="margin-top: 20px; padding: 15px; background: #f0f8ff; border-radius: 8px;">
-        <h3>💡 Usage Tips:</h3>
-        <ul>
-            <li><strong>Speaker Tags:</strong> Use [S1] and [S2] to switch between speakers</li>
-            <li><strong>Emotions:</strong> Add (laughs), (sighs), (excited), (whispers), (sad), etc.</li>
-            <li><strong>Length:</strong> Keep text moderate length (5-20 seconds of speech works best)</li>
-            <li><strong>Seeds:</strong> Use the same seed number for consistent voice characteristics</li>
-        </ul>
-        <p><strong>Supported Emotions:</strong> (laughs), (sighs), (gasps), (excited), (sad), (angry),
-        (surprised), (whispers), (shouts), (coughs), (clears throat), (sniffs), (chuckles), (groans)</p>
-    </div>
-    """)
-# Launch with basic configuration
 if __name__ == "__main__":
-    demo.launch()

+import os
+import gc
+import time
 import torch
 import numpy as np
+import soundfile as sf
+import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    pipeline
+)
+from TTS.api import TTS
+import nemo.collections.asr as nemo_asr
+from scipy.io.wavfile import write
+import tempfile
+import threading
+import queue
+# Configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 22050
+MAX_LENGTH = 512
+TEMPERATURE = 0.7
+SEED = 42
+# Set seeds for reproducibility
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+class ConversationalAI:
+    def __init__(self):
+        print("🔄 Initializing Conversational AI...")
+        self.setup_models()
+        print("✅ All models loaded successfully!")
+    def setup_models(self):
+        """Initialize all models with T4 GPU optimization"""
+        # 1. ASR Model - Parakeet for high accuracy speech recognition
+        print("📢 Loading ASR model...")
         try:
+            self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                model_name="nvidia/parakeet-tdt-0.6b-v2"
+            ).to(DEVICE)[7][9]
+            self.asr_model.eval()
+            print("✅ ASR model loaded")
         except Exception as e:
+            print(f"⚠️ ASR fallback: {e}")
+            # Fallback to Whisper if Parakeet fails
+            self.asr_pipeline = pipeline(
+                "automatic-speech-recognition",
+                model="openai/whisper-base.en",
+                device=0 if DEVICE == "cuda" else -1
+            )[31]
+        # 2. LLM Model - Quantized Llama for T4 GPU compatibility
+        print("🧠 Loading LLM model...")
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )[25][32]
+        model_name = "microsoft/DialoGPT-medium"  # Optimized for conversation
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.llm_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=quantization_config,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
+        )[42][44]
+        print("✅ LLM model loaded")
+        # 3. TTS Model - Coqui TTS for female voice consistency
+        print("🗣️ Loading TTS model...")
+        try:
+            # Using XTTS-v2 for high quality female voice
+            self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)[33][35]
+            # Create consistent female voice embedding
+            self.female_voice_path = self.create_female_reference()
+            print("✅ TTS model loaded with female voice")
+        except Exception as e:
+            print(f"⚠️ TTS fallback: {e}")
+            # Fallback to simpler TTS model
+            self.tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)[33]
+        # Memory optimization
+        if DEVICE == "cuda":
             torch.cuda.empty_cache()
+    def create_female_reference(self):
+        """Create a consistent female voice reference for TTS"""
+        # Generate a short reference audio with consistent female characteristics
+        reference_text = "Hello, I am your AI assistant with a consistent female voice."
+        # Create temporary reference file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        try:
+            # Use a built-in female speaker if available
+            wav = self.tts.tts(
+                text=reference_text,
+                language="en",
+                split_sentences=True
+            )
+            # Save reference audio
+            sf.write(temp_file.name, wav, SAMPLE_RATE)
+            return temp_file.name
+        except:
+            return None
+    def transcribe_audio(self, audio_data):
+        """Convert speech to text using ASR"""
+        try:
+            if hasattr(self, 'asr_model'):
+                # Save audio temporarily for NeMo ASR
+                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+                sf.write(temp_file.name, audio_data[1], audio_data[0])
+                # Transcribe
+                transcription = self.asr_model.transcribe([temp_file.name])[0]
+                os.unlink(temp_file.name)
+                return transcription.text if hasattr(transcription, 'text') else transcription
+            else:
+                # Use Whisper pipeline
+                return self.asr_pipeline({"sampling_rate": audio_data[0], "raw": audio_data[1]})["text"]
+        except Exception as e:
+            print(f"ASR Error: {e}")
+            return "Sorry, I couldn't understand the audio."
+    def generate_response(self, user_input, chat_history):
+        """Generate conversational response using LLM"""
+        try:
+            # Prepare conversation context
+            context = ""
+            for turn in chat_history[-3:]:  # Last 3 turns for context
+                context += f"Human: {turn[0]}\nAssistant: {turn[1]}\n"
+            context += f"Human: {user_input}\nAssistant:"
+            # Tokenize and generate
+            inputs = self.tokenizer.encode(context, return_tensors="pt", max_length=512, truncation=True).to(DEVICE)
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    inputs,
+                    max_length=inputs.shape[1] + 100,
+                    temperature=TEMPERATURE,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    no_repeat_ngram_size=2,
+                    top_p=0.9
+                )
+            response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+            response = response.split("Human:")[0].strip()
+            return response if response else "I understand. Please tell me more."
+        except Exception as e:
+            print(f"LLM Error: {e}")
+            return "I'm having trouble processing that. Could you please rephrase?"
+    def synthesize_speech(self, text):
+        """Convert text to speech with consistent female voice"""
+        try:
+            if self.female_voice_path and hasattr(self.tts, 'tts'):
+                # Use voice cloning for consistency
+                wav = self.tts.tts(
+                    text=text,
+                    speaker_wav=self.female_voice_path,
+                    language="en",
+                    split_sentences=True
+                )
+            else:
+                # Fallback to default synthesis
+                wav = self.tts.tts(text=text)
+            # Ensure proper format
+            if isinstance(wav, list):
+                wav = np.array(wav, dtype=np.float32)
+            # Normalize audio
+            wav = wav / np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else wav
+            return (SAMPLE_RATE, (wav * 32767).astype(np.int16))
+        except Exception as e:
+            print(f"TTS Error: {e}")
+            # Return silence as fallback
+            return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
+    def process_conversation(self, audio_input, chat_history):
+        """Main pipeline: Speech -> Text -> LLM -> Speech"""
+        if audio_input is None:
+            return chat_history, None, ""
+        try:
+            # Step 1: Speech to Text
+            user_text = self.transcribe_audio(audio_input)
+            if not user_text.strip():
+                return chat_history, None, "No speech detected."
+            # Step 2: Generate Response
+            ai_response = self.generate_response(user_text, chat_history)
+            # Step 3: Text to Speech
+            audio_response = self.synthesize_speech(ai_response)
+            # Update chat history
+            chat_history.append([user_text, ai_response])
+            # Memory cleanup
+            if DEVICE == "cuda":
+                torch.cuda.empty_cache()
+                gc.collect()
+            return chat_history, audio_response, f"You said: {user_text}"
+        except Exception as e:
+            error_msg = f"Error processing conversation: {e}"
+            print(error_msg)
+            return chat_history, None, error_msg
+# Initialize the AI system
+print("🚀 Starting Conversational AI initialization...")
+ai_system = ConversationalAI()
+# Gradio Interface
+def create_interface():
+    """Create the Gradio interface for the conversational AI"""
+    with gr.Blocks(
+        title="Advanced Conversational AI",
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header { text-align: center; color: #2563eb; margin-bottom: 2rem; }
+        .chat-container { max-height: 500px; overflow-y: auto; }
+        .status-box { background: #f0f9ff; padding: 1rem; border-radius: 0.5rem; }
+        """
+    ) as demo:
+        gr.HTML("""
+            <div class="main-header">
+                <h1>🤖 Advanced Conversational AI</h1>
+                <p>Speak naturally and get intelligent responses with consistent female voice</p>
+            </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Chat History
+                chatbot = gr.Chatbot(
+                    label="Conversation History",
+                    elem_classes=["chat-container"],
+                    height=400,
+                    show_copy_button=True
+                )
+                # Audio Input
+                audio_input = gr.Audio(
+                    label="🎤 Speak to AI",
+                    sources=["microphone"],
+                    type="numpy",
+                    format="wav"
+                )
+                # Control Buttons
+                with gr.Row():
+                    submit_btn = gr.Button("💬 Process Speech", variant="primary", scale=2)
+                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary", scale=1)
+            with gr.Column(scale=1):
+                # AI Response Audio
+                audio_output = gr.Audio(
+                    label="🔊 AI Response",
+                    type="numpy",
+                    autoplay=True
+                )
+                # Status Display
+                status_display = gr.Textbox(
+                    label="📊 Status",
+                    lines=3,
+                    elem_classes=["status-box"],
+                    interactive=False
+                )
+                # System Information
+                gr.HTML(f"""
+                    <div class="status-box">
+                        <h3>🔧 System Info</h3>
+                        <p><strong>Device:</strong> {DEVICE.upper()}</p>
+                        <p><strong>Models:</strong> Parakeet ASR + DialoGPT + XTTS</p>
+                        <p><strong>Voice:</strong> Consistent Female</p>
+                        <p><strong>Memory:</strong> 4-bit Quantized</p>
+                    </div>
+                """)
+        # Event Handlers
+        def process_audio(audio, history):
+            return ai_system.process_conversation(audio, history)
+        def clear_conversation():
+            if DEVICE == "cuda":
+                torch.cuda.empty_cache()
+            return [], None, "Conversation cleared."
+        # Button Events
+        submit_btn.click(
+            fn=process_audio,
+            inputs=[audio_input, chatbot],
+            outputs=[chatbot, audio_output, status_display],
+            show_progress=True
+        )
+        clear_btn.click(
+            fn=clear_conversation,
+            outputs=[chatbot, audio_output, status_display]
+        )
+        # Auto-process when audio is recorded
+        audio_input.change(
+            fn=process_audio,
+            inputs=[audio_input, chatbot],
+            outputs=[chatbot, audio_output, status_display]
+        )
+        # Example Usage
+        gr.HTML("""
+            <div style="margin-top: 2rem; padding: 1rem; background: #fef3c7; border-radius: 0.5rem;">
+                <h3>💡 How to Use:</h3>
+                <ol>
+                    <li>Click the microphone button and speak clearly</li>
+                    <li>Wait for the AI to process your speech</li>
+                    <li>Listen to the AI's response with consistent female voice</li>
+                    <li>Continue the conversation naturally</li>
+                </ol>
+            </div>
+        """)
+    return demo
+# Launch the application
 if __name__ == "__main__":
+    print("🌟 Creating Gradio interface...")
+    demo = create_interface()
+    print("🚀 Launching Conversational AI...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True,
+        debug=False
+    )