Spaces:

akhaliq
/

VibeVoice-1.5B

Running on L4

App Files Files Community

akhaliq HF Staff commited on 6 days ago

Commit

0fa6dba

verified ·

1 Parent(s): e35db11

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -255

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import argparse
 import os
 import tempfile
@@ -88,21 +92,6 @@ logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
-def convert_to_16_bit_wav(data):
-    """Convert audio data to 16-bit WAV format."""
-    if torch.is_tensor(data):
-        data = data.detach().cpu().numpy()
-    data = np.array(data, dtype=np.float32)
-    # Normalize to -1 to 1 if necessary
-    if np.max(np.abs(data)) > 1.0:
-        data = data / np.max(np.abs(data))
-    data = (data * 32767).astype(np.int16)
-    return data
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
         """Initialize the VibeVoice chat model."""
@@ -186,8 +175,7 @@ class VibeVoiceChat:
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
-        # This assumes 'voices' directory is in the same location as the script
-        voices_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "voices")
         # Create voices directory if it doesn't exist
         if not os.path.exists(voices_dir):
@@ -218,14 +206,15 @@ class VibeVoiceChat:
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
-            # Librosa is more robust for various audio formats
-            wav, sr = librosa.load(audio_path, sr=None, mono=True)
             if sr != target_sr:
-                wav = librosa.resample(y=wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
-            return np.zeros(target_sr)  # Return 1 second of silence as fallback
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
@@ -237,7 +226,7 @@ class VibeVoiceChat:
             if not line:
                 continue
-            # Check if already formatted (e.g., "Speaker 0: Hello")
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
@@ -256,18 +245,14 @@ class VibeVoiceChat:
         num_speakers: int,
         cfg_scale: float
     ) -> Iterator[tuple]:
-        """
-        Generate audio stream from text input.
-        Yields (sample_rate, audio_chunk_numpy) tuples as audio becomes available.
-        """
         try:
             self.stop_generation = False
             self.is_generating = True
             # Validate inputs
             if not message.strip():
-                self.is_generating = False
-                yield None, "❌ Error: Please provide a message."
                 return
             # Format the script
@@ -275,6 +260,7 @@ class VibeVoiceChat:
             print(f"Formatted script:\n{formatted_script}")
             print(f"Using device: {self.device}")
             start_time = time.time()
             # Select voices based on number of speakers
@@ -286,19 +272,20 @@ class VibeVoiceChat:
             # Load voice samples
             voice_samples = []
-            target_sr = 24000
             for i in range(num_speakers):
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
-                        audio_data = self.read_audio(self.available_voices[voice_name], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 else:
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
-                        audio_data = self.read_audio(self.available_voices[selected_voices[0]], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 voice_samples.append(audio_data)
@@ -317,6 +304,9 @@ class VibeVoiceChat:
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
                 print(f"✓ Inputs moved to GPU")
             # Create audio streamer
             audio_streamer = AudioStreamer(
@@ -327,45 +317,31 @@ class VibeVoiceChat:
             self.current_streamer = audio_streamer
-            # Start generation in a separate thread
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
             )
             generation_thread.start()
-            # Wait for generation to start
             time.sleep(1)
-            # Check for stop signal
-            if self.stop_generation:
-                audio_streamer.end()
-                generation_thread.join(timeout=5.0)
-                self.is_generating = False
-                yield None, "🛑 Generation stopped by user"
-                return
-            # Get the audio stream
-            audio_output_stream = audio_streamer.get_stream(0)
             all_audio_chunks = []
-            pending_chunks = []
             chunk_count = 0
-            last_yield_time = time.time()
-            min_yield_interval = 15
-            min_chunk_size = target_sr * 30
-            has_yielded_audio = False
-            has_received_chunks = False
-            for audio_chunk in audio_output_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
                 chunk_count += 1
-                has_received_chunks = True
                 if torch.is_tensor(audio_chunk):
                     if audio_chunk.dtype == torch.bfloat16:
                         audio_chunk = audio_chunk.float()
@@ -373,87 +349,43 @@ class VibeVoiceChat:
                 else:
                     audio_np = np.array(audio_chunk, dtype=np.float32)
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
-                audio_16bit = convert_to_16_bit_wav(audio_np)
                 all_audio_chunks.append(audio_16bit)
-                pending_chunks.append(audio_16bit)
-                pending_audio_size = sum(len(chunk) for chunk in pending_chunks)
-                current_time = time.time()
-                time_since_last_yield = current_time - last_yield_time
-                should_yield = False
-                if not has_yielded_audio and pending_audio_size >= min_chunk_size:
-                    should_yield = True
-                    has_yielded_audio = True
-                elif has_yielded_audio and (pending_audio_size >= min_chunk_size or time_since_last_yield >= min_yield_interval):
-                    should_yield = True
-                if should_yield and pending_chunks:
-                    new_audio = np.concatenate(pending_chunks)
-                    total_duration = sum(len(chunk) for chunk in all_audio_chunks) / target_sr
-                    log_update = f"🎵 Streaming: {total_duration:.1f}s generated (chunk {chunk_count})"
-                    yield (target_sr, new_audio), log_update
-                    pending_chunks = []
-                    last_yield_time = current_time
-            # Yield any remaining chunks
-            if pending_chunks:
-                final_new_audio = np.concatenate(pending_chunks)
-                total_duration = sum(len(chunk) for chunk in all_audio_chunks) / target_sr
-                log_update = f"🎵 Streaming final chunk: {total_duration:.1f}s total"
-                yield (target_sr, final_new_audio), log_update
-                has_yielded_audio = True
             # Wait for generation to complete
             generation_thread.join(timeout=5.0)
-            if generation_thread.is_alive():
-                print("Warning: Generation thread did not complete within timeout")
-                audio_streamer.end()
-                generation_thread.join(timeout=5.0)
             self.current_streamer = None
             self.is_generating = False
-            generation_time = time.time() - start_time
-            if self.stop_generation:
-                yield None, "🛑 Generation stopped by user"
-                return
-            if not has_received_chunks:
-                yield None, f"❌ Error: No audio chunks were received. Generation time: {generation_time:.2f}s"
-                return
-            if not has_yielded_audio:
-                yield None, f"❌ Error: Audio was generated but not streamed. Chunk count: {chunk_count}"
-                return
-            if all_audio_chunks:
-                complete_audio = np.concatenate(all_audio_chunks)
-                final_duration = len(complete_audio) / target_sr
-                final_log = f"⏱️ Generation completed in {generation_time:.2f} seconds\n"
-                final_log += f"🎵 Final audio duration: {final_duration:.2f} seconds\n"
-                final_log += f"📊 Total chunks: {chunk_count}\n"
-                final_log += "✨ Generation successful!"
-                yield None, final_log
-            else:
-                yield None, "❌ No audio was generated."
         except Exception as e:
             print(f"Error in generation: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
-            yield None, f"❌ An unexpected error occurred: {str(e)}"
     def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
         """Helper method to run generation with streamer."""
@@ -461,9 +393,10 @@ class VibeVoiceChat:
             def check_stop():
                 return self.stop_generation
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                    self.model.generate(
                         **inputs,
                         max_new_tokens=None,
                         cfg_scale=cfg_scale,
@@ -475,7 +408,7 @@ class VibeVoiceChat:
                         refresh_negative=True,
                     )
             else:
-                self.model.generate(
                     **inputs,
                     max_new_tokens=None,
                     cfg_scale=cfg_scale,
@@ -490,25 +423,35 @@ class VibeVoiceChat:
             print(f"Error in generation thread: {e}")
             import traceback
             traceback.print_exc()
-        finally:
             audio_streamer.end()
     def stop_audio_generation(self):
-        """Signal to stop the current audio generation."""
-        if self.is_generating:
-            print("🛑 Stop signal received.")
-            self.stop_generation = True
-            if self.current_streamer:
-                try:
-                    time.sleep(0.1)
-                    self.current_streamer.end()
-                except Exception as e:
-                    print(f"Error ending streamer: {e}")
 def create_chat_interface(chat_instance: VibeVoiceChat):
-    """Create a Gradio ChatInterface for VibeVoice with improved streaming."""
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
@@ -516,40 +459,51 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
-    # Custom CSS for streaming UI
-    custom_css = """
-    .streaming-indicator {
-        display: inline-block;
-        width: 10px;
-        height: 10px;
-        background: #22c55e;
-        border-radius: 50%;
-        margin-right: 8px;
-        animation: pulse 1.5s infinite;
-    }
-    @keyframes pulse {
-        0% { opacity: 1; transform: scale(1); }
-        50% { opacity: 0.5; transform: scale(1.1); }
-        100% { opacity: 1; transform: scale(1); }
-    }
-    .streaming-status {
-        background: linear-gradient(135deg, #dcfce7 0%, #bbf7d0 100%);
-        border: 1px solid rgba(34, 197, 94, 0.3);
-        border-radius: 8px;
-        padding: 0.75rem;
-        margin: 0.5rem 0;
-        text-align: center;
-        font-size: 0.9rem;
-        color: #166534;
-    }
-    """
-    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
-                  css=custom_css, fill_height=True) as interface:
-        gr.Markdown("# 🎙️ VibeVoice Chat - Streamed Audio\nGenerate natural dialogue audio with AI voices")
         with gr.Row():
             with gr.Column(scale=1):
@@ -601,33 +555,18 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
                     lines=3
                 )
-                # Streaming status indicator
-                streaming_status = gr.HTML(
-                    value="""
-                    <div class="streaming-status">
-                        <span class="streaming-indicator"></span>
-                        <strong>LIVE STREAMING</strong> - Audio is being generated in real-time
-                    </div>
-                    """,
-                    visible=False,
-                    elem_id="streaming-status"
-                )
-                # Audio output with streaming enabled
                 audio_output = gr.Audio(
                     label="Generated Audio",
-                    type="numpy",
-                    streaming=True,
                     autoplay=True,
-                    show_download_button=False,
                     visible=False
                 )
                 with gr.Row():
-                    submit_btn = gr.Button("🎵 Generate Audio", variant="primary")
-                    stop_btn = gr.Button("🛑 Stop Generation", variant="secondary")
-                    clear_btn = gr.Button("🗑️ Clear")
                 gr.Examples(
                     examples=[
                         "Hello! How are you doing today?",
@@ -639,90 +578,39 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
                     label="Example Messages"
                 )
-        def process_and_display_stream(message_text: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
-            """Process input and stream audio with status updates."""
             history = history or []
-            history.append({"role": "user", "content": message_text})
-            # Initial state: clear audio, show status, disable input
-            yield history, gr.update(value="", interactive=False), None, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
-            # Generate audio stream
-            for audio_chunk, log_message in chat_instance.generate_audio_stream(
-                message_text, history, voice_1, voice_2, num_speakers, cfg_scale
-            ):
-                if audio_chunk is not None:
-                    # Streaming audio chunk received
-                    yield history, gr.update(interactive=False), audio_chunk, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
-                else:
-                    # Final status message or error
-                    if log_message and "❌" in log_message:
-                        # Error case
-                        history.append({"role": "assistant", "content": log_message})
-                        yield history, gr.update(interactive=True), None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-                    elif log_message and "🛑" in log_message:
-                        # Stopped case
-                        history.append({"role": "assistant", "content": log_message})
-                        yield history, gr.update(interactive=True), None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-                    elif log_message:
-                        # Final success message
-                        history.append({"role": "assistant", "content": "🎵 Audio generated successfully!"})
-                        yield history, gr.update(interactive=True), None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-            # Final cleanup
-            yield history, gr.update(interactive=True), None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-        def stop_generation_handler():
-            """Handle stop generation."""
-            chat_instance.stop_audio_generation()
-            return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
-        def clear_audio_outputs():
-            """Clear audio outputs."""
-            return None
-        # Event handlers
-        submit_btn.click(
-            fn=clear_audio_outputs,
-            inputs=[],
-            outputs=[audio_output],
-            queue=False
-        ).then(
-            fn=process_and_display_stream,
             inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
-            outputs=[chatbot, msg, audio_output, streaming_status, submit_btn, stop_btn],
             queue=True
         )
         msg.submit(
-            fn=clear_audio_outputs,
-            inputs=[],
-            outputs=[audio_output],
-            queue=False
-        ).then(
-            fn=process_and_display_stream,
             inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
-            outputs=[chatbot, msg, audio_output, streaming_status, submit_btn, stop_btn],
             queue=True
         )
-        stop_btn.click(
-            fn=stop_generation_handler,
-            inputs=[],
-            outputs=[streaming_status, submit_btn, stop_btn],
-            queue=False
-        ).then(
-            fn=lambda: None,
-            inputs=[],
-            outputs=[audio_output],
-            queue=False
-        )
-        clear_btn.click(
-            lambda: ([], gr.update(value="", interactive=True), gr.update(visible=False, value=None),
-                    gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)),
-            outputs=[chatbot, msg, audio_output, streaming_status, submit_btn, stop_btn]
-        )
     return interface
@@ -777,6 +665,9 @@ def main():
     if chat_instance.device == "cpu":
         print("\n⚠️  WARNING: Running on CPU - generation will be VERY slow!")
     # Launch the interface
     interface.queue(max_size=10).launch(

+"""
+VibeVoice Simple Chat Interface - Streamlined Audio Generation Demo
+"""
 import argparse
 import os
 import tempfile
 logger = logging.get_logger(__name__)
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
         """Initialize the VibeVoice chat model."""
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
+        voices_dir = os.path.join(os.path.dirname(__file__), "voices")
         # Create voices directory if it doesn't exist
         if not os.path.exists(voices_dir):
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
+            wav, sr = sf.read(audio_path)
+            if len(wav.shape) > 1:
+                wav = np.mean(wav, axis=1)
             if sr != target_sr:
+                wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
+            return np.zeros(24000)  # Return 1 second of silence as fallback
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
             if not line:
                 continue
+            # Check if already formatted
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
         num_speakers: int,
         cfg_scale: float
     ) -> Iterator[tuple]:
+        """Generate audio stream from text input."""
         try:
             self.stop_generation = False
             self.is_generating = True
             # Validate inputs
             if not message.strip():
+                yield None
                 return
             # Format the script
             print(f"Formatted script:\n{formatted_script}")
             print(f"Using device: {self.device}")
+            # Start timing
             start_time = time.time()
             # Select voices based on number of speakers
             # Load voice samples
             voice_samples = []
             for i in range(num_speakers):
+                # Use the appropriate voice for each speaker
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
+                        audio_data = self.read_audio(self.available_voices[voice_name])
                     else:
+                        audio_data = np.zeros(24000)  # Default silence
                 else:
+                    # Use first voice or default if not enough voices selected
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
+                        audio_data = self.read_audio(self.available_voices[selected_voices[0]])
                     else:
+                        audio_data = np.zeros(24000)  # Default silence
                 voice_samples.append(audio_data)
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
                 print(f"✓ Inputs moved to GPU")
+                # Check GPU memory
+                if torch.cuda.is_available():
+                    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
             # Create audio streamer
             audio_streamer = AudioStreamer(
             self.current_streamer = audio_streamer
+            # Start generation in separate thread
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
             )
             generation_thread.start()
+            # Wait briefly for generation to start
             time.sleep(1)
+            # Stream audio chunks
+            sample_rate = 24000
+            audio_stream = audio_streamer.get_stream(0)
             all_audio_chunks = []
             chunk_count = 0
+            for audio_chunk in audio_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
                 chunk_count += 1
+                # Convert to numpy
                 if torch.is_tensor(audio_chunk):
                     if audio_chunk.dtype == torch.bfloat16:
                         audio_chunk = audio_chunk.float()
                 else:
                     audio_np = np.array(audio_chunk, dtype=np.float32)
+                # Ensure 1D
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
+                # Convert to 16-bit
+                audio_16bit = self.convert_to_16_bit_wav(audio_np)
                 all_audio_chunks.append(audio_16bit)
+                # Yield accumulated audio
+                if all_audio_chunks:
+                    complete_audio = np.concatenate(all_audio_chunks)
+                    yield (sample_rate, complete_audio)
             # Wait for generation to complete
             generation_thread.join(timeout=5.0)
+            # Final yield with complete audio
+            if all_audio_chunks:
+                complete_audio = np.concatenate(all_audio_chunks)
+                generation_time = time.time() - start_time
+                audio_duration = len(complete_audio) / sample_rate
+                print(f"✓ Generation complete:")
+                print(f"  Time taken: {generation_time:.2f} seconds")
+                print(f"  Audio duration: {audio_duration:.2f} seconds")
+                print(f"  Real-time factor: {audio_duration/generation_time:.2f}x")
+                yield (sample_rate, complete_audio)
             self.current_streamer = None
             self.is_generating = False
         except Exception as e:
             print(f"Error in generation: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
+            yield None
     def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
         """Helper method to run generation with streamer."""
             def check_stop():
                 return self.stop_generation
+            # Use torch.cuda.amp for mixed precision if available
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    outputs = self.model.generate(
                         **inputs,
                         max_new_tokens=None,
                         cfg_scale=cfg_scale,
                         refresh_negative=True,
                     )
             else:
+                outputs = self.model.generate(
                     **inputs,
                     max_new_tokens=None,
                     cfg_scale=cfg_scale,
             print(f"Error in generation thread: {e}")
             import traceback
             traceback.print_exc()
             audio_streamer.end()
+    def convert_to_16_bit_wav(self, data):
+        """Convert audio data to 16-bit WAV format."""
+        if torch.is_tensor(data):
+            data = data.detach().cpu().numpy()
+        data = np.array(data)
+        if np.max(np.abs(data)) > 1.0:
+            data = data / np.max(np.abs(data))
+        data = (data * 32767).astype(np.int16)
+        return data
     def stop_audio_generation(self):
+        """Stop the current audio generation."""
+        self.stop_generation = True
+        if self.current_streamer:
+            try:
+                self.current_streamer.end()
+            except:
+                pass
 def create_chat_interface(chat_instance: VibeVoiceChat):
+    """Create a simplified Gradio ChatInterface for VibeVoice."""
+    # Get available voices
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
+    # Define the chat function that returns audio
+    def chat_fn(message: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
+        """Process chat message and generate audio response."""
+        # Extract text from message
+        if isinstance(message, dict):
+            text = message.get("text", "")
+        else:
+            text = message
+        if not text.strip():
+            return ""
+        try:
+            # Generate audio stream
+            audio_generator = chat_instance.generate_audio_stream(
+                text, history, voice_1, voice_2, num_speakers, cfg_scale
+            )
+            # Collect all audio data
+            audio_data = None
+            for audio_chunk in audio_generator:
+                if audio_chunk is not None:
+                    audio_data = audio_chunk
+            # Return audio file path or error message
+            if audio_data:
+                # Save audio to temporary file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    sample_rate, audio_array = audio_data
+                    sf.write(tmp_file.name, audio_array, sample_rate)
+                    # Return the file path directly
+                    return tmp_file.name
+            else:
+                return "Failed to generate audio"
+        except Exception as e:
+            print(f"Error in chat_fn: {e}")
+            import traceback
+            traceback.print_exc()
+            return f"Error: {str(e)}"
+    # Create the interface using Blocks for more control
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), fill_height=True) as interface:
+        gr.Markdown("# 🎙️ VibeVoice Chat\nGenerate natural dialogue audio with AI voices")
         with gr.Row():
             with gr.Column(scale=1):
                     lines=3
                 )
                 audio_output = gr.Audio(
                     label="Generated Audio",
+                    type="filepath",
                     autoplay=True,
                     visible=False
                 )
                 with gr.Row():
+                    submit = gr.Button("🎵 Generate Audio", variant="primary")
+                    clear = gr.Button("🗑️ Clear")
+                # Example messages
                 gr.Examples(
                     examples=[
                         "Hello! How are you doing today?",
                     label="Example Messages"
                 )
+        # Set up event handlers
+        def process_and_display(message, history, voice_1, voice_2, num_speakers, cfg_scale):
+            """Process message and update both chatbot and audio."""
+            # Add user message to history
             history = history or []
+            history.append({"role": "user", "content": message})
+            # Generate audio
+            audio_path = chat_fn(message, history, voice_1, voice_2, num_speakers, cfg_scale)
+            # Add assistant response with audio
+            if audio_path and audio_path.endswith('.wav'):
+                history.append({"role": "assistant", "content": f"🎵 Audio generated successfully"})
+                return history, audio_path, gr.update(visible=True), ""
+            else:
+                history.append({"role": "assistant", "content": audio_path or "Failed to generate audio"})
+                return history, None, gr.update(visible=False), ""
+        submit.click(
+            fn=process_and_display,
             inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            outputs=[chatbot, audio_output, audio_output, msg],
             queue=True
         )
         msg.submit(
+            fn=process_and_display,
             inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            outputs=[chatbot, audio_output, audio_output, msg],
             queue=True
         )
+        clear.click(lambda: ([], None, gr.update(visible=False)), outputs=[chatbot, audio_output, audio_output])
     return interface
     if chat_instance.device == "cpu":
         print("\n⚠️  WARNING: Running on CPU - generation will be VERY slow!")
+        print("   For faster generation, ensure you have:")
+        print("   1. NVIDIA GPU with CUDA support")
+        print("   2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
     # Launch the interface
     interface.queue(max_size=10).launch(