Spaces:

akhaliq
/

VibeVoice-1.5B

Sleeping

App Files Files Community

akhaliq HF Staff commited on 6 days ago

Commit

c2e9ecc

verified ·

1 Parent(s): 8dbf894

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -254

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import librosa
 import soundfile as sf
 import torch
 from pathlib import Path
-from typing import Iterator, Dict, Any
 # Clone and setup VibeVoice if not already present
 vibevoice_dir = Path('./VibeVoice')
@@ -87,6 +87,20 @@ from transformers import set_seed
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
@@ -96,7 +110,8 @@ class VibeVoiceChat:
         self.inference_steps = inference_steps
         self.is_generating = False
         self.stop_generation = False
-        self.current_streamer = None
         # Check GPU availability and CUDA version
         if torch.cuda.is_available():
@@ -104,10 +119,8 @@ class VibeVoiceChat:
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
             print(f"  CUDA Version: {torch.version.cuda}")
             print(f"  PyTorch CUDA: {torch.cuda.is_available()}")
-            # Set memory fraction to avoid OOM
-            torch.cuda.set_per_process_memory_fraction(0.95)
-            # Enable TF32 for faster computation on Ampere GPUs
-            torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cudnn.allow_tf32 = True
         else:
             print("✗ No GPU detected, using CPU (generation will be VERY slow)")
@@ -165,16 +178,13 @@ class VibeVoiceChat:
         load_time = time.time() - start_time
         print(f"✓ Model loaded in {load_time:.2f} seconds")
-        # Print model device
         if hasattr(self.model, 'device'):
             print(f"Model device: {self.model.device}")
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
-        # This assumes 'voices' directory is in the same location as the script
         voices_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "voices")
-        # Create voices directory if it doesn't exist
         if not os.path.exists(voices_dir):
             os.makedirs(voices_dir)
             print(f"Created voices directory at {voices_dir}")
@@ -183,19 +193,16 @@ class VibeVoiceChat:
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
-        # Scan for audio files
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
-        # Sort voices alphabetically
         self.available_voices = dict(sorted(self.available_voices.items()))
         if not self.available_voices:
             print(f"Warning: No voice files found in {voices_dir}")
             print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
-            # Add a default "None" option
             self.available_voices = {"Default": None}
         else:
             print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
@@ -203,14 +210,13 @@ class VibeVoiceChat:
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
-            # Librosa is more robust for various audio formats
             wav, sr = librosa.load(audio_path, sr=None, mono=True)
             if sr != target_sr:
                 wav = librosa.resample(y=wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
-            return np.zeros(target_sr)  # Return 1 second of silence as fallback
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
@@ -222,11 +228,9 @@ class VibeVoiceChat:
             if not line:
                 continue
-            # Check if already formatted (e.g., "Speaker 0: Hello")
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
-                # Auto-assign speakers in rotation
                 speaker_id = i % num_speakers
                 formatted_lines.append(f"Speaker {speaker_id}: {line}")
@@ -235,63 +239,50 @@ class VibeVoiceChat:
     def generate_audio_stream(
         self,
         message: str,
-        history: list, # Keep history parameter for consistency, though not directly used for generation here
         voice_1: str,
         voice_2: str,
         num_speakers: int,
         cfg_scale: float
-    ) -> Iterator[tuple]:
         """
-        Generate audio stream from text input.
-        Yields (sample_rate, audio_chunk_numpy) tuples as audio becomes available.
         """
         try:
             self.stop_generation = False
             self.is_generating = True
-            # Validate inputs
             if not message.strip():
                 self.is_generating = False
                 yield None
                 return
-            # Format the script
             formatted_script = self.format_script(message, num_speakers)
-            print(f"Formatted script:\n{formatted_script}")
-            print(f"Using device: {self.device}")
-            start_time = time.time() # Start timing for the overall generation
-            # Select voices based on number of speakers
             selected_voices = []
             if voice_1 and voice_1 != "Default":
                 selected_voices.append(voice_1)
             if num_speakers > 1 and voice_2 and voice_2 != "Default":
                 selected_voices.append(voice_2)
-            # Load voice samples
             voice_samples = []
-            target_sr = 24000 # VibeVoice expects 24kHz
             for i in range(num_speakers):
-                # Use the appropriate voice for each speaker
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
                         audio_data = self.read_audio(self.available_voices[voice_name], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)  # Default silence
                 else:
-                    # Fallback: use first voice or default if not enough unique voices selected
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
                         audio_data = self.read_audio(self.available_voices[selected_voices[0]], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)  # Default silence
                 voice_samples.append(audio_data)
-            print(f"Loaded {len(voice_samples)} voice samples")
-            # Process inputs
             inputs = self.processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
@@ -300,14 +291,9 @@ class VibeVoiceChat:
                 return_attention_mask=True,
             )
-            # Move to device and ensure correct dtype
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
-                print(f"✓ Inputs moved to GPU")
-                if torch.cuda.is_available():
-                    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
-            # Create audio streamer
             audio_streamer = AudioStreamer(
                 batch_size=1,
                 stop_signal=None,
@@ -316,7 +302,6 @@ class VibeVoiceChat:
             self.current_streamer = audio_streamer
-            # Start generation in a separate thread
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
@@ -324,64 +309,79 @@ class VibeVoiceChat:
             generation_thread.start()
             # Give the generation thread a moment to start producing output
-            time.sleep(0.5)
             audio_output_stream = audio_streamer.get_stream(0)
-            total_generated_samples = 0
-            # Stream audio chunks
-            for audio_chunk in audio_output_stream:
                 if self.stop_generation:
-                    audio_streamer.end() # Signal streamer to stop
-                    break # Exit the loop
-                # Convert to numpy array (float32 is preferred by Gradio's Audio component)
-                if torch.is_tensor(audio_chunk):
-                    if audio_chunk.dtype == torch.bfloat16:
-                        audio_chunk = audio_chunk.float()
-                    audio_np = audio_chunk.cpu().numpy().astype(np.float32)
                 else:
-                    audio_np = np.array(audio_chunk, dtype=np.float32)
-                # Ensure 1D audio array
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
-                total_generated_samples += len(audio_np)
-                # Yield the audio chunk directly for Gradio's streaming audio component
-                yield (target_sr, audio_np)
-            # Ensure generation thread completes/cleans up
-            generation_thread.join(timeout=10.0)
-            generation_time = time.time() - start_time
-            audio_duration = total_generated_samples / target_sr
-            print(f"✓ Streaming complete:")
-            print(f"  Total time: {generation_time:.2f} seconds")
-            print(f"  Total audio duration: {audio_duration:.2f} seconds")
-            if generation_time > 0:
-                print(f"  Real-time factor: {audio_duration/generation_time:.2f}x")
             self.current_streamer = None
             self.is_generating = False
         except Exception as e:
-            print(f"Error in generation: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
-            yield None # Yield None to indicate an error or end of stream
-    def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
         """Helper method to run generation with streamer."""
         try:
             def check_stop():
                 return self.stop_generation
-            # Use torch.cuda.amp for mixed precision if available
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     self.model.generate(
@@ -412,23 +412,8 @@ class VibeVoiceChat:
             import traceback
             traceback.print_exc()
         finally:
-            # Ensure the streamer is always ended, even if generation fails
             audio_streamer.end()
-    def convert_to_16_bit_wav(self, data):
-        """Convert audio data to 16-bit WAV format."""
-        if torch.is_tensor(data):
-            data = data.detach().cpu().numpy()
-        data = np.array(data, dtype=np.float32) # Ensure float32 before scaling
-        # Normalize to -1 to 1 if necessary
-        if np.max(np.abs(data)) > 1.0:
-            data = data / np.max(np.abs(data))
-        data = (data * 32767).astype(np.int16)
-        return data
     def stop_audio_generation(self):
         """Signal to stop the current audio generation."""
         if self.is_generating:
@@ -436,12 +421,11 @@ class VibeVoiceChat:
             self.stop_generation = True
             if self.current_streamer:
                 try:
-                    # Give a brief moment for the streamer to process remaining buffers,
-                    # then force end it if needed.
-                    time.sleep(0.1)
                     self.current_streamer.end()
                 except Exception as e:
                     print(f"Error ending streamer: {e}")
         else:
             print("No active generation to stop.")
@@ -449,69 +433,210 @@ class VibeVoiceChat:
 def create_chat_interface(chat_instance: VibeVoiceChat):
     """Create a simplified Gradio ChatInterface for VibeVoice with audio streaming."""
-    # Get available voices
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
-    # Generator function to handle both chatbot updates and audio streaming
-    def process_and_display_stream(message_text: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
-        """
-        Processes the input message, updates the chatbot, and streams audio output.
-        This function is a generator that yields updates for multiple Gradio components.
-        """
-        # Add user message to history immediately
         history = history or []
-        history.append({"role": "user", "content": message_text})
-        # Yield initial state: updated chatbot, clear text input, disable input,
-        # make audio component visible but initially empty.
-        yield history, gr.update(value="", interactive=False), gr.update(visible=True, value=None)
-        # Generate audio stream using the VibeVoiceChat instance
         audio_stream_generator = chat_instance.generate_audio_stream(
-            message_text, history, voice_1, voice_2, num_speakers, cfg_scale
         )
-        generated_any_audio = False # Flag to track if any audio chunks were yielded
-        # Iterate through audio chunks and yield for the audio component
         for chunk_data in audio_stream_generator:
             if chat_instance.stop_generation:
-                break # Break if stop button was pressed
             if chunk_data is not None:
                 generated_any_audio = True
-                # Yield the current history (remains static during audio streaming),
-                # keep msg input disabled, and pass the audio chunk for gr.Audio.
-                yield history, gr.update(interactive=False), chunk_data
             else:
-                # If chunk_data is None, it indicates an error or end of stream
                 break
-        # After audio generation is complete (or stopped/failed)
-        # Add assistant message to chatbot and re-enable text input.
-        if generated_any_audio and not chat_instance.stop_generation:
-            history.append({"role": "assistant", "content": f"🎵 Audio generated successfully"})
-        elif chat_instance.stop_generation:
-            history.append({"role": "assistant", "content": f"🚫 Audio generation stopped"})
-            chat_instance.stop_generation = False # Reset stop flag for the next generation
         else:
-            history.append({"role": "assistant", "content": "Failed to generate audio"})
-        # Final yield: updated chatbot, re-enabled input, and keep audio output visible
-        # The gr.Audio component will retain the last streamed content.
-        yield history, gr.update(value="", interactive=True), gr.update(visible=True)
-    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), fill_height=True) as interface:
-        gr.Markdown("# 🎙️ VibeVoice Chat - Streamed Audio\nGenerate natural dialogue audio with AI voices")
         with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### Voice & Generation Settings")
                 voice_1 = gr.Dropdown(
                     choices=voice_options,
@@ -545,139 +670,37 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
                     info="Guidance strength (higher = more adherence to text)"
                 )
-            with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
                     label="Conversation",
-                    height=400,
                     type="messages",
-                    elem_id="chatbot"
                 )
                 msg = gr.Textbox(
                     label="Message",
                     placeholder="Type your message or paste a script...",
-                    lines=3
-                )
-                # Gradio's gr.Audio component automatically handles streaming when a generator
-                # function yields (sample_rate, numpy_array) tuples.
-                audio_output = gr.Audio(
-                    label="Generated Audio",
-                    autoplay=True,
-                    streaming=True, # Explicitly setting streaming=True, though often inferred.
-                    visible=False # Initially hide the audio player
                 )
-                with gr.Row():
-                    submit_btn = gr.Button("🎵 Generate Audio", variant="primary")
-                    stop_btn = gr.Button("🛑 Stop Generation", variant="secondary")
-                    clear_btn = gr.Button("🗑️ Clear")
-                # Example messages
-                gr.Examples(
-                    examples=[
-                        "Hello! How are you doing today?",
-                        "Speaker 0: Welcome to our podcast!\nSpeaker 1: Thanks for having me!",
-                        "Tell me an interesting fact about space.",
-                        "What's your favorite type of music and why?",
-                    ],
-                    inputs=msg,
-                    label="Example Messages"
                 )
-        # Set up event handlers for the buttons and text input
-        submit_btn.click(
-            fn=process_and_display_stream,
-            inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
-            outputs=[chatbot, msg, audio_output],
-            queue=True # Queue allows processing requests sequentially
-        )
-        # Allow submitting message by pressing Enter in the textbox
-        msg.submit(
-            fn=process_and_display_stream,
-            inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
-            outputs=[chatbot, msg, audio_output],
-            queue=True
-        )
-        # Clear button functionality
-        clear_btn.click(
-            lambda: ([], gr.update(value="", interactive=True), gr.update(visible=False, value=None)),
-            outputs=[chatbot, msg, audio_output]
-        )
-        # Stop button functionality - calls the VibeVoiceChat instance's stop method
-        stop_btn.click(
-            fn=chat_instance.stop_audio_generation,
-            inputs=[],
-            outputs=[], # Does not update any Gradio components directly
-            queue=False # Important: A stop button should generally not be queued.
-        )
-    return interface
-def parse_args():
-    parser = argparse.ArgumentParser(description="VibeVoice Chat Interface")
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="microsoft/VibeVoice-1.5B",
-        help="Path to the VibeVoice model",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda" if torch.cuda.is_available() else "cpu",
-        help="Device for inference",
-    )
-    parser.add_argument(
-        "--inference_steps",
-        type=int,
-        default=5,
-        help="Number of DDPM inference steps (lower = faster, higher = better quality)",
-    )
-    return parser.parse_args()
-def main():
-    """Main function to run the chat interface."""
-    args = parse_args()
-    set_seed(42)
-    print("🎙️ Initializing VibeVoice Chat Interface...")
-    # Initialize chat instance
-    chat_instance = VibeVoiceChat(
-        model_path=args.model_path,
-        device=args.device,
-        inference_steps=args.inference_steps
-    )
-    # Create interface
-    interface = create_chat_interface(chat_instance)
-    print(f"🚀 Launching chat interface")
-    print(f"📁 Model: {args.model_path}")
-    print(f"💻 Device: {chat_instance.device}")
-    print(f"🔢 Inference steps: {args.inference_steps}")
-    print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
-    if chat_instance.device == "cpu":
-        print("\n⚠️  WARNING: Running on CPU - generation will be VERY slow!")
-        print("   For faster generation, ensure you have:")
-        print("   1. NVIDIA GPU with CUDA support")
-        print("   2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
-    # Launch the interface
-    interface.queue(max_size=10).launch(
-        show_error=True,
-        quiet=False,
-    )
-if __name__ == "__main__":
-    main()

 import soundfile as sf
 import torch
 from pathlib import Path
+from typing import Iterator, Dict, Any, List
 # Clone and setup VibeVoice if not already present
 vibevoice_dir = Path('./VibeVoice')
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
+# --- Helper function for audio conversion ---
+def convert_to_16_bit_wav(data: np.ndarray | torch.Tensor) -> np.ndarray:
+    """Convert audio data to 16-bit WAV format (numpy int16)."""
+    if torch.is_tensor(data):
+        data = data.detach().cpu().numpy()
+    data = np.array(data, dtype=np.float32) # Ensure float32 before scaling
+    # Normalize to -1 to 1 if necessary
+    if np.max(np.abs(data)) > 1.0:
+        data = data / np.max(np.abs(data))
+    data = (data * 32767).astype(np.int16)
+    return data
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
         self.inference_steps = inference_steps
         self.is_generating = False
         self.stop_generation = False
+        self.current_streamer: AudioStreamer | None = None
+        self.complete_audio_buffer: List[np.ndarray] = [] # To store all generated audio for final download
         # Check GPU availability and CUDA version
         if torch.cuda.is_available():
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
             print(f"  CUDA Version: {torch.version.cuda}")
             print(f"  PyTorch CUDA: {torch.cuda.is_available()}")
+            torch.cuda.set_per_process_memory_fraction(0.95) # Set memory fraction to avoid OOM
+            torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for faster computation on Ampere GPUs
             torch.backends.cudnn.allow_tf32 = True
         else:
             print("✗ No GPU detected, using CPU (generation will be VERY slow)")
         load_time = time.time() - start_time
         print(f"✓ Model loaded in {load_time:.2f} seconds")
         if hasattr(self.model, 'device'):
             print(f"Model device: {self.model.device}")
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
         voices_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "voices")
         if not os.path.exists(voices_dir):
             os.makedirs(voices_dir)
             print(f"Created voices directory at {voices_dir}")
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
         self.available_voices = dict(sorted(self.available_voices.items()))
         if not self.available_voices:
             print(f"Warning: No voice files found in {voices_dir}")
             print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
             self.available_voices = {"Default": None}
         else:
             print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
             wav, sr = librosa.load(audio_path, sr=None, mono=True)
             if sr != target_sr:
                 wav = librosa.resample(y=wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
+            return np.zeros(target_sr, dtype=np.float32)
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
             if not line:
                 continue
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
                 speaker_id = i % num_speakers
                 formatted_lines.append(f"Speaker {speaker_id}: {line}")
     def generate_audio_stream(
         self,
         message: str,
         voice_1: str,
         voice_2: str,
         num_speakers: int,
         cfg_scale: float
+    ) -> Iterator[tuple]: # This generator yields (sample_rate, audio_chunk_numpy_int16)
         """
+        Generate audio stream from text input, implementing buffering for smoother streaming.
+        Yields (sample_rate, audio_chunk_numpy_int16) tuples as audio becomes available.
         """
         try:
             self.stop_generation = False
             self.is_generating = True
+            self.complete_audio_buffer = [] # Reset buffer for new generation
             if not message.strip():
                 self.is_generating = False
                 yield None
                 return
             formatted_script = self.format_script(message, num_speakers)
             selected_voices = []
             if voice_1 and voice_1 != "Default":
                 selected_voices.append(voice_1)
             if num_speakers > 1 and voice_2 and voice_2 != "Default":
                 selected_voices.append(voice_2)
             voice_samples = []
+            target_sr = 24000
             for i in range(num_speakers):
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
                         audio_data = self.read_audio(self.available_voices[voice_name], target_sr=target_sr)
                     else:
+                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 else:
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
                         audio_data = self.read_audio(self.available_voices[selected_voices[0]], target_sr=target_sr)
                     else:
+                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 voice_samples.append(audio_data)
             inputs = self.processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
                 return_attention_mask=True,
             )
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
             audio_streamer = AudioStreamer(
                 batch_size=1,
                 stop_signal=None,
             self.current_streamer = audio_streamer
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
             generation_thread.start()
             # Give the generation thread a moment to start producing output
+            time.sleep(1.0) # Increased from 0.5s for stability
             audio_output_stream = audio_streamer.get_stream(0)
+            # Buffering logic for smoother Gradio streaming
+            pending_chunks: List[np.ndarray] = []
+            min_yield_interval_seconds = 1.0 # Yield at least every 1 second
+            min_chunk_size_samples = target_sr * 0.5 # At least 0.5 seconds of audio per chunk yielded to Gradio
+            last_yield_time = time.time()
+            for audio_chunk_raw in audio_output_stream:
                 if self.stop_generation:
+                    audio_streamer.end()
+                    break
+                # Convert raw chunk to numpy float32
+                if torch.is_tensor(audio_chunk_raw):
+                    if audio_chunk_raw.dtype == torch.bfloat16:
+                        audio_chunk_raw = audio_chunk_raw.float()
+                    audio_np = audio_chunk_raw.cpu().numpy().astype(np.float32)
                 else:
+                    audio_np = np.array(audio_chunk_raw, dtype=np.float32)
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
+                # Append to complete buffer (for final download)
+                self.complete_audio_buffer.append(audio_np)
+                # Append to pending chunks for streaming to Gradio
+                pending_chunks.append(audio_np)
+                current_pending_size = sum(len(c) for c in pending_chunks)
+                current_time = time.time()
+                should_yield = False
+                if current_pending_size >= min_chunk_size_samples:
+                    should_yield = True
+                elif (current_time - last_yield_time) >= min_yield_interval_seconds and pending_chunks:
+                    should_yield = True
+                if should_yield:
+                    combined_chunk = np.concatenate(pending_chunks)
+                    yield (target_sr, convert_to_16_bit_wav(combined_chunk)) # Convert to int16 before yielding
+                    pending_chunks = []
+                    last_yield_time = current_time
+            # Yield any remaining chunks after the loop finishes
+            if pending_chunks and not self.stop_generation:
+                combined_chunk = np.concatenate(pending_chunks)
+                yield (target_sr, convert_to_16_bit_wav(combined_chunk))
+            generation_thread.join(timeout=10.0) # Ensure generation thread completes
+            # Clean up
             self.current_streamer = None
             self.is_generating = False
         except Exception as e:
+            print(f"Error in generate_audio_stream: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
+            self.complete_audio_buffer = [] # Clear buffer on error
+            yield None
+    def _generate_with_streamer(self, inputs: Dict[str, Any], cfg_scale: float, audio_streamer: AudioStreamer):
         """Helper method to run generation with streamer."""
         try:
             def check_stop():
                 return self.stop_generation
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     self.model.generate(
             import traceback
             traceback.print_exc()
         finally:
             audio_streamer.end()
     def stop_audio_generation(self):
         """Signal to stop the current audio generation."""
         if self.is_generating:
             self.stop_generation = True
             if self.current_streamer:
                 try:
                     self.current_streamer.end()
                 except Exception as e:
                     print(f"Error ending streamer: {e}")
+            self.is_generating = False
+            self.complete_audio_buffer = []
         else:
             print("No active generation to stop.")
 def create_chat_interface(chat_instance: VibeVoiceChat):
     """Create a simplified Gradio ChatInterface for VibeVoice with audio streaming."""
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
+    # Custom CSS for modern aesthetics
+    custom_css = """
+    .gradio-container {
+        font-family: 'Inter', sans-serif;
+        background: linear-gradient(135deg, #f0f2f5 0%, #e0e6ed 100%);
+        color: #333;
+    }
+    .main-header {
+        background: linear-gradient(45deg, #4A00E0 0%, #8E2DE2 100%);
+        padding: 20px 30px;
+        border-radius: 15px;
+        margin-bottom: 25px;
+        text-align: center;
+        box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
+    }
+    .main-header h1 {
+        color: white;
+        font-size: 2.8em;
+        font-weight: 800;
+        margin: 0;
+        letter-spacing: -1px;
+        text-shadow: 0 3px 5px rgba(0,0,0,0.2);
+    }
+    .main-header p {
+        color: rgba(255,255,255,0.85);
+        font-size: 1.1em;
+        margin-top: 10px;
+    }
+    .settings-card, .generation-card {
+        background: rgba(255, 255, 255, 0.9);
+        border: 1px solid #dcdfe6;
+        border-radius: 12px;
+        padding: 20px;
+        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08);
+        transition: all 0.3s ease;
+    }
+    .settings-card:hover, .generation-card:hover {
+        box-shadow: 0 6px 20px rgba(0, 0, 0, 0.12);
+        transform: translateY(-2px);
+    }
+    .gradio-output {
+        border-radius: 10px;
+        background-color: #fcfcfc;
+    }
+    .gradio-button {
+        border-radius: 8px !important;
+        font-weight: 600;
+        padding: 10px 20px;
+        transition: all 0.2s ease-in-out;
+    }
+    .gradio-button.primary {
+        background: linear-gradient(45deg, #4CAF50 0%, #8BC34A 100%) !important;
+        color: white !important;
+        border: none !important;
+    }
+    .gradio-button.primary:hover {
+        opacity: 0.9;
+        transform: translateY(-1px);
+    }
+    .gradio-button.secondary {
+        background: linear-gradient(45deg, #FF5722 0%, #FFC107 100%) !important;
+        color: white !important;
+        border: none !important;
+    }
+    .gradio-button.secondary:hover {
+        opacity: 0.9;
+        transform: translateY(-1px);
+    }
+    .gradio-button.clear {
+        background: #90A4AE !important;
+        color: white !important;
+        border: none !important;
+    }
+    .gradio-button.clear:hover {
+        opacity: 0.9;
+        transform: translateY(-1px);
+    }
+    .gradio-input {
+        border-radius: 8px !important;
+        border: 1px solid #ced4da !important;
+    }
+    .gradio-label {
+        font-weight: 700;
+        color: #495057;
+        margin-bottom: 5px;
+    }
+    .chatbot {
+        border: 1px solid #e0e0e0 !important;
+        border-radius: 10px !important;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.05);
+    }
+    .log-output {
+        font-family: 'JetBrains Mono', monospace;
+        background-color: #f8f9fa !important;
+        border-radius: 8px !important;
+        border: 1px solid #e9ecef !important;
+        color: #495057 !important;
+        min-height: 80px;
+    }
+    .audio-output {
+        border-radius: 10px !important;
+        border: 1px solid #e0e0e0 !important;
+        background-color: #f8f9fa !important;
+    }
+    """
+    # Gradio handler function that coordinates UI updates
+    def process_and_display_stream(message_text: str, history: List[Dict[str, str]], voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
         history = history or []
+        user_message_entry = {"role": "user", "content": message_text}
+        # Initial state: user message added, text input disabled, buttons updated, audio cleared/hidden
+        # This yield ensures immediate UI feedback
+        yield (
+            history + [user_message_entry], # Add user message to chatbot immediately
+            gr.update(value="", interactive=False), # Clear text input and disable
+            gr.update(value=None, visible=True), # Clear streaming audio, make visible
+            gr.update(value=None, visible=False), # Clear complete audio, hide
+            "🎙️ Starting audio generation...", # Initial log message
+            gr.update(interactive=False, value="Generating..."), # Disable submit button
+            gr.update(visible=True) # Show stop button
+        )
+        log_message = ""
+        generated_any_audio = False
+        # Call the chat_instance's audio generator
         audio_stream_generator = chat_instance.generate_audio_stream(
+            message_text, voice_1, voice_2, num_speakers, cfg_scale
         )
+        # Loop through the streaming audio chunks
         for chunk_data in audio_stream_generator:
             if chat_instance.stop_generation:
+                log_message = "🛑 Audio generation stopped."
+                break
             if chunk_data is not None:
                 generated_any_audio = True
+                log_message = "🎵 Streaming audio..."
+                # Yield current chunk to streaming audio component
+                # Other components remain static during streaming
+                yield (
+                    history + [user_message_entry], # Chatbot state
+                    gr.update(interactive=False), # Text input disabled
+                    chunk_data, # Streaming audio chunk
+                    gr.update(visible=False), # Complete audio hidden
+                    log_message, # Log update
+                    gr.update(interactive=False, value="Generating..."), # Submit button still disabled
+                    gr.update(visible=True) # Stop button still visible
+                )
             else:
+                # None indicates an error or unexpected end from the generator
+                log_message = "❌ Error during audio generation."
                 break
+        # After generation (or stop/error), prepare final updates
+        final_chatbot_history = history + [user_message_entry]
+        final_streaming_audio_update = gr.update(value=None, visible=False) # Hide streaming audio
+        final_complete_audio_update = gr.update(value=None, visible=False) # Default to hidden
+        if chat_instance.stop_generation:
+            final_chatbot_history.append({"role": "assistant", "content": "🚫 Audio generation stopped."})
+            log_message = "🛑 Generation stopped by user."
+            chat_instance.stop_generation = False # Reset flag for next run
+        elif generated_any_audio and chat_instance.complete_audio_buffer:
+            # Concatenate all collected audio chunks for the final downloadable audio
+            complete_audio_data_np = np.concatenate(chat_instance.complete_audio_buffer)
+            final_complete_audio_update = gr.update(value=(24000, convert_to_16_bit_wav(complete_audio_data_np)), visible=True)
+            final_chatbot_history.append({"role": "assistant", "content": "✅ Audio generated successfully! Listen below and download."})
+            log_message = "✨ Generation complete! See 'Complete Audio' below."
         else:
+            final_chatbot_history.append({"role": "assistant", "content": "❌ Failed to generate audio."})
+            log_message = "❌ Generation failed or no audio produced."
+        # Final yield to update all components after streaming
+        yield (
+            final_chatbot_history,
+            gr.update(value="", interactive=True), # Re-enable text input
+            final_streaming_audio_update,
+            final_complete_audio_update,
+            log_message,
+            gr.update(interactive=True, value="🎵 Generate Audio"), # Re-enable submit button
+            gr.update(visible=False) # Hide stop button
+        )
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), fill_height=True, css=custom_css) as interface:
+        gr.HTML("""
+        <div class="main-header">
+            <h1>🎙️ VibeVoice Chat - Streamed Audio</h1>
+            <p>Generate natural dialogue audio with AI voices</p>
+        </div>
+        """)
         with gr.Row():
+            with gr.Column(scale=1, elem_classes="settings-card"):
+                gr.Markdown("### 🎛️ **Voice & Generation Settings**")
                 voice_1 = gr.Dropdown(
                     choices=voice_options,
                     info="Guidance strength (higher = more adherence to text)"
                 )
+            with gr.Column(scale=2, elem_classes="generation-card"):
                 chatbot = gr.Chatbot(
                     label="Conversation",
+                    height=300, # Adjusted height
                     type="messages",
+                    elem_id="chatbot",
+                    elem_classes="chatbot"
                 )
                 msg = gr.Textbox(
                     label="Message",
                     placeholder="Type your message or paste a script...",
+                    lines=3,
+                    elem_classes="gradio-input"
                 )
+                # Log output for generation status
+                log_output = gr.Textbox(
+                    label="Generation Log",
+                    lines=2,
+                    max_lines=5,
+                    interactive=False,
+                    value="Ready to generate audio.",
+                    elem_classes="log-output"
                 )
+                # Streaming audio component
+                audio_output = gr.Audio(
+                    label="Streaming Audio (Real-time Playback)",
+                    type="numpy", # Expects (sr, np_array)
+                    streaming=True,
+                    autoplay=True,
+                    visible=True, # Start visible but empty
+                    show_download_but