Spaces:

akhaliq
/

VibeVoice-1.5B

Running on L4

App Files Files Community

akhaliq HF Staff commited on 6 days ago

Commit

b7fc0b0

verified ·

1 Parent(s): c2e9ecc

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -308

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import argparse
 import os
 import tempfile
@@ -10,7 +14,7 @@ import librosa
 import soundfile as sf
 import torch
 from pathlib import Path
-from typing import Iterator, Dict, Any, List
 # Clone and setup VibeVoice if not already present
 vibevoice_dir = Path('./VibeVoice')
@@ -87,20 +91,6 @@ from transformers import set_seed
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
-# --- Helper function for audio conversion ---
-def convert_to_16_bit_wav(data: np.ndarray | torch.Tensor) -> np.ndarray:
-    """Convert audio data to 16-bit WAV format (numpy int16)."""
-    if torch.is_tensor(data):
-        data = data.detach().cpu().numpy()
-    data = np.array(data, dtype=np.float32) # Ensure float32 before scaling
-    # Normalize to -1 to 1 if necessary
-    if np.max(np.abs(data)) > 1.0:
-        data = data / np.max(np.abs(data))
-    data = (data * 32767).astype(np.int16)
-    return data
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
@@ -110,8 +100,7 @@ class VibeVoiceChat:
         self.inference_steps = inference_steps
         self.is_generating = False
         self.stop_generation = False
-        self.current_streamer: AudioStreamer | None = None
-        self.complete_audio_buffer: List[np.ndarray] = [] # To store all generated audio for final download
         # Check GPU availability and CUDA version
         if torch.cuda.is_available():
@@ -119,8 +108,10 @@ class VibeVoiceChat:
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
             print(f"  CUDA Version: {torch.version.cuda}")
             print(f"  PyTorch CUDA: {torch.cuda.is_available()}")
-            torch.cuda.set_per_process_memory_fraction(0.95) # Set memory fraction to avoid OOM
-            torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for faster computation on Ampere GPUs
             torch.backends.cudnn.allow_tf32 = True
         else:
             print("✗ No GPU detected, using CPU (generation will be VERY slow)")
@@ -178,13 +169,15 @@ class VibeVoiceChat:
         load_time = time.time() - start_time
         print(f"✓ Model loaded in {load_time:.2f} seconds")
         if hasattr(self.model, 'device'):
             print(f"Model device: {self.model.device}")
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
-        voices_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "voices")
         if not os.path.exists(voices_dir):
             os.makedirs(voices_dir)
             print(f"Created voices directory at {voices_dir}")
@@ -193,16 +186,19 @@ class VibeVoiceChat:
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
         self.available_voices = dict(sorted(self.available_voices.items()))
         if not self.available_voices:
             print(f"Warning: No voice files found in {voices_dir}")
             print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
             self.available_voices = {"Default": None}
         else:
             print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
@@ -210,13 +206,15 @@ class VibeVoiceChat:
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
-            wav, sr = librosa.load(audio_path, sr=None, mono=True)
             if sr != target_sr:
-                wav = librosa.resample(y=wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
-            return np.zeros(target_sr, dtype=np.float32)
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
@@ -228,9 +226,11 @@ class VibeVoiceChat:
             if not line:
                 continue
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
                 speaker_id = i % num_speakers
                 formatted_lines.append(f"Speaker {speaker_id}: {line}")
@@ -239,50 +239,59 @@ class VibeVoiceChat:
     def generate_audio_stream(
         self,
         message: str,
         voice_1: str,
         voice_2: str,
         num_speakers: int,
         cfg_scale: float
-    ) -> Iterator[tuple]: # This generator yields (sample_rate, audio_chunk_numpy_int16)
-        """
-        Generate audio stream from text input, implementing buffering for smoother streaming.
-        Yields (sample_rate, audio_chunk_numpy_int16) tuples as audio becomes available.
-        """
         try:
             self.stop_generation = False
             self.is_generating = True
-            self.complete_audio_buffer = [] # Reset buffer for new generation
             if not message.strip():
-                self.is_generating = False
                 yield None
                 return
             formatted_script = self.format_script(message, num_speakers)
             selected_voices = []
             if voice_1 and voice_1 != "Default":
                 selected_voices.append(voice_1)
             if num_speakers > 1 and voice_2 and voice_2 != "Default":
                 selected_voices.append(voice_2)
             voice_samples = []
-            target_sr = 24000
             for i in range(num_speakers):
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
-                        audio_data = self.read_audio(self.available_voices[voice_name], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 else:
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
-                        audio_data = self.read_audio(self.available_voices[selected_voices[0]], target_sr=target_sr)
                     else:
-                        audio_data = np.zeros(target_sr, dtype=np.float32)
                 voice_samples.append(audio_data)
             inputs = self.processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
@@ -291,9 +300,15 @@ class VibeVoiceChat:
                 return_attention_mask=True,
             )
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
             audio_streamer = AudioStreamer(
                 batch_size=1,
                 stop_signal=None,
@@ -302,89 +317,86 @@ class VibeVoiceChat:
             self.current_streamer = audio_streamer
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
             )
             generation_thread.start()
-            # Give the generation thread a moment to start producing output
-            time.sleep(1.0) # Increased from 0.5s for stability
-            audio_output_stream = audio_streamer.get_stream(0)
-            # Buffering logic for smoother Gradio streaming
-            pending_chunks: List[np.ndarray] = []
-            min_yield_interval_seconds = 1.0 # Yield at least every 1 second
-            min_chunk_size_samples = target_sr * 0.5 # At least 0.5 seconds of audio per chunk yielded to Gradio
-            last_yield_time = time.time()
-            for audio_chunk_raw in audio_output_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
-                # Convert raw chunk to numpy float32
-                if torch.is_tensor(audio_chunk_raw):
-                    if audio_chunk_raw.dtype == torch.bfloat16:
-                        audio_chunk_raw = audio_chunk_raw.float()
-                    audio_np = audio_chunk_raw.cpu().numpy().astype(np.float32)
                 else:
-                    audio_np = np.array(audio_chunk_raw, dtype=np.float32)
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
-                # Append to complete buffer (for final download)
-                self.complete_audio_buffer.append(audio_np)
-                # Append to pending chunks for streaming to Gradio
-                pending_chunks.append(audio_np)
-                current_pending_size = sum(len(c) for c in pending_chunks)
-                current_time = time.time()
-                should_yield = False
-                if current_pending_size >= min_chunk_size_samples:
-                    should_yield = True
-                elif (current_time - last_yield_time) >= min_yield_interval_seconds and pending_chunks:
-                    should_yield = True
-                if should_yield:
-                    combined_chunk = np.concatenate(pending_chunks)
-                    yield (target_sr, convert_to_16_bit_wav(combined_chunk)) # Convert to int16 before yielding
-                    pending_chunks = []
-                    last_yield_time = current_time
-            # Yield any remaining chunks after the loop finishes
-            if pending_chunks and not self.stop_generation:
-                combined_chunk = np.concatenate(pending_chunks)
-                yield (target_sr, convert_to_16_bit_wav(combined_chunk))
-            generation_thread.join(timeout=10.0) # Ensure generation thread completes
-            # Clean up
             self.current_streamer = None
             self.is_generating = False
         except Exception as e:
-            print(f"Error in generate_audio_stream: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
-            self.complete_audio_buffer = [] # Clear buffer on error
             yield None
-    def _generate_with_streamer(self, inputs: Dict[str, Any], cfg_scale: float, audio_streamer: AudioStreamer):
         """Helper method to run generation with streamer."""
         try:
             def check_stop():
                 return self.stop_generation
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                    self.model.generate(
                         **inputs,
                         max_new_tokens=None,
                         cfg_scale=cfg_scale,
@@ -396,7 +408,7 @@ class VibeVoiceChat:
                         refresh_negative=True,
                     )
             else:
-                self.model.generate(
                     **inputs,
                     max_new_tokens=None,
                     cfg_scale=cfg_scale,
@@ -411,232 +423,91 @@ class VibeVoiceChat:
             print(f"Error in generation thread: {e}")
             import traceback
             traceback.print_exc()
-        finally:
-            audio_streamer.end()
     def stop_audio_generation(self):
-        """Signal to stop the current audio generation."""
-        if self.is_generating:
-            print("🛑 Stop signal received.")
-            self.stop_generation = True
-            if self.current_streamer:
-                try:
-                    self.current_streamer.end()
-                except Exception as e:
-                    print(f"Error ending streamer: {e}")
-            self.is_generating = False
-            self.complete_audio_buffer = []
-        else:
-            print("No active generation to stop.")
 def create_chat_interface(chat_instance: VibeVoiceChat):
-    """Create a simplified Gradio ChatInterface for VibeVoice with audio streaming."""
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
-    # Custom CSS for modern aesthetics
-    custom_css = """
-    .gradio-container {
-        font-family: 'Inter', sans-serif;
-        background: linear-gradient(135deg, #f0f2f5 0%, #e0e6ed 100%);
-        color: #333;
-    }
-    .main-header {
-        background: linear-gradient(45deg, #4A00E0 0%, #8E2DE2 100%);
-        padding: 20px 30px;
-        border-radius: 15px;
-        margin-bottom: 25px;
-        text-align: center;
-        box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
-    }
-    .main-header h1 {
-        color: white;
-        font-size: 2.8em;
-        font-weight: 800;
-        margin: 0;
-        letter-spacing: -1px;
-        text-shadow: 0 3px 5px rgba(0,0,0,0.2);
-    }
-    .main-header p {
-        color: rgba(255,255,255,0.85);
-        font-size: 1.1em;
-        margin-top: 10px;
-    }
-    .settings-card, .generation-card {
-        background: rgba(255, 255, 255, 0.9);
-        border: 1px solid #dcdfe6;
-        border-radius: 12px;
-        padding: 20px;
-        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08);
-        transition: all 0.3s ease;
-    }
-    .settings-card:hover, .generation-card:hover {
-        box-shadow: 0 6px 20px rgba(0, 0, 0, 0.12);
-        transform: translateY(-2px);
-    }
-    .gradio-output {
-        border-radius: 10px;
-        background-color: #fcfcfc;
-    }
-    .gradio-button {
-        border-radius: 8px !important;
-        font-weight: 600;
-        padding: 10px 20px;
-        transition: all 0.2s ease-in-out;
-    }
-    .gradio-button.primary {
-        background: linear-gradient(45deg, #4CAF50 0%, #8BC34A 100%) !important;
-        color: white !important;
-        border: none !important;
-    }
-    .gradio-button.primary:hover {
-        opacity: 0.9;
-        transform: translateY(-1px);
-    }
-    .gradio-button.secondary {
-        background: linear-gradient(45deg, #FF5722 0%, #FFC107 100%) !important;
-        color: white !important;
-        border: none !important;
-    }
-    .gradio-button.secondary:hover {
-        opacity: 0.9;
-        transform: translateY(-1px);
-    }
-    .gradio-button.clear {
-        background: #90A4AE !important;
-        color: white !important;
-        border: none !important;
-    }
-    .gradio-button.clear:hover {
-        opacity: 0.9;
-        transform: translateY(-1px);
-    }
-    .gradio-input {
-        border-radius: 8px !important;
-        border: 1px solid #ced4da !important;
-    }
-    .gradio-label {
-        font-weight: 700;
-        color: #495057;
-        margin-bottom: 5px;
-    }
-    .chatbot {
-        border: 1px solid #e0e0e0 !important;
-        border-radius: 10px !important;
-        box-shadow: 0 2px 10px rgba(0,0,0,0.05);
-    }
-    .log-output {
-        font-family: 'JetBrains Mono', monospace;
-        background-color: #f8f9fa !important;
-        border-radius: 8px !important;
-        border: 1px solid #e9ecef !important;
-        color: #495057 !important;
-        min-height: 80px;
-    }
-    .audio-output {
-        border-radius: 10px !important;
-        border: 1px solid #e0e0e0 !important;
-        background-color: #f8f9fa !important;
-    }
-    """
-    # Gradio handler function that coordinates UI updates
-    def process_and_display_stream(message_text: str, history: List[Dict[str, str]], voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
-        history = history or []
-        user_message_entry = {"role": "user", "content": message_text}
-        # Initial state: user message added, text input disabled, buttons updated, audio cleared/hidden
-        # This yield ensures immediate UI feedback
-        yield (
-            history + [user_message_entry], # Add user message to chatbot immediately
-            gr.update(value="", interactive=False), # Clear text input and disable
-            gr.update(value=None, visible=True), # Clear streaming audio, make visible
-            gr.update(value=None, visible=False), # Clear complete audio, hide
-            "🎙️ Starting audio generation...", # Initial log message
-            gr.update(interactive=False, value="Generating..."), # Disable submit button
-            gr.update(visible=True) # Show stop button
-        )
-        log_message = ""
-        generated_any_audio = False
-        # Call the chat_instance's audio generator
-        audio_stream_generator = chat_instance.generate_audio_stream(
-            message_text, voice_1, voice_2, num_speakers, cfg_scale
-        )
-        # Loop through the streaming audio chunks
-        for chunk_data in audio_stream_generator:
-            if chat_instance.stop_generation:
-                log_message = "🛑 Audio generation stopped."
-                break
-            if chunk_data is not None:
-                generated_any_audio = True
-                log_message = "🎵 Streaming audio..."
-                # Yield current chunk to streaming audio component
-                # Other components remain static during streaming
-                yield (
-                    history + [user_message_entry], # Chatbot state
-                    gr.update(interactive=False), # Text input disabled
-                    chunk_data, # Streaming audio chunk
-                    gr.update(visible=False), # Complete audio hidden
-                    log_message, # Log update
-                    gr.update(interactive=False, value="Generating..."), # Submit button still disabled
-                    gr.update(visible=True) # Stop button still visible
-                )
             else:
-                # None indicates an error or unexpected end from the generator
-                log_message = "❌ Error during audio generation."
-                break
-        # After generation (or stop/error), prepare final updates
-        final_chatbot_history = history + [user_message_entry]
-        final_streaming_audio_update = gr.update(value=None, visible=False) # Hide streaming audio
-        final_complete_audio_update = gr.update(value=None, visible=False) # Default to hidden
-        if chat_instance.stop_generation:
-            final_chatbot_history.append({"role": "assistant", "content": "🚫 Audio generation stopped."})
-            log_message = "🛑 Generation stopped by user."
-            chat_instance.stop_generation = False # Reset flag for next run
-        elif generated_any_audio and chat_instance.complete_audio_buffer:
-            # Concatenate all collected audio chunks for the final downloadable audio
-            complete_audio_data_np = np.concatenate(chat_instance.complete_audio_buffer)
-            final_complete_audio_update = gr.update(value=(24000, convert_to_16_bit_wav(complete_audio_data_np)), visible=True)
-            final_chatbot_history.append({"role": "assistant", "content": "✅ Audio generated successfully! Listen below and download."})
-            log_message = "✨ Generation complete! See 'Complete Audio' below."
-        else:
-            final_chatbot_history.append({"role": "assistant", "content": "❌ Failed to generate audio."})
-            log_message = "❌ Generation failed or no audio produced."
-        # Final yield to update all components after streaming
-        yield (
-            final_chatbot_history,
-            gr.update(value="", interactive=True), # Re-enable text input
-            final_streaming_audio_update,
-            final_complete_audio_update,
-            log_message,
-            gr.update(interactive=True, value="🎵 Generate Audio"), # Re-enable submit button
-            gr.update(visible=False) # Hide stop button
-        )
-    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), fill_height=True, css=custom_css) as interface:
-        gr.HTML("""
-        <div class="main-header">
-            <h1>🎙️ VibeVoice Chat - Streamed Audio</h1>
-            <p>Generate natural dialogue audio with AI voices</p>
-        </div>
-        """)
         with gr.Row():
-            with gr.Column(scale=1, elem_classes="settings-card"):
-                gr.Markdown("### 🎛️ **Voice & Generation Settings**")
                 voice_1 = gr.Dropdown(
                     choices=voice_options,
@@ -670,37 +541,140 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
                     info="Guidance strength (higher = more adherence to text)"
                 )
-            with gr.Column(scale=2, elem_classes="generation-card"):
                 chatbot = gr.Chatbot(
                     label="Conversation",
-                    height=300, # Adjusted height
                     type="messages",
-                    elem_id="chatbot",
-                    elem_classes="chatbot"
                 )
                 msg = gr.Textbox(
                     label="Message",
                     placeholder="Type your message or paste a script...",
-                    lines=3,
-                    elem_classes="gradio-input"
                 )
-                # Log output for generation status
-                log_output = gr.Textbox(
-                    label="Generation Log",
-                    lines=2,
-                    max_lines=5,
-                    interactive=False,
-                    value="Ready to generate audio.",
-                    elem_classes="log-output"
-                )
-                # Streaming audio component
                 audio_output = gr.Audio(
-                    label="Streaming Audio (Real-time Playback)",
-                    type="numpy", # Expects (sr, np_array)
-                    streaming=True,
                     autoplay=True,
-                    visible=True, # Start visible but empty
-                    show_download_but

+"""
+VibeVoice Simple Chat Interface - Streamlined Audio Generation Demo
+"""
 import argparse
 import os
 import tempfile
 import soundfile as sf
 import torch
 from pathlib import Path
+from typing import Iterator, Dict, Any
 # Clone and setup VibeVoice if not already present
 vibevoice_dir = Path('./VibeVoice')
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 class VibeVoiceChat:
     def __init__(self, model_path: str, device: str = "cuda", inference_steps: int = 5):
         self.inference_steps = inference_steps
         self.is_generating = False
         self.stop_generation = False
+        self.current_streamer = None
         # Check GPU availability and CUDA version
         if torch.cuda.is_available():
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
             print(f"  CUDA Version: {torch.version.cuda}")
             print(f"  PyTorch CUDA: {torch.cuda.is_available()}")
+            # Set memory fraction to avoid OOM
+            torch.cuda.set_per_process_memory_fraction(0.95)
+            # Enable TF32 for faster computation on Ampere GPUs
+            torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cudnn.allow_tf32 = True
         else:
             print("✗ No GPU detected, using CPU (generation will be VERY slow)")
         load_time = time.time() - start_time
         print(f"✓ Model loaded in {load_time:.2f} seconds")
+        # Print model device
         if hasattr(self.model, 'device'):
             print(f"Model device: {self.model.device}")
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
+        voices_dir = os.path.join(os.path.dirname(__file__), "voices")
+        # Create voices directory if it doesn't exist
         if not os.path.exists(voices_dir):
             os.makedirs(voices_dir)
             print(f"Created voices directory at {voices_dir}")
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
+        # Scan for audio files
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
+        # Sort voices alphabetically
         self.available_voices = dict(sorted(self.available_voices.items()))
         if not self.available_voices:
             print(f"Warning: No voice files found in {voices_dir}")
             print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
+            # Add a default "None" option
             self.available_voices = {"Default": None}
         else:
             print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
         try:
+            wav, sr = sf.read(audio_path)
+            if len(wav.shape) > 1:
+                wav = np.mean(wav, axis=1)
             if sr != target_sr:
+                wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
+            return np.zeros(24000)  # Return 1 second of silence as fallback
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
             if not line:
                 continue
+            # Check if already formatted
             if line.startswith('Speaker ') and ':' in line:
                 formatted_lines.append(line)
             else:
+                # Auto-assign speakers in rotation
                 speaker_id = i % num_speakers
                 formatted_lines.append(f"Speaker {speaker_id}: {line}")
     def generate_audio_stream(
         self,
         message: str,
+        history: list,
         voice_1: str,
         voice_2: str,
         num_speakers: int,
         cfg_scale: float
+    ) -> Iterator[tuple]:
+        """Generate audio stream from text input."""
         try:
             self.stop_generation = False
             self.is_generating = True
+            # Validate inputs
             if not message.strip():
                 yield None
                 return
+            # Format the script
             formatted_script = self.format_script(message, num_speakers)
+            print(f"Formatted script:\n{formatted_script}")
+            print(f"Using device: {self.device}")
+            # Start timing
+            start_time = time.time()
+            # Select voices based on number of speakers
             selected_voices = []
             if voice_1 and voice_1 != "Default":
                 selected_voices.append(voice_1)
             if num_speakers > 1 and voice_2 and voice_2 != "Default":
                 selected_voices.append(voice_2)
+            # Load voice samples
             voice_samples = []
             for i in range(num_speakers):
+                # Use the appropriate voice for each speaker
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
                     if voice_name in self.available_voices and self.available_voices[voice_name]:
+                        audio_data = self.read_audio(self.available_voices[voice_name])
                     else:
+                        audio_data = np.zeros(24000)  # Default silence
                 else:
+                    # Use first voice or default if not enough voices selected
                     if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
+                        audio_data = self.read_audio(self.available_voices[selected_voices[0]])
                     else:
+                        audio_data = np.zeros(24000)  # Default silence
                 voice_samples.append(audio_data)
+            print(f"Loaded {len(voice_samples)} voice samples")
+            # Process inputs
             inputs = self.processor(
                 text=[formatted_script],
                 voice_samples=[voice_samples],
                 return_attention_mask=True,
             )
+            # Move to device and ensure correct dtype
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
+                print(f"✓ Inputs moved to GPU")
+                # Check GPU memory
+                if torch.cuda.is_available():
+                    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+            # Create audio streamer
             audio_streamer = AudioStreamer(
                 batch_size=1,
                 stop_signal=None,
             self.current_streamer = audio_streamer
+            # Start generation in separate thread
             generation_thread = threading.Thread(
                 target=self._generate_with_streamer,
                 args=(inputs, cfg_scale, audio_streamer)
             )
             generation_thread.start()
+            # Wait briefly for generation to start
+            time.sleep(1)
+            # Stream audio chunks
+            sample_rate = 24000
+            audio_stream = audio_streamer.get_stream(0)
+            all_audio_chunks = []
+            chunk_count = 0
+            for audio_chunk in audio_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
+                chunk_count += 1
+                # Convert to numpy
+                if torch.is_tensor(audio_chunk):
+                    if audio_chunk.dtype == torch.bfloat16:
+                        audio_chunk = audio_chunk.float()
+                    audio_np = audio_chunk.cpu().numpy().astype(np.float32)
                 else:
+                    audio_np = np.array(audio_chunk, dtype=np.float32)
+                # Ensure 1D
                 if len(audio_np.shape) > 1:
                     audio_np = audio_np.squeeze()
+                # Convert to 16-bit
+                audio_16bit = self.convert_to_16_bit_wav(audio_np)
+                all_audio_chunks.append(audio_16bit)
+                # Yield accumulated audio
+                if all_audio_chunks:
+                    complete_audio = np.concatenate(all_audio_chunks)
+                    yield (sample_rate, complete_audio)
+            # Wait for generation to complete
+            generation_thread.join(timeout=5.0)
+            # Final yield with complete audio
+            if all_audio_chunks:
+                complete_audio = np.concatenate(all_audio_chunks)
+                generation_time = time.time() - start_time
+                audio_duration = len(complete_audio) / sample_rate
+                print(f"✓ Generation complete:")
+                print(f"  Time taken: {generation_time:.2f} seconds")
+                print(f"  Audio duration: {audio_duration:.2f} seconds")
+                print(f"  Real-time factor: {audio_duration/generation_time:.2f}x")
+                yield (sample_rate, complete_audio)
             self.current_streamer = None
             self.is_generating = False
         except Exception as e:
+            print(f"Error in generation: {e}")
             import traceback
             traceback.print_exc()
             self.is_generating = False
             self.current_streamer = None
             yield None
+    def _generate_with_streamer(self, inputs, cfg_scale, audio_streamer):
         """Helper method to run generation with streamer."""
         try:
             def check_stop():
                 return self.stop_generation
+            # Use torch.cuda.amp for mixed precision if available
             if self.device == "cuda" and torch.cuda.is_available():
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    outputs = self.model.generate(
                         **inputs,
                         max_new_tokens=None,
                         cfg_scale=cfg_scale,
                         refresh_negative=True,
                     )
             else:
+                outputs = self.model.generate(
                     **inputs,
                     max_new_tokens=None,
                     cfg_scale=cfg_scale,
             print(f"Error in generation thread: {e}")
             import traceback
             traceback.print_exc()
+            audio_streamer.end()
+    def convert_to_16_bit_wav(self, data):
+        """Convert audio data to 16-bit WAV format."""
+        if torch.is_tensor(data):
+            data = data.detach().cpu().numpy()
+        data = np.array(data)
+        if np.max(np.abs(data)) > 1.0:
+            data = data / np.max(np.abs(data))
+        data = (data * 32767).astype(np.int16)
+        return data
     def stop_audio_generation(self):
+        """Stop the current audio generation."""
+        self.stop_generation = True
+        if self.current_streamer:
+            try:
+                self.current_streamer.end()
+            except:
+                pass
 def create_chat_interface(chat_instance: VibeVoiceChat):
+    """Create a simplified Gradio ChatInterface for VibeVoice."""
+    # Get available voices
     voice_options = list(chat_instance.available_voices.keys())
     if not voice_options:
         voice_options = ["Default"]
     default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
+    # Define the chat function that returns audio
+    def chat_fn(message: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
+        """Process chat message and generate audio response."""
+        # Extract text from message
+        if isinstance(message, dict):
+            text = message.get("text", "")
+        else:
+            text = message
+        if not text.strip():
+            return ""
+        try:
+            # Generate audio stream
+            audio_generator = chat_instance.generate_audio_stream(
+                text, history, voice_1, voice_2, num_speakers, cfg_scale
+            )
+            # Collect all audio data
+            audio_data = None
+            for audio_chunk in audio_generator:
+                if audio_chunk is not None:
+                    audio_data = audio_chunk
+            # Return audio file path or error message
+            if audio_data:
+                # Save audio to temporary file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    sample_rate, audio_array = audio_data
+                    sf.write(tmp_file.name, audio_array, sample_rate)
+                    # Return the file path directly
+                    return tmp_file.name
             else:
+                return "Failed to generate audio"
+        except Exception as e:
+            print(f"Error in chat_fn: {e}")
+            import traceback
+            traceback.print_exc()
+            return f"Error: {str(e)}"
+    # Create the interface using Blocks for more control
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), fill_height=True) as interface:
+        gr.Markdown("# 🎙️ VibeVoice Chat\nGenerate natural dialogue audio with AI voices")
         with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Voice & Generation Settings")
                 voice_1 = gr.Dropdown(
                     choices=voice_options,
                     info="Guidance strength (higher = more adherence to text)"
                 )
+            with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
                     label="Conversation",
+                    height=400,
                     type="messages",
+                    elem_id="chatbot"
                 )
                 msg = gr.Textbox(
                     label="Message",
                     placeholder="Type your message or paste a script...",
+                    lines=3
                 )
                 audio_output = gr.Audio(
+                    label="Generated Audio",
+                    type="filepath",
                     autoplay=True,
+                    visible=False
+                )
+                with gr.Row():
+                    submit = gr.Button("🎵 Generate Audio", variant="primary")
+                    clear = gr.Button("🗑️ Clear")
+                # Example messages
+                gr.Examples(
+                    examples=[
+                        "Hello! How are you doing today?",
+                        "Speaker 0: Welcome to our podcast!\nSpeaker 1: Thanks for having me!",
+                        "Tell me an interesting fact about space.",
+                        "What's your favorite type of music and why?",
+                    ],
+                    inputs=msg,
+                    label="Example Messages"
+                )
+        # Set up event handlers
+        def process_and_display(message, history, voice_1, voice_2, num_speakers, cfg_scale):
+            """Process message and update both chatbot and audio."""
+            # Add user message to history
+            history = history or []
+            history.append({"role": "user", "content": message})
+            # Generate audio
+            audio_path = chat_fn(message, history, voice_1, voice_2, num_speakers, cfg_scale)
+            # Add assistant response with audio
+            if audio_path and audio_path.endswith('.wav'):
+                history.append({"role": "assistant", "content": f"🎵 Audio generated successfully"})
+                return history, audio_path, gr.update(visible=True), ""
+            else:
+                history.append({"role": "assistant", "content": audio_path or "Failed to generate audio"})
+                return history, None, gr.update(visible=False), ""
+        submit.click(
+            fn=process_and_display,
+            inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            outputs=[chatbot, audio_output, audio_output, msg],
+            queue=True
+        )
+        msg.submit(
+            fn=process_and_display,
+            inputs=[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            outputs=[chatbot, audio_output, audio_output, msg],
+            queue=True
+        )
+        clear.click(lambda: ([], None, gr.update(visible=False)), outputs=[chatbot, audio_output, audio_output])
+    return interface
+def parse_args():
+    parser = argparse.ArgumentParser(description="VibeVoice Chat Interface")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="microsoft/VibeVoice-1.5B",
+        help="Path to the VibeVoice model",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for inference",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=5,
+        help="Number of DDPM inference steps (lower = faster, higher = better quality)",
+    )
+    return parser.parse_args()
+def main():
+    """Main function to run the chat interface."""
+    args = parse_args()
+    set_seed(42)
+    print("🎙️ Initializing VibeVoice Chat Interface...")
+    # Initialize chat instance
+    chat_instance = VibeVoiceChat(
+        model_path=args.model_path,
+        device=args.device,
+        inference_steps=args.inference_steps
+    )
+    # Create interface
+    interface = create_chat_interface(chat_instance)
+    print(f"🚀 Launching chat interface")
+    print(f"📁 Model: {args.model_path}")
+    print(f"💻 Device: {chat_instance.device}")
+    print(f"🔢 Inference steps: {args.inference_steps}")
+    print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
+    if chat_instance.device == "cpu":
+        print("\n⚠️  WARNING: Running on CPU - generation will be VERY slow!")
+        print("   For faster generation, ensure you have:")
+        print("   1. NVIDIA GPU with CUDA support")
+        print("   2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
+    # Launch the interface
+    interface.queue(max_size=10).launch(
+        show_error=True,
+        quiet=False,
+    )
+if __name__ == "__main__":
+    main()