Spaces:

akhaliq
/

VibeVoice-1.5B

Running on L4

App Files Files Community

akhaliq HF Staff commited on 8 days ago

Commit

972f767

verified ·

1 Parent(s): 31b623f

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +166 -103

app.py CHANGED Viewed

@@ -17,8 +17,6 @@ from pathlib import Path
 from typing import Iterator, Dict, Any
 # Clone and setup VibeVoice if not already present
-import subprocess
 vibevoice_dir = Path('./VibeVoice')
 if not vibevoice_dir.exists():
     print("Cloning VibeVoice repository...")
@@ -39,17 +37,14 @@ sys.path.insert(0, str(vibevoice_dir))
 # Import VibeVoice modules
 try:
-    # Try direct import first (if installed as package)
     from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
     from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
     from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
     from vibevoice.modular.streamer import AudioStreamer
 except ImportError:
     try:
-        # Try importing from the cloned directory
         import importlib.util
-        # Load modules directly from the VibeVoice directory
         def load_module(module_name, file_path):
             spec = importlib.util.spec_from_file_location(module_name, file_path)
             module = importlib.util.module_from_spec(spec)
@@ -57,7 +52,6 @@ except ImportError:
             spec.loader.exec_module(module)
             return module
-        # Load each module
         config_module = load_module(
             "vibevoice_config",
             vibevoice_dir / "modular" / "configuration_vibevoice.py"
@@ -90,6 +84,7 @@ except ImportError:
             "cd VibeVoice/\n"
             "pip install -e .\n"
         )
 from transformers.utils import logging
 from transformers import set_seed
@@ -151,21 +146,31 @@ class VibeVoiceChat:
         """Setup voice presets from the voices directory."""
         voices_dir = os.path.join(os.path.dirname(__file__), "voices")
         if not os.path.exists(voices_dir):
-            print(f"Warning: Voices directory not found at {voices_dir}")
-            self.available_voices = {}
-            return
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
         self.available_voices = dict(sorted(self.available_voices.items()))
-        print(f"Found {len(self.available_voices)} voice presets")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
@@ -178,7 +183,7 @@ class VibeVoiceChat:
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
-            return np.array([])
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
@@ -221,10 +226,13 @@ class VibeVoiceChat:
             # Format the script
             formatted_script = self.format_script(message, num_speakers)
             # Select voices based on number of speakers
-            selected_voices = [voice_1]
-            if num_speakers > 1 and voice_2:
                 selected_voices.append(voice_2)
             # Load voice samples
@@ -233,23 +241,20 @@ class VibeVoiceChat:
                 # Use the appropriate voice for each speaker
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
-                else:
-                    # Reuse the first voice if we don't have enough
-                    voice_name = selected_voices[0] if selected_voices else None
-                if voice_name and voice_name in self.available_voices:
-                    audio_data = self.read_audio(self.available_voices[voice_name])
-                    if len(audio_data) > 0:
-                        voice_samples.append(audio_data)
                     else:
-                        # Add default audio if reading failed
-                        voice_samples.append(np.zeros(24000))
                 else:
-                    # Add default audio if no voice available
-                    voice_samples.append(np.zeros(24000))
-            # Ensure we have exactly the right number of voice samples
-            voice_samples = voice_samples[:num_speakers]
             # Process inputs
             inputs = self.processor(
@@ -287,11 +292,16 @@ class VibeVoiceChat:
             sample_rate = 24000
             audio_stream = audio_streamer.get_stream(0)
             for audio_chunk in audio_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
                 # Convert to numpy
                 if torch.is_tensor(audio_chunk):
                     if audio_chunk.dtype == torch.bfloat16:
@@ -306,12 +316,21 @@ class VibeVoiceChat:
                 # Convert to 16-bit
                 audio_16bit = self.convert_to_16_bit_wav(audio_np)
-                yield (sample_rate, audio_16bit)
             # Wait for generation to complete
             generation_thread.join(timeout=5.0)
             self.current_streamer = None
             self.is_generating = False
@@ -373,108 +392,153 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
     """Create a simplified Gradio ChatInterface for VibeVoice."""
     # Get available voices
-    voice_options = list(chat_instance.available_voices.keys()) if chat_instance.available_voices else ["None"]
-    default_voice_1 = voice_options[0] if len(voice_options) > 0 else "None"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
-    # Define the chat function
-    def chat_fn(message: Dict[str, Any], history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
         """Process chat message and generate audio response."""
-        # Extract text from message (handle both string and dict inputs)
         if isinstance(message, dict):
             text = message.get("text", "")
         else:
             text = message
         if not text.strip():
-            return gr.Audio(value=None)
         try:
             # Generate audio stream
             audio_generator = chat_instance.generate_audio_stream(
                 text, history, voice_1, voice_2, num_speakers, cfg_scale
             )
-            # Get the first audio chunk for immediate response
             audio_data = None
             for audio_chunk in audio_generator:
                 if audio_chunk is not None:
                     audio_data = audio_chunk
-                    break
-            # Return audio component
             if audio_data:
-                return gr.Audio(value=audio_data, streaming=True, autoplay=True)
             else:
-                return gr.Audio(value=None)
         except Exception as e:
             print(f"Error in chat_fn: {e}")
             import traceback
             traceback.print_exc()
-            return gr.Audio(value=None)
-    # Create additional inputs
-    additional_inputs = [
-        gr.Dropdown(
-            choices=voice_options,
-            value=default_voice_1,
-            label="Voice 1",
-            info="Select voice for Speaker 0"
-        ),
-        gr.Dropdown(
-            choices=voice_options,
-            value=default_voice_2,
-            label="Voice 2",
-            info="Select voice for Speaker 1 (if using multiple speakers)"
-        ),
-        gr.Slider(
-            minimum=1,
-            maximum=2,
-            value=2,
-            step=1,
-            label="Number of Speakers",
-            info="Number of speakers in the dialogue"
-        ),
-        gr.Slider(
-            minimum=1.0,
-            maximum=2.0,
-            value=1.3,
-            step=0.05,
-            label="CFG Scale",
-            info="Guidance strength (higher = more adherence to text)"
         )
-    ]
-    # Create the ChatInterface without examples to avoid the error
-    interface = gr.ChatInterface(
-        fn=chat_fn,
-        type="messages",
-        title="🎙️ VibeVoice Chat",
-        description="Generate natural dialogue audio with AI voices. Type your message or paste a script!",
-        additional_inputs=additional_inputs,
-        additional_inputs_accordion=gr.Accordion(label="Voice & Generation Settings", open=True),
-        submit_btn="🎵 Generate Audio",
-        stop_btn="⏹️ Stop",
-        autofocus=True,
-        autoscroll=True,
-        show_progress="minimal",
-        theme=gr.themes.Soft(
-            primary_hue="blue",
-            secondary_hue="purple"
-        ),
-        css="""
-        .gradio-container {
-            max-width: 1200px;
-            margin: auto;
-        }
-        .message {
-            font-size: 1.1em;
-        }
-        """,
-        analytics_enabled=True,
-        fill_height=True,
-        fill_width=False,
-    )
     return interface
@@ -500,7 +564,6 @@ def parse_args():
         help="Number of DDPM inference steps",
     )
     return parser.parse_args()
@@ -528,7 +591,7 @@ def main():
     print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
     # Launch the interface
-    interface.launch(
         show_error=True,
         quiet=False,
     )

 from typing import Iterator, Dict, Any
 # Clone and setup VibeVoice if not already present
 vibevoice_dir = Path('./VibeVoice')
 if not vibevoice_dir.exists():
     print("Cloning VibeVoice repository...")
 # Import VibeVoice modules
 try:
     from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
     from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
     from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
     from vibevoice.modular.streamer import AudioStreamer
 except ImportError:
     try:
         import importlib.util
         def load_module(module_name, file_path):
             spec = importlib.util.spec_from_file_location(module_name, file_path)
             module = importlib.util.module_from_spec(spec)
             spec.loader.exec_module(module)
             return module
         config_module = load_module(
             "vibevoice_config",
             vibevoice_dir / "modular" / "configuration_vibevoice.py"
             "cd VibeVoice/\n"
             "pip install -e .\n"
         )
 from transformers.utils import logging
 from transformers import set_seed
         """Setup voice presets from the voices directory."""
         voices_dir = os.path.join(os.path.dirname(__file__), "voices")
+        # Create voices directory if it doesn't exist
         if not os.path.exists(voices_dir):
+            os.makedirs(voices_dir)
+            print(f"Created voices directory at {voices_dir}")
+            print("Please add voice sample files (.wav, .mp3, etc.) to this directory")
         self.available_voices = {}
         audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
+        # Scan for audio files
         for file in os.listdir(voices_dir):
             if file.lower().endswith(audio_extensions):
                 name = os.path.splitext(file)[0]
                 self.available_voices[name] = os.path.join(voices_dir, file)
+        # Sort voices alphabetically
         self.available_voices = dict(sorted(self.available_voices.items()))
+        if not self.available_voices:
+            print(f"Warning: No voice files found in {voices_dir}")
+            print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
+            # Add a default "None" option
+            self.available_voices = {"Default": None}
+        else:
+            print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
     def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
         """Read and preprocess audio file."""
             return wav
         except Exception as e:
             print(f"Error reading audio {audio_path}: {e}")
+            return np.zeros(24000)  # Return 1 second of silence as fallback
     def format_script(self, message: str, num_speakers: int = 2) -> str:
         """Format input message into a script with speaker assignments."""
             # Format the script
             formatted_script = self.format_script(message, num_speakers)
+            print(f"Formatted script:\n{formatted_script}")
             # Select voices based on number of speakers
+            selected_voices = []
+            if voice_1 and voice_1 != "Default":
+                selected_voices.append(voice_1)
+            if num_speakers > 1 and voice_2 and voice_2 != "Default":
                 selected_voices.append(voice_2)
             # Load voice samples
                 # Use the appropriate voice for each speaker
                 if i < len(selected_voices):
                     voice_name = selected_voices[i]
+                    if voice_name in self.available_voices and self.available_voices[voice_name]:
+                        audio_data = self.read_audio(self.available_voices[voice_name])
                     else:
+                        audio_data = np.zeros(24000)  # Default silence
                 else:
+                    # Use first voice or default if not enough voices selected
+                    if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
+                        audio_data = self.read_audio(self.available_voices[selected_voices[0]])
+                    else:
+                        audio_data = np.zeros(24000)  # Default silence
+                voice_samples.append(audio_data)
+            print(f"Loaded {len(voice_samples)} voice samples")
             # Process inputs
             inputs = self.processor(
             sample_rate = 24000
             audio_stream = audio_streamer.get_stream(0)
+            all_audio_chunks = []
+            chunk_count = 0
             for audio_chunk in audio_stream:
                 if self.stop_generation:
                     audio_streamer.end()
                     break
+                chunk_count += 1
                 # Convert to numpy
                 if torch.is_tensor(audio_chunk):
                     if audio_chunk.dtype == torch.bfloat16:
                 # Convert to 16-bit
                 audio_16bit = self.convert_to_16_bit_wav(audio_np)
+                all_audio_chunks.append(audio_16bit)
+                # Yield accumulated audio
+                if all_audio_chunks:
+                    complete_audio = np.concatenate(all_audio_chunks)
+                    yield (sample_rate, complete_audio)
             # Wait for generation to complete
             generation_thread.join(timeout=5.0)
+            # Final yield with complete audio
+            if all_audio_chunks:
+                complete_audio = np.concatenate(all_audio_chunks)
+                yield (sample_rate, complete_audio)
             self.current_streamer = None
             self.is_generating = False
     """Create a simplified Gradio ChatInterface for VibeVoice."""
     # Get available voices
+    voice_options = list(chat_instance.available_voices.keys())
+    if not voice_options:
+        voice_options = ["Default"]
+    default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
     default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
+    # Define the chat function that returns audio
+    def chat_fn(message: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
         """Process chat message and generate audio response."""
+        # Extract text from message
         if isinstance(message, dict):
             text = message.get("text", "")
         else:
             text = message
         if not text.strip():
+            return history + [[text, None]]
         try:
+            # Add the user message to history
+            history = history + [[text, None]]
             # Generate audio stream
             audio_generator = chat_instance.generate_audio_stream(
                 text, history, voice_1, voice_2, num_speakers, cfg_scale
             )
+            # Collect all audio data
             audio_data = None
             for audio_chunk in audio_generator:
                 if audio_chunk is not None:
                     audio_data = audio_chunk
+            # Update the last message with audio response
             if audio_data:
+                # Create audio element
+                history[-1][1] = audio_data
             else:
+                history[-1][1] = "Failed to generate audio"
+            return history
         except Exception as e:
             print(f"Error in chat_fn: {e}")
             import traceback
             traceback.print_exc()
+            history[-1][1] = f"Error: {str(e)}"
+            return history
+    # Create the interface using Blocks for more control
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")) as interface:
+        gr.Markdown("# 🎙️ VibeVoice Chat\nGenerate natural dialogue audio with AI voices")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Voice & Generation Settings")
+                voice_1 = gr.Dropdown(
+                    choices=voice_options,
+                    value=default_voice_1,
+                    label="Voice 1",
+                    info="Select voice for Speaker 0"
+                )
+                voice_2 = gr.Dropdown(
+                    choices=voice_options,
+                    value=default_voice_2,
+                    label="Voice 2",
+                    info="Select voice for Speaker 1 (if using multiple speakers)"
+                )
+                num_speakers = gr.Slider(
+                    minimum=1,
+                    maximum=2,
+                    value=2,
+                    step=1,
+                    label="Number of Speakers",
+                    info="Number of speakers in the dialogue"
+                )
+                cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=2.0,
+                    value=1.3,
+                    step=0.05,
+                    label="CFG Scale",
+                    info="Guidance strength (higher = more adherence to text)"
+                )
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    label="Conversation",
+                    height=400,
+                    type="tuples"
+                )
+                msg = gr.Textbox(
+                    label="Message",
+                    placeholder="Type your message or paste a script...",
+                    lines=3
+                )
+                with gr.Row():
+                    submit = gr.Button("🎵 Generate Audio", variant="primary")
+                    clear = gr.Button("🗑️ Clear")
+                # Example messages
+                gr.Examples(
+                    examples=[
+                        "Hello! How are you doing today?",
+                        "Speaker 0: Welcome to our podcast!\nSpeaker 1: Thanks for having me!",
+                        "Tell me an interesting fact about space.",
+                        "What's your favorite type of music and why?",
+                    ],
+                    inputs=msg,
+                    label="Example Messages"
+                )
+        # Event handlers
+        def user_submit(message, history, v1, v2, ns, cfg):
+            return chat_fn(message, history, v1, v2, ns, cfg)
+        msg.submit(
+            user_submit,
+            [msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            [chatbot],
+            queue=True
+        ).then(
+            lambda: "",
+            None,
+            [msg]
         )
+        submit.click(
+            user_submit,
+            [msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
+            [chatbot],
+            queue=True
+        ).then(
+            lambda: "",
+            None,
+            [msg]
+        )
+        clear.click(lambda: ([], ""), None, [chatbot, msg])
     return interface
         help="Number of DDPM inference steps",
     )
     return parser.parse_args()
     print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
     # Launch the interface
+    interface.queue(max_size=10).launch(
         show_error=True,
         quiet=False,
     )