Spaces:
Running
on
L4
Running
on
L4
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
@@ -17,8 +17,6 @@ from pathlib import Path
|
|
17 |
from typing import Iterator, Dict, Any
|
18 |
|
19 |
# Clone and setup VibeVoice if not already present
|
20 |
-
import subprocess
|
21 |
-
|
22 |
vibevoice_dir = Path('./VibeVoice')
|
23 |
if not vibevoice_dir.exists():
|
24 |
print("Cloning VibeVoice repository...")
|
@@ -39,17 +37,14 @@ sys.path.insert(0, str(vibevoice_dir))
|
|
39 |
|
40 |
# Import VibeVoice modules
|
41 |
try:
|
42 |
-
# Try direct import first (if installed as package)
|
43 |
from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
|
44 |
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
45 |
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
46 |
from vibevoice.modular.streamer import AudioStreamer
|
47 |
except ImportError:
|
48 |
try:
|
49 |
-
# Try importing from the cloned directory
|
50 |
import importlib.util
|
51 |
|
52 |
-
# Load modules directly from the VibeVoice directory
|
53 |
def load_module(module_name, file_path):
|
54 |
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
55 |
module = importlib.util.module_from_spec(spec)
|
@@ -57,7 +52,6 @@ except ImportError:
|
|
57 |
spec.loader.exec_module(module)
|
58 |
return module
|
59 |
|
60 |
-
# Load each module
|
61 |
config_module = load_module(
|
62 |
"vibevoice_config",
|
63 |
vibevoice_dir / "modular" / "configuration_vibevoice.py"
|
@@ -90,6 +84,7 @@ except ImportError:
|
|
90 |
"cd VibeVoice/\n"
|
91 |
"pip install -e .\n"
|
92 |
)
|
|
|
93 |
from transformers.utils import logging
|
94 |
from transformers import set_seed
|
95 |
|
@@ -151,21 +146,31 @@ class VibeVoiceChat:
|
|
151 |
"""Setup voice presets from the voices directory."""
|
152 |
voices_dir = os.path.join(os.path.dirname(__file__), "voices")
|
153 |
|
|
|
154 |
if not os.path.exists(voices_dir):
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
|
159 |
self.available_voices = {}
|
160 |
audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
|
161 |
|
|
|
162 |
for file in os.listdir(voices_dir):
|
163 |
if file.lower().endswith(audio_extensions):
|
164 |
name = os.path.splitext(file)[0]
|
165 |
self.available_voices[name] = os.path.join(voices_dir, file)
|
166 |
|
|
|
167 |
self.available_voices = dict(sorted(self.available_voices.items()))
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
171 |
"""Read and preprocess audio file."""
|
@@ -178,7 +183,7 @@ class VibeVoiceChat:
|
|
178 |
return wav
|
179 |
except Exception as e:
|
180 |
print(f"Error reading audio {audio_path}: {e}")
|
181 |
-
return np.
|
182 |
|
183 |
def format_script(self, message: str, num_speakers: int = 2) -> str:
|
184 |
"""Format input message into a script with speaker assignments."""
|
@@ -221,10 +226,13 @@ class VibeVoiceChat:
|
|
221 |
|
222 |
# Format the script
|
223 |
formatted_script = self.format_script(message, num_speakers)
|
|
|
224 |
|
225 |
# Select voices based on number of speakers
|
226 |
-
selected_voices = [
|
227 |
-
if
|
|
|
|
|
228 |
selected_voices.append(voice_2)
|
229 |
|
230 |
# Load voice samples
|
@@ -233,23 +241,20 @@ class VibeVoiceChat:
|
|
233 |
# Use the appropriate voice for each speaker
|
234 |
if i < len(selected_voices):
|
235 |
voice_name = selected_voices[i]
|
236 |
-
|
237 |
-
|
238 |
-
voice_name = selected_voices[0] if selected_voices else None
|
239 |
-
|
240 |
-
if voice_name and voice_name in self.available_voices:
|
241 |
-
audio_data = self.read_audio(self.available_voices[voice_name])
|
242 |
-
if len(audio_data) > 0:
|
243 |
-
voice_samples.append(audio_data)
|
244 |
else:
|
245 |
-
|
246 |
-
voice_samples.append(np.zeros(24000))
|
247 |
else:
|
248 |
-
#
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
-
|
252 |
-
voice_samples = voice_samples[:num_speakers]
|
253 |
|
254 |
# Process inputs
|
255 |
inputs = self.processor(
|
@@ -287,11 +292,16 @@ class VibeVoiceChat:
|
|
287 |
sample_rate = 24000
|
288 |
audio_stream = audio_streamer.get_stream(0)
|
289 |
|
|
|
|
|
|
|
290 |
for audio_chunk in audio_stream:
|
291 |
if self.stop_generation:
|
292 |
audio_streamer.end()
|
293 |
break
|
294 |
|
|
|
|
|
295 |
# Convert to numpy
|
296 |
if torch.is_tensor(audio_chunk):
|
297 |
if audio_chunk.dtype == torch.bfloat16:
|
@@ -306,12 +316,21 @@ class VibeVoiceChat:
|
|
306 |
|
307 |
# Convert to 16-bit
|
308 |
audio_16bit = self.convert_to_16_bit_wav(audio_np)
|
|
|
309 |
|
310 |
-
|
|
|
|
|
|
|
311 |
|
312 |
# Wait for generation to complete
|
313 |
generation_thread.join(timeout=5.0)
|
314 |
|
|
|
|
|
|
|
|
|
|
|
315 |
self.current_streamer = None
|
316 |
self.is_generating = False
|
317 |
|
@@ -373,108 +392,153 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
|
|
373 |
"""Create a simplified Gradio ChatInterface for VibeVoice."""
|
374 |
|
375 |
# Get available voices
|
376 |
-
voice_options = list(chat_instance.available_voices.keys())
|
377 |
-
|
|
|
|
|
|
|
378 |
default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
|
379 |
|
380 |
-
# Define the chat function
|
381 |
-
def chat_fn(message:
|
382 |
"""Process chat message and generate audio response."""
|
383 |
-
|
|
|
384 |
if isinstance(message, dict):
|
385 |
text = message.get("text", "")
|
386 |
else:
|
387 |
text = message
|
388 |
|
389 |
if not text.strip():
|
390 |
-
return
|
391 |
|
392 |
try:
|
|
|
|
|
|
|
393 |
# Generate audio stream
|
394 |
audio_generator = chat_instance.generate_audio_stream(
|
395 |
text, history, voice_1, voice_2, num_speakers, cfg_scale
|
396 |
)
|
397 |
|
398 |
-
#
|
399 |
audio_data = None
|
400 |
for audio_chunk in audio_generator:
|
401 |
if audio_chunk is not None:
|
402 |
audio_data = audio_chunk
|
403 |
-
break
|
404 |
|
405 |
-
#
|
406 |
if audio_data:
|
407 |
-
|
|
|
408 |
else:
|
409 |
-
|
|
|
|
|
|
|
410 |
except Exception as e:
|
411 |
print(f"Error in chat_fn: {e}")
|
412 |
import traceback
|
413 |
traceback.print_exc()
|
414 |
-
|
|
|
415 |
|
416 |
-
# Create
|
417 |
-
|
418 |
-
gr.
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
)
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
autoscroll=True,
|
460 |
-
show_progress="minimal",
|
461 |
-
theme=gr.themes.Soft(
|
462 |
-
primary_hue="blue",
|
463 |
-
secondary_hue="purple"
|
464 |
-
),
|
465 |
-
css="""
|
466 |
-
.gradio-container {
|
467 |
-
max-width: 1200px;
|
468 |
-
margin: auto;
|
469 |
-
}
|
470 |
-
.message {
|
471 |
-
font-size: 1.1em;
|
472 |
-
}
|
473 |
-
""",
|
474 |
-
analytics_enabled=True,
|
475 |
-
fill_height=True,
|
476 |
-
fill_width=False,
|
477 |
-
)
|
478 |
|
479 |
return interface
|
480 |
|
@@ -500,7 +564,6 @@ def parse_args():
|
|
500 |
help="Number of DDPM inference steps",
|
501 |
)
|
502 |
|
503 |
-
|
504 |
return parser.parse_args()
|
505 |
|
506 |
|
@@ -528,7 +591,7 @@ def main():
|
|
528 |
print(f"π Available voices: {len(chat_instance.available_voices)}")
|
529 |
|
530 |
# Launch the interface
|
531 |
-
interface.launch(
|
532 |
show_error=True,
|
533 |
quiet=False,
|
534 |
)
|
|
|
17 |
from typing import Iterator, Dict, Any
|
18 |
|
19 |
# Clone and setup VibeVoice if not already present
|
|
|
|
|
20 |
vibevoice_dir = Path('./VibeVoice')
|
21 |
if not vibevoice_dir.exists():
|
22 |
print("Cloning VibeVoice repository...")
|
|
|
37 |
|
38 |
# Import VibeVoice modules
|
39 |
try:
|
|
|
40 |
from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
|
41 |
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
42 |
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
43 |
from vibevoice.modular.streamer import AudioStreamer
|
44 |
except ImportError:
|
45 |
try:
|
|
|
46 |
import importlib.util
|
47 |
|
|
|
48 |
def load_module(module_name, file_path):
|
49 |
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
50 |
module = importlib.util.module_from_spec(spec)
|
|
|
52 |
spec.loader.exec_module(module)
|
53 |
return module
|
54 |
|
|
|
55 |
config_module = load_module(
|
56 |
"vibevoice_config",
|
57 |
vibevoice_dir / "modular" / "configuration_vibevoice.py"
|
|
|
84 |
"cd VibeVoice/\n"
|
85 |
"pip install -e .\n"
|
86 |
)
|
87 |
+
|
88 |
from transformers.utils import logging
|
89 |
from transformers import set_seed
|
90 |
|
|
|
146 |
"""Setup voice presets from the voices directory."""
|
147 |
voices_dir = os.path.join(os.path.dirname(__file__), "voices")
|
148 |
|
149 |
+
# Create voices directory if it doesn't exist
|
150 |
if not os.path.exists(voices_dir):
|
151 |
+
os.makedirs(voices_dir)
|
152 |
+
print(f"Created voices directory at {voices_dir}")
|
153 |
+
print("Please add voice sample files (.wav, .mp3, etc.) to this directory")
|
154 |
|
155 |
self.available_voices = {}
|
156 |
audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
|
157 |
|
158 |
+
# Scan for audio files
|
159 |
for file in os.listdir(voices_dir):
|
160 |
if file.lower().endswith(audio_extensions):
|
161 |
name = os.path.splitext(file)[0]
|
162 |
self.available_voices[name] = os.path.join(voices_dir, file)
|
163 |
|
164 |
+
# Sort voices alphabetically
|
165 |
self.available_voices = dict(sorted(self.available_voices.items()))
|
166 |
+
|
167 |
+
if not self.available_voices:
|
168 |
+
print(f"Warning: No voice files found in {voices_dir}")
|
169 |
+
print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
|
170 |
+
# Add a default "None" option
|
171 |
+
self.available_voices = {"Default": None}
|
172 |
+
else:
|
173 |
+
print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
|
174 |
|
175 |
def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
|
176 |
"""Read and preprocess audio file."""
|
|
|
183 |
return wav
|
184 |
except Exception as e:
|
185 |
print(f"Error reading audio {audio_path}: {e}")
|
186 |
+
return np.zeros(24000) # Return 1 second of silence as fallback
|
187 |
|
188 |
def format_script(self, message: str, num_speakers: int = 2) -> str:
|
189 |
"""Format input message into a script with speaker assignments."""
|
|
|
226 |
|
227 |
# Format the script
|
228 |
formatted_script = self.format_script(message, num_speakers)
|
229 |
+
print(f"Formatted script:\n{formatted_script}")
|
230 |
|
231 |
# Select voices based on number of speakers
|
232 |
+
selected_voices = []
|
233 |
+
if voice_1 and voice_1 != "Default":
|
234 |
+
selected_voices.append(voice_1)
|
235 |
+
if num_speakers > 1 and voice_2 and voice_2 != "Default":
|
236 |
selected_voices.append(voice_2)
|
237 |
|
238 |
# Load voice samples
|
|
|
241 |
# Use the appropriate voice for each speaker
|
242 |
if i < len(selected_voices):
|
243 |
voice_name = selected_voices[i]
|
244 |
+
if voice_name in self.available_voices and self.available_voices[voice_name]:
|
245 |
+
audio_data = self.read_audio(self.available_voices[voice_name])
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
else:
|
247 |
+
audio_data = np.zeros(24000) # Default silence
|
|
|
248 |
else:
|
249 |
+
# Use first voice or default if not enough voices selected
|
250 |
+
if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
|
251 |
+
audio_data = self.read_audio(self.available_voices[selected_voices[0]])
|
252 |
+
else:
|
253 |
+
audio_data = np.zeros(24000) # Default silence
|
254 |
+
|
255 |
+
voice_samples.append(audio_data)
|
256 |
|
257 |
+
print(f"Loaded {len(voice_samples)} voice samples")
|
|
|
258 |
|
259 |
# Process inputs
|
260 |
inputs = self.processor(
|
|
|
292 |
sample_rate = 24000
|
293 |
audio_stream = audio_streamer.get_stream(0)
|
294 |
|
295 |
+
all_audio_chunks = []
|
296 |
+
chunk_count = 0
|
297 |
+
|
298 |
for audio_chunk in audio_stream:
|
299 |
if self.stop_generation:
|
300 |
audio_streamer.end()
|
301 |
break
|
302 |
|
303 |
+
chunk_count += 1
|
304 |
+
|
305 |
# Convert to numpy
|
306 |
if torch.is_tensor(audio_chunk):
|
307 |
if audio_chunk.dtype == torch.bfloat16:
|
|
|
316 |
|
317 |
# Convert to 16-bit
|
318 |
audio_16bit = self.convert_to_16_bit_wav(audio_np)
|
319 |
+
all_audio_chunks.append(audio_16bit)
|
320 |
|
321 |
+
# Yield accumulated audio
|
322 |
+
if all_audio_chunks:
|
323 |
+
complete_audio = np.concatenate(all_audio_chunks)
|
324 |
+
yield (sample_rate, complete_audio)
|
325 |
|
326 |
# Wait for generation to complete
|
327 |
generation_thread.join(timeout=5.0)
|
328 |
|
329 |
+
# Final yield with complete audio
|
330 |
+
if all_audio_chunks:
|
331 |
+
complete_audio = np.concatenate(all_audio_chunks)
|
332 |
+
yield (sample_rate, complete_audio)
|
333 |
+
|
334 |
self.current_streamer = None
|
335 |
self.is_generating = False
|
336 |
|
|
|
392 |
"""Create a simplified Gradio ChatInterface for VibeVoice."""
|
393 |
|
394 |
# Get available voices
|
395 |
+
voice_options = list(chat_instance.available_voices.keys())
|
396 |
+
if not voice_options:
|
397 |
+
voice_options = ["Default"]
|
398 |
+
|
399 |
+
default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
|
400 |
default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
|
401 |
|
402 |
+
# Define the chat function that returns audio
|
403 |
+
def chat_fn(message: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
|
404 |
"""Process chat message and generate audio response."""
|
405 |
+
|
406 |
+
# Extract text from message
|
407 |
if isinstance(message, dict):
|
408 |
text = message.get("text", "")
|
409 |
else:
|
410 |
text = message
|
411 |
|
412 |
if not text.strip():
|
413 |
+
return history + [[text, None]]
|
414 |
|
415 |
try:
|
416 |
+
# Add the user message to history
|
417 |
+
history = history + [[text, None]]
|
418 |
+
|
419 |
# Generate audio stream
|
420 |
audio_generator = chat_instance.generate_audio_stream(
|
421 |
text, history, voice_1, voice_2, num_speakers, cfg_scale
|
422 |
)
|
423 |
|
424 |
+
# Collect all audio data
|
425 |
audio_data = None
|
426 |
for audio_chunk in audio_generator:
|
427 |
if audio_chunk is not None:
|
428 |
audio_data = audio_chunk
|
|
|
429 |
|
430 |
+
# Update the last message with audio response
|
431 |
if audio_data:
|
432 |
+
# Create audio element
|
433 |
+
history[-1][1] = audio_data
|
434 |
else:
|
435 |
+
history[-1][1] = "Failed to generate audio"
|
436 |
+
|
437 |
+
return history
|
438 |
+
|
439 |
except Exception as e:
|
440 |
print(f"Error in chat_fn: {e}")
|
441 |
import traceback
|
442 |
traceback.print_exc()
|
443 |
+
history[-1][1] = f"Error: {str(e)}"
|
444 |
+
return history
|
445 |
|
446 |
+
# Create the interface using Blocks for more control
|
447 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")) as interface:
|
448 |
+
gr.Markdown("# ποΈ VibeVoice Chat\nGenerate natural dialogue audio with AI voices")
|
449 |
+
|
450 |
+
with gr.Row():
|
451 |
+
with gr.Column(scale=1):
|
452 |
+
gr.Markdown("### Voice & Generation Settings")
|
453 |
+
|
454 |
+
voice_1 = gr.Dropdown(
|
455 |
+
choices=voice_options,
|
456 |
+
value=default_voice_1,
|
457 |
+
label="Voice 1",
|
458 |
+
info="Select voice for Speaker 0"
|
459 |
+
)
|
460 |
+
|
461 |
+
voice_2 = gr.Dropdown(
|
462 |
+
choices=voice_options,
|
463 |
+
value=default_voice_2,
|
464 |
+
label="Voice 2",
|
465 |
+
info="Select voice for Speaker 1 (if using multiple speakers)"
|
466 |
+
)
|
467 |
+
|
468 |
+
num_speakers = gr.Slider(
|
469 |
+
minimum=1,
|
470 |
+
maximum=2,
|
471 |
+
value=2,
|
472 |
+
step=1,
|
473 |
+
label="Number of Speakers",
|
474 |
+
info="Number of speakers in the dialogue"
|
475 |
+
)
|
476 |
+
|
477 |
+
cfg_scale = gr.Slider(
|
478 |
+
minimum=1.0,
|
479 |
+
maximum=2.0,
|
480 |
+
value=1.3,
|
481 |
+
step=0.05,
|
482 |
+
label="CFG Scale",
|
483 |
+
info="Guidance strength (higher = more adherence to text)"
|
484 |
+
)
|
485 |
+
|
486 |
+
with gr.Column(scale=2):
|
487 |
+
chatbot = gr.Chatbot(
|
488 |
+
label="Conversation",
|
489 |
+
height=400,
|
490 |
+
type="tuples"
|
491 |
+
)
|
492 |
+
|
493 |
+
msg = gr.Textbox(
|
494 |
+
label="Message",
|
495 |
+
placeholder="Type your message or paste a script...",
|
496 |
+
lines=3
|
497 |
+
)
|
498 |
+
|
499 |
+
with gr.Row():
|
500 |
+
submit = gr.Button("π΅ Generate Audio", variant="primary")
|
501 |
+
clear = gr.Button("ποΈ Clear")
|
502 |
+
|
503 |
+
# Example messages
|
504 |
+
gr.Examples(
|
505 |
+
examples=[
|
506 |
+
"Hello! How are you doing today?",
|
507 |
+
"Speaker 0: Welcome to our podcast!\nSpeaker 1: Thanks for having me!",
|
508 |
+
"Tell me an interesting fact about space.",
|
509 |
+
"What's your favorite type of music and why?",
|
510 |
+
],
|
511 |
+
inputs=msg,
|
512 |
+
label="Example Messages"
|
513 |
+
)
|
514 |
+
|
515 |
+
# Event handlers
|
516 |
+
def user_submit(message, history, v1, v2, ns, cfg):
|
517 |
+
return chat_fn(message, history, v1, v2, ns, cfg)
|
518 |
+
|
519 |
+
msg.submit(
|
520 |
+
user_submit,
|
521 |
+
[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
|
522 |
+
[chatbot],
|
523 |
+
queue=True
|
524 |
+
).then(
|
525 |
+
lambda: "",
|
526 |
+
None,
|
527 |
+
[msg]
|
528 |
)
|
529 |
+
|
530 |
+
submit.click(
|
531 |
+
user_submit,
|
532 |
+
[msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
|
533 |
+
[chatbot],
|
534 |
+
queue=True
|
535 |
+
).then(
|
536 |
+
lambda: "",
|
537 |
+
None,
|
538 |
+
[msg]
|
539 |
+
)
|
540 |
+
|
541 |
+
clear.click(lambda: ([], ""), None, [chatbot, msg])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
|
543 |
return interface
|
544 |
|
|
|
564 |
help="Number of DDPM inference steps",
|
565 |
)
|
566 |
|
|
|
567 |
return parser.parse_args()
|
568 |
|
569 |
|
|
|
591 |
print(f"π Available voices: {len(chat_instance.available_voices)}")
|
592 |
|
593 |
# Launch the interface
|
594 |
+
interface.queue(max_size=10).launch(
|
595 |
show_error=True,
|
596 |
quiet=False,
|
597 |
)
|