akhaliq HF Staff commited on
Commit
972f767
Β·
verified Β·
1 Parent(s): 31b623f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +166 -103
app.py CHANGED
@@ -17,8 +17,6 @@ from pathlib import Path
17
  from typing import Iterator, Dict, Any
18
 
19
  # Clone and setup VibeVoice if not already present
20
- import subprocess
21
-
22
  vibevoice_dir = Path('./VibeVoice')
23
  if not vibevoice_dir.exists():
24
  print("Cloning VibeVoice repository...")
@@ -39,17 +37,14 @@ sys.path.insert(0, str(vibevoice_dir))
39
 
40
  # Import VibeVoice modules
41
  try:
42
- # Try direct import first (if installed as package)
43
  from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
44
  from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
45
  from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
46
  from vibevoice.modular.streamer import AudioStreamer
47
  except ImportError:
48
  try:
49
- # Try importing from the cloned directory
50
  import importlib.util
51
 
52
- # Load modules directly from the VibeVoice directory
53
  def load_module(module_name, file_path):
54
  spec = importlib.util.spec_from_file_location(module_name, file_path)
55
  module = importlib.util.module_from_spec(spec)
@@ -57,7 +52,6 @@ except ImportError:
57
  spec.loader.exec_module(module)
58
  return module
59
 
60
- # Load each module
61
  config_module = load_module(
62
  "vibevoice_config",
63
  vibevoice_dir / "modular" / "configuration_vibevoice.py"
@@ -90,6 +84,7 @@ except ImportError:
90
  "cd VibeVoice/\n"
91
  "pip install -e .\n"
92
  )
 
93
  from transformers.utils import logging
94
  from transformers import set_seed
95
 
@@ -151,21 +146,31 @@ class VibeVoiceChat:
151
  """Setup voice presets from the voices directory."""
152
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
153
 
 
154
  if not os.path.exists(voices_dir):
155
- print(f"Warning: Voices directory not found at {voices_dir}")
156
- self.available_voices = {}
157
- return
158
 
159
  self.available_voices = {}
160
  audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
161
 
 
162
  for file in os.listdir(voices_dir):
163
  if file.lower().endswith(audio_extensions):
164
  name = os.path.splitext(file)[0]
165
  self.available_voices[name] = os.path.join(voices_dir, file)
166
 
 
167
  self.available_voices = dict(sorted(self.available_voices.items()))
168
- print(f"Found {len(self.available_voices)} voice presets")
 
 
 
 
 
 
 
169
 
170
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
171
  """Read and preprocess audio file."""
@@ -178,7 +183,7 @@ class VibeVoiceChat:
178
  return wav
179
  except Exception as e:
180
  print(f"Error reading audio {audio_path}: {e}")
181
- return np.array([])
182
 
183
  def format_script(self, message: str, num_speakers: int = 2) -> str:
184
  """Format input message into a script with speaker assignments."""
@@ -221,10 +226,13 @@ class VibeVoiceChat:
221
 
222
  # Format the script
223
  formatted_script = self.format_script(message, num_speakers)
 
224
 
225
  # Select voices based on number of speakers
226
- selected_voices = [voice_1]
227
- if num_speakers > 1 and voice_2:
 
 
228
  selected_voices.append(voice_2)
229
 
230
  # Load voice samples
@@ -233,23 +241,20 @@ class VibeVoiceChat:
233
  # Use the appropriate voice for each speaker
234
  if i < len(selected_voices):
235
  voice_name = selected_voices[i]
236
- else:
237
- # Reuse the first voice if we don't have enough
238
- voice_name = selected_voices[0] if selected_voices else None
239
-
240
- if voice_name and voice_name in self.available_voices:
241
- audio_data = self.read_audio(self.available_voices[voice_name])
242
- if len(audio_data) > 0:
243
- voice_samples.append(audio_data)
244
  else:
245
- # Add default audio if reading failed
246
- voice_samples.append(np.zeros(24000))
247
  else:
248
- # Add default audio if no voice available
249
- voice_samples.append(np.zeros(24000))
 
 
 
 
 
250
 
251
- # Ensure we have exactly the right number of voice samples
252
- voice_samples = voice_samples[:num_speakers]
253
 
254
  # Process inputs
255
  inputs = self.processor(
@@ -287,11 +292,16 @@ class VibeVoiceChat:
287
  sample_rate = 24000
288
  audio_stream = audio_streamer.get_stream(0)
289
 
 
 
 
290
  for audio_chunk in audio_stream:
291
  if self.stop_generation:
292
  audio_streamer.end()
293
  break
294
 
 
 
295
  # Convert to numpy
296
  if torch.is_tensor(audio_chunk):
297
  if audio_chunk.dtype == torch.bfloat16:
@@ -306,12 +316,21 @@ class VibeVoiceChat:
306
 
307
  # Convert to 16-bit
308
  audio_16bit = self.convert_to_16_bit_wav(audio_np)
 
309
 
310
- yield (sample_rate, audio_16bit)
 
 
 
311
 
312
  # Wait for generation to complete
313
  generation_thread.join(timeout=5.0)
314
 
 
 
 
 
 
315
  self.current_streamer = None
316
  self.is_generating = False
317
 
@@ -373,108 +392,153 @@ def create_chat_interface(chat_instance: VibeVoiceChat):
373
  """Create a simplified Gradio ChatInterface for VibeVoice."""
374
 
375
  # Get available voices
376
- voice_options = list(chat_instance.available_voices.keys()) if chat_instance.available_voices else ["None"]
377
- default_voice_1 = voice_options[0] if len(voice_options) > 0 else "None"
 
 
 
378
  default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
379
 
380
- # Define the chat function
381
- def chat_fn(message: Dict[str, Any], history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
382
  """Process chat message and generate audio response."""
383
- # Extract text from message (handle both string and dict inputs)
 
384
  if isinstance(message, dict):
385
  text = message.get("text", "")
386
  else:
387
  text = message
388
 
389
  if not text.strip():
390
- return gr.Audio(value=None)
391
 
392
  try:
 
 
 
393
  # Generate audio stream
394
  audio_generator = chat_instance.generate_audio_stream(
395
  text, history, voice_1, voice_2, num_speakers, cfg_scale
396
  )
397
 
398
- # Get the first audio chunk for immediate response
399
  audio_data = None
400
  for audio_chunk in audio_generator:
401
  if audio_chunk is not None:
402
  audio_data = audio_chunk
403
- break
404
 
405
- # Return audio component
406
  if audio_data:
407
- return gr.Audio(value=audio_data, streaming=True, autoplay=True)
 
408
  else:
409
- return gr.Audio(value=None)
 
 
 
410
  except Exception as e:
411
  print(f"Error in chat_fn: {e}")
412
  import traceback
413
  traceback.print_exc()
414
- return gr.Audio(value=None)
 
415
 
416
- # Create additional inputs
417
- additional_inputs = [
418
- gr.Dropdown(
419
- choices=voice_options,
420
- value=default_voice_1,
421
- label="Voice 1",
422
- info="Select voice for Speaker 0"
423
- ),
424
- gr.Dropdown(
425
- choices=voice_options,
426
- value=default_voice_2,
427
- label="Voice 2",
428
- info="Select voice for Speaker 1 (if using multiple speakers)"
429
- ),
430
- gr.Slider(
431
- minimum=1,
432
- maximum=2,
433
- value=2,
434
- step=1,
435
- label="Number of Speakers",
436
- info="Number of speakers in the dialogue"
437
- ),
438
- gr.Slider(
439
- minimum=1.0,
440
- maximum=2.0,
441
- value=1.3,
442
- step=0.05,
443
- label="CFG Scale",
444
- info="Guidance strength (higher = more adherence to text)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  )
446
- ]
447
-
448
- # Create the ChatInterface without examples to avoid the error
449
- interface = gr.ChatInterface(
450
- fn=chat_fn,
451
- type="messages",
452
- title="πŸŽ™οΈ VibeVoice Chat",
453
- description="Generate natural dialogue audio with AI voices. Type your message or paste a script!",
454
- additional_inputs=additional_inputs,
455
- additional_inputs_accordion=gr.Accordion(label="Voice & Generation Settings", open=True),
456
- submit_btn="🎡 Generate Audio",
457
- stop_btn="⏹️ Stop",
458
- autofocus=True,
459
- autoscroll=True,
460
- show_progress="minimal",
461
- theme=gr.themes.Soft(
462
- primary_hue="blue",
463
- secondary_hue="purple"
464
- ),
465
- css="""
466
- .gradio-container {
467
- max-width: 1200px;
468
- margin: auto;
469
- }
470
- .message {
471
- font-size: 1.1em;
472
- }
473
- """,
474
- analytics_enabled=True,
475
- fill_height=True,
476
- fill_width=False,
477
- )
478
 
479
  return interface
480
 
@@ -500,7 +564,6 @@ def parse_args():
500
  help="Number of DDPM inference steps",
501
  )
502
 
503
-
504
  return parser.parse_args()
505
 
506
 
@@ -528,7 +591,7 @@ def main():
528
  print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
529
 
530
  # Launch the interface
531
- interface.launch(
532
  show_error=True,
533
  quiet=False,
534
  )
 
17
  from typing import Iterator, Dict, Any
18
 
19
  # Clone and setup VibeVoice if not already present
 
 
20
  vibevoice_dir = Path('./VibeVoice')
21
  if not vibevoice_dir.exists():
22
  print("Cloning VibeVoice repository...")
 
37
 
38
  # Import VibeVoice modules
39
  try:
 
40
  from vibevoice.modular.configuration_vibevoice import VibeVoiceConfig
41
  from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
42
  from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
43
  from vibevoice.modular.streamer import AudioStreamer
44
  except ImportError:
45
  try:
 
46
  import importlib.util
47
 
 
48
  def load_module(module_name, file_path):
49
  spec = importlib.util.spec_from_file_location(module_name, file_path)
50
  module = importlib.util.module_from_spec(spec)
 
52
  spec.loader.exec_module(module)
53
  return module
54
 
 
55
  config_module = load_module(
56
  "vibevoice_config",
57
  vibevoice_dir / "modular" / "configuration_vibevoice.py"
 
84
  "cd VibeVoice/\n"
85
  "pip install -e .\n"
86
  )
87
+
88
  from transformers.utils import logging
89
  from transformers import set_seed
90
 
 
146
  """Setup voice presets from the voices directory."""
147
  voices_dir = os.path.join(os.path.dirname(__file__), "voices")
148
 
149
+ # Create voices directory if it doesn't exist
150
  if not os.path.exists(voices_dir):
151
+ os.makedirs(voices_dir)
152
+ print(f"Created voices directory at {voices_dir}")
153
+ print("Please add voice sample files (.wav, .mp3, etc.) to this directory")
154
 
155
  self.available_voices = {}
156
  audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
157
 
158
+ # Scan for audio files
159
  for file in os.listdir(voices_dir):
160
  if file.lower().endswith(audio_extensions):
161
  name = os.path.splitext(file)[0]
162
  self.available_voices[name] = os.path.join(voices_dir, file)
163
 
164
+ # Sort voices alphabetically
165
  self.available_voices = dict(sorted(self.available_voices.items()))
166
+
167
+ if not self.available_voices:
168
+ print(f"Warning: No voice files found in {voices_dir}")
169
+ print("Using default (zero) voice samples. Add audio files to the voices directory for better results.")
170
+ # Add a default "None" option
171
+ self.available_voices = {"Default": None}
172
+ else:
173
+ print(f"Found {len(self.available_voices)} voice presets: {', '.join(self.available_voices.keys())}")
174
 
175
  def read_audio(self, audio_path: str, target_sr: int = 24000) -> np.ndarray:
176
  """Read and preprocess audio file."""
 
183
  return wav
184
  except Exception as e:
185
  print(f"Error reading audio {audio_path}: {e}")
186
+ return np.zeros(24000) # Return 1 second of silence as fallback
187
 
188
  def format_script(self, message: str, num_speakers: int = 2) -> str:
189
  """Format input message into a script with speaker assignments."""
 
226
 
227
  # Format the script
228
  formatted_script = self.format_script(message, num_speakers)
229
+ print(f"Formatted script:\n{formatted_script}")
230
 
231
  # Select voices based on number of speakers
232
+ selected_voices = []
233
+ if voice_1 and voice_1 != "Default":
234
+ selected_voices.append(voice_1)
235
+ if num_speakers > 1 and voice_2 and voice_2 != "Default":
236
  selected_voices.append(voice_2)
237
 
238
  # Load voice samples
 
241
  # Use the appropriate voice for each speaker
242
  if i < len(selected_voices):
243
  voice_name = selected_voices[i]
244
+ if voice_name in self.available_voices and self.available_voices[voice_name]:
245
+ audio_data = self.read_audio(self.available_voices[voice_name])
 
 
 
 
 
 
246
  else:
247
+ audio_data = np.zeros(24000) # Default silence
 
248
  else:
249
+ # Use first voice or default if not enough voices selected
250
+ if selected_voices and selected_voices[0] in self.available_voices and self.available_voices[selected_voices[0]]:
251
+ audio_data = self.read_audio(self.available_voices[selected_voices[0]])
252
+ else:
253
+ audio_data = np.zeros(24000) # Default silence
254
+
255
+ voice_samples.append(audio_data)
256
 
257
+ print(f"Loaded {len(voice_samples)} voice samples")
 
258
 
259
  # Process inputs
260
  inputs = self.processor(
 
292
  sample_rate = 24000
293
  audio_stream = audio_streamer.get_stream(0)
294
 
295
+ all_audio_chunks = []
296
+ chunk_count = 0
297
+
298
  for audio_chunk in audio_stream:
299
  if self.stop_generation:
300
  audio_streamer.end()
301
  break
302
 
303
+ chunk_count += 1
304
+
305
  # Convert to numpy
306
  if torch.is_tensor(audio_chunk):
307
  if audio_chunk.dtype == torch.bfloat16:
 
316
 
317
  # Convert to 16-bit
318
  audio_16bit = self.convert_to_16_bit_wav(audio_np)
319
+ all_audio_chunks.append(audio_16bit)
320
 
321
+ # Yield accumulated audio
322
+ if all_audio_chunks:
323
+ complete_audio = np.concatenate(all_audio_chunks)
324
+ yield (sample_rate, complete_audio)
325
 
326
  # Wait for generation to complete
327
  generation_thread.join(timeout=5.0)
328
 
329
+ # Final yield with complete audio
330
+ if all_audio_chunks:
331
+ complete_audio = np.concatenate(all_audio_chunks)
332
+ yield (sample_rate, complete_audio)
333
+
334
  self.current_streamer = None
335
  self.is_generating = False
336
 
 
392
  """Create a simplified Gradio ChatInterface for VibeVoice."""
393
 
394
  # Get available voices
395
+ voice_options = list(chat_instance.available_voices.keys())
396
+ if not voice_options:
397
+ voice_options = ["Default"]
398
+
399
+ default_voice_1 = voice_options[0] if len(voice_options) > 0 else "Default"
400
  default_voice_2 = voice_options[1] if len(voice_options) > 1 else voice_options[0]
401
 
402
+ # Define the chat function that returns audio
403
+ def chat_fn(message: str, history: list, voice_1: str, voice_2: str, num_speakers: int, cfg_scale: float):
404
  """Process chat message and generate audio response."""
405
+
406
+ # Extract text from message
407
  if isinstance(message, dict):
408
  text = message.get("text", "")
409
  else:
410
  text = message
411
 
412
  if not text.strip():
413
+ return history + [[text, None]]
414
 
415
  try:
416
+ # Add the user message to history
417
+ history = history + [[text, None]]
418
+
419
  # Generate audio stream
420
  audio_generator = chat_instance.generate_audio_stream(
421
  text, history, voice_1, voice_2, num_speakers, cfg_scale
422
  )
423
 
424
+ # Collect all audio data
425
  audio_data = None
426
  for audio_chunk in audio_generator:
427
  if audio_chunk is not None:
428
  audio_data = audio_chunk
 
429
 
430
+ # Update the last message with audio response
431
  if audio_data:
432
+ # Create audio element
433
+ history[-1][1] = audio_data
434
  else:
435
+ history[-1][1] = "Failed to generate audio"
436
+
437
+ return history
438
+
439
  except Exception as e:
440
  print(f"Error in chat_fn: {e}")
441
  import traceback
442
  traceback.print_exc()
443
+ history[-1][1] = f"Error: {str(e)}"
444
+ return history
445
 
446
+ # Create the interface using Blocks for more control
447
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")) as interface:
448
+ gr.Markdown("# πŸŽ™οΈ VibeVoice Chat\nGenerate natural dialogue audio with AI voices")
449
+
450
+ with gr.Row():
451
+ with gr.Column(scale=1):
452
+ gr.Markdown("### Voice & Generation Settings")
453
+
454
+ voice_1 = gr.Dropdown(
455
+ choices=voice_options,
456
+ value=default_voice_1,
457
+ label="Voice 1",
458
+ info="Select voice for Speaker 0"
459
+ )
460
+
461
+ voice_2 = gr.Dropdown(
462
+ choices=voice_options,
463
+ value=default_voice_2,
464
+ label="Voice 2",
465
+ info="Select voice for Speaker 1 (if using multiple speakers)"
466
+ )
467
+
468
+ num_speakers = gr.Slider(
469
+ minimum=1,
470
+ maximum=2,
471
+ value=2,
472
+ step=1,
473
+ label="Number of Speakers",
474
+ info="Number of speakers in the dialogue"
475
+ )
476
+
477
+ cfg_scale = gr.Slider(
478
+ minimum=1.0,
479
+ maximum=2.0,
480
+ value=1.3,
481
+ step=0.05,
482
+ label="CFG Scale",
483
+ info="Guidance strength (higher = more adherence to text)"
484
+ )
485
+
486
+ with gr.Column(scale=2):
487
+ chatbot = gr.Chatbot(
488
+ label="Conversation",
489
+ height=400,
490
+ type="tuples"
491
+ )
492
+
493
+ msg = gr.Textbox(
494
+ label="Message",
495
+ placeholder="Type your message or paste a script...",
496
+ lines=3
497
+ )
498
+
499
+ with gr.Row():
500
+ submit = gr.Button("🎡 Generate Audio", variant="primary")
501
+ clear = gr.Button("πŸ—‘οΈ Clear")
502
+
503
+ # Example messages
504
+ gr.Examples(
505
+ examples=[
506
+ "Hello! How are you doing today?",
507
+ "Speaker 0: Welcome to our podcast!\nSpeaker 1: Thanks for having me!",
508
+ "Tell me an interesting fact about space.",
509
+ "What's your favorite type of music and why?",
510
+ ],
511
+ inputs=msg,
512
+ label="Example Messages"
513
+ )
514
+
515
+ # Event handlers
516
+ def user_submit(message, history, v1, v2, ns, cfg):
517
+ return chat_fn(message, history, v1, v2, ns, cfg)
518
+
519
+ msg.submit(
520
+ user_submit,
521
+ [msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
522
+ [chatbot],
523
+ queue=True
524
+ ).then(
525
+ lambda: "",
526
+ None,
527
+ [msg]
528
  )
529
+
530
+ submit.click(
531
+ user_submit,
532
+ [msg, chatbot, voice_1, voice_2, num_speakers, cfg_scale],
533
+ [chatbot],
534
+ queue=True
535
+ ).then(
536
+ lambda: "",
537
+ None,
538
+ [msg]
539
+ )
540
+
541
+ clear.click(lambda: ([], ""), None, [chatbot, msg])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
  return interface
544
 
 
564
  help="Number of DDPM inference steps",
565
  )
566
 
 
567
  return parser.parse_args()
568
 
569
 
 
591
  print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
592
 
593
  # Launch the interface
594
+ interface.queue(max_size=10).launch(
595
  show_error=True,
596
  quiet=False,
597
  )