akhaliq HF Staff commited on
Commit
42fb4a5
Β·
verified Β·
1 Parent(s): 5ffdcee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -23
app.py CHANGED
@@ -102,12 +102,20 @@ class VibeVoiceChat:
102
  self.stop_generation = False
103
  self.current_streamer = None
104
 
105
- # Check GPU availability
106
  if torch.cuda.is_available():
107
  print(f"βœ“ GPU detected: {torch.cuda.get_device_name(0)}")
108
  print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
 
 
 
 
 
 
 
109
  else:
110
- print("βœ— No GPU detected, using CPU (generation will be slower)")
 
111
 
112
  self.load_model()
113
  self.setup_voice_presets()
@@ -115,32 +123,55 @@ class VibeVoiceChat:
115
  def load_model(self):
116
  """Load the VibeVoice model and processor."""
117
  print(f"Loading model from {self.model_path}")
 
118
 
119
  self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
120
 
121
  if torch.cuda.is_available():
122
- self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
123
- self.model_path,
124
- torch_dtype=torch.bfloat16,
125
- device_map='cuda',
126
- attn_implementation="flash_attention_2",
127
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  else:
 
129
  self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
130
  self.model_path,
131
  torch_dtype=torch.float32,
132
  device_map='cpu',
 
133
  )
134
 
135
  self.model.eval()
136
 
137
- # Configure noise scheduler
138
  self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
139
  self.model.model.noise_scheduler.config,
140
  algorithm_type='sde-dpmsolver++',
141
  beta_schedule='squaredcos_cap_v2'
142
  )
143
  self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
 
 
 
 
 
 
 
144
 
145
  def setup_voice_presets(self):
146
  """Setup voice presets from the voices directory."""
@@ -227,6 +258,10 @@ class VibeVoiceChat:
227
  # Format the script
228
  formatted_script = self.format_script(message, num_speakers)
229
  print(f"Formatted script:\n{formatted_script}")
 
 
 
 
230
 
231
  # Select voices based on number of speakers
232
  selected_voices = []
@@ -265,9 +300,13 @@ class VibeVoiceChat:
265
  return_attention_mask=True,
266
  )
267
 
268
- # Move to device
269
  if self.device == "cuda":
270
  inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
 
 
 
 
271
 
272
  # Create audio streamer
273
  audio_streamer = AudioStreamer(
@@ -329,6 +368,12 @@ class VibeVoiceChat:
329
  # Final yield with complete audio
330
  if all_audio_chunks:
331
  complete_audio = np.concatenate(all_audio_chunks)
 
 
 
 
 
 
332
  yield (sample_rate, complete_audio)
333
 
334
  self.current_streamer = None
@@ -348,17 +393,32 @@ class VibeVoiceChat:
348
  def check_stop():
349
  return self.stop_generation
350
 
351
- outputs = self.model.generate(
352
- **inputs,
353
- max_new_tokens=None,
354
- cfg_scale=cfg_scale,
355
- tokenizer=self.processor.tokenizer,
356
- generation_config={'do_sample': False},
357
- audio_streamer=audio_streamer,
358
- stop_check_fn=check_stop,
359
- verbose=False,
360
- refresh_negative=True,
361
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  except Exception as e:
363
  print(f"Error in generation thread: {e}")
364
  import traceback
@@ -572,8 +632,8 @@ def parse_args():
572
  parser.add_argument(
573
  "--inference_steps",
574
  type=int,
575
- default=10,
576
- help="Number of DDPM inference steps",
577
  )
578
 
579
  return parser.parse_args()
@@ -600,8 +660,15 @@ def main():
600
  print(f"πŸš€ Launching chat interface")
601
  print(f"πŸ“ Model: {args.model_path}")
602
  print(f"πŸ’» Device: {chat_instance.device}")
 
603
  print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
604
 
 
 
 
 
 
 
605
  # Launch the interface
606
  interface.queue(max_size=10).launch(
607
  show_error=True,
 
102
  self.stop_generation = False
103
  self.current_streamer = None
104
 
105
+ # Check GPU availability and CUDA version
106
  if torch.cuda.is_available():
107
  print(f"βœ“ GPU detected: {torch.cuda.get_device_name(0)}")
108
  print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
109
+ print(f" CUDA Version: {torch.version.cuda}")
110
+ print(f" PyTorch CUDA: {torch.cuda.is_available()}")
111
+ # Set memory fraction to avoid OOM
112
+ torch.cuda.set_per_process_memory_fraction(0.95)
113
+ # Enable TF32 for faster computation on Ampere GPUs
114
+ torch.backends.cuda.matmul.allow_tf32 = True
115
+ torch.backends.cudnn.allow_tf32 = True
116
  else:
117
+ print("βœ— No GPU detected, using CPU (generation will be VERY slow)")
118
+ print(" For faster generation, ensure CUDA is properly installed")
119
 
120
  self.load_model()
121
  self.setup_voice_presets()
 
123
  def load_model(self):
124
  """Load the VibeVoice model and processor."""
125
  print(f"Loading model from {self.model_path}")
126
+ start_time = time.time()
127
 
128
  self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
129
 
130
  if torch.cuda.is_available():
131
+ print("Loading model with GPU acceleration...")
132
+ try:
133
+ self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
134
+ self.model_path,
135
+ torch_dtype=torch.bfloat16,
136
+ device_map='cuda:0',
137
+ attn_implementation="flash_attention_2",
138
+ low_cpu_mem_usage=True,
139
+ )
140
+ print("βœ“ Flash Attention 2 enabled for faster generation")
141
+ except Exception as e:
142
+ print(f"Warning: Could not load with flash_attention_2: {e}")
143
+ print("Falling back to standard attention...")
144
+ self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
145
+ self.model_path,
146
+ torch_dtype=torch.bfloat16,
147
+ device_map='cuda:0',
148
+ low_cpu_mem_usage=True,
149
+ )
150
  else:
151
+ print("Loading model on CPU (this will be slow)...")
152
  self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
153
  self.model_path,
154
  torch_dtype=torch.float32,
155
  device_map='cpu',
156
+ low_cpu_mem_usage=True,
157
  )
158
 
159
  self.model.eval()
160
 
161
+ # Configure noise scheduler for faster inference
162
  self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
163
  self.model.model.noise_scheduler.config,
164
  algorithm_type='sde-dpmsolver++',
165
  beta_schedule='squaredcos_cap_v2'
166
  )
167
  self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
168
+
169
+ load_time = time.time() - start_time
170
+ print(f"βœ“ Model loaded in {load_time:.2f} seconds")
171
+
172
+ # Print model device
173
+ if hasattr(self.model, 'device'):
174
+ print(f"Model device: {self.model.device}")
175
 
176
  def setup_voice_presets(self):
177
  """Setup voice presets from the voices directory."""
 
258
  # Format the script
259
  formatted_script = self.format_script(message, num_speakers)
260
  print(f"Formatted script:\n{formatted_script}")
261
+ print(f"Using device: {self.device}")
262
+
263
+ # Start timing
264
+ start_time = time.time()
265
 
266
  # Select voices based on number of speakers
267
  selected_voices = []
 
300
  return_attention_mask=True,
301
  )
302
 
303
+ # Move to device and ensure correct dtype
304
  if self.device == "cuda":
305
  inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
306
+ print(f"βœ“ Inputs moved to GPU")
307
+ # Check GPU memory
308
+ if torch.cuda.is_available():
309
+ print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
310
 
311
  # Create audio streamer
312
  audio_streamer = AudioStreamer(
 
368
  # Final yield with complete audio
369
  if all_audio_chunks:
370
  complete_audio = np.concatenate(all_audio_chunks)
371
+ generation_time = time.time() - start_time
372
+ audio_duration = len(complete_audio) / sample_rate
373
+ print(f"βœ“ Generation complete:")
374
+ print(f" Time taken: {generation_time:.2f} seconds")
375
+ print(f" Audio duration: {audio_duration:.2f} seconds")
376
+ print(f" Real-time factor: {audio_duration/generation_time:.2f}x")
377
  yield (sample_rate, complete_audio)
378
 
379
  self.current_streamer = None
 
393
  def check_stop():
394
  return self.stop_generation
395
 
396
+ # Use torch.cuda.amp for mixed precision if available
397
+ if self.device == "cuda" and torch.cuda.is_available():
398
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
399
+ outputs = self.model.generate(
400
+ **inputs,
401
+ max_new_tokens=None,
402
+ cfg_scale=cfg_scale,
403
+ tokenizer=self.processor.tokenizer,
404
+ generation_config={'do_sample': False},
405
+ audio_streamer=audio_streamer,
406
+ stop_check_fn=check_stop,
407
+ verbose=False,
408
+ refresh_negative=True,
409
+ )
410
+ else:
411
+ outputs = self.model.generate(
412
+ **inputs,
413
+ max_new_tokens=None,
414
+ cfg_scale=cfg_scale,
415
+ tokenizer=self.processor.tokenizer,
416
+ generation_config={'do_sample': False},
417
+ audio_streamer=audio_streamer,
418
+ stop_check_fn=check_stop,
419
+ verbose=False,
420
+ refresh_negative=True,
421
+ )
422
  except Exception as e:
423
  print(f"Error in generation thread: {e}")
424
  import traceback
 
632
  parser.add_argument(
633
  "--inference_steps",
634
  type=int,
635
+ default=5,
636
+ help="Number of DDPM inference steps (lower = faster, higher = better quality)",
637
  )
638
 
639
  return parser.parse_args()
 
660
  print(f"πŸš€ Launching chat interface")
661
  print(f"πŸ“ Model: {args.model_path}")
662
  print(f"πŸ’» Device: {chat_instance.device}")
663
+ print(f"πŸ”’ Inference steps: {args.inference_steps}")
664
  print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
665
 
666
+ if chat_instance.device == "cpu":
667
+ print("\n⚠️ WARNING: Running on CPU - generation will be VERY slow!")
668
+ print(" For faster generation, ensure you have:")
669
+ print(" 1. NVIDIA GPU with CUDA support")
670
+ print(" 2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
671
+
672
  # Launch the interface
673
  interface.queue(max_size=10).launch(
674
  show_error=True,