Spaces:
Running
on
L4
Running
on
L4
Update app.py
Browse files
app.py
CHANGED
@@ -102,12 +102,20 @@ class VibeVoiceChat:
|
|
102 |
self.stop_generation = False
|
103 |
self.current_streamer = None
|
104 |
|
105 |
-
# Check GPU availability
|
106 |
if torch.cuda.is_available():
|
107 |
print(f"β GPU detected: {torch.cuda.get_device_name(0)}")
|
108 |
print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
else:
|
110 |
-
print("β No GPU detected, using CPU (generation will be
|
|
|
111 |
|
112 |
self.load_model()
|
113 |
self.setup_voice_presets()
|
@@ -115,32 +123,55 @@ class VibeVoiceChat:
|
|
115 |
def load_model(self):
|
116 |
"""Load the VibeVoice model and processor."""
|
117 |
print(f"Loading model from {self.model_path}")
|
|
|
118 |
|
119 |
self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
|
120 |
|
121 |
if torch.cuda.is_available():
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
else:
|
|
|
129 |
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
130 |
self.model_path,
|
131 |
torch_dtype=torch.float32,
|
132 |
device_map='cpu',
|
|
|
133 |
)
|
134 |
|
135 |
self.model.eval()
|
136 |
|
137 |
-
# Configure noise scheduler
|
138 |
self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
|
139 |
self.model.model.noise_scheduler.config,
|
140 |
algorithm_type='sde-dpmsolver++',
|
141 |
beta_schedule='squaredcos_cap_v2'
|
142 |
)
|
143 |
self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
def setup_voice_presets(self):
|
146 |
"""Setup voice presets from the voices directory."""
|
@@ -227,6 +258,10 @@ class VibeVoiceChat:
|
|
227 |
# Format the script
|
228 |
formatted_script = self.format_script(message, num_speakers)
|
229 |
print(f"Formatted script:\n{formatted_script}")
|
|
|
|
|
|
|
|
|
230 |
|
231 |
# Select voices based on number of speakers
|
232 |
selected_voices = []
|
@@ -265,9 +300,13 @@ class VibeVoiceChat:
|
|
265 |
return_attention_mask=True,
|
266 |
)
|
267 |
|
268 |
-
# Move to device
|
269 |
if self.device == "cuda":
|
270 |
inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
271 |
|
272 |
# Create audio streamer
|
273 |
audio_streamer = AudioStreamer(
|
@@ -329,6 +368,12 @@ class VibeVoiceChat:
|
|
329 |
# Final yield with complete audio
|
330 |
if all_audio_chunks:
|
331 |
complete_audio = np.concatenate(all_audio_chunks)
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
yield (sample_rate, complete_audio)
|
333 |
|
334 |
self.current_streamer = None
|
@@ -348,17 +393,32 @@ class VibeVoiceChat:
|
|
348 |
def check_stop():
|
349 |
return self.stop_generation
|
350 |
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
except Exception as e:
|
363 |
print(f"Error in generation thread: {e}")
|
364 |
import traceback
|
@@ -572,8 +632,8 @@ def parse_args():
|
|
572 |
parser.add_argument(
|
573 |
"--inference_steps",
|
574 |
type=int,
|
575 |
-
default=
|
576 |
-
help="Number of DDPM inference steps",
|
577 |
)
|
578 |
|
579 |
return parser.parse_args()
|
@@ -600,8 +660,15 @@ def main():
|
|
600 |
print(f"π Launching chat interface")
|
601 |
print(f"π Model: {args.model_path}")
|
602 |
print(f"π» Device: {chat_instance.device}")
|
|
|
603 |
print(f"π Available voices: {len(chat_instance.available_voices)}")
|
604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
# Launch the interface
|
606 |
interface.queue(max_size=10).launch(
|
607 |
show_error=True,
|
|
|
102 |
self.stop_generation = False
|
103 |
self.current_streamer = None
|
104 |
|
105 |
+
# Check GPU availability and CUDA version
|
106 |
if torch.cuda.is_available():
|
107 |
print(f"β GPU detected: {torch.cuda.get_device_name(0)}")
|
108 |
print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
|
109 |
+
print(f" CUDA Version: {torch.version.cuda}")
|
110 |
+
print(f" PyTorch CUDA: {torch.cuda.is_available()}")
|
111 |
+
# Set memory fraction to avoid OOM
|
112 |
+
torch.cuda.set_per_process_memory_fraction(0.95)
|
113 |
+
# Enable TF32 for faster computation on Ampere GPUs
|
114 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
115 |
+
torch.backends.cudnn.allow_tf32 = True
|
116 |
else:
|
117 |
+
print("β No GPU detected, using CPU (generation will be VERY slow)")
|
118 |
+
print(" For faster generation, ensure CUDA is properly installed")
|
119 |
|
120 |
self.load_model()
|
121 |
self.setup_voice_presets()
|
|
|
123 |
def load_model(self):
|
124 |
"""Load the VibeVoice model and processor."""
|
125 |
print(f"Loading model from {self.model_path}")
|
126 |
+
start_time = time.time()
|
127 |
|
128 |
self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
|
129 |
|
130 |
if torch.cuda.is_available():
|
131 |
+
print("Loading model with GPU acceleration...")
|
132 |
+
try:
|
133 |
+
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
134 |
+
self.model_path,
|
135 |
+
torch_dtype=torch.bfloat16,
|
136 |
+
device_map='cuda:0',
|
137 |
+
attn_implementation="flash_attention_2",
|
138 |
+
low_cpu_mem_usage=True,
|
139 |
+
)
|
140 |
+
print("β Flash Attention 2 enabled for faster generation")
|
141 |
+
except Exception as e:
|
142 |
+
print(f"Warning: Could not load with flash_attention_2: {e}")
|
143 |
+
print("Falling back to standard attention...")
|
144 |
+
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
145 |
+
self.model_path,
|
146 |
+
torch_dtype=torch.bfloat16,
|
147 |
+
device_map='cuda:0',
|
148 |
+
low_cpu_mem_usage=True,
|
149 |
+
)
|
150 |
else:
|
151 |
+
print("Loading model on CPU (this will be slow)...")
|
152 |
self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
153 |
self.model_path,
|
154 |
torch_dtype=torch.float32,
|
155 |
device_map='cpu',
|
156 |
+
low_cpu_mem_usage=True,
|
157 |
)
|
158 |
|
159 |
self.model.eval()
|
160 |
|
161 |
+
# Configure noise scheduler for faster inference
|
162 |
self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
|
163 |
self.model.model.noise_scheduler.config,
|
164 |
algorithm_type='sde-dpmsolver++',
|
165 |
beta_schedule='squaredcos_cap_v2'
|
166 |
)
|
167 |
self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
|
168 |
+
|
169 |
+
load_time = time.time() - start_time
|
170 |
+
print(f"β Model loaded in {load_time:.2f} seconds")
|
171 |
+
|
172 |
+
# Print model device
|
173 |
+
if hasattr(self.model, 'device'):
|
174 |
+
print(f"Model device: {self.model.device}")
|
175 |
|
176 |
def setup_voice_presets(self):
|
177 |
"""Setup voice presets from the voices directory."""
|
|
|
258 |
# Format the script
|
259 |
formatted_script = self.format_script(message, num_speakers)
|
260 |
print(f"Formatted script:\n{formatted_script}")
|
261 |
+
print(f"Using device: {self.device}")
|
262 |
+
|
263 |
+
# Start timing
|
264 |
+
start_time = time.time()
|
265 |
|
266 |
# Select voices based on number of speakers
|
267 |
selected_voices = []
|
|
|
300 |
return_attention_mask=True,
|
301 |
)
|
302 |
|
303 |
+
# Move to device and ensure correct dtype
|
304 |
if self.device == "cuda":
|
305 |
inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
|
306 |
+
print(f"β Inputs moved to GPU")
|
307 |
+
# Check GPU memory
|
308 |
+
if torch.cuda.is_available():
|
309 |
+
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
|
310 |
|
311 |
# Create audio streamer
|
312 |
audio_streamer = AudioStreamer(
|
|
|
368 |
# Final yield with complete audio
|
369 |
if all_audio_chunks:
|
370 |
complete_audio = np.concatenate(all_audio_chunks)
|
371 |
+
generation_time = time.time() - start_time
|
372 |
+
audio_duration = len(complete_audio) / sample_rate
|
373 |
+
print(f"β Generation complete:")
|
374 |
+
print(f" Time taken: {generation_time:.2f} seconds")
|
375 |
+
print(f" Audio duration: {audio_duration:.2f} seconds")
|
376 |
+
print(f" Real-time factor: {audio_duration/generation_time:.2f}x")
|
377 |
yield (sample_rate, complete_audio)
|
378 |
|
379 |
self.current_streamer = None
|
|
|
393 |
def check_stop():
|
394 |
return self.stop_generation
|
395 |
|
396 |
+
# Use torch.cuda.amp for mixed precision if available
|
397 |
+
if self.device == "cuda" and torch.cuda.is_available():
|
398 |
+
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
399 |
+
outputs = self.model.generate(
|
400 |
+
**inputs,
|
401 |
+
max_new_tokens=None,
|
402 |
+
cfg_scale=cfg_scale,
|
403 |
+
tokenizer=self.processor.tokenizer,
|
404 |
+
generation_config={'do_sample': False},
|
405 |
+
audio_streamer=audio_streamer,
|
406 |
+
stop_check_fn=check_stop,
|
407 |
+
verbose=False,
|
408 |
+
refresh_negative=True,
|
409 |
+
)
|
410 |
+
else:
|
411 |
+
outputs = self.model.generate(
|
412 |
+
**inputs,
|
413 |
+
max_new_tokens=None,
|
414 |
+
cfg_scale=cfg_scale,
|
415 |
+
tokenizer=self.processor.tokenizer,
|
416 |
+
generation_config={'do_sample': False},
|
417 |
+
audio_streamer=audio_streamer,
|
418 |
+
stop_check_fn=check_stop,
|
419 |
+
verbose=False,
|
420 |
+
refresh_negative=True,
|
421 |
+
)
|
422 |
except Exception as e:
|
423 |
print(f"Error in generation thread: {e}")
|
424 |
import traceback
|
|
|
632 |
parser.add_argument(
|
633 |
"--inference_steps",
|
634 |
type=int,
|
635 |
+
default=5,
|
636 |
+
help="Number of DDPM inference steps (lower = faster, higher = better quality)",
|
637 |
)
|
638 |
|
639 |
return parser.parse_args()
|
|
|
660 |
print(f"π Launching chat interface")
|
661 |
print(f"π Model: {args.model_path}")
|
662 |
print(f"π» Device: {chat_instance.device}")
|
663 |
+
print(f"π’ Inference steps: {args.inference_steps}")
|
664 |
print(f"π Available voices: {len(chat_instance.available_voices)}")
|
665 |
|
666 |
+
if chat_instance.device == "cpu":
|
667 |
+
print("\nβ οΈ WARNING: Running on CPU - generation will be VERY slow!")
|
668 |
+
print(" For faster generation, ensure you have:")
|
669 |
+
print(" 1. NVIDIA GPU with CUDA support")
|
670 |
+
print(" 2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
|
671 |
+
|
672 |
# Launch the interface
|
673 |
interface.queue(max_size=10).launch(
|
674 |
show_error=True,
|