Spaces:

akhaliq
/

VibeVoice-1.5B

Running on L4

App Files Files Community

akhaliq HF Staff commited on 7 days ago

Commit

42fb4a5

verified ·

1 Parent(s): 5ffdcee

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -23

app.py CHANGED Viewed

@@ -102,12 +102,20 @@ class VibeVoiceChat:
         self.stop_generation = False
         self.current_streamer = None
-        # Check GPU availability
         if torch.cuda.is_available():
             print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
         else:
-            print("✗ No GPU detected, using CPU (generation will be slower)")
         self.load_model()
         self.setup_voice_presets()
@@ -115,32 +123,55 @@ class VibeVoiceChat:
     def load_model(self):
         """Load the VibeVoice model and processor."""
         print(f"Loading model from {self.model_path}")
         self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
         if torch.cuda.is_available():
-            self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-                self.model_path,
-                torch_dtype=torch.bfloat16,
-                device_map='cuda',
-                attn_implementation="flash_attention_2",
-            )
         else:
             self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
                 self.model_path,
                 torch_dtype=torch.float32,
                 device_map='cpu',
             )
         self.model.eval()
-        # Configure noise scheduler
         self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
             self.model.model.noise_scheduler.config,
             algorithm_type='sde-dpmsolver++',
             beta_schedule='squaredcos_cap_v2'
         )
         self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
@@ -227,6 +258,10 @@ class VibeVoiceChat:
             # Format the script
             formatted_script = self.format_script(message, num_speakers)
             print(f"Formatted script:\n{formatted_script}")
             # Select voices based on number of speakers
             selected_voices = []
@@ -265,9 +300,13 @@ class VibeVoiceChat:
                 return_attention_mask=True,
             )
-            # Move to device
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
             # Create audio streamer
             audio_streamer = AudioStreamer(
@@ -329,6 +368,12 @@ class VibeVoiceChat:
             # Final yield with complete audio
             if all_audio_chunks:
                 complete_audio = np.concatenate(all_audio_chunks)
                 yield (sample_rate, complete_audio)
             self.current_streamer = None
@@ -348,17 +393,32 @@ class VibeVoiceChat:
             def check_stop():
                 return self.stop_generation
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=None,
-                cfg_scale=cfg_scale,
-                tokenizer=self.processor.tokenizer,
-                generation_config={'do_sample': False},
-                audio_streamer=audio_streamer,
-                stop_check_fn=check_stop,
-                verbose=False,
-                refresh_negative=True,
-            )
         except Exception as e:
             print(f"Error in generation thread: {e}")
             import traceback
@@ -572,8 +632,8 @@ def parse_args():
     parser.add_argument(
         "--inference_steps",
         type=int,
-        default=10,
-        help="Number of DDPM inference steps",
     )
     return parser.parse_args()
@@ -600,8 +660,15 @@ def main():
     print(f"🚀 Launching chat interface")
     print(f"📁 Model: {args.model_path}")
     print(f"💻 Device: {chat_instance.device}")
     print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
     # Launch the interface
     interface.queue(max_size=10).launch(
         show_error=True,

         self.stop_generation = False
         self.current_streamer = None
+        # Check GPU availability and CUDA version
         if torch.cuda.is_available():
             print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
             print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+            print(f"  CUDA Version: {torch.version.cuda}")
+            print(f"  PyTorch CUDA: {torch.cuda.is_available()}")
+            # Set memory fraction to avoid OOM
+            torch.cuda.set_per_process_memory_fraction(0.95)
+            # Enable TF32 for faster computation on Ampere GPUs
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
         else:
+            print("✗ No GPU detected, using CPU (generation will be VERY slow)")
+            print("  For faster generation, ensure CUDA is properly installed")
         self.load_model()
         self.setup_voice_presets()
     def load_model(self):
         """Load the VibeVoice model and processor."""
         print(f"Loading model from {self.model_path}")
+        start_time = time.time()
         self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
         if torch.cuda.is_available():
+            print("Loading model with GPU acceleration...")
+            try:
+                self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                    self.model_path,
+                    torch_dtype=torch.bfloat16,
+                    device_map='cuda:0',
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                )
+                print("✓ Flash Attention 2 enabled for faster generation")
+            except Exception as e:
+                print(f"Warning: Could not load with flash_attention_2: {e}")
+                print("Falling back to standard attention...")
+                self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                    self.model_path,
+                    torch_dtype=torch.bfloat16,
+                    device_map='cuda:0',
+                    low_cpu_mem_usage=True,
+                )
         else:
+            print("Loading model on CPU (this will be slow)...")
             self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
                 self.model_path,
                 torch_dtype=torch.float32,
                 device_map='cpu',
+                low_cpu_mem_usage=True,
             )
         self.model.eval()
+        # Configure noise scheduler for faster inference
         self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
             self.model.model.noise_scheduler.config,
             algorithm_type='sde-dpmsolver++',
             beta_schedule='squaredcos_cap_v2'
         )
         self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
+        load_time = time.time() - start_time
+        print(f"✓ Model loaded in {load_time:.2f} seconds")
+        # Print model device
+        if hasattr(self.model, 'device'):
+            print(f"Model device: {self.model.device}")
     def setup_voice_presets(self):
         """Setup voice presets from the voices directory."""
             # Format the script
             formatted_script = self.format_script(message, num_speakers)
             print(f"Formatted script:\n{formatted_script}")
+            print(f"Using device: {self.device}")
+            # Start timing
+            start_time = time.time()
             # Select voices based on number of speakers
             selected_voices = []
                 return_attention_mask=True,
             )
+            # Move to device and ensure correct dtype
             if self.device == "cuda":
                 inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
+                print(f"✓ Inputs moved to GPU")
+                # Check GPU memory
+                if torch.cuda.is_available():
+                    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
             # Create audio streamer
             audio_streamer = AudioStreamer(
             # Final yield with complete audio
             if all_audio_chunks:
                 complete_audio = np.concatenate(all_audio_chunks)
+                generation_time = time.time() - start_time
+                audio_duration = len(complete_audio) / sample_rate
+                print(f"✓ Generation complete:")
+                print(f"  Time taken: {generation_time:.2f} seconds")
+                print(f"  Audio duration: {audio_duration:.2f} seconds")
+                print(f"  Real-time factor: {audio_duration/generation_time:.2f}x")
                 yield (sample_rate, complete_audio)
             self.current_streamer = None
             def check_stop():
                 return self.stop_generation
+            # Use torch.cuda.amp for mixed precision if available
+            if self.device == "cuda" and torch.cuda.is_available():
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_new_tokens=None,
+                        cfg_scale=cfg_scale,
+                        tokenizer=self.processor.tokenizer,
+                        generation_config={'do_sample': False},
+                        audio_streamer=audio_streamer,
+                        stop_check_fn=check_stop,
+                        verbose=False,
+                        refresh_negative=True,
+                    )
+            else:
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=None,
+                    cfg_scale=cfg_scale,
+                    tokenizer=self.processor.tokenizer,
+                    generation_config={'do_sample': False},
+                    audio_streamer=audio_streamer,
+                    stop_check_fn=check_stop,
+                    verbose=False,
+                    refresh_negative=True,
+                )
         except Exception as e:
             print(f"Error in generation thread: {e}")
             import traceback
     parser.add_argument(
         "--inference_steps",
         type=int,
+        default=5,
+        help="Number of DDPM inference steps (lower = faster, higher = better quality)",
     )
     return parser.parse_args()
     print(f"🚀 Launching chat interface")
     print(f"📁 Model: {args.model_path}")
     print(f"💻 Device: {chat_instance.device}")
+    print(f"🔢 Inference steps: {args.inference_steps}")
     print(f"🎭 Available voices: {len(chat_instance.available_voices)}")
+    if chat_instance.device == "cpu":
+        print("\n⚠️  WARNING: Running on CPU - generation will be VERY slow!")
+        print("   For faster generation, ensure you have:")
+        print("   1. NVIDIA GPU with CUDA support")
+        print("   2. PyTorch with CUDA installed: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
     # Launch the interface
     interface.queue(max_size=10).launch(
         show_error=True,