Spaces:

yasserrmd
/

VibeVoice

Running on Zero

App Files Files Community

yasserrmd commited on 24 days ago

Commit

1c0cdb5

verified ·

1 Parent(s): 043b99a

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -54

app.py CHANGED Viewed

@@ -68,53 +68,46 @@ class VibeVoiceDemo:
             return np.array([])
     @GPU
-    def generate_podcast(self,
-                     num_speakers: int,
-                     script: str,
-                     speaker_1: str = None,
-                     speaker_2: str = None,
-                     speaker_3: str = None,
-                     speaker_4: str = None,
-                     cfg_scale: float = 1.3):
-        """Generate full podcast audio (no streaming to UI, only final WAV)."""
-        self.stop_generation = False
         self.is_generating = True
         if not script.strip():
             raise gr.Error("Please provide a script.")
-        if not (1 <= num_speakers <= 4):
             raise gr.Error("Number of speakers must be 1–4.")
-        # validate speakers
-        selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
-        for i, sp in enumerate(selected_speakers):
             if not sp or sp not in self.available_voices:
                 raise gr.Error(f"Invalid speaker {i+1} selection.")
-        # load voices
-        voice_samples = []
-        for speaker_name in selected_speakers:
-            audio_path = self.available_voices[speaker_name]
-            audio_data = self.read_audio(audio_path)
-            if len(audio_data) == 0:
-                raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
-            voice_samples.append(audio_data)
-        # format script
         lines = script.strip().split("\n")
-        formatted_lines = []
-        for line in lines:
             line = line.strip()
             if not line:
                 continue
-            if line.startswith("Speaker ") and ":" in line:
-                formatted_lines.append(line)
             else:
-                sp_id = len(formatted_lines) % num_speakers
-                formatted_lines.append(f"Speaker {sp_id}: {line}")
-        formatted_script = "\n".join(formatted_lines)
-        # prepare inputs
         inputs = self.processor(
             text=[formatted_script],
             voice_samples=[voice_samples],
@@ -123,44 +116,39 @@ class VibeVoiceDemo:
             return_attention_mask=True,
         )
-        # run with AudioStreamer
-        from vibevoice.modular.streamer import AudioStreamer
         audio_streamer = AudioStreamer(batch_size=1)
-        self.current_streamer = audio_streamer
-        start = time.time()
-        gen_thread = threading.Thread(
-            target=self._generate_with_streamer,
-            args=(inputs, cfg_scale, audio_streamer)
         )
-        gen_thread.start()
-        # collect chunks
-        sample_rate = 24000
         all_chunks = []
-        audio_stream = audio_streamer.get_stream(0)
-        for audio_chunk in audio_stream:
             if torch.is_tensor(audio_chunk):
                 audio_chunk = audio_chunk.float().cpu().numpy()
             if audio_chunk.ndim > 1:
                 audio_chunk = audio_chunk.squeeze()
             all_chunks.append(audio_chunk)
-        gen_thread.join(timeout=10.0)
-        self.current_streamer = None
-        self.is_generating = False
         if not all_chunks:
-            raise gr.Error("❌ No audio chunks were generated.")
-        # merge
-        complete_audio = np.concatenate(all_chunks).astype("float32")
-        # save automatically
         os.makedirs("outputs", exist_ok=True)
-        from datetime import datetime
-        import soundfile as sf
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
         sf.write(file_path, complete_audio, sample_rate)
@@ -169,10 +157,12 @@ class VibeVoiceDemo:
         total_dur = len(complete_audio) / sample_rate
         log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
         return (sample_rate, complete_audio), log
     def load_example_scripts(self):
         examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
         self.example_scripts = []

             return np.array([])
     @GPU
+    def generate_podcast(self, num_speakers: int, script: str,
+                         speaker_1: str = None, speaker_2: str = None,
+                         speaker_3: str = None, speaker_4: str = None,
+                         cfg_scale: float = 1.3):
+        """Final audio generation only (no streaming, runs fully on GPU)."""
         self.is_generating = True
+        self.stop_generation = False
         if not script.strip():
             raise gr.Error("Please provide a script.")
+        if num_speakers < 1 or num_speakers > 4:
             raise gr.Error("Number of speakers must be 1–4.")
+        # Collect selected speakers
+        selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+        for i, sp in enumerate(selected):
             if not sp or sp not in self.available_voices:
                 raise gr.Error(f"Invalid speaker {i+1} selection.")
+        # Load voices into memory
+        voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
+        if any(len(v) == 0 for v in voice_samples):
+            raise gr.Error("Failed to load one or more voice samples.")
+        # Format script
         lines = script.strip().split("\n")
+        formatted = []
+        for i, line in enumerate(lines):
             line = line.strip()
             if not line:
                 continue
+            if line.startswith("Speaker "):
+                formatted.append(line)
             else:
+                sp_id = i % num_speakers
+                formatted.append(f"Speaker {sp_id}: {line}")
+        formatted_script = "\n".join(formatted)
+        # Prepare processor inputs
         inputs = self.processor(
             text=[formatted_script],
             voice_samples=[voice_samples],
             return_attention_mask=True,
         )
+        start = time.time()
+        sample_rate = 24000
         audio_streamer = AudioStreamer(batch_size=1)
+        # Run generation fully on GPU
+        self.model.generate(
+            **inputs,
+            max_new_tokens=None,
+            cfg_scale=cfg_scale,
+            tokenizer=self.processor.tokenizer,
+            generation_config={'do_sample': False},
+            audio_streamer=audio_streamer,
+            verbose=False,
         )
+        # Collect all audio chunks
         all_chunks = []
+        for audio_chunk in audio_streamer.get_stream(0):
             if torch.is_tensor(audio_chunk):
                 audio_chunk = audio_chunk.float().cpu().numpy()
             if audio_chunk.ndim > 1:
                 audio_chunk = audio_chunk.squeeze()
             all_chunks.append(audio_chunk)
         if not all_chunks:
+            self.is_generating = False
+            raise gr.Error("❌ No audio was generated by the model.")
+        complete_audio = np.concatenate(all_chunks)
+        audio16 = convert_to_16_bit_wav(complete_audio)
+        # Save automatically to disk
         os.makedirs("outputs", exist_ok=True)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
         sf.write(file_path, complete_audio, sample_rate)
         total_dur = len(complete_audio) / sample_rate
         log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
+        self.is_generating = False
         return (sample_rate, complete_audio), log
     def load_example_scripts(self):
         examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
         self.example_scripts = []