Spaces:

yasserrmd
/

VibeVoice

Running on Zero

App Files Files Community

yasserrmd commited on 26 days ago

Commit

1276b3e

verified ·

1 Parent(s): e873ae8

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -52

app.py CHANGED Viewed

@@ -67,11 +67,16 @@ class VibeVoiceDemo:
             return np.array([])
     @GPU
-    def generate_podcast(self, num_speakers: int, script: str,
-                         speaker_1: str = None, speaker_2: str = None,
-                         speaker_3: str = None, speaker_4: str = None,
-                         cfg_scale: float = 1.3):
-        """Final audio generation only (no streaming)."""
         self.is_generating = True
         if not script.strip():
@@ -79,85 +84,91 @@ class VibeVoiceDemo:
         if not (1 <= num_speakers <= 4):
             raise gr.Error("Number of speakers must be 1–4.")
-        selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
-        for i, sp in enumerate(selected):
             if not sp or sp not in self.available_voices:
                 raise gr.Error(f"Invalid speaker {i+1} selection.")
-        voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
-        if any(len(v) == 0 for v in voice_samples):
-            raise gr.Error("Failed to load one or more voice samples.")
         lines = script.strip().split("\n")
-        formatted = []
-        for i, line in enumerate(lines):
             line = line.strip()
             if not line:
                 continue
-            if line.startswith("Speaker "):
-                formatted.append(line)
             else:
-                sp_id = i % num_speakers
-                formatted.append(f"Speaker {sp_id}: {line}")
-        formatted_script = "\n".join(formatted)
         inputs = self.processor(
             text=[formatted_script],
             voice_samples=[voice_samples],
             padding=True,
-            return_tensors="pt"
         )
         start = time.time()
-        outputs = self.model.generate(
-            **inputs,
-            cfg_scale=cfg_scale,
-            tokenizer=self.processor.tokenizer,
-            verbose=False
         )
-        gen_time = time.time() - start
-        print("DEBUG: outputs type:", type(outputs))
-        print("DEBUG: outputs dir:", dir(outputs))
-        audio = None
-        if hasattr(outputs, "audios") and outputs.audios:
-            audio = outputs.audios[0]
-        elif hasattr(outputs, "audio"):
-            audio = outputs.audio
-        elif hasattr(outputs, "waveforms") and outputs.waveforms:
-            audio = outputs.waveforms[0]
-        elif hasattr(outputs, "waveform"):
-            audio = outputs.waveform
-        elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
-            audio = outputs.speech_outputs[0]
-        else:
-            raise gr.Error(f"No audio found in output. Check debug: {dir(outputs)}")
-        if audio is None:
-            raise gr.Error("Extracted audio is None — check model output structure.")
-        if torch.is_tensor(audio):
-            audio = audio.float().cpu().numpy()
-        if audio.ndim > 1:
-            audio = audio.squeeze()
-        sample_rate = 24000
-        audio = audio.astype("float32")
         os.makedirs("outputs", exist_ok=True)
         from datetime import datetime
         import soundfile as sf
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
-        sf.write(file_path, audio, sample_rate)
         print(f"💾 Saved podcast to {file_path}")
-        total_dur = len(audio) / sample_rate
-        log = f"✅ Generation complete in {gen_time:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
-        self.is_generating = False
-        return (sample_rate, audio), log

             return np.array([])
     @GPU
+    def generate_podcast(self,
+                     num_speakers: int,
+                     script: str,
+                     speaker_1: str = None,
+                     speaker_2: str = None,
+                     speaker_3: str = None,
+                     speaker_4: str = None,
+                     cfg_scale: float = 1.3):
+        """Generate full podcast audio (no streaming to UI, only final WAV)."""
+        self.stop_generation = False
         self.is_generating = True
         if not script.strip():
         if not (1 <= num_speakers <= 4):
             raise gr.Error("Number of speakers must be 1–4.")
+        # validate speakers
+        selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
+        for i, sp in enumerate(selected_speakers):
             if not sp or sp not in self.available_voices:
                 raise gr.Error(f"Invalid speaker {i+1} selection.")
+        # load voices
+        voice_samples = []
+        for speaker_name in selected_speakers:
+            audio_path = self.available_voices[speaker_name]
+            audio_data = self.read_audio(audio_path)
+            if len(audio_data) == 0:
+                raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
+            voice_samples.append(audio_data)
+        # format script
         lines = script.strip().split("\n")
+        formatted_lines = []
+        for line in lines:
             line = line.strip()
             if not line:
                 continue
+            if line.startswith("Speaker ") and ":" in line:
+                formatted_lines.append(line)
             else:
+                sp_id = len(formatted_lines) % num_speakers
+                formatted_lines.append(f"Speaker {sp_id}: {line}")
+        formatted_script = "\n".join(formatted_lines)
+        # prepare inputs
         inputs = self.processor(
             text=[formatted_script],
             voice_samples=[voice_samples],
             padding=True,
+            return_tensors="pt",
+            return_attention_mask=True,
         )
+        # run with AudioStreamer
+        from vibevoice.modular.streamer import AudioStreamer
+        audio_streamer = AudioStreamer(batch_size=1)
+        self.current_streamer = audio_streamer
         start = time.time()
+        gen_thread = threading.Thread(
+            target=self._generate_with_streamer,
+            args=(inputs, cfg_scale, audio_streamer)
         )
+        gen_thread.start()
+        # collect chunks
+        sample_rate = 24000
+        all_chunks = []
+        audio_stream = audio_streamer.get_stream(0)
+        for audio_chunk in audio_stream:
+            if torch.is_tensor(audio_chunk):
+                audio_chunk = audio_chunk.float().cpu().numpy()
+            if audio_chunk.ndim > 1:
+                audio_chunk = audio_chunk.squeeze()
+            all_chunks.append(audio_chunk)
+        gen_thread.join(timeout=10.0)
+        self.current_streamer = None
+        self.is_generating = False
+        if not all_chunks:
+            raise gr.Error("❌ No audio chunks were generated.")
+        # merge
+        complete_audio = np.concatenate(all_chunks).astype("float32")
+        # save automatically
         os.makedirs("outputs", exist_ok=True)
         from datetime import datetime
         import soundfile as sf
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
+        sf.write(file_path, complete_audio, sample_rate)
         print(f"💾 Saved podcast to {file_path}")
+        total_dur = len(complete_audio) / sample_rate
+        log = f"✅ Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
+        return (sample_rate, complete_audio), log