yasserrmd commited on
Commit
1276b3e
Β·
verified Β·
1 Parent(s): e873ae8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -52
app.py CHANGED
@@ -67,11 +67,16 @@ class VibeVoiceDemo:
67
  return np.array([])
68
 
69
  @GPU
70
- def generate_podcast(self, num_speakers: int, script: str,
71
- speaker_1: str = None, speaker_2: str = None,
72
- speaker_3: str = None, speaker_4: str = None,
73
- cfg_scale: float = 1.3):
74
- """Final audio generation only (no streaming)."""
 
 
 
 
 
75
  self.is_generating = True
76
 
77
  if not script.strip():
@@ -79,85 +84,91 @@ class VibeVoiceDemo:
79
  if not (1 <= num_speakers <= 4):
80
  raise gr.Error("Number of speakers must be 1–4.")
81
 
82
- selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
83
- for i, sp in enumerate(selected):
 
84
  if not sp or sp not in self.available_voices:
85
  raise gr.Error(f"Invalid speaker {i+1} selection.")
86
 
87
- voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
88
- if any(len(v) == 0 for v in voice_samples):
89
- raise gr.Error("Failed to load one or more voice samples.")
 
 
 
 
 
90
 
 
91
  lines = script.strip().split("\n")
92
- formatted = []
93
- for i, line in enumerate(lines):
94
  line = line.strip()
95
  if not line:
96
  continue
97
- if line.startswith("Speaker "):
98
- formatted.append(line)
99
  else:
100
- sp_id = i % num_speakers
101
- formatted.append(f"Speaker {sp_id}: {line}")
102
- formatted_script = "\n".join(formatted)
103
 
 
104
  inputs = self.processor(
105
  text=[formatted_script],
106
  voice_samples=[voice_samples],
107
  padding=True,
108
- return_tensors="pt"
 
109
  )
110
 
 
 
 
 
 
111
  start = time.time()
112
- outputs = self.model.generate(
113
- **inputs,
114
- cfg_scale=cfg_scale,
115
- tokenizer=self.processor.tokenizer,
116
- verbose=False
117
  )
118
- gen_time = time.time() - start
119
 
120
- print("DEBUG: outputs type:", type(outputs))
121
- print("DEBUG: outputs dir:", dir(outputs))
 
 
122
 
123
- audio = None
124
- if hasattr(outputs, "audios") and outputs.audios:
125
- audio = outputs.audios[0]
126
- elif hasattr(outputs, "audio"):
127
- audio = outputs.audio
128
- elif hasattr(outputs, "waveforms") and outputs.waveforms:
129
- audio = outputs.waveforms[0]
130
- elif hasattr(outputs, "waveform"):
131
- audio = outputs.waveform
132
- elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
133
- audio = outputs.speech_outputs[0]
134
- else:
135
- raise gr.Error(f"No audio found in output. Check debug: {dir(outputs)}")
136
 
137
- if audio is None:
138
- raise gr.Error("Extracted audio is None β€” check model output structure.")
 
139
 
140
- if torch.is_tensor(audio):
141
- audio = audio.float().cpu().numpy()
142
- if audio.ndim > 1:
143
- audio = audio.squeeze()
144
 
145
- sample_rate = 24000
146
- audio = audio.astype("float32")
147
 
 
148
  os.makedirs("outputs", exist_ok=True)
149
  from datetime import datetime
150
  import soundfile as sf
151
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
152
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
153
- sf.write(file_path, audio, sample_rate)
154
  print(f"πŸ’Ύ Saved podcast to {file_path}")
155
 
156
- total_dur = len(audio) / sample_rate
157
- log = f"βœ… Generation complete in {gen_time:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
158
 
159
- self.is_generating = False
160
- return (sample_rate, audio), log
161
 
162
 
163
 
 
67
  return np.array([])
68
 
69
  @GPU
70
+ def generate_podcast(self,
71
+ num_speakers: int,
72
+ script: str,
73
+ speaker_1: str = None,
74
+ speaker_2: str = None,
75
+ speaker_3: str = None,
76
+ speaker_4: str = None,
77
+ cfg_scale: float = 1.3):
78
+ """Generate full podcast audio (no streaming to UI, only final WAV)."""
79
+ self.stop_generation = False
80
  self.is_generating = True
81
 
82
  if not script.strip():
 
84
  if not (1 <= num_speakers <= 4):
85
  raise gr.Error("Number of speakers must be 1–4.")
86
 
87
+ # validate speakers
88
+ selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
89
+ for i, sp in enumerate(selected_speakers):
90
  if not sp or sp not in self.available_voices:
91
  raise gr.Error(f"Invalid speaker {i+1} selection.")
92
 
93
+ # load voices
94
+ voice_samples = []
95
+ for speaker_name in selected_speakers:
96
+ audio_path = self.available_voices[speaker_name]
97
+ audio_data = self.read_audio(audio_path)
98
+ if len(audio_data) == 0:
99
+ raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
100
+ voice_samples.append(audio_data)
101
 
102
+ # format script
103
  lines = script.strip().split("\n")
104
+ formatted_lines = []
105
+ for line in lines:
106
  line = line.strip()
107
  if not line:
108
  continue
109
+ if line.startswith("Speaker ") and ":" in line:
110
+ formatted_lines.append(line)
111
  else:
112
+ sp_id = len(formatted_lines) % num_speakers
113
+ formatted_lines.append(f"Speaker {sp_id}: {line}")
114
+ formatted_script = "\n".join(formatted_lines)
115
 
116
+ # prepare inputs
117
  inputs = self.processor(
118
  text=[formatted_script],
119
  voice_samples=[voice_samples],
120
  padding=True,
121
+ return_tensors="pt",
122
+ return_attention_mask=True,
123
  )
124
 
125
+ # run with AudioStreamer
126
+ from vibevoice.modular.streamer import AudioStreamer
127
+ audio_streamer = AudioStreamer(batch_size=1)
128
+ self.current_streamer = audio_streamer
129
+
130
  start = time.time()
131
+ gen_thread = threading.Thread(
132
+ target=self._generate_with_streamer,
133
+ args=(inputs, cfg_scale, audio_streamer)
 
 
134
  )
135
+ gen_thread.start()
136
 
137
+ # collect chunks
138
+ sample_rate = 24000
139
+ all_chunks = []
140
+ audio_stream = audio_streamer.get_stream(0)
141
 
142
+ for audio_chunk in audio_stream:
143
+ if torch.is_tensor(audio_chunk):
144
+ audio_chunk = audio_chunk.float().cpu().numpy()
145
+ if audio_chunk.ndim > 1:
146
+ audio_chunk = audio_chunk.squeeze()
147
+ all_chunks.append(audio_chunk)
 
 
 
 
 
 
 
148
 
149
+ gen_thread.join(timeout=10.0)
150
+ self.current_streamer = None
151
+ self.is_generating = False
152
 
153
+ if not all_chunks:
154
+ raise gr.Error("❌ No audio chunks were generated.")
 
 
155
 
156
+ # merge
157
+ complete_audio = np.concatenate(all_chunks).astype("float32")
158
 
159
+ # save automatically
160
  os.makedirs("outputs", exist_ok=True)
161
  from datetime import datetime
162
  import soundfile as sf
163
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
164
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
165
+ sf.write(file_path, complete_audio, sample_rate)
166
  print(f"πŸ’Ύ Saved podcast to {file_path}")
167
 
168
+ total_dur = len(complete_audio) / sample_rate
169
+ log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
170
 
171
+ return (sample_rate, complete_audio), log
 
172
 
173
 
174