yasserrmd commited on
Commit
1c0cdb5
Β·
verified Β·
1 Parent(s): 043b99a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -54
app.py CHANGED
@@ -68,53 +68,46 @@ class VibeVoiceDemo:
68
  return np.array([])
69
 
70
  @GPU
71
- def generate_podcast(self,
72
- num_speakers: int,
73
- script: str,
74
- speaker_1: str = None,
75
- speaker_2: str = None,
76
- speaker_3: str = None,
77
- speaker_4: str = None,
78
- cfg_scale: float = 1.3):
79
- """Generate full podcast audio (no streaming to UI, only final WAV)."""
80
- self.stop_generation = False
81
  self.is_generating = True
 
82
 
83
  if not script.strip():
84
  raise gr.Error("Please provide a script.")
85
- if not (1 <= num_speakers <= 4):
 
86
  raise gr.Error("Number of speakers must be 1–4.")
87
 
88
- # validate speakers
89
- selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
90
- for i, sp in enumerate(selected_speakers):
91
  if not sp or sp not in self.available_voices:
92
  raise gr.Error(f"Invalid speaker {i+1} selection.")
93
 
94
- # load voices
95
- voice_samples = []
96
- for speaker_name in selected_speakers:
97
- audio_path = self.available_voices[speaker_name]
98
- audio_data = self.read_audio(audio_path)
99
- if len(audio_data) == 0:
100
- raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
101
- voice_samples.append(audio_data)
102
 
103
- # format script
104
  lines = script.strip().split("\n")
105
- formatted_lines = []
106
- for line in lines:
107
  line = line.strip()
108
  if not line:
109
  continue
110
- if line.startswith("Speaker ") and ":" in line:
111
- formatted_lines.append(line)
112
  else:
113
- sp_id = len(formatted_lines) % num_speakers
114
- formatted_lines.append(f"Speaker {sp_id}: {line}")
115
- formatted_script = "\n".join(formatted_lines)
116
 
117
- # prepare inputs
118
  inputs = self.processor(
119
  text=[formatted_script],
120
  voice_samples=[voice_samples],
@@ -123,44 +116,39 @@ class VibeVoiceDemo:
123
  return_attention_mask=True,
124
  )
125
 
126
- # run with AudioStreamer
127
- from vibevoice.modular.streamer import AudioStreamer
128
  audio_streamer = AudioStreamer(batch_size=1)
129
- self.current_streamer = audio_streamer
130
 
131
- start = time.time()
132
- gen_thread = threading.Thread(
133
- target=self._generate_with_streamer,
134
- args=(inputs, cfg_scale, audio_streamer)
 
 
 
 
 
135
  )
136
- gen_thread.start()
137
 
138
- # collect chunks
139
- sample_rate = 24000
140
  all_chunks = []
141
- audio_stream = audio_streamer.get_stream(0)
142
-
143
- for audio_chunk in audio_stream:
144
  if torch.is_tensor(audio_chunk):
145
  audio_chunk = audio_chunk.float().cpu().numpy()
146
  if audio_chunk.ndim > 1:
147
  audio_chunk = audio_chunk.squeeze()
148
  all_chunks.append(audio_chunk)
149
 
150
- gen_thread.join(timeout=10.0)
151
- self.current_streamer = None
152
- self.is_generating = False
153
-
154
  if not all_chunks:
155
- raise gr.Error("❌ No audio chunks were generated.")
 
156
 
157
- # merge
158
- complete_audio = np.concatenate(all_chunks).astype("float32")
159
 
160
- # save automatically
161
  os.makedirs("outputs", exist_ok=True)
162
- from datetime import datetime
163
- import soundfile as sf
164
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
165
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
166
  sf.write(file_path, complete_audio, sample_rate)
@@ -169,10 +157,12 @@ class VibeVoiceDemo:
169
  total_dur = len(complete_audio) / sample_rate
170
  log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
171
 
 
172
  return (sample_rate, complete_audio), log
173
 
174
 
175
 
 
176
  def load_example_scripts(self):
177
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
178
  self.example_scripts = []
 
68
  return np.array([])
69
 
70
  @GPU
71
+ def generate_podcast(self, num_speakers: int, script: str,
72
+ speaker_1: str = None, speaker_2: str = None,
73
+ speaker_3: str = None, speaker_4: str = None,
74
+ cfg_scale: float = 1.3):
75
+ """Final audio generation only (no streaming, runs fully on GPU)."""
 
 
 
 
 
76
  self.is_generating = True
77
+ self.stop_generation = False
78
 
79
  if not script.strip():
80
  raise gr.Error("Please provide a script.")
81
+
82
+ if num_speakers < 1 or num_speakers > 4:
83
  raise gr.Error("Number of speakers must be 1–4.")
84
 
85
+ # Collect selected speakers
86
+ selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
87
+ for i, sp in enumerate(selected):
88
  if not sp or sp not in self.available_voices:
89
  raise gr.Error(f"Invalid speaker {i+1} selection.")
90
 
91
+ # Load voices into memory
92
+ voice_samples = [self.read_audio(self.available_voices[sp]) for sp in selected]
93
+ if any(len(v) == 0 for v in voice_samples):
94
+ raise gr.Error("Failed to load one or more voice samples.")
 
 
 
 
95
 
96
+ # Format script
97
  lines = script.strip().split("\n")
98
+ formatted = []
99
+ for i, line in enumerate(lines):
100
  line = line.strip()
101
  if not line:
102
  continue
103
+ if line.startswith("Speaker "):
104
+ formatted.append(line)
105
  else:
106
+ sp_id = i % num_speakers
107
+ formatted.append(f"Speaker {sp_id}: {line}")
108
+ formatted_script = "\n".join(formatted)
109
 
110
+ # Prepare processor inputs
111
  inputs = self.processor(
112
  text=[formatted_script],
113
  voice_samples=[voice_samples],
 
116
  return_attention_mask=True,
117
  )
118
 
119
+ start = time.time()
120
+ sample_rate = 24000
121
  audio_streamer = AudioStreamer(batch_size=1)
 
122
 
123
+ # Run generation fully on GPU
124
+ self.model.generate(
125
+ **inputs,
126
+ max_new_tokens=None,
127
+ cfg_scale=cfg_scale,
128
+ tokenizer=self.processor.tokenizer,
129
+ generation_config={'do_sample': False},
130
+ audio_streamer=audio_streamer,
131
+ verbose=False,
132
  )
 
133
 
134
+ # Collect all audio chunks
 
135
  all_chunks = []
136
+ for audio_chunk in audio_streamer.get_stream(0):
 
 
137
  if torch.is_tensor(audio_chunk):
138
  audio_chunk = audio_chunk.float().cpu().numpy()
139
  if audio_chunk.ndim > 1:
140
  audio_chunk = audio_chunk.squeeze()
141
  all_chunks.append(audio_chunk)
142
 
 
 
 
 
143
  if not all_chunks:
144
+ self.is_generating = False
145
+ raise gr.Error("❌ No audio was generated by the model.")
146
 
147
+ complete_audio = np.concatenate(all_chunks)
148
+ audio16 = convert_to_16_bit_wav(complete_audio)
149
 
150
+ # Save automatically to disk
151
  os.makedirs("outputs", exist_ok=True)
 
 
152
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
153
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
154
  sf.write(file_path, complete_audio, sample_rate)
 
157
  total_dur = len(complete_audio) / sample_rate
158
  log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
159
 
160
+ self.is_generating = False
161
  return (sample_rate, complete_audio), log
162
 
163
 
164
 
165
+
166
  def load_example_scripts(self):
167
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
168
  self.example_scripts = []