yasserrmd commited on
Commit
e873ae8
Β·
verified Β·
1 Parent(s): 2565173

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -76,11 +76,9 @@ class VibeVoiceDemo:
76
 
77
  if not script.strip():
78
  raise gr.Error("Please provide a script.")
79
-
80
- if num_speakers < 1 or num_speakers > 4:
81
  raise gr.Error("Number of speakers must be 1–4.")
82
 
83
- # collect speakers
84
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
85
  for i, sp in enumerate(selected):
86
  if not sp or sp not in self.available_voices:
@@ -90,7 +88,6 @@ class VibeVoiceDemo:
90
  if any(len(v) == 0 for v in voice_samples):
91
  raise gr.Error("Failed to load one or more voice samples.")
92
 
93
- # format script
94
  lines = script.strip().split("\n")
95
  formatted = []
96
  for i, line in enumerate(lines):
@@ -104,7 +101,6 @@ class VibeVoiceDemo:
104
  formatted.append(f"Speaker {sp_id}: {line}")
105
  formatted_script = "\n".join(formatted)
106
 
107
- # processor input
108
  inputs = self.processor(
109
  text=[formatted_script],
110
  voice_samples=[voice_samples],
@@ -119,48 +115,52 @@ class VibeVoiceDemo:
119
  tokenizer=self.processor.tokenizer,
120
  verbose=False
121
  )
 
122
 
123
- # --- handle model output robustly ---
124
- if hasattr(outputs, "audio"):
125
- audio = outputs.audio
126
- elif hasattr(outputs, "audios") and outputs.audios:
 
127
  audio = outputs.audios[0]
128
- elif hasattr(outputs, "waveform"):
129
- audio = outputs.waveform
130
  elif hasattr(outputs, "waveforms") and outputs.waveforms:
131
  audio = outputs.waveforms[0]
 
 
132
  elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
133
  audio = outputs.speech_outputs[0]
134
  else:
135
- raise gr.Error(f"Model did not return audio in expected format. Got attributes: {dir(outputs)}")
 
 
 
136
 
137
- # convert to numpy
138
  if torch.is_tensor(audio):
139
  audio = audio.float().cpu().numpy()
140
  if audio.ndim > 1:
141
  audio = audio.squeeze()
142
 
143
  sample_rate = 24000
144
- # ensure float32 for saving and returning
145
  audio = audio.astype("float32")
146
 
147
- # save automatically to disk
148
  os.makedirs("outputs", exist_ok=True)
149
  from datetime import datetime
150
  import soundfile as sf
151
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
152
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
153
- sf.write(file_path, audio, sample_rate) # soundfile handles float32
154
-
155
  print(f"πŸ’Ύ Saved podcast to {file_path}")
156
 
157
  total_dur = len(audio) / sample_rate
158
- log = f"βœ… Generation complete in {time.time()-start:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
159
 
160
  self.is_generating = False
161
  return (sample_rate, audio), log
162
 
163
 
 
164
  def load_example_scripts(self):
165
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
166
  self.example_scripts = []
 
76
 
77
  if not script.strip():
78
  raise gr.Error("Please provide a script.")
79
+ if not (1 <= num_speakers <= 4):
 
80
  raise gr.Error("Number of speakers must be 1–4.")
81
 
 
82
  selected = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
83
  for i, sp in enumerate(selected):
84
  if not sp or sp not in self.available_voices:
 
88
  if any(len(v) == 0 for v in voice_samples):
89
  raise gr.Error("Failed to load one or more voice samples.")
90
 
 
91
  lines = script.strip().split("\n")
92
  formatted = []
93
  for i, line in enumerate(lines):
 
101
  formatted.append(f"Speaker {sp_id}: {line}")
102
  formatted_script = "\n".join(formatted)
103
 
 
104
  inputs = self.processor(
105
  text=[formatted_script],
106
  voice_samples=[voice_samples],
 
115
  tokenizer=self.processor.tokenizer,
116
  verbose=False
117
  )
118
+ gen_time = time.time() - start
119
 
120
+ print("DEBUG: outputs type:", type(outputs))
121
+ print("DEBUG: outputs dir:", dir(outputs))
122
+
123
+ audio = None
124
+ if hasattr(outputs, "audios") and outputs.audios:
125
  audio = outputs.audios[0]
126
+ elif hasattr(outputs, "audio"):
127
+ audio = outputs.audio
128
  elif hasattr(outputs, "waveforms") and outputs.waveforms:
129
  audio = outputs.waveforms[0]
130
+ elif hasattr(outputs, "waveform"):
131
+ audio = outputs.waveform
132
  elif hasattr(outputs, "speech_outputs") and outputs.speech_outputs:
133
  audio = outputs.speech_outputs[0]
134
  else:
135
+ raise gr.Error(f"No audio found in output. Check debug: {dir(outputs)}")
136
+
137
+ if audio is None:
138
+ raise gr.Error("Extracted audio is None β€” check model output structure.")
139
 
 
140
  if torch.is_tensor(audio):
141
  audio = audio.float().cpu().numpy()
142
  if audio.ndim > 1:
143
  audio = audio.squeeze()
144
 
145
  sample_rate = 24000
 
146
  audio = audio.astype("float32")
147
 
 
148
  os.makedirs("outputs", exist_ok=True)
149
  from datetime import datetime
150
  import soundfile as sf
151
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
152
  file_path = os.path.join("outputs", f"podcast_{timestamp}.wav")
153
+ sf.write(file_path, audio, sample_rate)
 
154
  print(f"πŸ’Ύ Saved podcast to {file_path}")
155
 
156
  total_dur = len(audio) / sample_rate
157
+ log = f"βœ… Generation complete in {gen_time:.1f}s, {total_dur:.1f}s audio\nSaved to {file_path}"
158
 
159
  self.is_generating = False
160
  return (sample_rate, audio), log
161
 
162
 
163
+
164
  def load_example_scripts(self):
165
  examples_dir = os.path.join(os.path.dirname(__file__), "text_examples")
166
  self.example_scripts = []