bluenevus commited on
Commit
2148d28
·
verified ·
1 Parent(s): 5ba3e1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -1
app.py CHANGED
@@ -137,9 +137,36 @@ def text_to_speech(text, voice):
137
  inputs = tokenizer(text, return_tensors="pt").to(device)
138
  with torch.no_grad():
139
  output = model.generate(**inputs, max_new_tokens=256)
140
- audio = output.audio.cpu().numpy()
 
 
 
141
  return audio
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def process_audio_segment(line, voice, result_queue):
144
  audio = text_to_speech(line, voice)
145
  result_queue.put(audio)
 
137
  inputs = tokenizer(text, return_tensors="pt").to(device)
138
  with torch.no_grad():
139
  output = model.generate(**inputs, max_new_tokens=256)
140
+ # Assuming the model outputs mel spectrograms
141
+ mel = output[0].cpu().numpy() # Explicitly move to CPU for numpy conversion
142
+ # Convert mel spectrogram to audio (you might need to implement this conversion)
143
+ audio = mel_to_audio(mel) # This function needs to be implemented
144
  return audio
145
 
146
+ def render_podcast(api_key, script, voice1, voice2, num_hosts):
147
+ lines = [line for line in script.split('\n') if line.strip()]
148
+ audio_segments = []
149
+
150
+ for i, line in enumerate(lines):
151
+ voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
152
+ audio = text_to_speech(line, voice)
153
+ audio_segments.append(audio)
154
+
155
+ if not audio_segments:
156
+ logger.warning("No valid audio segments were generated.")
157
+ return (24000, np.zeros(24000, dtype=np.float32))
158
+
159
+ podcast_audio = np.concatenate(audio_segments)
160
+ return (24000, podcast_audio) # Assuming 24kHz sample rate
161
+
162
+ # You'll need to implement this function based on the model's output
163
+ def mel_to_audio(mel):
164
+ # Convert mel spectrogram to audio
165
+ # This will depend on the specific output of your model
166
+ # You might need to use a vocoder or other conversion method
167
+ # For now, we'll just return a placeholder
168
+ return np.zeros(24000, dtype=np.float32) # 1 second of silence as placeholder
169
+
170
  def process_audio_segment(line, voice, result_queue):
171
  audio = text_to_speech(line, voice)
172
  result_queue.put(audio)