Update app.py
Browse files
app.py
CHANGED
@@ -137,9 +137,36 @@ def text_to_speech(text, voice):
|
|
137 |
inputs = tokenizer(text, return_tensors="pt").to(device)
|
138 |
with torch.no_grad():
|
139 |
output = model.generate(**inputs, max_new_tokens=256)
|
140 |
-
|
|
|
|
|
|
|
141 |
return audio
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def process_audio_segment(line, voice, result_queue):
|
144 |
audio = text_to_speech(line, voice)
|
145 |
result_queue.put(audio)
|
|
|
137 |
inputs = tokenizer(text, return_tensors="pt").to(device)
|
138 |
with torch.no_grad():
|
139 |
output = model.generate(**inputs, max_new_tokens=256)
|
140 |
+
# Assuming the model outputs mel spectrograms
|
141 |
+
mel = output[0].cpu().numpy() # Explicitly move to CPU for numpy conversion
|
142 |
+
# Convert mel spectrogram to audio (you might need to implement this conversion)
|
143 |
+
audio = mel_to_audio(mel) # This function needs to be implemented
|
144 |
return audio
|
145 |
|
146 |
+
def render_podcast(api_key, script, voice1, voice2, num_hosts):
|
147 |
+
lines = [line for line in script.split('\n') if line.strip()]
|
148 |
+
audio_segments = []
|
149 |
+
|
150 |
+
for i, line in enumerate(lines):
|
151 |
+
voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
|
152 |
+
audio = text_to_speech(line, voice)
|
153 |
+
audio_segments.append(audio)
|
154 |
+
|
155 |
+
if not audio_segments:
|
156 |
+
logger.warning("No valid audio segments were generated.")
|
157 |
+
return (24000, np.zeros(24000, dtype=np.float32))
|
158 |
+
|
159 |
+
podcast_audio = np.concatenate(audio_segments)
|
160 |
+
return (24000, podcast_audio) # Assuming 24kHz sample rate
|
161 |
+
|
162 |
+
# You'll need to implement this function based on the model's output
|
163 |
+
def mel_to_audio(mel):
|
164 |
+
# Convert mel spectrogram to audio
|
165 |
+
# This will depend on the specific output of your model
|
166 |
+
# You might need to use a vocoder or other conversion method
|
167 |
+
# For now, we'll just return a placeholder
|
168 |
+
return np.zeros(24000, dtype=np.float32) # 1 second of silence as placeholder
|
169 |
+
|
170 |
def process_audio_segment(line, voice, result_queue):
|
171 |
audio = text_to_speech(line, voice)
|
172 |
result_queue.put(audio)
|