Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 15

Commit

2148d28

verified ·

1 Parent(s): 5ba3e1d

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -1

app.py CHANGED Viewed

@@ -137,9 +137,36 @@ def text_to_speech(text, voice):
     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model.generate(**inputs, max_new_tokens=256)
-    audio = output.audio.cpu().numpy()
     return audio
 def process_audio_segment(line, voice, result_queue):
     audio = text_to_speech(line, voice)
     result_queue.put(audio)

     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model.generate(**inputs, max_new_tokens=256)
+    # Assuming the model outputs mel spectrograms
+    mel = output[0].cpu().numpy()  # Explicitly move to CPU for numpy conversion
+    # Convert mel spectrogram to audio (you might need to implement this conversion)
+    audio = mel_to_audio(mel)  # This function needs to be implemented
     return audio
+def render_podcast(api_key, script, voice1, voice2, num_hosts):
+    lines = [line for line in script.split('\n') if line.strip()]
+    audio_segments = []
+    for i, line in enumerate(lines):
+        voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
+        audio = text_to_speech(line, voice)
+        audio_segments.append(audio)
+    if not audio_segments:
+        logger.warning("No valid audio segments were generated.")
+        return (24000, np.zeros(24000, dtype=np.float32))
+    podcast_audio = np.concatenate(audio_segments)
+    return (24000, podcast_audio)  # Assuming 24kHz sample rate
+# You'll need to implement this function based on the model's output
+def mel_to_audio(mel):
+    # Convert mel spectrogram to audio
+    # This will depend on the specific output of your model
+    # You might need to use a vocoder or other conversion method
+    # For now, we'll just return a placeholder
+    return np.zeros(24000, dtype=np.float32)  # 1 second of silence as placeholder
 def process_audio_segment(line, voice, result_queue):
     audio = text_to_speech(line, voice)
     result_queue.put(audio)