Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 12

Commit

7d92703

verified ·

1 Parent(s): ac81409

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -8

app.py CHANGED Viewed

@@ -42,7 +42,18 @@ def text_to_speech(text, speaker_id):
     with torch.no_grad():
         sampled = e2tts.sample(mel[:, :5], text=[text])
-    return sampled.cpu().numpy().squeeze()
 def create_podcast(api_key, content, duration, voice1, voice2):
     script = generate_podcast_script(api_key, content, duration)
@@ -57,18 +68,22 @@ def render_podcast(api_key, script, voice1, voice2):
     audio_segments = []
     for line in lines:
-        if line.startswith("Host 1:"):
-            audio = text_to_speech(line[7:], speaker_id=0)
-            audio_segments.append(audio)
-        elif line.startswith("Host 2:"):
-            audio = text_to_speech(line[7:], speaker_id=1)
-            audio_segments.append(audio)
     if not audio_segments:
-        return (22050, np.zeros(22050))  # Return silence if no audio was generated
     # Concatenate audio segments
     podcast_audio = np.concatenate(audio_segments)
     return (22050, podcast_audio)  # Assuming 22050 Hz sample rate
 # Gradio Interface

     with torch.no_grad():
         sampled = e2tts.sample(mel[:, :5], text=[text])
+    audio = sampled.cpu().numpy().squeeze()
+    # Check if audio contains any non-zero values
+    if np.all(audio == 0):
+        print(f"Warning: Generated audio for '{text}' is all zeros.")
+    elif np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
+        print(f"Warning: Generated audio for '{text}' contains NaN or Inf values.")
+    # Normalize audio to [-1, 1] range
+    audio = np.clip(audio, -1, 1)
+    return audio
 def create_podcast(api_key, content, duration, voice1, voice2):
     script = generate_podcast_script(api_key, content, duration)
     audio_segments = []
     for line in lines:
+        if line.startswith("Host 1:") or line.startswith("Host 2:"):
+            audio = text_to_speech(line[7:], speaker_id=0 if line.startswith("Host 1:") else 1)
+            if not np.all(audio == 0) and not np.any(np.isnan(audio)) and not np.any(np.isinf(audio)):
+                audio_segments.append(audio)
     if not audio_segments:
+        print("Warning: No valid audio segments were generated.")
+        return (22050, np.zeros(22050))  # Return silence if no valid audio was generated
     # Concatenate audio segments
     podcast_audio = np.concatenate(audio_segments)
+    # Ensure audio is in the correct range for int16
+    podcast_audio = np.clip(podcast_audio, -1, 1) * 32767
+    podcast_audio = podcast_audio.astype(np.int16)
     return (22050, podcast_audio)  # Assuming 22050 Hz sample rate
 # Gradio Interface