Update app.py
Browse files
app.py
CHANGED
@@ -42,7 +42,18 @@ def text_to_speech(text, speaker_id):
|
|
42 |
with torch.no_grad():
|
43 |
sampled = e2tts.sample(mel[:, :5], text=[text])
|
44 |
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
def create_podcast(api_key, content, duration, voice1, voice2):
|
48 |
script = generate_podcast_script(api_key, content, duration)
|
@@ -57,18 +68,22 @@ def render_podcast(api_key, script, voice1, voice2):
|
|
57 |
audio_segments = []
|
58 |
|
59 |
for line in lines:
|
60 |
-
if line.startswith("Host 1:"):
|
61 |
-
audio = text_to_speech(line[7:], speaker_id=0)
|
62 |
-
|
63 |
-
|
64 |
-
audio = text_to_speech(line[7:], speaker_id=1)
|
65 |
-
audio_segments.append(audio)
|
66 |
|
67 |
if not audio_segments:
|
68 |
-
|
|
|
69 |
|
70 |
# Concatenate audio segments
|
71 |
podcast_audio = np.concatenate(audio_segments)
|
|
|
|
|
|
|
|
|
|
|
72 |
return (22050, podcast_audio) # Assuming 22050 Hz sample rate
|
73 |
|
74 |
# Gradio Interface
|
|
|
42 |
with torch.no_grad():
|
43 |
sampled = e2tts.sample(mel[:, :5], text=[text])
|
44 |
|
45 |
+
audio = sampled.cpu().numpy().squeeze()
|
46 |
+
|
47 |
+
# Check if audio contains any non-zero values
|
48 |
+
if np.all(audio == 0):
|
49 |
+
print(f"Warning: Generated audio for '{text}' is all zeros.")
|
50 |
+
elif np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
|
51 |
+
print(f"Warning: Generated audio for '{text}' contains NaN or Inf values.")
|
52 |
+
|
53 |
+
# Normalize audio to [-1, 1] range
|
54 |
+
audio = np.clip(audio, -1, 1)
|
55 |
+
|
56 |
+
return audio
|
57 |
|
58 |
def create_podcast(api_key, content, duration, voice1, voice2):
|
59 |
script = generate_podcast_script(api_key, content, duration)
|
|
|
68 |
audio_segments = []
|
69 |
|
70 |
for line in lines:
|
71 |
+
if line.startswith("Host 1:") or line.startswith("Host 2:"):
|
72 |
+
audio = text_to_speech(line[7:], speaker_id=0 if line.startswith("Host 1:") else 1)
|
73 |
+
if not np.all(audio == 0) and not np.any(np.isnan(audio)) and not np.any(np.isinf(audio)):
|
74 |
+
audio_segments.append(audio)
|
|
|
|
|
75 |
|
76 |
if not audio_segments:
|
77 |
+
print("Warning: No valid audio segments were generated.")
|
78 |
+
return (22050, np.zeros(22050)) # Return silence if no valid audio was generated
|
79 |
|
80 |
# Concatenate audio segments
|
81 |
podcast_audio = np.concatenate(audio_segments)
|
82 |
+
|
83 |
+
# Ensure audio is in the correct range for int16
|
84 |
+
podcast_audio = np.clip(podcast_audio, -1, 1) * 32767
|
85 |
+
podcast_audio = podcast_audio.astype(np.int16)
|
86 |
+
|
87 |
return (22050, podcast_audio) # Assuming 22050 Hz sample rate
|
88 |
|
89 |
# Gradio Interface
|