Update app.py
Browse files
app.py
CHANGED
@@ -1,90 +1,69 @@
|
|
1 |
-
# app.py
|
2 |
-
|
3 |
import gradio as gr
|
4 |
-
import torch
|
5 |
-
import torchaudio
|
6 |
import google.generativeai as genai
|
7 |
-
from e2_tts_pytorch import E2TTS, DurationPredictor
|
8 |
import numpy as np
|
9 |
-
import
|
10 |
-
import
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
|
|
14 |
|
15 |
def generate_podcast_script(api_key, content, duration):
|
16 |
genai.configure(api_key=api_key)
|
17 |
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
18 |
|
19 |
prompt = f"""
|
20 |
-
Create a podcast script for two people discussing the following content:
|
21 |
{content}
|
22 |
|
23 |
The podcast should last approximately {duration}. Include natural speech patterns,
|
24 |
humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
|
25 |
-
"yes", "I see", "Ok now". Vary the emotional tone
|
26 |
-
and indicate these in [square brackets]. Format the script as follows:
|
27 |
|
28 |
-
|
29 |
-
Host
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
Ensure the conversation flows naturally and stays relevant to the topic.
|
32 |
"""
|
33 |
response = model.generate_content(prompt)
|
34 |
return response.text
|
35 |
|
36 |
-
def text_to_speech(text,
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
# Generate speech
|
42 |
-
with torch.no_grad():
|
43 |
-
sampled = e2tts.sample(mel[:, :5], text=[text])
|
44 |
-
|
45 |
-
audio = sampled.cpu().numpy().squeeze()
|
46 |
-
|
47 |
-
# Check if audio contains any non-zero values
|
48 |
-
if np.all(audio == 0):
|
49 |
-
print(f"Warning: Generated audio for '{text}' is all zeros.")
|
50 |
-
elif np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
|
51 |
-
print(f"Warning: Generated audio for '{text}' contains NaN or Inf values.")
|
52 |
-
|
53 |
-
# Normalize audio to [-1, 1] range
|
54 |
-
audio = np.clip(audio, -1, 1)
|
55 |
-
|
56 |
-
return audio
|
57 |
-
|
58 |
-
def create_podcast(api_key, content, duration, voice1, voice2):
|
59 |
-
script = generate_podcast_script(api_key, content, duration)
|
60 |
-
return render_podcast(api_key, script, voice1, voice2)
|
61 |
|
62 |
-
def
|
63 |
-
script = generate_podcast_script(api_key, content, duration)
|
64 |
-
return script
|
65 |
-
|
66 |
-
def render_podcast(api_key, script, voice1, voice2):
|
67 |
lines = script.split('\n')
|
68 |
audio_segments = []
|
69 |
|
70 |
for line in lines:
|
71 |
-
if line.startswith("Host 1:")
|
72 |
-
audio = text_to_speech(line[7:],
|
73 |
-
|
74 |
-
|
|
|
|
|
75 |
|
76 |
if not audio_segments:
|
77 |
-
|
78 |
-
return (
|
79 |
|
80 |
# Concatenate audio segments
|
81 |
-
podcast_audio =
|
82 |
|
83 |
-
#
|
84 |
-
podcast_audio = np.
|
85 |
-
podcast_audio = podcast_audio.astype(np.int16)
|
86 |
|
87 |
-
return (
|
88 |
|
89 |
# Gradio Interface
|
90 |
with gr.Blocks() as demo:
|
@@ -99,8 +78,8 @@ with gr.Blocks() as demo:
|
|
99 |
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
|
100 |
|
101 |
with gr.Row():
|
102 |
-
|
103 |
-
|
104 |
|
105 |
generate_btn = gr.Button("Generate Script")
|
106 |
script_output = gr.Textbox(label="Generated Script", lines=10)
|
@@ -108,7 +87,13 @@ with gr.Blocks() as demo:
|
|
108 |
render_btn = gr.Button("Render Podcast")
|
109 |
audio_output = gr.Audio(label="Generated Podcast")
|
110 |
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
demo.launch()
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import google.generativeai as genai
|
|
|
3 |
import numpy as np
|
4 |
+
import edge_tts
|
5 |
+
import asyncio
|
6 |
+
|
7 |
+
# Set up logging
|
8 |
+
import logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
# Initialize Gemini AI
|
13 |
+
genai.configure(api_key='YOUR_GEMINI_API_KEY')
|
14 |
|
15 |
def generate_podcast_script(api_key, content, duration):
|
16 |
genai.configure(api_key=api_key)
|
17 |
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
18 |
|
19 |
prompt = f"""
|
20 |
+
Create a podcast script for two people (Host 1 and Host 2) discussing the following content:
|
21 |
{content}
|
22 |
|
23 |
The podcast should last approximately {duration}. Include natural speech patterns,
|
24 |
humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
|
25 |
+
"yes", "I see", "Ok now". Vary the emotional tone.
|
|
|
26 |
|
27 |
+
Format the script as follows, with each line representing a single speaker's dialogue:
|
28 |
+
Host 1: Dialog
|
29 |
+
Host 2: Dialog
|
30 |
+
Host 1: Dialog
|
31 |
+
Host 2: Dialog
|
32 |
+
|
33 |
+
Do not include any other text, markdown, or formatting. Only include the alternating dialogue lines.
|
34 |
Ensure the conversation flows naturally and stays relevant to the topic.
|
35 |
"""
|
36 |
response = model.generate_content(prompt)
|
37 |
return response.text
|
38 |
|
39 |
+
async def text_to_speech(text, voice):
|
40 |
+
communicate = edge_tts.Communicate(text, voice)
|
41 |
+
audio_data = await communicate.to_wav()
|
42 |
+
return audio_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
async def render_podcast(api_key, script, voice1, voice2):
|
|
|
|
|
|
|
|
|
45 |
lines = script.split('\n')
|
46 |
audio_segments = []
|
47 |
|
48 |
for line in lines:
|
49 |
+
if line.startswith("Host 1:"):
|
50 |
+
audio = await text_to_speech(line[7:], voice1)
|
51 |
+
audio_segments.append(audio)
|
52 |
+
elif line.startswith("Host 2:"):
|
53 |
+
audio = await text_to_speech(line[7:], voice2)
|
54 |
+
audio_segments.append(audio)
|
55 |
|
56 |
if not audio_segments:
|
57 |
+
logger.warning("No valid audio segments were generated.")
|
58 |
+
return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated
|
59 |
|
60 |
# Concatenate audio segments
|
61 |
+
podcast_audio = b''.join(audio_segments)
|
62 |
|
63 |
+
# Convert to numpy array
|
64 |
+
podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
|
|
|
65 |
|
66 |
+
return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate
|
67 |
|
68 |
# Gradio Interface
|
69 |
with gr.Blocks() as demo:
|
|
|
78 |
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
|
79 |
|
80 |
with gr.Row():
|
81 |
+
voice1_select = gr.Dropdown(label="Select Voice 1", choices=edge_tts.list_voices())
|
82 |
+
voice2_select = gr.Dropdown(label="Select Voice 2", choices=edge_tts.list_voices())
|
83 |
|
84 |
generate_btn = gr.Button("Generate Script")
|
85 |
script_output = gr.Textbox(label="Generated Script", lines=10)
|
|
|
87 |
render_btn = gr.Button("Render Podcast")
|
88 |
audio_output = gr.Audio(label="Generated Podcast")
|
89 |
|
90 |
+
def generate_script_wrapper(api_key, content, duration):
|
91 |
+
return generate_podcast_script(api_key, content, duration)
|
92 |
+
|
93 |
+
async def render_podcast_wrapper(api_key, script, voice1, voice2):
|
94 |
+
return await render_podcast(api_key, script, voice1, voice2)
|
95 |
+
|
96 |
+
generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
|
97 |
+
render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)
|
98 |
|
99 |
demo.launch()
|