Update app.py
Browse files
app.py
CHANGED
@@ -1,166 +1,52 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import edge_tts
|
5 |
-
import asyncio
|
6 |
import io
|
7 |
-
import
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
logging.basicConfig(level=logging.INFO)
|
12 |
-
logger = logging.getLogger(__name__)
|
13 |
-
|
14 |
-
# Initialize Gemini AI
|
15 |
-
genai.configure(api_key='YOUR_GEMINI_API_KEY')
|
16 |
-
|
17 |
-
def generate_podcast_script(api_key, content, duration):
|
18 |
-
genai.configure(api_key=api_key)
|
19 |
-
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
20 |
-
|
21 |
-
prompt = f"""
|
22 |
-
Create a podcast script for two people discussing the following content:
|
23 |
-
{content}
|
24 |
-
|
25 |
-
The podcast should last approximately {duration}. Include natural speech patterns,
|
26 |
-
humor, and occasional off-topic chit-chat. Use speech fillers like um, ah,
|
27 |
-
yes, I see, Ok now. Vary the emotional tone.
|
28 |
-
|
29 |
-
Format the script as alternating lines of dialogue without speaker labels.
|
30 |
-
Do not use any special characters, markdown, or formatting. Only include the alternating dialogue lines.
|
31 |
-
Ensure the conversation flows naturally and stays relevant to the topic.
|
32 |
-
Limit the script length to match the requested duration of {duration}.
|
33 |
-
"""
|
34 |
-
response = model.generate_content(prompt)
|
35 |
-
# Remove any special characters that might be read aloud
|
36 |
-
clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
|
37 |
-
return clean_text
|
38 |
-
|
39 |
-
async def text_to_speech(text, voice):
|
40 |
-
communicate = edge_tts.Communicate(text, voice)
|
41 |
-
audio = io.BytesIO()
|
42 |
-
async for chunk in communicate.stream():
|
43 |
-
if chunk["type"] == "audio":
|
44 |
-
audio.write(chunk["data"])
|
45 |
-
audio.seek(0)
|
46 |
-
return audio.read()
|
47 |
-
|
48 |
-
async def render_podcast(api_key, script, voice1, voice2):
|
49 |
-
lines = script.split('\n')
|
50 |
audio_segments = []
|
51 |
-
|
52 |
for i, line in enumerate(lines):
|
53 |
if line.strip(): # Skip empty lines
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)',
|
92 |
-
'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)',
|
93 |
-
'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)',
|
94 |
-
'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)',
|
95 |
-
'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)',
|
96 |
-
'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)',
|
97 |
-
'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)',
|
98 |
-
'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)',
|
99 |
-
'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)',
|
100 |
-
'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)',
|
101 |
-
'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)',
|
102 |
-
'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)',
|
103 |
-
'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)',
|
104 |
-
'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)',
|
105 |
-
'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)',
|
106 |
-
'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)',
|
107 |
-
'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)',
|
108 |
-
'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)',
|
109 |
-
'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)',
|
110 |
-
'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)',
|
111 |
-
'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)',
|
112 |
-
'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)',
|
113 |
-
'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)',
|
114 |
-
'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)',
|
115 |
-
'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)',
|
116 |
-
'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)',
|
117 |
-
'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)'
|
118 |
-
}
|
119 |
-
|
120 |
-
# Gradio Interface
|
121 |
-
with gr.Blocks() as demo:
|
122 |
-
gr.Markdown("# AI Podcast Generator")
|
123 |
-
|
124 |
-
api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
|
125 |
-
|
126 |
-
with gr.Row():
|
127 |
-
content_input = gr.Textbox(label="Paste your content or upload a document")
|
128 |
-
document_upload = gr.File(label="Upload Document")
|
129 |
-
|
130 |
-
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
|
131 |
-
|
132 |
-
voice_dict = asyncio.run(get_voice_list())
|
133 |
-
languages = list(voice_dict.keys())
|
134 |
-
|
135 |
-
with gr.Row():
|
136 |
-
lang1_select = gr.Dropdown(label="Select Language 1", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
|
137 |
-
voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_dict.get('en-US', []), value="en-US-AnaNeural")
|
138 |
-
|
139 |
-
with gr.Row():
|
140 |
-
lang2_select = gr.Dropdown(label="Select Language 2", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
|
141 |
-
voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_dict.get('en-US', []), value="en-US-MichelleNeural")
|
142 |
-
|
143 |
-
generate_btn = gr.Button("Generate Script")
|
144 |
-
script_output = gr.Textbox(label="Generated Script", lines=10)
|
145 |
-
|
146 |
-
render_btn = gr.Button("Render Podcast")
|
147 |
-
audio_output = gr.Audio(label="Generated Podcast")
|
148 |
-
|
149 |
-
def update_voices(lang):
|
150 |
-
selected_lang = next((key for key, value in language_names.items() if value == lang), None)
|
151 |
-
return gr.Dropdown(choices=voice_dict.get(selected_lang, []))
|
152 |
-
|
153 |
-
lang1_select.change(update_voices, inputs=[lang1_select], outputs=[voice1_select])
|
154 |
-
lang2_select.change(update_voices, inputs=[lang2_select], outputs=[voice2_select])
|
155 |
-
|
156 |
-
def generate_script_wrapper(api_key, content, duration):
|
157 |
-
return generate_podcast_script(api_key, content, duration)
|
158 |
-
|
159 |
-
async def render_podcast_wrapper(api_key, script, voice1, voice2):
|
160 |
-
return await render_podcast(api_key, script, voice1, voice2)
|
161 |
-
|
162 |
-
generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
|
163 |
-
render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)
|
164 |
|
165 |
if __name__ == "__main__":
|
166 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
from gtts import gTTS
|
|
|
|
|
|
|
3 |
import io
|
4 |
+
import os
|
5 |
+
from pydub import AudioSegment
|
6 |
|
7 |
+
def text_to_speech(text):
|
8 |
+
lines = text.split('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
audio_segments = []
|
10 |
+
|
11 |
for i, line in enumerate(lines):
|
12 |
if line.strip(): # Skip empty lines
|
13 |
+
tts = gTTS(text=line, lang='en', slow=False)
|
14 |
+
|
15 |
+
# Save to a temporary file
|
16 |
+
temp_filename = f"temp_{i}.mp3"
|
17 |
+
tts.save(temp_filename)
|
18 |
+
|
19 |
+
# Load the audio segment
|
20 |
+
audio_segment = AudioSegment.from_mp3(temp_filename)
|
21 |
+
|
22 |
+
# Alternate between two voices
|
23 |
+
if i % 2 == 0:
|
24 |
+
audio_segment = audio_segment.set_frame_rate(44100) # Higher pitch for even lines (speaker 1)
|
25 |
+
else:
|
26 |
+
audio_segment = audio_segment.set_frame_rate(22050) # Lower pitch for odd lines (speaker 2)
|
27 |
+
|
28 |
+
audio_segments.append(audio_segment)
|
29 |
+
|
30 |
+
# Remove the temporary file
|
31 |
+
os.remove(temp_filename)
|
32 |
+
|
33 |
+
# Concatenate all audio segments
|
34 |
+
final_audio = sum(audio_segments)
|
35 |
+
|
36 |
+
# Export the final audio to a byte stream
|
37 |
+
buffer = io.BytesIO()
|
38 |
+
final_audio.export(buffer, format="mp3")
|
39 |
+
buffer.seek(0)
|
40 |
+
|
41 |
+
return buffer
|
42 |
+
|
43 |
+
iface = gr.Interface(
|
44 |
+
fn=text_to_speech,
|
45 |
+
inputs=gr.Textbox(lines=10, placeholder="Enter your text here..."),
|
46 |
+
outputs=gr.Audio(type="binary"),
|
47 |
+
title="Two-Speaker Text-to-Speech Converter",
|
48 |
+
description="Convert text to speech with alternating voices for even and odd lines."
|
49 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
if __name__ == "__main__":
|
52 |
+
iface.launch()
|