File size: 11,888 Bytes
f3b14e5 f304938 e3bea0f f304938 f3b14e5 f304938 e3bea0f f304938 e3bea0f f304938 e3bea0f f304938 2fa76ba f304938 e3bea0f f304938 f3b14e5 f304938 aa10e55 f304938 f3b14e5 f304938 f3b14e5 f304938 74245b5 a727789 f304938 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import gradio as gr
import google.generativeai as genai
import numpy as np
import edge_tts
import asyncio
import io
import re
# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize Gemini AI
genai.configure(api_key='YOUR_GEMINI_API_KEY')
def generate_podcast_script(api_key, content, duration, num_hosts):
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
if num_hosts == 1:
prompt = f"""
Create a podcast script for one person discussing the following content:
{content}
The podcast should last approximately {duration}. Include natural speech patterns,
humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
yes, I see, Ok now. Vary the emotional tone.
Format the script as a monologue without speaker labels.
Separate each paragraph with a blank line.
Do not use any special characters or markdown. Only include the monologue with proper punctuation.
Ensure the content flows naturally and stays relevant to the topic.
Limit the script length to match the requested duration of {duration}.
"""
else:
prompt = f"""
Create a podcast script for two people discussing the following content:
{content}
The podcast should last approximately {duration}. Include natural speech patterns,
humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
yes, I see, Ok now. Vary the emotional tone.
Format the script as alternating lines of dialogue without speaker labels.
Separate each line with a blank line.
Do not use any special characters or markdown. Only include the alternating dialogue lines with proper punctuation.
Ensure the conversation flows naturally and stays relevant to the topic.
Limit the script length to match the requested duration of {duration}.
"""
response = model.generate_content(prompt)
# Remove any special characters that might be read aloud
clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
return clean_text
async def text_to_speech(text, voice):
communicate = edge_tts.Communicate(text, voice)
audio = io.BytesIO()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio.write(chunk["data"])
audio.seek(0)
return audio.read()
async def render_podcast(api_key, script, voice1, voice2, num_hosts):
lines = [line for line in script.split('\n') if line.strip()]
audio_segments = []
if num_hosts == 1:
for line in lines:
audio = await text_to_speech(line, voice1)
audio_segments.append(audio)
else:
for i, line in enumerate(lines):
voice = voice1 if i % 2 == 0 else voice2
audio = await text_to_speech(line, voice)
audio_segments.append(audio)
if not audio_segments:
logger.warning("No valid audio segments were generated.")
return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated
# Concatenate audio segments
podcast_audio = b''.join(audio_segments)
# Convert to numpy array
podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate
async def get_voice_list():
voices = await edge_tts.list_voices()
voice_dict = {}
for voice in voices:
lang = voice["Locale"]
if lang not in voice_dict:
voice_dict[lang] = []
voice_dict[lang].append(voice["Name"])
return voice_dict
# Language names dictionary
language_names = {
'af-ZA': 'Afrikaans (South Africa)', 'am-ET': 'Amharic (Ethiopia)', 'ar-AE': 'Arabic (UAE)', 'ar-BH': 'Arabic (Bahrain)',
'ar-DZ': 'Arabic (Algeria)', 'ar-EG': 'Arabic (Egypt)', 'ar-IQ': 'Arabic (Iraq)', 'ar-JO': 'Arabic (Jordan)',
'ar-KW': 'Arabic (Kuwait)', 'ar-LB': 'Arabic (Lebanon)', 'ar-LY': 'Arabic (Libya)', 'ar-MA': 'Arabic (Morocco)',
'ar-OM': 'Arabic (Oman)', 'ar-QA': 'Arabic (Qatar)', 'ar-SA': 'Arabic (Saudi Arabia)', 'ar-SY': 'Arabic (Syria)',
'ar-TN': 'Arabic (Tunisia)', 'ar-YE': 'Arabic (Yemen)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'bg-BG': 'Bulgarian (Bulgaria)',
'bn-BD': 'Bengali (Bangladesh)', 'bn-IN': 'Bengali (India)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'ca-ES': 'Catalan (Spain)',
'cs-CZ': 'Czech (Czech Republic)', 'cy-GB': 'Welsh (United Kingdom)', 'da-DK': 'Danish (Denmark)', 'de-AT': 'German (Austria)',
'de-CH': 'German (Switzerland)', 'de-DE': 'German (Germany)', 'el-GR': 'Greek (Greece)', 'en-AU': 'English (Australia)',
'en-CA': 'English (Canada)', 'en-GB': 'English (United Kingdom)', 'en-GH': 'English (Ghana)', 'en-HK': 'English (Hong Kong SAR)',
'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)',
'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)',
'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)',
'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)',
'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)',
'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)',
'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)',
'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)',
'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)',
'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)',
'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)',
'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)',
'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)',
'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)',
'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)',
'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)',
'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)',
'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)',
'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)',
'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)',
'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)',
'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)',
'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)',
'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)',
'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)',
'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)',
'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)'
}
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# AI Podcast Generator")
api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
with gr.Row():
content_input = gr.Textbox(label="Paste your content or upload a document")
document_upload = gr.File(label="Upload Document")
duration = gr.Radio(
["1-5 min", "5-10 min", "10-15 min"],
label="Estimated podcast duration",
value="1-5 min" # This sets the default value
)
num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
voice_dict = asyncio.run(get_voice_list())
languages = list(voice_dict.keys())
languages.insert(0, "None") # Add "None" option for single speaker
default_voice1 = "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)"
default_voice2 = "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)"
with gr.Row():
lang1_select = gr.Dropdown(label="Select Language 1", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_dict.get('en-US', []), value=default_voice1)
with gr.Row():
lang2_select = gr.Dropdown(label="Select Language 2", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_dict.get('en-US', []), value=default_voice2)
generate_btn = gr.Button("Generate Script")
script_output = gr.Textbox(label="Generated Script", lines=10)
render_btn = gr.Button("Render Podcast")
audio_output = gr.Audio(label="Generated Podcast")
def update_voices(lang):
if lang == "None":
return gr.Dropdown(choices=[], value=None)
selected_lang = next((key for key, value in language_names.items() if value == lang), None)
voices = voice_dict.get(selected_lang, [])
if lang == "English (United States)":
if default_voice1 in voices:
return gr.Dropdown(choices=voices, value=default_voice1)
elif default_voice2 in voices:
return gr.Dropdown(choices=voices, value=default_voice2)
return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
lang1_select.change(update_voices, inputs=[lang1_select], outputs=[voice1_select])
lang2_select.change(update_voices, inputs=[lang2_select], outputs=[voice2_select])
def generate_script_wrapper(api_key, content, duration, num_hosts):
return generate_podcast_script(api_key, content, duration, num_hosts)
async def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
return await render_podcast(api_key, script, voice1, voice2, num_hosts)
generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)
def update_second_voice_visibility(num_hosts):
return gr.update(visible=num_hosts == 2), gr.update(visible=num_hosts == 2)
num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[lang2_select, voice2_select])
if __name__ == "__main__":
demo.launch() |