File size: 5,190 Bytes
74245b5
 
 
851995d
 
38f82cf
89d4917
851995d
 
 
 
 
74245b5
851995d
 
74245b5
d0f551e
 
 
 
74245b5
38f82cf
74245b5
 
 
89d4917
 
74245b5
38f82cf
89d4917
74245b5
89d4917
74245b5
 
89d4917
 
 
74245b5
851995d
 
38f82cf
 
 
 
 
 
74245b5
851995d
74245b5
 
 
38f82cf
 
 
 
851995d
74245b5
ac81409
851995d
 
ac81409
 
851995d
7d92703
851995d
 
7d92703
851995d
74245b5
a727789
 
89d4917
 
c359c63
89d4917
 
 
 
a727789
1036e07
c359c63
1036e07
 
 
c359c63
 
74245b5
 
 
 
d0f551e
 
74245b5
 
 
 
 
 
89d4917
 
a727789
74245b5
1036e07
 
89d4917
 
1036e07
 
74245b5
 
 
 
 
 
 
89d4917
c359c63
 
89d4917
 
 
 
851995d
 
 
 
 
 
 
 
74245b5
a727789
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
import google.generativeai as genai
import numpy as np
import edge_tts
import asyncio
import io
import re

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Gemini AI
genai.configure(api_key='YOUR_GEMINI_API_KEY')

def generate_podcast_script(api_key, content, duration):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
    
    prompt = f"""
    Create a podcast script for two people discussing the following content:
    {content}
    
    The podcast should last approximately {duration}. Include natural speech patterns,
    humor, and occasional off-topic chit-chat. Use speech fillers like um, ah,
    yes, I see, Ok now. Vary the emotional tone.

    Format the script as alternating lines of dialogue without speaker labels.
    Do not use any special characters, markdown, or formatting. Only include the alternating dialogue lines.
    Ensure the conversation flows naturally and stays relevant to the topic.
    Limit the script length to match the requested duration of {duration}.
    """
    response = model.generate_content(prompt)
    # Remove any special characters that might be read aloud
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
    return clean_text

async def text_to_speech(text, voice):
    communicate = edge_tts.Communicate(text, voice)
    audio = io.BytesIO()
    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            audio.write(chunk["data"])
    audio.seek(0)
    return audio.read()

async def render_podcast(api_key, script, voice1, voice2):
    lines = script.split('\n')
    audio_segments = []
    
    for i, line in enumerate(lines):
        if line.strip():  # Skip empty lines
            voice = voice1 if i % 2 == 0 else voice2
            audio = await text_to_speech(line, voice)
            audio_segments.append(audio)
    
    if not audio_segments:
        logger.warning("No valid audio segments were generated.")
        return (24000, np.zeros(24000, dtype=np.int16))  # Return silence if no valid audio was generated
    
    # Concatenate audio segments
    podcast_audio = b''.join(audio_segments)
    
    # Convert to numpy array
    podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
    
    return (24000, podcast_audio)  # edge-tts uses 24000 Hz sample rate

async def get_voice_list():
    voices = await edge_tts.list_voices()
    voice_dict = {}
    for voice in voices:
        lang = voice["Locale"]
        if lang not in voice_dict:
            voice_dict[lang] = []
        voice_dict[lang].append(voice["Name"])
    return voice_dict

# Language names dictionary (abbreviated for brevity)
language_names = {
    'en-US': 'English (United States)',
    'en-GB': 'English (United Kingdom)',
    # ... (other languages)
}

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# AI Podcast Generator")
    
    api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
    
    with gr.Row():
        content_input = gr.Textbox(label="Paste your content or upload a document")
        document_upload = gr.File(label="Upload Document")
    
    duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
    
    voice_dict = asyncio.run(get_voice_list())
    languages = list(voice_dict.keys())
    
    with gr.Row():
        lang1_select = gr.Dropdown(label="Select Language 1", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
        voice1_select = gr.Dropdown(label="Select Voice 1", value="en-US-AnaNeural")
    
    with gr.Row():
        lang2_select = gr.Dropdown(label="Select Language 2", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
        voice2_select = gr.Dropdown(label="Select Voice 2", value="en-US-MichelleNeural")
    
    generate_btn = gr.Button("Generate Script")
    script_output = gr.Textbox(label="Generated Script", lines=10)
    
    render_btn = gr.Button("Render Podcast")
    audio_output = gr.Audio(label="Generated Podcast")
    
    def update_voices(lang):
        selected_lang = next((key for key, value in language_names.items() if value == lang), None)
        return gr.Dropdown(choices=voice_dict.get(selected_lang, []))
    
    lang1_select.change(update_voices, inputs=[lang1_select], outputs=[voice1_select])
    lang2_select.change(update_voices, inputs=[lang2_select], outputs=[voice2_select])
    
    def generate_script_wrapper(api_key, content, duration):
        return generate_podcast_script(api_key, content, duration)
    
    async def render_podcast_wrapper(api_key, script, voice1, voice2):
        return await render_podcast(api_key, script, voice1, voice2)
    
    generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
    render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)

if __name__ == "__main__":
    demo.launch()