Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

ai-podcast-builder / app.py

bluenevus

Update app.py

38f82cf verified 5 months ago

raw

history blame

3.85 kB

	import gradio as gr
	import google.generativeai as genai
	import numpy as np
	import edge_tts
	import asyncio
	import io

	# Set up logging
	import logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize Gemini AI
	genai.configure(api_key='YOUR_GEMINI_API_KEY')

	def generate_podcast_script(api_key, content, duration):
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')

	prompt = f"""
	Create a podcast script for two people discussing the following content:
	{content}

	The podcast should last approximately {duration}. Include natural speech patterns,
	humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
	"yes", "I see", "Ok now". Vary the emotional tone.

	Format the script as alternating lines of dialogue without speaker labels.
	Do not include any other text, markdown, or formatting. Only include the alternating dialogue lines.
	Ensure the conversation flows naturally and stays relevant to the topic.
	"""
	response = model.generate_content(prompt)
	return response.text

	async def text_to_speech(text, voice):
	communicate = edge_tts.Communicate(text, voice)
	audio = io.BytesIO()
	async for chunk in communicate.stream():
	if chunk["type"] == "audio":
	audio.write(chunk["data"])
	audio.seek(0)
	return audio.read()

	async def render_podcast(api_key, script, voice1, voice2):
	lines = script.split('\n')
	audio_segments = []

	for i, line in enumerate(lines):
	if line.strip(): # Skip empty lines
	voice = voice1 if i % 2 == 0 else voice2
	audio = await text_to_speech(line, voice)
	audio_segments.append(audio)

	if not audio_segments:
	logger.warning("No valid audio segments were generated.")
	return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated

	# Concatenate audio segments
	podcast_audio = b''.join(audio_segments)

	# Convert to numpy array
	podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)

	return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate

	async def get_voice_list():
	voices = await edge_tts.list_voices()
	return [voice["Name"] for voice in voices]

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# AI Podcast Generator")

	api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")

	with gr.Row():
	content_input = gr.Textbox(label="Paste your content or upload a document")
	document_upload = gr.File(label="Upload Document")

	duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")

	voice_list = asyncio.run(get_voice_list())

	with gr.Row():
	voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_list)
	voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_list)

	generate_btn = gr.Button("Generate Script")
	script_output = gr.Textbox(label="Generated Script", lines=10)

	render_btn = gr.Button("Render Podcast")
	audio_output = gr.Audio(label="Generated Podcast")

	def generate_script_wrapper(api_key, content, duration):
	return generate_podcast_script(api_key, content, duration)

	async def render_podcast_wrapper(api_key, script, voice1, voice2):
	return await render_podcast(api_key, script, voice1, voice2)

	generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
	render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)

	if __name__ == "__main__":
	demo.launch()