Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

ai-podcast-builder / app.py

bluenevus

Update app.py

c06bbb8 verified 5 months ago

raw

history blame

5.45 kB

	# app.py

	import gradio as gr
	import torch
	import torchaudio
	import google.generativeai as genai
	from e2_tts_pytorch import E2TTS, DurationPredictor
	import numpy as np
	import os
	import requests
	from tqdm import tqdm

	# Initialize Gemini AI
	genai.configure(api_key='YOUR_GEMINI_API_KEY')
	model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')

	# Function to download the model file
	def download_model(url, filename):
	response = requests.get(url, stream=True)
	total_size = int(response.headers.get('content-length', 0))
	block_size = 1024 # 1 KB
	progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)

	os.makedirs(os.path.dirname(filename), exist_ok=True)
	with open(filename, 'wb') as file:
	for data in response.iter_content(block_size):
	size = file.write(data)
	progress_bar.update(size)
	progress_bar.close()

	# Check if model file exists, if not, download it
	model_path = "ckpts/E2TTS_Base/model_1200000.pt"
	if not os.path.exists(model_path):
	print("Downloading model file...")
	model_url = "https://huggingface.co/SWivid/E2-TTS/resolve/main/E2TTS_Base/model_1200000.pt"
	download_model(model_url, model_path)
	print("Model file downloaded successfully.")

	# Initialize E2-TTS model
	duration_predictor = DurationPredictor(
	transformer=dict(
	dim=512,
	depth=8,
	)
	)

	e2tts = E2TTS(
	duration_predictor=duration_predictor,
	transformer=dict(
	dim=512,
	depth=8
	),
	)

	# Load the pre-trained model
	checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
	if 'model_state_dict' in checkpoint:
	state_dict = checkpoint['model_state_dict']
	elif 'ema_model_state_dict' in checkpoint:
	state_dict = checkpoint['ema_model_state_dict']
	else:
	state_dict = checkpoint # Assume the checkpoint is the state dict itself

	# Filter out unexpected keys
	model_dict = e2tts.state_dict()
	filtered_state_dict = {k: v for k, v in state_dict.items() if k in model_dict}
	e2tts.load_state_dict(filtered_state_dict, strict=False)
	e2tts.eval()

	def generate_podcast_script(content, duration):
	prompt = f"""
	Create a podcast script for two people discussing the following content:
	{content}

	The podcast should last approximately {duration}. Include natural speech patterns,
	humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
	"yes", "I see", "Ok now". Vary the emotional tone (e.g., regular, happy, sad, surprised)
	and indicate these in [square brackets]. Format the script as follows:

	Host 1: [emotion] Dialog
	Host 2: [emotion] Dialog

	Ensure the conversation flows naturally and stays relevant to the topic.
	"""
	response = model.generate_content(prompt)
	return response.text

	def text_to_speech(text, speaker_id):
	# For simplicity, we'll use a random mel spectrogram as input
	# In a real scenario, you'd use the actual mel spectrogram from the cloned voice
	mel = torch.randn(1, 80, 100)

	# Generate speech
	with torch.no_grad():
	sampled = e2tts.sample(mel[:, :5], text=[text])

	return sampled.cpu().numpy()

	def create_podcast(content, duration, voice1, voice2):
	script = generate_podcast_script(content, duration)
	lines = script.split('\n')
	audio_segments = []

	for line in lines:
	if line.startswith("Host 1:"):
	audio = text_to_speech(line[7:], speaker_id=0)
	audio_segments.append(audio)
	elif line.startswith("Host 2:"):
	audio = text_to_speech(line[7:], speaker_id=1)
	audio_segments.append(audio)

	# Concatenate audio segments
	podcast_audio = np.concatenate(audio_segments)
	return (22050, podcast_audio) # Assuming 22050 Hz sample rate

	def gradio_interface(content, duration, voice1, voice2):
	script = generate_podcast_script(content, duration)
	return script

	def render_podcast(script, voice1, voice2):
	lines = script.split('\n')
	audio_segments = []

	for line in lines:
	if line.startswith("Host 1:"):
	audio = text_to_speech(line[7:], speaker_id=0)
	audio_segments.append(audio)
	elif line.startswith("Host 2:"):
	audio = text_to_speech(line[7:], speaker_id=1)
	audio_segments.append(audio)

	podcast_audio = np.concatenate(audio_segments)
	return (22050, podcast_audio)

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# AI Podcast Generator")

	with gr.Row():
	content_input = gr.Textbox(label="Paste your content or upload a document")
	document_upload = gr.File(label="Upload Document")

	duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")

	with gr.Row():
	voice1_upload = gr.Audio(label="Upload Voice 1", type="filepath")
	voice2_upload = gr.Audio(label="Upload Voice 2", type="filepath")

	generate_btn = gr.Button("Generate Script")
	script_output = gr.Textbox(label="Generated Script", lines=10)

	render_btn = gr.Button("Render Podcast")
	audio_output = gr.Audio(label="Generated Podcast")

	generate_btn.click(gradio_interface, inputs=[content_input, duration, voice1_upload, voice2_upload], outputs=script_output)
	render_btn.click(render_podcast, inputs=[script_output, voice1_upload, voice2_upload], outputs=audio_output)

	demo.launch()