Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

ai-podcast-builder / app.py

bluenevus

Update app.py

07cb903 verified 5 months ago

raw

history blame

8.87 kB

	import gradio as gr
	import google.generativeai as genai
	import numpy as np
	import io
	import re
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from huggingface_hub import snapshot_download, login
	import torchaudio
	from torchaudio.functional import resample
	import threading
	import queue
	import os
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = None
	tokenizer = None

	def load_model():
	global model, tokenizer

	print("Loading Orpheus model...")
	model_name = "canopylabs/orpheus-3b-0.1-ft"

	hf_token = os.environ.get("HUGGINGFACE_TOKEN")
	if not hf_token:
	raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")

	login(token=hf_token)

	snapshot_download(
	repo_id=model_name,
	use_auth_token=hf_token,
	allow_patterns=[
	"config.json",
	"*.safetensors",
	"model.safetensors.index.json",
	],
	ignore_patterns=[
	"optimizer.pt",
	"pytorch_model.bin",
	"training_args.bin",
	"scheduler.pt",
	"tokenizer.json",
	"tokenizer_config.json",
	"special_tokens_map.json",
	"vocab.json",
	"merges.txt",
	"tokenizer.*"
	]
	)

	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
	model.to(device)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	print(f"Orpheus model and tokenizer loaded to {device}")

	def generate_podcast_script(api_key, content, duration, num_hosts):
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')

	if num_hosts == 1:
	prompt = f"""
	Create a podcast script for one person discussing the following content:
	{content}

	The podcast should last approximately {duration}. Include natural speech patterns,
	humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
	yes, I see, Ok now. Vary the emotional tone.

	Format the script as a monologue without speaker labels.
	Separate each paragraph with a blank line.

	Only include the monologue with proper punctuation and emotion tags enclosed in angle brackets < >.
	For example, use <chuckle> instead of "chuckle".

	Ensure the content flows naturally and stays relevant to the topic.
	Limit the script length to match the requested duration of {duration}.

	To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
	to enhance the dialogue's emotional context.

	Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement,
	<cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.

	For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
	Oh well, at least I finished the project <chuckle>."

	Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
	"""
	else:
	prompt = f"""
	Create a podcast script for two people discussing the following content:
	{content}

	The podcast should last approximately {duration}. Include natural speech patterns,
	humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
	yes, I see, Ok now. Vary the emotional tone.

	Format the script as alternating lines of dialogue without speaker labels.
	Separate each line with a blank line.

	Only include the dialogue with proper punctuation and emotion tags enclosed in angle brackets < >.
	For example, use <chuckle> instead of "chuckle".

	Ensure the conversation flows naturally and stays relevant to the topic.
	Limit the script length to match the requested duration of {duration}.

	To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
	to enhance the dialogue's emotional context.

	Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement,
	<cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.

	For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
	Oh well, at least I finished the project <chuckle>."

	Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
	"""

	response = model.generate_content(prompt)
	clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
	return clean_text

	def text_to_speech(text, voice):
	global model, tokenizer
	if tokenizer is None or model is None:
	raise ValueError("Model or tokenizer not initialized. Please call load_model() first.")

	inputs = tokenizer(text, return_tensors="pt").to(device)
	with torch.no_grad():
	output = model.generate(**inputs, max_new_tokens=256)
	mel = output[0].cpu().numpy()
	audio = mel_to_audio(mel)
	return audio

	def mel_to_audio(mel):
	# Placeholder implementation
	return np.zeros(24000, dtype=np.float32) # 1 second of silence

	def process_audio_segment(line, voice, result_queue):
	try:
	audio = text_to_speech(line, voice)
	result_queue.put(audio)
	except Exception as e:
	logger.error(f"Error processing audio segment: {str(e)}")
	result_queue.put(None)

	def render_podcast(api_key, script, voice1, voice2, num_hosts):
	lines = [line for line in script.split('\n') if line.strip()]
	audio_segments = []
	threads = []
	result_queue = queue.Queue()

	for i, line in enumerate(lines):
	voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
	thread = threading.Thread(target=process_audio_segment, args=(line, voice, result_queue))
	threads.append(thread)
	thread.start()

	for thread in threads:
	thread.join()

	while not result_queue.empty():
	segment = result_queue.get()
	if segment is not None:
	audio_segments.append(segment)

	if not audio_segments:
	logger.warning("No valid audio segments were generated.")
	return (24000, np.zeros(24000, dtype=np.float32))

	podcast_audio = np.concatenate(audio_segments)
	return (24000, podcast_audio)

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# AI Podcast Generator")

	api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")

	with gr.Row():
	content_input = gr.Textbox(label="Paste your content or upload a document")
	document_upload = gr.File(label="Upload Document")

	duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")

	num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)

	voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]

	with gr.Row():
	voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")

	with gr.Row():
	voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")

	generate_btn = gr.Button("Generate Script")
	script_output = gr.Textbox(label="Generated Script", lines=10)

	render_btn = gr.Button("Render Podcast")
	audio_output = gr.Audio(label="Generated Podcast")

	def generate_script_wrapper(api_key, content, duration, num_hosts):
	return generate_podcast_script(api_key, content, duration, num_hosts)

	def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
	return render_podcast(api_key, script, voice1, voice2, num_hosts)

	generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
	render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)

	def update_second_voice_visibility(num_hosts):
	return gr.update(visible=num_hosts == 2)

	num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])

	if __name__ == "__main__":
	load_model() # Ensure the model is loaded before launching the interface
	demo.launch()