Spaces:

m-ric
/

open-notebooklm

Running on Zero

App Files Files Community

open-notebooklm / app.py

m-ric HF Staff

Working Kokoro

4af8987 27 days ago

raw

history blame

5.51 kB

	import queue
	import threading
	import spaces
	import os
	import io
	import soundfile as sf
	import gradio as gr
	import numpy as np
	import torch
	from transformers import set_seed
	from huggingface_hub import InferenceClient
	from kokoro import KModel, KPipeline

	# -----------------------------------------------------------------------------
	# Hard‑coded podcast subject
	# -----------------------------------------------------------------------------
	PODCAST_SUBJECT = "The future of AI and its impact on society"

	# -----------------------------------------------------------------------------
	# LLM that writes the script (unchanged)
	# -----------------------------------------------------------------------------
	client = InferenceClient(
	"meta-llama/Llama-3.3-70B-Instruct",
	provider="cerebras",
	token=os.getenv("HF_TOKEN"),
	)

	# -----------------------------------------------------------------------------
	# Kokoro TTS setup (replaces Dia)
	# -----------------------------------------------------------------------------
	CUDA_AVAILABLE = torch.cuda.is_available()

	kmodel = KModel().to("cuda" if CUDA_AVAILABLE else "cpu").eval()
	kpipeline = KPipeline(lang_code="a") # English voices

	MALE_VOICE = "am_michael" # [S1]
	FEMALE_VOICE = "af_heart" # [S2]

	# Pre‑warm voices to avoid first‑call latency
	for v in (MALE_VOICE, FEMALE_VOICE):
	kpipeline.load_voice(v)


	audio_queue: queue.Queue[tuple[int, np.ndarray] \| None] = queue.Queue()
	stop_signal = threading.Event()



	def generate_podcast_text(subject: str) -> str:
	"""Ask the LLM for a ~5‑minute two‑host script."""
	prompt = f"""Generate a podcast told by 2 hosts about {subject}.
	The podcast should be an insightful discussion, with some amount of playful banter.
	Separate dialog as follows using [S1] for the male host and [S2] for the female host, for instance:
	[S1] Hello, how are you?
	[S2] I'm good, thank you. How are you?
	[S1] I'm good, thank you. (laughs)
	[S2] Great.
	Now go on, make 5 minutes of podcast.
	"""
	response = client.chat_completion(
	[{"role": "user", "content": prompt}],
	max_tokens=1000,
	)
	return response.choices[0].message.content

	@spaces.GPU
	def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
	"""Read each line, pick voice via tag, send chunks to the queue."""
	lines = [l for l in podcast_text.strip().splitlines() if l.strip()]

	pipeline = kpipeline
	pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
	pipeline_voice_male = pipeline.load_voice(MALE_VOICE)

	for line in lines:
	if stop_signal.is_set():
	break

	# Expect "[S1] ..." or "[S2] ..."
	if line.startswith("[S1]"):
	pipeline_voice = pipeline_voice_male
	voice = MALE_VOICE
	utterance = line[len("[S1]"):].strip()
	elif line.startswith("[S2]"):
	pipeline_voice = pipeline_voice_female
	voice = FEMALE_VOICE
	utterance = line[len("[S2]"):].strip()
	else: # fallback
	pipeline_voice = pipeline_voice_female
	voice = FEMALE_VOICE
	utterance = line

	first = True
	for _, ps, _ in pipeline(utterance, voice, speed):
	ref_s = pipeline_voice[len(ps) - 1]
	audio = kmodel(ps, ref_s, speed)
	audio_queue.put((24000, audio.numpy()))
	audio_numpy = audio.numpy()
	print("GENERATED AUDIO", audio_numpy[-100:], audio_numpy.max())
	if first:
	first = False
	audio_queue.put((24000, torch.zeros(1).numpy()))
	audio_queue.put(None) # Signal end of stream


	def stream_audio_generator(podcast_text: str):
	stop_signal.clear()
	threading.Thread(target=process_audio_chunks, args=(podcast_text,)).start()

	while True:
	chunk = audio_queue.get()
	if chunk is None:
	break
	print("CHUNK", chunk, type(chunk))
	sr, data = chunk

	buf = io.BytesIO()
	sf.write(buf, data, sr, format="wav")
	buf.seek(0)
	yield buf.getvalue()



	def stop_generation():
	stop_signal.set()
	return "Generation stopped"


	def generate_podcast():
	return generate_podcast_text(PODCAST_SUBJECT)


	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# NotebookLM Podcast Generator")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown(f"## Current Topic: {PODCAST_SUBJECT}")
	gr.Markdown(
	"This app generates a podcast discussion between two hosts about the specified topic."
	)

	generate_btn = gr.Button("Generate Podcast Script", variant="primary")
	podcast_output = gr.Textbox(label="Generated Podcast Script", lines=15)

	gr.Markdown("## Audio Preview")
	gr.Markdown("Click below to hear the podcast with realistic voices:")

	with gr.Row():
	start_audio_btn = gr.Button("▶️ Generate Podcast", variant="secondary")
	stop_btn = gr.Button("⏹️ Stop", variant="stop")

	audio_output = gr.Audio(label="Podcast Audio", streaming=True)
	status_text = gr.Textbox(label="Status", visible=True)

	generate_btn.click(fn=generate_podcast, outputs=podcast_output)

	start_audio_btn.click(fn=stream_audio_generator, inputs=podcast_output, outputs=audio_output)
	stop_btn.click(fn=stop_generation, outputs=status_text)

	if __name__ == "__main__":
	demo.queue().launch()