open-notebooklm / app.py
m-ric's picture
m-ric HF Staff
Working interface kokoro
4d88a72
raw
history blame
5.31 kB
import queue
import threading
import spaces
import os
import io
import soundfile as sf
import gradio as gr
import numpy as np
import time
import torch
from huggingface_hub import InferenceClient
from kokoro import KModel, KPipeline
# -----------------------------------------------------------------------------
# Get podcast subject
# -----------------------------------------------------------------------------
from papers import PaperManager
paper_manager = PaperManager()
top_papers = paper_manager.get_top_content()
PODCAST_SUBJECT = list(top_papers.values())[0]
# -----------------------------------------------------------------------------
# LLM that writes the script (unchanged)
# -----------------------------------------------------------------------------
from prompts import SYSTEM_PROMPT
client = InferenceClient(
"meta-llama/Llama-3.3-70B-Instruct",
provider="cerebras",
token=os.getenv("HF_TOKEN"),
)
def generate_podcast_text(subject: str, steering_question: str | None = None) -> str:
"""Ask the LLM for a script of a podcast given by two hosts."""
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"""Here is the topic: it's the top trending paper on Hugging Face daily papers today. You will need to analyze it by bringing profound insights.
{subject[:1000]}"""},
]
if steering_question and len(steering_question) > 0:
messages.append({"role": "user", "content": f"You could focus on this question: {steering_question}"})
response = client.chat_completion(
messages,
max_tokens=8156,
)
full_text = response.choices[0].message.content
assert "[JANE]" in full_text
dialogue_start_index = full_text.find("[JANE]")
podcast_text = full_text[dialogue_start_index:]
return podcast_text
# -----------------------------------------------------------------------------
# Kokoro TTS
# -----------------------------------------------------------------------------
CUDA_AVAILABLE = torch.cuda.is_available()
kmodel = KModel().to("cuda" if CUDA_AVAILABLE else "cpu").eval()
kpipeline = KPipeline(lang_code="a") # English voices
MALE_VOICE = "am_michael" # [MIKE]
FEMALE_VOICE = "af_heart" # [JANE]
# Pre‑warm voices to avoid first‑call latency
for v in (MALE_VOICE, FEMALE_VOICE):
kpipeline.load_voice(v)
# -----------------------------------------------------------------------------
# Audio generation system with queue
# -----------------------------------------------------------------------------
@spaces.GPU
def generate_podcast(pdf, url, topic):
podcast_text = generate_podcast_text(PODCAST_SUBJECT, topic)
lines = [l for l in podcast_text.strip().splitlines() if l.strip()]
pipeline = kpipeline
pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
pipeline_voice_male = pipeline.load_voice(MALE_VOICE)
speed = 1.
sr = 24000
for line in lines:
# Expect "[S1] ..." or "[S2] ..."
if line.startswith("[MIKE]"):
pipeline_voice = pipeline_voice_male
voice = MALE_VOICE
utterance = line[len("[MIKE]"):].strip()
elif line.startswith("[JANE]"):
pipeline_voice = pipeline_voice_female
voice = FEMALE_VOICE
utterance = line[len("[JANE]"):].strip()
else: # fallback
pipeline_voice = pipeline_voice_female
voice = FEMALE_VOICE
utterance = line
for _, ps, _ in pipeline(utterance, voice, speed):
t0 = time.time()
ref_s = pipeline_voice[len(ps) - 1]
audio_numpy = kmodel(ps, ref_s, speed).numpy()
yield (sr, audio_numpy)
t1 = time.time()
print(f"PROCESSED '{utterance}' in {int(t1-t0)} seconds. {audio_numpy.shape}")
demo = gr.Interface(
title="Open NotebookLM",
description=f"""Generates a podcast discussion between two hosts about the materials of your choice. Based on [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M), and uses elements from a NotebookLM app by [Gabriel Chua](https://huggingface.co/spaces/gabrielchua/open-notebooklm).
If you do not specify any source materials below, the podcast will be about the top trending [Daily paper](https://huggingface.co/papers/), '**{list(top_papers.keys())[0]}**'""",
fn=generate_podcast,
inputs=[
gr.File(
label="Optional - Upload a pdf",
file_types=[".pdf"],
file_count="single",
),
gr.Textbox(
label="Optional - Type a URL to read its page",
),
gr.Textbox(label="Do you have a more specific topic or question on the materials?"),
# gr.Dropdown(
# label=UI_INPUTS["length"]["label"],
# choices=UI_INPUTS["length"]["choices"],
# value=UI_INPUTS["length"]["value"],
# ),
],
outputs=[
gr.Audio(
label="Listen to your podcast",
format="wav",
streaming=True,
),
# gr.Markdown(label=UI_OUTPUTS["transcript"]["label"]),
],
theme=gr.themes.Soft(),
submit_btn="Generate podcast 🎙️",
# examples=UI_EXAMPLES,
# cache_examples=UI_CACHE_EXAMPLES,
)
if __name__ == "__main__":
demo.launch()