Spaces:

m-ric
/

open-notebooklm

Running on Zero

App Files Files Community

m-ric HF Staff commited on 17 days ago

Commit

4af8987

1 Parent(s): 5da485d

Working Kokoro

Browse files

Files changed (1) hide show

app.py +86 -95

app.py CHANGED Viewed

@@ -2,27 +2,52 @@ import queue
 import threading
 import spaces
 import os
 import gradio as gr
-from dia.model import Dia
-from huggingface_hub import InferenceClient
 import numpy as np
 from transformers import set_seed
-import io, soundfile as sf
-# Hardcoded podcast subject
 PODCAST_SUBJECT = "The future of AI and its impact on society"
-# Initialize the inference client
-client = InferenceClient("meta-llama/Llama-3.3-70B-Instruct", provider="cerebras", token=os.getenv("HF_TOKEN"))
-model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="bfloat16")
-# Queue for audio streaming
-audio_queue = queue.Queue()
 stop_signal = threading.Event()
-def generate_podcast_text(subject):
     prompt = f"""Generate a podcast told by 2 hosts about {subject}.
 The podcast should be an insightful discussion, with some amount of playful banter.
 Separate dialog as follows using [S1] for the male host and [S2] for the female host, for instance:
@@ -32,87 +57,53 @@ Separate dialog as follows using [S1] for the male host and [S2] for the female
 [S2] Great.
 Now go on, make 5 minutes of podcast.
 """
-    response = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=1000)
-    return response.choices[0].message.content
-def split_podcast_into_chunks(podcast_text, chunk_size=3):
-    lines = podcast_text.strip().split("\n")
-    return ["\n".join(lines[i : i + chunk_size]) for i in range(0, len(lines), chunk_size)]
-def postprocess_audio(output_audio_np, speed_factor: float=0.8):
-    """Taken from https://huggingface.co/spaces/nari-labs/Dia-1.6B/blob/main/app.py"""
-    # Get sample rate from the loaded DAC model
-    output_sr = 44100
-    # --- Slow down audio ---
-    original_len = len(output_audio_np)
-    # Ensure speed_factor is positive and not excessively small/large to avoid issues
-    speed_factor = max(0.1, min(speed_factor, 5.0))
-    target_len = int(
-        original_len / speed_factor
-    )  # Target length based on speed_factor
-    if (
-        target_len != original_len and target_len > 0
-    ):  # Only interpolate if length changes and is valid
-        x_original = np.arange(original_len)
-        x_resampled = np.linspace(0, original_len - 1, target_len)
-        resampled_audio_np = np.interp(x_resampled, x_original, output_audio_np)
-        output_audio = (
-            output_sr,
-            resampled_audio_np.astype(np.float32),
-        )  # Use resampled audio
-        print(
-            f"Resampled audio from {original_len} to {target_len} samples for {speed_factor:.2f}x speed."
-        )
-    else:
-        output_audio = (
-            output_sr,
-            output_audio_np,
-        )  # Keep original if calculation fails or no change
-        print(f"Skipping audio speed adjustment (factor: {speed_factor:.2f}).")
-    # --- End slowdown ---
-    print(
-        f"Audio conversion successful. Final shape: {output_audio[1].shape}, Sample Rate: {output_sr}"
     )
-    # Explicitly convert to int16 to prevent Gradio warning
-    if (
-        output_audio[1].dtype == np.float32
-        or output_audio[1].dtype == np.float64
-    ):
-        audio_for_gradio = np.clip(output_audio[1], -1.0, 1.0)
-        audio_for_gradio = (audio_for_gradio * 32767).astype(np.int16)
-        output_audio = (output_sr, audio_for_gradio)
-        print("Converted audio to int16 for Gradio output.")
-    return output_audio
-@spaces.GPU
-def process_audio_chunks(podcast_text):
-    chunks = split_podcast_into_chunks(podcast_text)
-    sample_rate = 44100 # Modified from https://huggingface.co/spaces/nari-labs/Dia-1.6B/blob/main/app.py has 44100
-    for chunk in chunks:
-        print(f"Processing chunk: {chunk}")
         if stop_signal.is_set():
             break
-        set_seed(42)
-        raw_audio = model.generate(
-            chunk,
-            use_torch_compile=False, # To avoid gradio instability
-            verbose=False,
-            temperature=1.3,
-            top_p=0.95,
-        )
-        audio_chunk_np = np.array(raw_audio, dtype=np.float32)
-        audio_queue.put(postprocess_audio(audio_chunk_np))
-    audio_queue.put(None)
-def stream_audio_generator(podcast_text):
-    """Creates a generator that yields audio chunks for streaming"""
     stop_signal.clear()
     threading.Thread(target=process_audio_chunks, args=(podcast_text,)).start()
@@ -120,15 +111,14 @@ def stream_audio_generator(podcast_text):
         chunk = audio_queue.get()
         if chunk is None:
             break
-        sr, data = chunk           # the tuple you produced earlier
-        # Encode the numpy array into a WAV blob
         buf = io.BytesIO()
-        sf.write(buf, data.astype(np.float32) / 32768.0, sr, format="wav")
         buf.seek(0)
-        buffer = buf.getvalue()
-        print("PRINTING BUFFER:", buffer)
-        yield buffer# <-- bytes, so the browser can play it
 def stop_generation():
@@ -137,8 +127,7 @@ def stop_generation():
 def generate_podcast():
-    podcast_text = generate_podcast_text(PODCAST_SUBJECT)
-    return podcast_text
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -147,7 +136,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(f"## Current Topic: {PODCAST_SUBJECT}")
-            gr.Markdown("This app generates a podcast discussion between two hosts about the specified topic.")
             generate_btn = gr.Button("Generate Podcast Script", variant="primary")
             podcast_output = gr.Textbox(label="Generated Podcast Script", lines=15)

 import threading
 import spaces
 import os
+import io
+import soundfile as sf
 import gradio as gr
 import numpy as np
+import torch
 from transformers import set_seed
+from huggingface_hub import InferenceClient
+from kokoro import KModel, KPipeline
+# -----------------------------------------------------------------------------
+# Hard‑coded podcast subject
+# -----------------------------------------------------------------------------
 PODCAST_SUBJECT = "The future of AI and its impact on society"
+# -----------------------------------------------------------------------------
+# LLM that writes the script (unchanged)
+# -----------------------------------------------------------------------------
+client = InferenceClient(
+    "meta-llama/Llama-3.3-70B-Instruct",
+    provider="cerebras",
+    token=os.getenv("HF_TOKEN"),
+)
+# -----------------------------------------------------------------------------
+# Kokoro TTS setup (replaces Dia)
+# -----------------------------------------------------------------------------
+CUDA_AVAILABLE = torch.cuda.is_available()
+kmodel = KModel().to("cuda" if CUDA_AVAILABLE else "cpu").eval()
+kpipeline = KPipeline(lang_code="a")  # English voices
+MALE_VOICE = "am_michael"  # [S1]
+FEMALE_VOICE = "af_heart"  # [S2]
+# Pre‑warm voices to avoid first‑call latency
+for v in (MALE_VOICE, FEMALE_VOICE):
+    kpipeline.load_voice(v)
+audio_queue: queue.Queue[tuple[int, np.ndarray] | None] = queue.Queue()
 stop_signal = threading.Event()
+def generate_podcast_text(subject: str) -> str:
+    """Ask the LLM for a ~5‑minute two‑host script."""
     prompt = f"""Generate a podcast told by 2 hosts about {subject}.
 The podcast should be an insightful discussion, with some amount of playful banter.
 Separate dialog as follows using [S1] for the male host and [S2] for the female host, for instance:
 [S2] Great.
 Now go on, make 5 minutes of podcast.
 """
+    response = client.chat_completion(
+        [{"role": "user", "content": prompt}],
+        max_tokens=1000,
     )
+    return response.choices[0].message.content
+@spaces.GPU
+def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
+    """Read each line, pick voice via tag, send chunks to the queue."""
+    lines = [l for l in podcast_text.strip().splitlines() if l.strip()]
+    pipeline = kpipeline
+    pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
+    pipeline_voice_male = pipeline.load_voice(MALE_VOICE)
+    for line in lines:
         if stop_signal.is_set():
             break
+        # Expect "[S1] ..." or "[S2] ..."
+        if line.startswith("[S1]"):
+            pipeline_voice = pipeline_voice_male
+            voice = MALE_VOICE
+            utterance = line[len("[S1]"):].strip()
+        elif line.startswith("[S2]"):
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line[len("[S2]"):].strip()
+        else:  # fallback
+            pipeline_voice = pipeline_voice_female
+            voice = FEMALE_VOICE
+            utterance = line
+        first = True
+        for _, ps, _ in pipeline(utterance, voice, speed):
+            ref_s = pipeline_voice[len(ps) - 1]
+            audio = kmodel(ps, ref_s, speed)
+            audio_queue.put((24000, audio.numpy()))
+            audio_numpy = audio.numpy()
+            print("GENERATED AUDIO", audio_numpy[-100:], audio_numpy.max())
+            if first:
+                first = False
+                audio_queue.put((24000, torch.zeros(1).numpy()))
+    audio_queue.put(None)  # Signal end of stream
+def stream_audio_generator(podcast_text: str):
     stop_signal.clear()
     threading.Thread(target=process_audio_chunks, args=(podcast_text,)).start()
         chunk = audio_queue.get()
         if chunk is None:
             break
+        print("CHUNK", chunk, type(chunk))
+        sr, data = chunk
         buf = io.BytesIO()
+        sf.write(buf, data, sr, format="wav")
         buf.seek(0)
+        yield buf.getvalue()
 def stop_generation():
 def generate_podcast():
+    return generate_podcast_text(PODCAST_SUBJECT)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown(f"## Current Topic: {PODCAST_SUBJECT}")
+            gr.Markdown(
+                "This app generates a podcast discussion between two hosts about the specified topic."
+            )
             generate_btn = gr.Button("Generate Podcast Script", variant="primary")
             podcast_output = gr.Textbox(label="Generated Podcast Script", lines=15)