Spaces:

HaiderAUT
/

PodCastIt

Build error

File size: 7,083 Bytes

f1adb14
fe00684
f1adb14
fe00684
 
 
 
f1adb14
 
 
fe00684
f1adb14
 
50d2a40
c172b12
f1adb14
 
fe00684
f1adb14
f0eca57
f1adb14
 
fe00684
f1adb14
 
f0eca57
fe00684
f1adb14
 
 
 
fe00684
f1adb14
50d2a40
f1adb14
50d2a40
fe00684
 
50d2a40
f1adb14
50d2a40
 
fe00684
 
50d2a40
f1adb14
c172b12
 
fe00684
 
 
f1adb14
 
 
fe00684
50d2a40
 
fe00684
50d2a40
f1adb14
 
 
 
fe00684
f1adb14
50d2a40
 
 
f1adb14
fe00684
 
f1adb14
 
 
 
 
fe00684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1adb14
50d2a40
f1adb14
 
fe00684
c172b12
fe00684
c172b12
 
fe00684
c172b12
fe00684
 
 
 
f0eca57
50d2a40
c172b12
fe00684
c172b12
 
fe00684
f1adb14
50d2a40
f1adb14
fe00684
 
f0eca57
fe00684
f1adb14
fe00684
f1adb14
 
fe00684
f1adb14
c172b12
 
 
 
 
 
 
 
 
 
 
fe00684
c172b12
f0eca57
f1adb14
 
 
c172b12
fe00684
c172b12
f0eca57
fe00684
 
 
 
f0eca57
f1adb14

# =============================================================
# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
# =============================================================
# • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct)
# • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe
#   (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split
#   into ≤280‑char chunks to stay within HF endpoint limits.
# -----------------------------------------------------------------

import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import gradio as gr
from huggingface_hub import InferenceClient, HubHTTPError
from PyPDF2 import PdfReader
from smolagents import HfApiModel

# ------------------------------------------------------------------
# LLM setup – remote Qwen model via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2048,
    temperature=0.5,
)

# ------------------------------------------------------------------
# Hugging Face Inference API client (uses HF_TOKEN secret if provided)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))

# ------------------------------------------------------------------
# Language metadata and corresponding open TTS model IDs
# (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

# ------------------------------------------------------------------
# Prompt template (≈300 words to keep TTS happy)
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of **≈300 words**.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.
    
    ### Lecture Content
    {content}
    """
)

# PDF helpers -------------------------------------------------------

def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

TOKEN_LIMIT = 4000  # approx words before hitting context limit


def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars)
# ------------------------------------------------------------------
CHUNK_CHAR_LIMIT = 280  # safe margin for MMS‑TTS

def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
    # split on sentence boundaries while respecting limit
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    chunks, current = [], ""
    for sent in sentences:
        if len(current) + len(sent) + 1 > limit:
            if current:
                chunks.append(current.strip())
            current = sent
        else:
            current += " " + sent if current else sent
    if current:
        chunks.append(current.strip())
    return chunks


def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
    """Stream chunks through HF TTS and concatenate FLAC bytes."""
    chunks = _split_to_chunks(text)
    flac_paths: List[Path] = []
    for idx, chunk in enumerate(chunks):
        try:
            audio_bytes = client.text_to_speech(chunk, model=model_id)
        except HubHTTPError as e:
            raise RuntimeError(f"TTS request failed: {e}") from e
        part_path = tmpdir / f"part_{idx}.flac"
        part_path.write_bytes(audio_bytes)
        flac_paths.append(part_path)

    # simple concat of FLAC files (works because each part includes header)
    # better: convert to raw & merge, but HF players handle sequential FLACs
    final_path = tmpdir / "podcast.flac"
    with open(final_path, "wb") as fout:
        for p in flac_paths:
            fout.write(p.read_bytes())
    return final_path

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
    if not selected_lang_names:
        raise gr.Error("Please select at least one language.")

    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
    results: List[Optional[Tuple[str, None]]] = []

    with tempfile.TemporaryDirectory() as td:
        tmpdir = Path(td)
        lecture_raw = extract_pdf_text(pdf.name)
        lecture_text = truncate_text(lecture_raw)

        for code, info in LANG_INFO.items():
            if code not in selected_codes:
                results.append(None)
                continue

            # 1️⃣ Generate dialogue
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            dialogue: str = llm(prompt)

            # 2️⃣ Speech synthesis (chunked)
            tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)

            results.append((str(tts_path), None))

    return results

# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
language_choices = [info["name"] for info in LANG_INFO.values()]

inputs = [
    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(
        choices=language_choices,
        value=["English"],
        label="Select podcast language(s) to generate",
    ),
]

outputs = [
    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=outputs,
    title="Lecture → Podcast Generator (Choose Languages)",
    description=(
        "Upload a lecture PDF, choose language(s), and receive a two‑host "
        "audio podcast. Dialogue comes from Qwen‑32B; speech is streamed "
        "via the HF Inference API using open MMS‑TTS models. Long texts are "
        "automatically chunked to fit API limits."
    ),
)

if __name__ == "__main__":
    iface.launch()