Spaces:

HaiderAUT
/

PodCastIt

Build error

File size: 7,466 Bytes

910bbfc

# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
# Face *audio‑generation* model for speech (no external TTS APIs).
# -----------------------------------------------------------------
# Files for your Space:
#   • app.py              (this file)
#   • requirements.txt    (see bottom comment block)
# -----------------------------------------------------------------
# Add your HF_TOKEN as a Space secret if required for gated models.
# =============================================================

import os
import tempfile
import uuid
import textwrap
from typing import Dict, Tuple

import gradio as gr
from PyPDF2 import PdfReader
import nltk  # sentence tokenisation
from llama_index.llms.huggingface import HfApiModel
from transformers import pipeline  # HF TTS pipeline
import soundfile as sf  # save audio

# ---------------------------------------------------------------
# Ensure NLTK punkt is present on first launch
# ---------------------------------------------------------------
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# --------------------------- LLM Setup ---------------------------
llm = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",  # text generation
    custom_role_conversions=None,
)

# ------------------------ TTS Setup ------------------------------
# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
# If you need lighter weights choose language‑specific VITS models.
# ----------------------------------------------------------------
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
# Load once; Space queues requests so single GPU/CPU is okay.
try:
    tts_pipeline = pipeline(
        "text-to-speech",
        model=TTS_MODEL_ID,
        device_map="auto",  # GPU if available, else CPU
    )
except Exception as e:
    raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")

# ------------------------ Helpers --------------------------------
LANG_CONFIG = {
    "English": {
        "tts_lang": "en",
        "prompt_tag": "English",
    },
    "Bangla": {
        "tts_lang": "bn",
        "prompt_tag": "Bangla (বাংলা)",
    },
    "Chinese": {
        "tts_lang": "zh",
        "prompt_tag": "Mandarin Chinese",
    },
    "Urdu": {
        "tts_lang": "ur",
        "prompt_tag": "Urdu (اردو)",
    },
    "Nepali": {
        "tts_lang": "ne",
        "prompt_tag": "Nepali (नेपाली)",
    },
}


def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
    """Extract raw text from PDF, truncate to avoid token overflow."""
    reader = PdfReader(pdf_path)
    text_parts = []
    for page in reader.pages:
        page_text = page.extract_text() or ""
        text_parts.append(page_text)
        if sum(len(t) for t in text_parts) >= max_chars:
            break
    raw_text = "\n".join(text_parts)[:max_chars]
    return raw_text


def build_prompt(lecture_text: str, lang: str) -> str:
    """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
    # Compress lecture to ~150 sentences to stay under token budget
    sentences = nltk.sent_tokenize(lecture_text)
    short_text = " ".join(sentences[: min(len(sentences), 150)])

    prompt = textwrap.dedent(
        f"""

        You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.



        Lecture content (for reference):

        """.strip()
    ) + "\n" + short_text + "\n"
    return prompt


def generate_dialogue(lecture_text: str, lang: str) -> str:
    """Call the Qwen model to generate podcast script for the given language."""
    prompt = build_prompt(lecture_text, lang)
    try:
        response = llm.complete(prompt)
        dialogue = response.text.strip()
    except Exception as e:
        dialogue = f"Error generating dialogue in {lang}: {e}"
    return dialogue


def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
    """Convert text to speech via HF TTS; returns (filepath, mime)."""
    language_code = LANG_CONFIG[lang]["tts_lang"]
    tmpdir = tempfile.gettempdir()
    filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
    try:
        # xtts_v2 accepts a `language` forward param
        speech = tts_pipeline(text, forward_params={"language": language_code})
        sf.write(filename, speech["audio"], speech["sampling_rate"])
        return filename, "audio/wav"
    except Exception as e:
        # Return dummy text file explaining error
        errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
        with open(errfile, "w", encoding="utf-8") as fh:
            fh.write(f"TTS error for {lang}: {e}\n")
        return errfile, "text/plain"


def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
    """Full pipeline returning a dict: language → (file_path, mime)."""
    if pdf_file is None:
        raise gr.Error("Please upload a PDF lecture first.")
    lecture_text = extract_text(pdf_file)

    audio_outputs = {}
    for lang in LANG_CONFIG.keys():
        dialogue = generate_dialogue(lecture_text, lang)
        path, mime = tts_for_dialogue(lang, dialogue)
        audio_outputs[lang] = (path, mime)
    return audio_outputs


# ------------------------ Gradio UI --------------------------------

with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
    gr.Markdown(
        """# 📚🎙️ Lecture → Podcast

Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.

"""
    )
    with gr.Row():
        inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
        btn = gr.Button("Generate Podcast")
    with gr.Group():
        audio_components = [
            gr.Audio(label=lang, interactive=False, type="filepath")
            for lang in LANG_CONFIG.keys()
        ]


    def gradio_wrapper(pdf_file):
        results = pipeline_runner(pdf_file)
        return [results[lang][0] for lang in LANG_CONFIG.keys()]


    btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)

if __name__ == "__main__":
    demo.launch()

# ---------------------------------------------------------------
# requirements.txt  (commit as separate file in the Space root)
# ---------------------------------------------------------------
# gradio>=4.28.0
# PyPDF2>=3.0.1
# nltk>=3.8.1
# transformers>=4.39.0
# torch>=2.1.2
# soundfile>=0.12.1
# llama-index>=0.11.47
# huggingface-hub>=0.23.0