Spaces:

HaiderAUT
/

PodCastIt

Build error

File size: 6,248 Bytes

f1adb14
fe00684
f1adb14
 
50d2a40
f036ad8
f1adb14
 
f036ad8
 
 
 
 
f1adb14
 
fe00684
f1adb14
 
f0eca57
f036ad8
f1adb14
 
 
 
f036ad8
f1adb14
50d2a40
f1adb14
50d2a40
f036ad8
50d2a40
f1adb14
50d2a40
 
fe00684
 
50d2a40
f1adb14
c172b12
 
f1adb14
 
c565171
f036ad8
50d2a40
f036ad8
c565171
 
f1adb14
 
 
 
f036ad8
 
f1adb14
f036ad8
 
 
50d2a40
c565171
 
 
 
 
f1adb14
f036ad8
 
 
f1adb14
 
c565171
 
 
f1adb14
fe00684
 
f036ad8
 
fe00684
f036ad8
 
 
fe00684
f036ad8
 
 
 
fe00684
f036ad8
 
fe00684
f036ad8
 
 
 
 
fe00684
 
 
f036ad8
 
 
c565171
f036ad8
c565171
f036ad8
 
 
 
 
 
 
fe00684
f1adb14
f036ad8
f1adb14
 
f036ad8
 
c565171
f036ad8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1adb14
 
f036ad8
f1adb14
f036ad8
c172b12
 
f036ad8
 
c172b12
 
f036ad8
9e251c5
f036ad8
 
 
f1adb14
 
 
c172b12
fe00684
f036ad8
 
f1adb14
 
 
f036ad8

import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional

import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader  # For PDF processing
from smolagents import HfApiModel  # For LLM interaction
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError

# ------------------------------------------------------------------
# LLM setup – remote Qwen model via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2048,
    temperature=0.5,
)

# ------------------------------------------------------------------
# Hugging Face Inference API client
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))

# ------------------------------------------------------------------
# Language metadata and open TTS models
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two-host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of ~300 words.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.

    ### Lecture Content
    {content}
    """
)

TOKEN_LIMIT = 8000
CHUNK_CHAR_LIMIT = 280

# ------------------------------------------------------------------
# PDF text extraction
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
    try:
        reader = PdfReader(pdf_path)
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        raise gr.Error(f"Failed to process PDF: {e}")

# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    if len(words) > limit:
        return " ".join(words[:limit])
    return text


def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    chunks, current = [], ""
    for sent in sentences:
        if current and len(current) + len(sent) + 1 > limit:
            chunks.append(current)
            current = sent
        else:
            current = f"{current} {sent}".strip()
    if current:
        chunks.append(current)
    return chunks


def synthesize_speech(text: str, model_id: str, tempdir: Path) -> Path:
    chunks = _split_to_chunks(text)
    if not chunks:
        raise ValueError("No text chunks to synthesize.")

    segments = []
    for i, chunk in enumerate(chunks):
        try:
            audio_bytes = client.text_to_speech(chunk, model=model_id)
        except HubHTTPError as e:
            raise RuntimeError(f"TTS error on chunk {i}: {e}")
        part = tempdir / f"seg_{i}.flac"
        part.write_bytes(audio_bytes)
        try:
            seg = AudioSegment.from_file(part, format="flac")
        except CouldntDecodeError as e:
            raise RuntimeError(f"Decode error on chunk {i}: {e}")
        segments.append(seg)

    combined = sum(segments, AudioSegment.empty())
    outpath = tempdir / "podcast.flac"
    combined.export(outpath, format="flac")
    return outpath

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------

def generate_podcast(pdf_file: Optional[gr.File], languages: List[str]):
    if not pdf_file:
        raise gr.Error("Please upload a PDF file.")
    if not languages:
        raise gr.Error("Select at least one language.")

    # Extract and truncate
    text = extract_pdf_text(pdf_file.name)
    if not text.strip():
        raise gr.Error("No text found in PDF.")
    lecture = truncate_text(text)

    transcripts, audios = [], []
    with tempfile.TemporaryDirectory() as td:
        base = Path(td)
        for name in languages:
            code = LANG_CODE_BY_NAME[name]
            # 1️⃣ Dialogue
            prompt = PROMPT_TEMPLATE.format(lang_name=name, content=lecture)
            dialogue = llm(prompt).strip()
            transcripts.append(dialogue)
            # 2️⃣ Speech
            tempdir = base / code
            tempdir.mkdir(parents=True, exist_ok=True)
            audio_path = synthesize_speech(dialogue, LANG_INFO[code]["tts_model"], tempdir)
            audios.append(str(audio_path))

    # Return alternating transcript and audio path
    results: List = []
    for t, a in zip(transcripts, audios):
        results.extend([t, a])
    return results

# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
languages = [info["name"] for info in LANG_INFO.values()]

inputs = [
    gr.File(label="Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(languages, value=["English"], label="Languages"),
]

# Two outputs per language: transcript and audio
outputs = []
for name in languages:
    outputs.append(gr.Textbox(label=f"{name} Transcript", interactive=False))
    outputs.append(gr.Audio(label=f"{name} Podcast", type="filepath"))

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=outputs,
    title="Lecture → Podcast Generator",
    description="Upload a lecture PDF, select languages, get dialogue transcript and audio podcast."
)

if __name__ == "__main__":
    iface.launch()