# ============================================================= # Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages) # ============================================================= # • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct) # • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe # (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split # into ≤280‑char chunks to stay within HF endpoint limits. # ----------------------------------------------------------------- import os import re import tempfile import textwrap from pathlib import Path from typing import List, Dict, Tuple, Optional import gradio as gr from huggingface_hub import InferenceClient, HubHTTPError from PyPDF2 import PdfReader from smolagents import HfApiModel # ------------------------------------------------------------------ # LLM setup – remote Qwen model via SmolAgents # ------------------------------------------------------------------ llm = HfApiModel( model_id="Qwen/Qwen2.5-Coder-32B-Instruct", max_tokens=2048, temperature=0.5, ) # ------------------------------------------------------------------ # Hugging Face Inference API client (uses HF_TOKEN secret if provided) # ------------------------------------------------------------------ client = InferenceClient(token=os.getenv("HF_TOKEN", None)) # ------------------------------------------------------------------ # Language metadata and corresponding open TTS model IDs # (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids) # ------------------------------------------------------------------ LANG_INFO: Dict[str, Dict[str, str]] = { "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"}, "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"}, "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"}, "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"}, "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"}, } LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()} # ------------------------------------------------------------------ # Prompt template (≈300 words to keep TTS happy) # ------------------------------------------------------------------ PROMPT_TEMPLATE = textwrap.dedent( """ You are producing a lively two‑host educational podcast in {lang_name}. Summarize the following lecture content into a dialogue of **≈300 words**. Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap. Preserve technical accuracy. ### Lecture Content {content} """ ) # PDF helpers ------------------------------------------------------- def extract_pdf_text(pdf_path: str) -> str: reader = PdfReader(pdf_path) return "\n".join(page.extract_text() or "" for page in reader.pages) TOKEN_LIMIT = 4000 # approx words before hitting context limit def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str: words = text.split() return " ".join(words[:limit]) # ------------------------------------------------------------------ # TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars) # ------------------------------------------------------------------ CHUNK_CHAR_LIMIT = 280 # safe margin for MMS‑TTS def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]: # split on sentence boundaries while respecting limit sentences = re.split(r"(?<=[.!?])\s+", text.strip()) chunks, current = [], "" for sent in sentences: if len(current) + len(sent) + 1 > limit: if current: chunks.append(current.strip()) current = sent else: current += " " + sent if current else sent if current: chunks.append(current.strip()) return chunks def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path: """Stream chunks through HF TTS and concatenate FLAC bytes.""" chunks = _split_to_chunks(text) flac_paths: List[Path] = [] for idx, chunk in enumerate(chunks): try: audio_bytes = client.text_to_speech(chunk, model=model_id) except HubHTTPError as e: raise RuntimeError(f"TTS request failed: {e}") from e part_path = tmpdir / f"part_{idx}.flac" part_path.write_bytes(audio_bytes) flac_paths.append(part_path) # simple concat of FLAC files (works because each part includes header) # better: convert to raw & merge, but HF players handle sequential FLACs final_path = tmpdir / "podcast.flac" with open(final_path, "wb") as fout: for p in flac_paths: fout.write(p.read_bytes()) return final_path # ------------------------------------------------------------------ # Main pipeline # ------------------------------------------------------------------ def generate_podcast(pdf: gr.File, selected_lang_names: List[str]): if not selected_lang_names: raise gr.Error("Please select at least one language.") selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names] results: List[Optional[Tuple[str, None]]] = [] with tempfile.TemporaryDirectory() as td: tmpdir = Path(td) lecture_raw = extract_pdf_text(pdf.name) lecture_text = truncate_text(lecture_raw) for code, info in LANG_INFO.items(): if code not in selected_codes: results.append(None) continue # 1️⃣ Generate dialogue prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text) dialogue: str = llm(prompt) # 2️⃣ Speech synthesis (chunked) tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code) results.append((str(tts_path), None)) return results # ------------------------------------------------------------------ # Gradio Interface # ------------------------------------------------------------------ language_choices = [info["name"] for info in LANG_INFO.values()] inputs = [ gr.File(label="Upload Lecture PDF", file_types=[".pdf"]), gr.CheckboxGroup( choices=language_choices, value=["English"], label="Select podcast language(s) to generate", ), ] outputs = [ gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values() ] iface = gr.Interface( fn=generate_podcast, inputs=inputs, outputs=outputs, title="Lecture → Podcast Generator (Choose Languages)", description=( "Upload a lecture PDF, choose language(s), and receive a two‑host " "audio podcast. Dialogue comes from Qwen‑32B; speech is streamed " "via the HF Inference API using open MMS‑TTS models. Long texts are " "automatically chunked to fit API limits." ), ) if __name__ == "__main__": iface.launch()