# ============================================================= # Hugging Face Space – Lecture → Multilingual Podcast Generator # ============================================================= # * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B) # * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib # (no private / gated repo, so it runs without a HF token). # * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali. # ----------------------------------------------------------------- import os import tempfile import uuid import textwrap from typing import List, Dict import gradio as gr from PyPDF2 import PdfReader from smolagents import HfApiModel from TTS.api import TTS # ↳ Coqui TTS # ------------------------------------------------------------------ # LLM configuration (SmolAgents wrapper for HF Inference API) # ------------------------------------------------------------------ llm = HfApiModel( model_id="Qwen/Qwen2.5-Coder-32B-Instruct", max_tokens=2096, temperature=0.5, custom_role_conversions=None, ) # ------------------------------------------------------------------ # XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK) # ------------------------------------------------------------------ TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False) # Automatically downloads and caches the model on first run. LANG_INFO: Dict[str, Dict[str, str]] = { "en": {"name": "English"}, "bn": {"name": "Bangla"}, "zh": {"name": "Chinese"}, "ur": {"name": "Urdu"}, "ne": {"name": "Nepali"}, } PROMPT_TEMPLATE = textwrap.dedent( """ You are producing a lively two‑host educational podcast in {lang_name}. Summarize the following lecture content into a dialogue of about 1200 words. Use an engaging style: hosts ask each other questions, clarify ideas, add simple analogies, and conclude with a short recap. Keep technical accuracy. ### Lecture Content {content} """ ) # ------------------------------------------------------------------ # Utility: extract & truncate PDF text to fit the LLM token budget # ------------------------------------------------------------------ def extract_pdf_text(pdf_file) -> str: reader = PdfReader(pdf_file) return "\n".join(p.extract_text() or "" for p in reader.pages) TOKEN_LIMIT = 6000 # ≈ tokens (safe margin for prompt + response) def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str: words = text.split() return " ".join(words[:limit]) # ------------------------------------------------------------------ # Main generation routine # ------------------------------------------------------------------ def generate_podcast(pdf: gr.File) -> List[gr.Audio]: with tempfile.TemporaryDirectory() as tmpdir: lecture_text = truncate_text(extract_pdf_text(pdf.name)) audio_outputs = [] for lang_code, info in LANG_INFO.items(): # 1️⃣ Create prompt + generate dialogue prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text) dialogue = llm(prompt) # 2️⃣ Save raw dialogue text (for reference) txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt") with open(txt_path, "w", encoding="utf-8") as f: f.write(dialogue) # 3️⃣ Synthesise speech with XTTS‑v2 wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav") # ► xtts_v2 accepts ISO‑639‑1 language codes directly tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path) audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio return audio_outputs # ------------------------------------------------------------------ # Gradio UI # ------------------------------------------------------------------ audio_components = [ gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values() ] iface = gr.Interface( fn=generate_podcast, inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]), outputs=audio_components, title="Lecture → Multilingual Podcast Generator", description=( "Upload a lecture PDF and receive a two‑host audio podcast in English, " "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the " "dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos " "or API keys needed." ), ) if __name__ == "__main__": iface.launch()