# ============================================================= # Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages) # ============================================================= # * **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct). # * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`. # * Users pick which languages to generate (English, Bangla, Chinese, # Urdu, Nepali). Unselected languages are skipped. # ----------------------------------------------------------------- import os import tempfile import textwrap from pathlib import Path from typing import List, Dict, Tuple, Optional import gradio as gr from huggingface_hub import InferenceClient from PyPDF2 import PdfReader from smolagents import HfApiModel # ------------------------------------------------------------------ # LLM: Qwen 32‑B via SmolAgents # ------------------------------------------------------------------ llm = HfApiModel( model_id="Qwen/Qwen2.5-Coder-32B-Instruct", max_tokens=2096, temperature=0.5, custom_role_conversions=None, ) # ------------------------------------------------------------------ # HF Inference API client (reads HF_TOKEN secret if set) # ------------------------------------------------------------------ client = InferenceClient(token=os.getenv("HF_TOKEN", None)) # ------------------------------------------------------------------ # Language metadata and matching TTS model IDs # ------------------------------------------------------------------ LANG_INFO: Dict[str, Dict[str, str]] = { "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"}, "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"}, "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"}, "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"}, "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"}, } # Helper map: name ➜ code LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()} PROMPT_TEMPLATE = textwrap.dedent( """ You are producing a lively two‑host educational podcast in {lang_name}. Summarize the following lecture content into a dialogue of ≈1200 words. Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap. Preserve technical accuracy. ### Lecture Content {content} """ ) # ------------------------------------------------------------------ # Helpers: extract and truncate PDF text # ------------------------------------------------------------------ def extract_pdf_text(pdf_path: str) -> str: reader = PdfReader(pdf_path) return "\n".join(page.extract_text() or "" for page in reader.pages) TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str: words = text.split() return " ".join(words[:limit]) # ------------------------------------------------------------------ # Main pipeline # ------------------------------------------------------------------ def generate_podcast(pdf: gr.File, selected_lang_names: List[str]) -> List[Optional[Tuple[str, None]]]: """Generate podcast audio files for chosen languages. Returns a list aligned with LANG_INFO order; unselected languages yield None.""" # Ensure at least one language selected if not selected_lang_names: return [None] * len(LANG_INFO) selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names] with tempfile.TemporaryDirectory() as tmpdir: raw_text = extract_pdf_text(pdf.name) lecture_text = truncate_text(raw_text) outputs: List[Optional[Tuple[str, None]]] = [] for code, info in LANG_INFO.items(): if code not in selected_codes: outputs.append(None) continue # 1️⃣ Draft dialogue in the target language prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text) dialogue: str = llm(prompt) # 2️⃣ Synthesize speech via HF Inference API audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"]) flac_path = Path(tmpdir) / f"podcast_{code}.flac" flac_path.write_bytes(audio_bytes) outputs.append((str(flac_path), None)) # (filepath, label) return outputs # ------------------------------------------------------------------ # Gradio interface # ------------------------------------------------------------------ language_choices = [info["name"] for info in LANG_INFO.values()] inputs = [ gr.File(label="Upload Lecture PDF", file_types=[".pdf"]), gr.CheckboxGroup( choices=language_choices, value=["English"], label="Select podcast language(s) to generate", ), ] audio_components = [ gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values() ] iface = gr.Interface( fn=generate_podcast, inputs=inputs, outputs=audio_components, title="Lecture → Podcast Generator (Choose Languages)", description=( "Upload a lecture PDF, choose your desired languages, and receive a " "two‑host audio podcast. Dialogue is crafted by Qwen‑32B; speech is " "synthesized on‑the‑fly using the Hugging Face Inference API — " "no heavy downloads or GPUs required." ), ) if __name__ == "__main__": iface.launch()