File size: 4,926 Bytes
f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 f0eca57 f1adb14 50d2a40 f1adb14 f0eca57 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f0eca57 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f0eca57 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 f0eca57 50d2a40 f0eca57 f1adb14 50d2a40 f0eca57 50d2a40 f0eca57 f1adb14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# * **Text generation** – SmolAgents `HfApiModel` running the remote
# Qwen/Qwen2.5‑Coder‑32B‑Instruct model.
# * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`
# (serverless) with open models per language – no heavy local
# downloads.
# * Outputs five FLAC files (English, Bangla, Chinese, Urdu, Nepali).
# -----------------------------------------------------------------
import os
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict
import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader
from smolagents import HfApiModel
# ------------------------------------------------------------------
# LLM: Qwen 32‑B via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
# ------------------------------------------------------------------
# HF Inference API client (reads HF_TOKEN secret if set)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))
# ------------------------------------------------------------------
# Language metadata and matching TTS model IDs
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
# MMS lacks mainstream Mandarin — fallback to an open Chinese TTS
"zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of ≈1200 words.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy.
### Lecture Content
{content}
"""
)
# ------------------------------------------------------------------
# Helpers: extract and truncate PDF text
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
"""Generate multilingual podcast from a lecture PDF."""
with tempfile.TemporaryDirectory() as tmpdir:
raw_text = extract_pdf_text(pdf.name)
lecture_text = truncate_text(raw_text)
outputs: List[tuple] = []
for code, info in LANG_INFO.items():
# 1️⃣ Draft dialogue in the target language
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
dialogue: str = llm(prompt)
# 2️⃣ Synthesize speech via HF Inference API
audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
flac_path = Path(tmpdir) / f"podcast_{code}.flac"
flac_path.write_bytes(audio_bytes)
outputs.append((str(flac_path), None)) # (filepath, label)
return outputs
# ------------------------------------------------------------------
# Gradio interface
# ------------------------------------------------------------------
audio_components = [
gr.Audio(label=f"{info['name']} Podcast", type="filepath")
for info in LANG_INFO.values()
]
iface = gr.Interface(
fn=generate_podcast,
inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
outputs=audio_components,
title="Lecture → Multilingual Podcast Generator",
description=(
"Upload a lecture PDF and receive a two‑host audio podcast in five "
"languages (English, Bangla, Chinese, Urdu, Nepali). Dialogue is "
"crafted by Qwen‑32B; speech is synthesized on‑the‑fly using the "
"Hugging Face Inference API — no heavy downloads or GPUs required."
),
)
if __name__ == "__main__":
iface.launch()
|