File size: 5,606 Bytes
f1adb14 c172b12 f1adb14 c172b12 f1adb14 50d2a40 c172b12 f1adb14 50d2a40 f1adb14 f0eca57 f1adb14 50d2a40 f1adb14 f0eca57 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 c172b12 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 c172b12 f1adb14 50d2a40 c172b12 f0eca57 50d2a40 c172b12 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f0eca57 50d2a40 f1adb14 50d2a40 f1adb14 50d2a40 f1adb14 c172b12 f0eca57 c172b12 f0eca57 f1adb14 c172b12 f1adb14 c172b12 f0eca57 c172b12 f0eca57 f1adb14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# =============================================================
# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
# =============================================================
# * **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct).
# * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`.
# * Users pick which languages to generate (English, Bangla, Chinese,
# Urdu, Nepali). Unselected languages are skipped.
# -----------------------------------------------------------------
import os
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader
from smolagents import HfApiModel
# ------------------------------------------------------------------
# LLM: Qwen 32‑B via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
# ------------------------------------------------------------------
# HF Inference API client (reads HF_TOKEN secret if set)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))
# ------------------------------------------------------------------
# Language metadata and matching TTS model IDs
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
"zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
}
# Helper map: name ➜ code
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of ≈1200 words.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy.
### Lecture Content
{content}
"""
)
# ------------------------------------------------------------------
# Helpers: extract and truncate PDF text
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File, selected_lang_names: List[str]) -> List[Optional[Tuple[str, None]]]:
"""Generate podcast audio files for chosen languages. Returns a list
aligned with LANG_INFO order; unselected languages yield None."""
# Ensure at least one language selected
if not selected_lang_names:
return [None] * len(LANG_INFO)
selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
with tempfile.TemporaryDirectory() as tmpdir:
raw_text = extract_pdf_text(pdf.name)
lecture_text = truncate_text(raw_text)
outputs: List[Optional[Tuple[str, None]]] = []
for code, info in LANG_INFO.items():
if code not in selected_codes:
outputs.append(None)
continue
# 1️⃣ Draft dialogue in the target language
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
dialogue: str = llm(prompt)
# 2️⃣ Synthesize speech via HF Inference API
audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
flac_path = Path(tmpdir) / f"podcast_{code}.flac"
flac_path.write_bytes(audio_bytes)
outputs.append((str(flac_path), None)) # (filepath, label)
return outputs
# ------------------------------------------------------------------
# Gradio interface
# ------------------------------------------------------------------
language_choices = [info["name"] for info in LANG_INFO.values()]
inputs = [
gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
gr.CheckboxGroup(
choices=language_choices,
value=["English"],
label="Select podcast language(s) to generate",
),
]
audio_components = [
gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]
iface = gr.Interface(
fn=generate_podcast,
inputs=inputs,
outputs=audio_components,
title="Lecture → Podcast Generator (Choose Languages)",
description=(
"Upload a lecture PDF, choose your desired languages, and receive a "
"two‑host audio podcast. Dialogue is crafted by Qwen‑32B; speech is "
"synthesized on‑the‑fly using the Hugging Face Inference API — "
"no heavy downloads or GPUs required."
),
)
if __name__ == "__main__":
iface.launch()
|