File size: 7,083 Bytes
f1adb14 fe00684 f1adb14 fe00684 f1adb14 fe00684 f1adb14 50d2a40 c172b12 f1adb14 fe00684 f1adb14 f0eca57 f1adb14 fe00684 f1adb14 f0eca57 fe00684 f1adb14 fe00684 f1adb14 50d2a40 f1adb14 50d2a40 fe00684 50d2a40 f1adb14 50d2a40 fe00684 50d2a40 f1adb14 c172b12 fe00684 f1adb14 fe00684 50d2a40 fe00684 50d2a40 f1adb14 fe00684 f1adb14 50d2a40 f1adb14 fe00684 f1adb14 fe00684 f1adb14 50d2a40 f1adb14 fe00684 c172b12 fe00684 c172b12 fe00684 c172b12 fe00684 f0eca57 50d2a40 c172b12 fe00684 c172b12 fe00684 f1adb14 50d2a40 f1adb14 fe00684 f0eca57 fe00684 f1adb14 fe00684 f1adb14 fe00684 f1adb14 c172b12 fe00684 c172b12 f0eca57 f1adb14 c172b12 fe00684 c172b12 f0eca57 fe00684 f0eca57 f1adb14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# =============================================================
# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
# =============================================================
# • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct)
# • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe
# (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split
# into ≤280‑char chunks to stay within HF endpoint limits.
# -----------------------------------------------------------------
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import gradio as gr
from huggingface_hub import InferenceClient, HubHTTPError
from PyPDF2 import PdfReader
from smolagents import HfApiModel
# ------------------------------------------------------------------
# LLM setup – remote Qwen model via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2048,
temperature=0.5,
)
# ------------------------------------------------------------------
# Hugging Face Inference API client (uses HF_TOKEN secret if provided)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))
# ------------------------------------------------------------------
# Language metadata and corresponding open TTS model IDs
# (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
# ------------------------------------------------------------------
# Prompt template (≈300 words to keep TTS happy)
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of **≈300 words**.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy.
### Lecture Content
{content}
"""
)
# PDF helpers -------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
TOKEN_LIMIT = 4000 # approx words before hitting context limit
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars)
# ------------------------------------------------------------------
CHUNK_CHAR_LIMIT = 280 # safe margin for MMS‑TTS
def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
# split on sentence boundaries while respecting limit
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
chunks, current = [], ""
for sent in sentences:
if len(current) + len(sent) + 1 > limit:
if current:
chunks.append(current.strip())
current = sent
else:
current += " " + sent if current else sent
if current:
chunks.append(current.strip())
return chunks
def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
"""Stream chunks through HF TTS and concatenate FLAC bytes."""
chunks = _split_to_chunks(text)
flac_paths: List[Path] = []
for idx, chunk in enumerate(chunks):
try:
audio_bytes = client.text_to_speech(chunk, model=model_id)
except HubHTTPError as e:
raise RuntimeError(f"TTS request failed: {e}") from e
part_path = tmpdir / f"part_{idx}.flac"
part_path.write_bytes(audio_bytes)
flac_paths.append(part_path)
# simple concat of FLAC files (works because each part includes header)
# better: convert to raw & merge, but HF players handle sequential FLACs
final_path = tmpdir / "podcast.flac"
with open(final_path, "wb") as fout:
for p in flac_paths:
fout.write(p.read_bytes())
return final_path
# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
if not selected_lang_names:
raise gr.Error("Please select at least one language.")
selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
results: List[Optional[Tuple[str, None]]] = []
with tempfile.TemporaryDirectory() as td:
tmpdir = Path(td)
lecture_raw = extract_pdf_text(pdf.name)
lecture_text = truncate_text(lecture_raw)
for code, info in LANG_INFO.items():
if code not in selected_codes:
results.append(None)
continue
# 1️⃣ Generate dialogue
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
dialogue: str = llm(prompt)
# 2️⃣ Speech synthesis (chunked)
tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)
results.append((str(tts_path), None))
return results
# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
language_choices = [info["name"] for info in LANG_INFO.values()]
inputs = [
gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
gr.CheckboxGroup(
choices=language_choices,
value=["English"],
label="Select podcast language(s) to generate",
),
]
outputs = [
gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]
iface = gr.Interface(
fn=generate_podcast,
inputs=inputs,
outputs=outputs,
title="Lecture → Podcast Generator (Choose Languages)",
description=(
"Upload a lecture PDF, choose language(s), and receive a two‑host "
"audio podcast. Dialogue comes from Qwen‑32B; speech is streamed "
"via the HF Inference API using open MMS‑TTS models. Long texts are "
"automatically chunked to fit API limits."
),
)
if __name__ == "__main__":
iface.launch()
|