File size: 4,673 Bytes
f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 f0eca57 f1adb14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
# * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib
# (no private / gated repo, so it runs without a HF token).
# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
# -----------------------------------------------------------------
import os
import tempfile
import uuid
import textwrap
from typing import List, Dict
import gradio as gr
from PyPDF2 import PdfReader
from smolagents import HfApiModel
from TTS.api import TTS # ↳ Coqui TTS
# ------------------------------------------------------------------
# LLM configuration (SmolAgents wrapper for HF Inference API)
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
# ------------------------------------------------------------------
# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
# ------------------------------------------------------------------
TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
# Automatically downloads and caches the model on first run.
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English"},
"bn": {"name": "Bangla"},
"zh": {"name": "Chinese"},
"ur": {"name": "Urdu"},
"ne": {"name": "Nepali"},
}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of about 1200 words.
Use an engaging style: hosts ask each other questions, clarify ideas, add
simple analogies, and conclude with a short recap. Keep technical accuracy.
### Lecture Content
{content}
"""
)
# ------------------------------------------------------------------
# Utility: extract & truncate PDF text to fit the LLM token budget
# ------------------------------------------------------------------
def extract_pdf_text(pdf_file) -> str:
reader = PdfReader(pdf_file)
return "\n".join(p.extract_text() or "" for p in reader.pages)
TOKEN_LIMIT = 6000 # ≈ tokens (safe margin for prompt + response)
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# Main generation routine
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
with tempfile.TemporaryDirectory() as tmpdir:
lecture_text = truncate_text(extract_pdf_text(pdf.name))
audio_outputs = []
for lang_code, info in LANG_INFO.items():
# 1️⃣ Create prompt + generate dialogue
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
dialogue = llm(prompt)
# 2️⃣ Save raw dialogue text (for reference)
txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(dialogue)
# 3️⃣ Synthesise speech with XTTS‑v2
wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
# ► xtts_v2 accepts ISO‑639‑1 language codes directly
tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio
return audio_outputs
# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
audio_components = [
gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]
iface = gr.Interface(
fn=generate_podcast,
inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
outputs=audio_components,
title="Lecture → Multilingual Podcast Generator",
description=(
"Upload a lecture PDF and receive a two‑host audio podcast in English, "
"Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
"dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
"or API keys needed."
),
)
if __name__ == "__main__":
iface.launch()
|