PodCastIt / app.py
HaiderAUT's picture
Update app.py
f0eca57 verified
raw
history blame
4.67 kB
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
# * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib
# (no private / gated repo, so it runs without a HF token).
# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
# -----------------------------------------------------------------
import os
import tempfile
import uuid
import textwrap
from typing import List, Dict
import gradio as gr
from PyPDF2 import PdfReader
from smolagents import HfApiModel
from TTS.api import TTS # ↳ Coqui TTS
# ------------------------------------------------------------------
# LLM configuration (SmolAgents wrapper for HF Inference API)
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
# ------------------------------------------------------------------
# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
# ------------------------------------------------------------------
TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
# Automatically downloads and caches the model on first run.
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English"},
"bn": {"name": "Bangla"},
"zh": {"name": "Chinese"},
"ur": {"name": "Urdu"},
"ne": {"name": "Nepali"},
}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of about 1200 words.
Use an engaging style: hosts ask each other questions, clarify ideas, add
simple analogies, and conclude with a short recap. Keep technical accuracy.
### Lecture Content
{content}
"""
)
# ------------------------------------------------------------------
# Utility: extract & truncate PDF text to fit the LLM token budget
# ------------------------------------------------------------------
def extract_pdf_text(pdf_file) -> str:
reader = PdfReader(pdf_file)
return "\n".join(p.extract_text() or "" for p in reader.pages)
TOKEN_LIMIT = 6000 # ≈ tokens (safe margin for prompt + response)
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# Main generation routine
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
with tempfile.TemporaryDirectory() as tmpdir:
lecture_text = truncate_text(extract_pdf_text(pdf.name))
audio_outputs = []
for lang_code, info in LANG_INFO.items():
# 1️⃣ Create prompt + generate dialogue
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
dialogue = llm(prompt)
# 2️⃣ Save raw dialogue text (for reference)
txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(dialogue)
# 3️⃣ Synthesise speech with XTTS‑v2
wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
# ► xtts_v2 accepts ISO‑639‑1 language codes directly
tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio
return audio_outputs
# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
audio_components = [
gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]
iface = gr.Interface(
fn=generate_podcast,
inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
outputs=audio_components,
title="Lecture → Multilingual Podcast Generator",
description=(
"Upload a lecture PDF and receive a two‑host audio podcast in English, "
"Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
"dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
"or API keys needed."
),
)
if __name__ == "__main__":
iface.launch()