Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

50d2a40 verified about 1 month ago

raw

history blame

4.93 kB

	# =============================================================
	# Hugging Face Space – Lecture → Multilingual Podcast Generator
	# =============================================================
	# * Text generation – SmolAgents `HfApiModel` running the remote
	# Qwen/Qwen2.5‑Coder‑32B‑Instruct model.
	# * Speech synthesis – `huggingface_hub.InferenceClient.text_to_speech`
	# (serverless) with open models per language – no heavy local
	# downloads.
	# * Outputs five FLAC files (English, Bangla, Chinese, Urdu, Nepali).
	# -----------------------------------------------------------------

	import os
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Dict

	import gradio as gr
	from huggingface_hub import InferenceClient
	from PyPDF2 import PdfReader
	from smolagents import HfApiModel

	# ------------------------------------------------------------------
	# LLM: Qwen 32‑B via SmolAgents
	# ------------------------------------------------------------------
	llm = HfApiModel(
	model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
	max_tokens=2096,
	temperature=0.5,
	custom_role_conversions=None,
	)

	# ------------------------------------------------------------------
	# HF Inference API client (reads HF_TOKEN secret if set)
	# ------------------------------------------------------------------
	client = InferenceClient(token=os.getenv("HF_TOKEN", None))

	# ------------------------------------------------------------------
	# Language metadata and matching TTS model IDs
	# ------------------------------------------------------------------
	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
	"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
	# MMS lacks mainstream Mandarin — fallback to an open Chinese TTS
	"zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
	"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
	"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
	}

	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two‑host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of ≈1200 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and
	wrap up with a concise recap. Preserve technical accuracy.

	### Lecture Content
	{content}
	"""
	)

	# ------------------------------------------------------------------
	# Helpers: extract and truncate PDF text
	# ------------------------------------------------------------------

	def extract_pdf_text(pdf_path: str) -> str:
	reader = PdfReader(pdf_path)
	return "\n".join(page.extract_text() or "" for page in reader.pages)

	TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit

	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	return " ".join(words[:limit])

	# ------------------------------------------------------------------
	# Main pipeline
	# ------------------------------------------------------------------

	def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
	"""Generate multilingual podcast from a lecture PDF."""
	with tempfile.TemporaryDirectory() as tmpdir:
	raw_text = extract_pdf_text(pdf.name)
	lecture_text = truncate_text(raw_text)
	outputs: List[tuple] = []

	for code, info in LANG_INFO.items():
	# 1️⃣ Draft dialogue in the target language
	prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
	dialogue: str = llm(prompt)

	# 2️⃣ Synthesize speech via HF Inference API
	audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
	flac_path = Path(tmpdir) / f"podcast_{code}.flac"
	flac_path.write_bytes(audio_bytes)

	outputs.append((str(flac_path), None)) # (filepath, label)

	return outputs

	# ------------------------------------------------------------------
	# Gradio interface
	# ------------------------------------------------------------------

	audio_components = [
	gr.Audio(label=f"{info['name']} Podcast", type="filepath")
	for info in LANG_INFO.values()
	]

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
	outputs=audio_components,
	title="Lecture → Multilingual Podcast Generator",
	description=(
	"Upload a lecture PDF and receive a two‑host audio podcast in five "
	"languages (English, Bangla, Chinese, Urdu, Nepali). Dialogue is "
	"crafted by Qwen‑32B; speech is synthesized on‑the‑fly using the "
	"Hugging Face Inference API — no heavy downloads or GPUs required."
	),
	)

	if __name__ == "__main__":
	iface.launch()