Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

f0eca57 verified about 1 month ago

raw

history blame

4.67 kB

	# =============================================================
	# Hugging Face Space – Lecture → Multilingual Podcast Generator
	# =============================================================
	# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
	# * Speech synthesis: Coqui XTTS‑v2 open model via the TTS lib
	# (no private / gated repo, so it runs without a HF token).
	# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
	# -----------------------------------------------------------------

	import os
	import tempfile
	import uuid
	import textwrap
	from typing import List, Dict

	import gradio as gr
	from PyPDF2 import PdfReader
	from smolagents import HfApiModel
	from TTS.api import TTS # ↳ Coqui TTS

	# ------------------------------------------------------------------
	# LLM configuration (SmolAgents wrapper for HF Inference API)
	# ------------------------------------------------------------------
	llm = HfApiModel(
	model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
	max_tokens=2096,
	temperature=0.5,
	custom_role_conversions=None,
	)

	# ------------------------------------------------------------------
	# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
	# ------------------------------------------------------------------
	TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"

	tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
	# Automatically downloads and caches the model on first run.

	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English"},
	"bn": {"name": "Bangla"},
	"zh": {"name": "Chinese"},
	"ur": {"name": "Urdu"},
	"ne": {"name": "Nepali"},
	}

	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two‑host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of about 1200 words.
	Use an engaging style: hosts ask each other questions, clarify ideas, add
	simple analogies, and conclude with a short recap. Keep technical accuracy.

	### Lecture Content
	{content}
	"""
	)

	# ------------------------------------------------------------------
	# Utility: extract & truncate PDF text to fit the LLM token budget
	# ------------------------------------------------------------------

	def extract_pdf_text(pdf_file) -> str:
	reader = PdfReader(pdf_file)
	return "\n".join(p.extract_text() or "" for p in reader.pages)

	TOKEN_LIMIT = 6000 # ≈ tokens (safe margin for prompt + response)

	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	return " ".join(words[:limit])

	# ------------------------------------------------------------------
	# Main generation routine
	# ------------------------------------------------------------------

	def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
	with tempfile.TemporaryDirectory() as tmpdir:
	lecture_text = truncate_text(extract_pdf_text(pdf.name))
	audio_outputs = []

	for lang_code, info in LANG_INFO.items():
	# 1️⃣ Create prompt + generate dialogue
	prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
	dialogue = llm(prompt)

	# 2️⃣ Save raw dialogue text (for reference)
	txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
	with open(txt_path, "w", encoding="utf-8") as f:
	f.write(dialogue)

	# 3️⃣ Synthesise speech with XTTS‑v2
	wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
	# ► xtts_v2 accepts ISO‑639‑1 language codes directly
	tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)

	audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio

	return audio_outputs

	# ------------------------------------------------------------------
	# Gradio UI
	# ------------------------------------------------------------------

	audio_components = [
	gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
	]

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
	outputs=audio_components,
	title="Lecture → Multilingual Podcast Generator",
	description=(
	"Upload a lecture PDF and receive a two‑host audio podcast in English, "
	"Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
	"dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
	"or API keys needed."
	),
	)

	if __name__ == "__main__":
	iface.launch()