Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

c172b12 verified about 1 month ago

raw

history blame

5.61 kB

	# =============================================================
	# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
	# =============================================================
	# * Text generation – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct).
	# * Speech synthesis – `huggingface_hub.InferenceClient.text_to_speech`.
	# * Users pick which languages to generate (English, Bangla, Chinese,
	# Urdu, Nepali). Unselected languages are skipped.
	# -----------------------------------------------------------------

	import os
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Dict, Tuple, Optional

	import gradio as gr
	from huggingface_hub import InferenceClient
	from PyPDF2 import PdfReader
	from smolagents import HfApiModel

	# ------------------------------------------------------------------
	# LLM: Qwen 32‑B via SmolAgents
	# ------------------------------------------------------------------
	llm = HfApiModel(
	model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
	max_tokens=2096,
	temperature=0.5,
	custom_role_conversions=None,
	)

	# ------------------------------------------------------------------
	# HF Inference API client (reads HF_TOKEN secret if set)
	# ------------------------------------------------------------------
	client = InferenceClient(token=os.getenv("HF_TOKEN", None))

	# ------------------------------------------------------------------
	# Language metadata and matching TTS model IDs
	# ------------------------------------------------------------------
	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
	"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
	"zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
	"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
	"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
	}

	# Helper map: name ➜ code
	LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two‑host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of ≈1200 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and
	wrap up with a concise recap. Preserve technical accuracy.

	### Lecture Content
	{content}
	"""
	)

	# ------------------------------------------------------------------
	# Helpers: extract and truncate PDF text
	# ------------------------------------------------------------------

	def extract_pdf_text(pdf_path: str) -> str:
	reader = PdfReader(pdf_path)
	return "\n".join(page.extract_text() or "" for page in reader.pages)

	TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit

	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	return " ".join(words[:limit])

	# ------------------------------------------------------------------
	# Main pipeline
	# ------------------------------------------------------------------

	def generate_podcast(pdf: gr.File, selected_lang_names: List[str]) -> List[Optional[Tuple[str, None]]]:
	"""Generate podcast audio files for chosen languages. Returns a list
	aligned with LANG_INFO order; unselected languages yield None."""
	# Ensure at least one language selected
	if not selected_lang_names:
	return [None] * len(LANG_INFO)

	selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]

	with tempfile.TemporaryDirectory() as tmpdir:
	raw_text = extract_pdf_text(pdf.name)
	lecture_text = truncate_text(raw_text)
	outputs: List[Optional[Tuple[str, None]]] = []

	for code, info in LANG_INFO.items():
	if code not in selected_codes:
	outputs.append(None)
	continue

	# 1️⃣ Draft dialogue in the target language
	prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
	dialogue: str = llm(prompt)

	# 2️⃣ Synthesize speech via HF Inference API
	audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
	flac_path = Path(tmpdir) / f"podcast_{code}.flac"
	flac_path.write_bytes(audio_bytes)

	outputs.append((str(flac_path), None)) # (filepath, label)

	return outputs

	# ------------------------------------------------------------------
	# Gradio interface
	# ------------------------------------------------------------------

	language_choices = [info["name"] for info in LANG_INFO.values()]

	inputs = [
	gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
	gr.CheckboxGroup(
	choices=language_choices,
	value=["English"],
	label="Select podcast language(s) to generate",
	),
	]

	audio_components = [
	gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
	]

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=inputs,
	outputs=audio_components,
	title="Lecture → Podcast Generator (Choose Languages)",
	description=(
	"Upload a lecture PDF, choose your desired languages, and receive a "
	"two‑host audio podcast. Dialogue is crafted by Qwen‑32B; speech is "
	"synthesized on‑the‑fly using the Hugging Face Inference API — "
	"no heavy downloads or GPUs required."
	),
	)

	if __name__ == "__main__":
	iface.launch()