Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

f036ad8 verified about 1 month ago

raw

history blame

6.25 kB

	import os
	import re
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Dict, Optional

	import gradio as gr
	from huggingface_hub import InferenceClient
	from PyPDF2 import PdfReader # For PDF processing
	from smolagents import HfApiModel # For LLM interaction
	from pydub import AudioSegment
	from pydub.exceptions import CouldntDecodeError

	# ------------------------------------------------------------------
	# LLM setup – remote Qwen model via SmolAgents
	# ------------------------------------------------------------------
	llm = HfApiModel(
	model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
	max_tokens=2048,
	temperature=0.5,
	)

	# ------------------------------------------------------------------
	# Hugging Face Inference API client
	# ------------------------------------------------------------------
	client = InferenceClient(token=os.getenv("HF_TOKEN", None))

	# ------------------------------------------------------------------
	# Language metadata and open TTS models
	# ------------------------------------------------------------------
	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
	"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
	"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
	"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
	"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
	}
	LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two-host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of ~300 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and
	wrap up with a concise recap. Preserve technical accuracy.

	### Lecture Content
	{content}
	"""
	)

	TOKEN_LIMIT = 8000
	CHUNK_CHAR_LIMIT = 280

	# ------------------------------------------------------------------
	# PDF text extraction
	# ------------------------------------------------------------------
	def extract_pdf_text(pdf_path: str) -> str:
	try:
	reader = PdfReader(pdf_path)
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	except Exception as e:
	raise gr.Error(f"Failed to process PDF: {e}")

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------
	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	if len(words) > limit:
	return " ".join(words[:limit])
	return text


	def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
	chunks, current = [], ""
	for sent in sentences:
	if current and len(current) + len(sent) + 1 > limit:
	chunks.append(current)
	current = sent
	else:
	current = f"{current} {sent}".strip()
	if current:
	chunks.append(current)
	return chunks


	def synthesize_speech(text: str, model_id: str, tempdir: Path) -> Path:
	chunks = _split_to_chunks(text)
	if not chunks:
	raise ValueError("No text chunks to synthesize.")

	segments = []
	for i, chunk in enumerate(chunks):
	try:
	audio_bytes = client.text_to_speech(chunk, model=model_id)
	except HubHTTPError as e:
	raise RuntimeError(f"TTS error on chunk {i}: {e}")
	part = tempdir / f"seg_{i}.flac"
	part.write_bytes(audio_bytes)
	try:
	seg = AudioSegment.from_file(part, format="flac")
	except CouldntDecodeError as e:
	raise RuntimeError(f"Decode error on chunk {i}: {e}")
	segments.append(seg)

	combined = sum(segments, AudioSegment.empty())
	outpath = tempdir / "podcast.flac"
	combined.export(outpath, format="flac")
	return outpath

	# ------------------------------------------------------------------
	# Main pipeline
	# ------------------------------------------------------------------

	def generate_podcast(pdf_file: Optional[gr.File], languages: List[str]):
	if not pdf_file:
	raise gr.Error("Please upload a PDF file.")
	if not languages:
	raise gr.Error("Select at least one language.")

	# Extract and truncate
	text = extract_pdf_text(pdf_file.name)
	if not text.strip():
	raise gr.Error("No text found in PDF.")
	lecture = truncate_text(text)

	transcripts, audios = [], []
	with tempfile.TemporaryDirectory() as td:
	base = Path(td)
	for name in languages:
	code = LANG_CODE_BY_NAME[name]
	# 1️⃣ Dialogue
	prompt = PROMPT_TEMPLATE.format(lang_name=name, content=lecture)
	dialogue = llm(prompt).strip()
	transcripts.append(dialogue)
	# 2️⃣ Speech
	tempdir = base / code
	tempdir.mkdir(parents=True, exist_ok=True)
	audio_path = synthesize_speech(dialogue, LANG_INFO[code]["tts_model"], tempdir)
	audios.append(str(audio_path))

	# Return alternating transcript and audio path
	results: List = []
	for t, a in zip(transcripts, audios):
	results.extend([t, a])
	return results

	# ------------------------------------------------------------------
	# Gradio UI
	# ------------------------------------------------------------------
	languages = [info["name"] for info in LANG_INFO.values()]

	inputs = [
	gr.File(label="Lecture PDF", file_types=[".pdf"]),
	gr.CheckboxGroup(languages, value=["English"], label="Languages"),
	]

	# Two outputs per language: transcript and audio
	outputs = []
	for name in languages:
	outputs.append(gr.Textbox(label=f"{name} Transcript", interactive=False))
	outputs.append(gr.Audio(label=f"{name} Podcast", type="filepath"))

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=inputs,
	outputs=outputs,
	title="Lecture → Podcast Generator",
	description="Upload a lecture PDF, select languages, get dialogue transcript and audio podcast."
	)

	if __name__ == "__main__":
	iface.launch()