Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Upload 2 files

910bbfc verified about 1 month ago

raw

history blame

7.47 kB

	# =============================================================
	# Hugging Face Space – Lecture → Multilingual Podcast Generator
	# =============================================================
	# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
	# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
	# using Qwen/Qwen2.5‑Coder‑32B‑Instruct for text AND a Hugging
	# Face audio‑generation model for speech (no external TTS APIs).
	# -----------------------------------------------------------------
	# Files for your Space:
	# • app.py (this file)
	# • requirements.txt (see bottom comment block)
	# -----------------------------------------------------------------
	# Add your HF_TOKEN as a Space secret if required for gated models.
	# =============================================================

	import os
	import tempfile
	import uuid
	import textwrap
	from typing import Dict, Tuple

	import gradio as gr
	from PyPDF2 import PdfReader
	import nltk # sentence tokenisation
	from llama_index.llms.huggingface import HfApiModel
	from transformers import pipeline # HF TTS pipeline
	import soundfile as sf # save audio

	# ---------------------------------------------------------------
	# Ensure NLTK punkt is present on first launch
	# ---------------------------------------------------------------
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	nltk.download("punkt")

	# --------------------------- LLM Setup ---------------------------
	llm = HfApiModel(
	max_tokens=2096,
	temperature=0.5,
	model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
	custom_role_conversions=None,
	)

	# ------------------------ TTS Setup ------------------------------
	# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
	# If you need lighter weights choose language‑specific VITS models.
	# ----------------------------------------------------------------
	TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
	# Load once; Space queues requests so single GPU/CPU is okay.
	try:
	tts_pipeline = pipeline(
	"text-to-speech",
	model=TTS_MODEL_ID,
	device_map="auto", # GPU if available, else CPU
	)
	except Exception as e:
	raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")

	# ------------------------ Helpers --------------------------------
	LANG_CONFIG = {
	"English": {
	"tts_lang": "en",
	"prompt_tag": "English",
	},
	"Bangla": {
	"tts_lang": "bn",
	"prompt_tag": "Bangla (বাংলা)",
	},
	"Chinese": {
	"tts_lang": "zh",
	"prompt_tag": "Mandarin Chinese",
	},
	"Urdu": {
	"tts_lang": "ur",
	"prompt_tag": "Urdu (اردو)",
	},
	"Nepali": {
	"tts_lang": "ne",
	"prompt_tag": "Nepali (नेपाली)",
	},
	}


	def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
	"""Extract raw text from PDF, truncate to avoid token overflow."""
	reader = PdfReader(pdf_path)
	text_parts = []
	for page in reader.pages:
	page_text = page.extract_text() or ""
	text_parts.append(page_text)
	if sum(len(t) for t in text_parts) >= max_chars:
	break
	raw_text = "\n".join(text_parts)[:max_chars]
	return raw_text


	def build_prompt(lecture_text: str, lang: str) -> str:
	"""Craft a prompt instructing the LLM to return a dialogue in `lang`."""
	# Compress lecture to ~150 sentences to stay under token budget
	sentences = nltk.sent_tokenize(lecture_text)
	short_text = " ".join(sentences[: min(len(sentences), 150)])

	prompt = textwrap.dedent(
	f"""
	You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation in {LANG_CONFIG[lang]['prompt_tag']}, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.

	Lecture content (for reference):
	""".strip()
	) + "\n" + short_text + "\n"
	return prompt


	def generate_dialogue(lecture_text: str, lang: str) -> str:
	"""Call the Qwen model to generate podcast script for the given language."""
	prompt = build_prompt(lecture_text, lang)
	try:
	response = llm.complete(prompt)
	dialogue = response.text.strip()
	except Exception as e:
	dialogue = f"Error generating dialogue in {lang}: {e}"
	return dialogue


	def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
	"""Convert text to speech via HF TTS; returns (filepath, mime)."""
	language_code = LANG_CONFIG[lang]["tts_lang"]
	tmpdir = tempfile.gettempdir()
	filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
	try:
	# xtts_v2 accepts a `language` forward param
	speech = tts_pipeline(text, forward_params={"language": language_code})
	sf.write(filename, speech["audio"], speech["sampling_rate"])
	return filename, "audio/wav"
	except Exception as e:
	# Return dummy text file explaining error
	errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
	with open(errfile, "w", encoding="utf-8") as fh:
	fh.write(f"TTS error for {lang}: {e}\n")
	return errfile, "text/plain"


	def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
	"""Full pipeline returning a dict: language → (file_path, mime)."""
	if pdf_file is None:
	raise gr.Error("Please upload a PDF lecture first.")
	lecture_text = extract_text(pdf_file)

	audio_outputs = {}
	for lang in LANG_CONFIG.keys():
	dialogue = generate_dialogue(lecture_text, lang)
	path, mime = tts_for_dialogue(lang, dialogue)
	audio_outputs[lang] = (path, mime)
	return audio_outputs


	# ------------------------ Gradio UI --------------------------------

	with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
	gr.Markdown(
	"""# 📚🎙️ Lecture → Podcast
	Upload a lecture PDF and receive a two‑host audio podcast generated directly in five languages using Qwen for text and XTTS‑v2 for speech.
	"""
	)
	with gr.Row():
	inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
	btn = gr.Button("Generate Podcast")
	with gr.Group():
	audio_components = [
	gr.Audio(label=lang, interactive=False, type="filepath")
	for lang in LANG_CONFIG.keys()
	]


	def gradio_wrapper(pdf_file):
	results = pipeline_runner(pdf_file)
	return [results[lang][0] for lang in LANG_CONFIG.keys()]


	btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)

	if __name__ == "__main__":
	demo.launch()

	# ---------------------------------------------------------------
	# requirements.txt (commit as separate file in the Space root)
	# ---------------------------------------------------------------
	# gradio>=4.28.0
	# PyPDF2>=3.0.1
	# nltk>=3.8.1
	# transformers>=4.39.0
	# torch>=2.1.2
	# soundfile>=0.12.1
	# llama-index>=0.11.47
	# huggingface-hub>=0.23.0