PodCastIt / app.py
HaiderAUT's picture
Upload 2 files
910bbfc verified
raw
history blame
7.47 kB
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
# Face *audio‑generation* model for speech (no external TTS APIs).
# -----------------------------------------------------------------
# Files for your Space:
# • app.py (this file)
# • requirements.txt (see bottom comment block)
# -----------------------------------------------------------------
# Add your HF_TOKEN as a Space secret if required for gated models.
# =============================================================
import os
import tempfile
import uuid
import textwrap
from typing import Dict, Tuple
import gradio as gr
from PyPDF2 import PdfReader
import nltk # sentence tokenisation
from llama_index.llms.huggingface import HfApiModel
from transformers import pipeline # HF TTS pipeline
import soundfile as sf # save audio
# ---------------------------------------------------------------
# Ensure NLTK punkt is present on first launch
# ---------------------------------------------------------------
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
# --------------------------- LLM Setup ---------------------------
llm = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
custom_role_conversions=None,
)
# ------------------------ TTS Setup ------------------------------
# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
# If you need lighter weights choose language‑specific VITS models.
# ----------------------------------------------------------------
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
# Load once; Space queues requests so single GPU/CPU is okay.
try:
tts_pipeline = pipeline(
"text-to-speech",
model=TTS_MODEL_ID,
device_map="auto", # GPU if available, else CPU
)
except Exception as e:
raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
# ------------------------ Helpers --------------------------------
LANG_CONFIG = {
"English": {
"tts_lang": "en",
"prompt_tag": "English",
},
"Bangla": {
"tts_lang": "bn",
"prompt_tag": "Bangla (বাংলা)",
},
"Chinese": {
"tts_lang": "zh",
"prompt_tag": "Mandarin Chinese",
},
"Urdu": {
"tts_lang": "ur",
"prompt_tag": "Urdu (اردو)",
},
"Nepali": {
"tts_lang": "ne",
"prompt_tag": "Nepali (नेपाली)",
},
}
def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
"""Extract raw text from PDF, truncate to avoid token overflow."""
reader = PdfReader(pdf_path)
text_parts = []
for page in reader.pages:
page_text = page.extract_text() or ""
text_parts.append(page_text)
if sum(len(t) for t in text_parts) >= max_chars:
break
raw_text = "\n".join(text_parts)[:max_chars]
return raw_text
def build_prompt(lecture_text: str, lang: str) -> str:
"""Craft a prompt instructing the LLM to return a dialogue in `lang`."""
# Compress lecture to ~150 sentences to stay under token budget
sentences = nltk.sent_tokenize(lecture_text)
short_text = " ".join(sentences[: min(len(sentences), 150)])
prompt = textwrap.dedent(
f"""
You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
Lecture content (for reference):
""".strip()
) + "\n" + short_text + "\n"
return prompt
def generate_dialogue(lecture_text: str, lang: str) -> str:
"""Call the Qwen model to generate podcast script for the given language."""
prompt = build_prompt(lecture_text, lang)
try:
response = llm.complete(prompt)
dialogue = response.text.strip()
except Exception as e:
dialogue = f"Error generating dialogue in {lang}: {e}"
return dialogue
def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
"""Convert text to speech via HF TTS; returns (filepath, mime)."""
language_code = LANG_CONFIG[lang]["tts_lang"]
tmpdir = tempfile.gettempdir()
filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
try:
# xtts_v2 accepts a `language` forward param
speech = tts_pipeline(text, forward_params={"language": language_code})
sf.write(filename, speech["audio"], speech["sampling_rate"])
return filename, "audio/wav"
except Exception as e:
# Return dummy text file explaining error
errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
with open(errfile, "w", encoding="utf-8") as fh:
fh.write(f"TTS error for {lang}: {e}\n")
return errfile, "text/plain"
def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
"""Full pipeline returning a dict: language → (file_path, mime)."""
if pdf_file is None:
raise gr.Error("Please upload a PDF lecture first.")
lecture_text = extract_text(pdf_file)
audio_outputs = {}
for lang in LANG_CONFIG.keys():
dialogue = generate_dialogue(lecture_text, lang)
path, mime = tts_for_dialogue(lang, dialogue)
audio_outputs[lang] = (path, mime)
return audio_outputs
# ------------------------ Gradio UI --------------------------------
with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
gr.Markdown(
"""# 📚🎙️ Lecture → Podcast
Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
"""
)
with gr.Row():
inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
btn = gr.Button("Generate Podcast")
with gr.Group():
audio_components = [
gr.Audio(label=lang, interactive=False, type="filepath")
for lang in LANG_CONFIG.keys()
]
def gradio_wrapper(pdf_file):
results = pipeline_runner(pdf_file)
return [results[lang][0] for lang in LANG_CONFIG.keys()]
btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
if __name__ == "__main__":
demo.launch()
# ---------------------------------------------------------------
# requirements.txt (commit as separate file in the Space root)
# ---------------------------------------------------------------
# gradio>=4.28.0
# PyPDF2>=3.0.1
# nltk>=3.8.1
# transformers>=4.39.0
# torch>=2.1.2
# soundfile>=0.12.1
# llama-index>=0.11.47
# huggingface-hub>=0.23.0