File size: 7,466 Bytes
910bbfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
# Face *audio‑generation* model for speech (no external TTS APIs).
# -----------------------------------------------------------------
# Files for your Space:
# • app.py (this file)
# • requirements.txt (see bottom comment block)
# -----------------------------------------------------------------
# Add your HF_TOKEN as a Space secret if required for gated models.
# =============================================================
import os
import tempfile
import uuid
import textwrap
from typing import Dict, Tuple
import gradio as gr
from PyPDF2 import PdfReader
import nltk # sentence tokenisation
from llama_index.llms.huggingface import HfApiModel
from transformers import pipeline # HF TTS pipeline
import soundfile as sf # save audio
# ---------------------------------------------------------------
# Ensure NLTK punkt is present on first launch
# ---------------------------------------------------------------
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
# --------------------------- LLM Setup ---------------------------
llm = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
custom_role_conversions=None,
)
# ------------------------ TTS Setup ------------------------------
# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
# If you need lighter weights choose language‑specific VITS models.
# ----------------------------------------------------------------
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
# Load once; Space queues requests so single GPU/CPU is okay.
try:
tts_pipeline = pipeline(
"text-to-speech",
model=TTS_MODEL_ID,
device_map="auto", # GPU if available, else CPU
)
except Exception as e:
raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
# ------------------------ Helpers --------------------------------
LANG_CONFIG = {
"English": {
"tts_lang": "en",
"prompt_tag": "English",
},
"Bangla": {
"tts_lang": "bn",
"prompt_tag": "Bangla (বাংলা)",
},
"Chinese": {
"tts_lang": "zh",
"prompt_tag": "Mandarin Chinese",
},
"Urdu": {
"tts_lang": "ur",
"prompt_tag": "Urdu (اردو)",
},
"Nepali": {
"tts_lang": "ne",
"prompt_tag": "Nepali (नेपाली)",
},
}
def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
"""Extract raw text from PDF, truncate to avoid token overflow."""
reader = PdfReader(pdf_path)
text_parts = []
for page in reader.pages:
page_text = page.extract_text() or ""
text_parts.append(page_text)
if sum(len(t) for t in text_parts) >= max_chars:
break
raw_text = "\n".join(text_parts)[:max_chars]
return raw_text
def build_prompt(lecture_text: str, lang: str) -> str:
"""Craft a prompt instructing the LLM to return a dialogue in `lang`."""
# Compress lecture to ~150 sentences to stay under token budget
sentences = nltk.sent_tokenize(lecture_text)
short_text = " ".join(sentences[: min(len(sentences), 150)])
prompt = textwrap.dedent(
f"""
You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
Lecture content (for reference):
""".strip()
) + "\n" + short_text + "\n"
return prompt
def generate_dialogue(lecture_text: str, lang: str) -> str:
"""Call the Qwen model to generate podcast script for the given language."""
prompt = build_prompt(lecture_text, lang)
try:
response = llm.complete(prompt)
dialogue = response.text.strip()
except Exception as e:
dialogue = f"Error generating dialogue in {lang}: {e}"
return dialogue
def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
"""Convert text to speech via HF TTS; returns (filepath, mime)."""
language_code = LANG_CONFIG[lang]["tts_lang"]
tmpdir = tempfile.gettempdir()
filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
try:
# xtts_v2 accepts a `language` forward param
speech = tts_pipeline(text, forward_params={"language": language_code})
sf.write(filename, speech["audio"], speech["sampling_rate"])
return filename, "audio/wav"
except Exception as e:
# Return dummy text file explaining error
errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
with open(errfile, "w", encoding="utf-8") as fh:
fh.write(f"TTS error for {lang}: {e}\n")
return errfile, "text/plain"
def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
"""Full pipeline returning a dict: language → (file_path, mime)."""
if pdf_file is None:
raise gr.Error("Please upload a PDF lecture first.")
lecture_text = extract_text(pdf_file)
audio_outputs = {}
for lang in LANG_CONFIG.keys():
dialogue = generate_dialogue(lecture_text, lang)
path, mime = tts_for_dialogue(lang, dialogue)
audio_outputs[lang] = (path, mime)
return audio_outputs
# ------------------------ Gradio UI --------------------------------
with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
gr.Markdown(
"""# 📚🎙️ Lecture → Podcast
Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
"""
)
with gr.Row():
inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
btn = gr.Button("Generate Podcast")
with gr.Group():
audio_components = [
gr.Audio(label=lang, interactive=False, type="filepath")
for lang in LANG_CONFIG.keys()
]
def gradio_wrapper(pdf_file):
results = pipeline_runner(pdf_file)
return [results[lang][0] for lang in LANG_CONFIG.keys()]
btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
if __name__ == "__main__":
demo.launch()
# ---------------------------------------------------------------
# requirements.txt (commit as separate file in the Space root)
# ---------------------------------------------------------------
# gradio>=4.28.0
# PyPDF2>=3.0.1
# nltk>=3.8.1
# transformers>=4.39.0
# torch>=2.1.2
# soundfile>=0.12.1
# llama-index>=0.11.47
# huggingface-hub>=0.23.0
|