|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import tempfile
|
|
import uuid
|
|
import textwrap
|
|
from typing import Dict, Tuple
|
|
|
|
import gradio as gr
|
|
from PyPDF2 import PdfReader
|
|
import nltk
|
|
from llama_index.llms.huggingface import HfApiModel
|
|
from transformers import pipeline
|
|
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
try:
|
|
nltk.data.find("tokenizers/punkt")
|
|
except LookupError:
|
|
nltk.download("punkt")
|
|
|
|
|
|
llm = HfApiModel(
|
|
max_tokens=2096,
|
|
temperature=0.5,
|
|
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
|
|
custom_role_conversions=None,
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
|
|
try:
|
|
tts_pipeline = pipeline(
|
|
"text-to-speech",
|
|
model=TTS_MODEL_ID,
|
|
device_map="auto",
|
|
)
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
|
|
|
|
|
|
LANG_CONFIG = {
|
|
"English": {
|
|
"tts_lang": "en",
|
|
"prompt_tag": "English",
|
|
},
|
|
"Bangla": {
|
|
"tts_lang": "bn",
|
|
"prompt_tag": "Bangla (বাংলা)",
|
|
},
|
|
"Chinese": {
|
|
"tts_lang": "zh",
|
|
"prompt_tag": "Mandarin Chinese",
|
|
},
|
|
"Urdu": {
|
|
"tts_lang": "ur",
|
|
"prompt_tag": "Urdu (اردو)",
|
|
},
|
|
"Nepali": {
|
|
"tts_lang": "ne",
|
|
"prompt_tag": "Nepali (नेपाली)",
|
|
},
|
|
}
|
|
|
|
|
|
def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
|
|
"""Extract raw text from PDF, truncate to avoid token overflow."""
|
|
reader = PdfReader(pdf_path)
|
|
text_parts = []
|
|
for page in reader.pages:
|
|
page_text = page.extract_text() or ""
|
|
text_parts.append(page_text)
|
|
if sum(len(t) for t in text_parts) >= max_chars:
|
|
break
|
|
raw_text = "\n".join(text_parts)[:max_chars]
|
|
return raw_text
|
|
|
|
|
|
def build_prompt(lecture_text: str, lang: str) -> str:
|
|
"""Craft a prompt instructing the LLM to return a dialogue in `lang`."""
|
|
|
|
sentences = nltk.sent_tokenize(lecture_text)
|
|
short_text = " ".join(sentences[: min(len(sentences), 150)])
|
|
|
|
prompt = textwrap.dedent(
|
|
f"""
|
|
You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
|
|
|
|
Lecture content (for reference):
|
|
""".strip()
|
|
) + "\n" + short_text + "\n"
|
|
return prompt
|
|
|
|
|
|
def generate_dialogue(lecture_text: str, lang: str) -> str:
|
|
"""Call the Qwen model to generate podcast script for the given language."""
|
|
prompt = build_prompt(lecture_text, lang)
|
|
try:
|
|
response = llm.complete(prompt)
|
|
dialogue = response.text.strip()
|
|
except Exception as e:
|
|
dialogue = f"Error generating dialogue in {lang}: {e}"
|
|
return dialogue
|
|
|
|
|
|
def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
|
|
"""Convert text to speech via HF TTS; returns (filepath, mime)."""
|
|
language_code = LANG_CONFIG[lang]["tts_lang"]
|
|
tmpdir = tempfile.gettempdir()
|
|
filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
|
|
try:
|
|
|
|
speech = tts_pipeline(text, forward_params={"language": language_code})
|
|
sf.write(filename, speech["audio"], speech["sampling_rate"])
|
|
return filename, "audio/wav"
|
|
except Exception as e:
|
|
|
|
errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
|
|
with open(errfile, "w", encoding="utf-8") as fh:
|
|
fh.write(f"TTS error for {lang}: {e}\n")
|
|
return errfile, "text/plain"
|
|
|
|
|
|
def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
|
|
"""Full pipeline returning a dict: language → (file_path, mime)."""
|
|
if pdf_file is None:
|
|
raise gr.Error("Please upload a PDF lecture first.")
|
|
lecture_text = extract_text(pdf_file)
|
|
|
|
audio_outputs = {}
|
|
for lang in LANG_CONFIG.keys():
|
|
dialogue = generate_dialogue(lecture_text, lang)
|
|
path, mime = tts_for_dialogue(lang, dialogue)
|
|
audio_outputs[lang] = (path, mime)
|
|
return audio_outputs
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
|
|
gr.Markdown(
|
|
"""# 📚🎙️ Lecture → Podcast
|
|
Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
|
|
"""
|
|
)
|
|
with gr.Row():
|
|
inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
|
|
btn = gr.Button("Generate Podcast")
|
|
with gr.Group():
|
|
audio_components = [
|
|
gr.Audio(label=lang, interactive=False, type="filepath")
|
|
for lang in LANG_CONFIG.keys()
|
|
]
|
|
|
|
|
|
def gradio_wrapper(pdf_file):
|
|
results = pipeline_runner(pdf_file)
|
|
return [results[lang][0] for lang in LANG_CONFIG.keys()]
|
|
|
|
|
|
btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|