|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import tempfile |
|
import uuid |
|
import textwrap |
|
from typing import List, Dict |
|
|
|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from smolagents import HfApiModel |
|
from TTS.api import TTS |
|
|
|
|
|
|
|
|
|
llm = HfApiModel( |
|
model_id="Qwen/Qwen2.5-Coder-32B-Instruct", |
|
max_tokens=2096, |
|
temperature=0.5, |
|
custom_role_conversions=None, |
|
) |
|
|
|
|
|
|
|
|
|
TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" |
|
|
|
tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False) |
|
|
|
|
|
LANG_INFO: Dict[str, Dict[str, str]] = { |
|
"en": {"name": "English"}, |
|
"bn": {"name": "Bangla"}, |
|
"zh": {"name": "Chinese"}, |
|
"ur": {"name": "Urdu"}, |
|
"ne": {"name": "Nepali"}, |
|
} |
|
|
|
PROMPT_TEMPLATE = textwrap.dedent( |
|
""" |
|
You are producing a lively two‑host educational podcast in {lang_name}. |
|
Summarize the following lecture content into a dialogue of about 1200 words. |
|
Use an engaging style: hosts ask each other questions, clarify ideas, add |
|
simple analogies, and conclude with a short recap. Keep technical accuracy. |
|
|
|
### Lecture Content |
|
{content} |
|
""" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def extract_pdf_text(pdf_file) -> str: |
|
reader = PdfReader(pdf_file) |
|
return "\n".join(p.extract_text() or "" for p in reader.pages) |
|
|
|
TOKEN_LIMIT = 6000 |
|
|
|
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str: |
|
words = text.split() |
|
return " ".join(words[:limit]) |
|
|
|
|
|
|
|
|
|
|
|
def generate_podcast(pdf: gr.File) -> List[gr.Audio]: |
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
lecture_text = truncate_text(extract_pdf_text(pdf.name)) |
|
audio_outputs = [] |
|
|
|
for lang_code, info in LANG_INFO.items(): |
|
|
|
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text) |
|
dialogue = llm(prompt) |
|
|
|
|
|
txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt") |
|
with open(txt_path, "w", encoding="utf-8") as f: |
|
f.write(dialogue) |
|
|
|
|
|
wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav") |
|
|
|
tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path) |
|
|
|
audio_outputs.append((wav_path, None)) |
|
|
|
return audio_outputs |
|
|
|
|
|
|
|
|
|
|
|
audio_components = [ |
|
gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values() |
|
] |
|
|
|
iface = gr.Interface( |
|
fn=generate_podcast, |
|
inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]), |
|
outputs=audio_components, |
|
title="Lecture → Multilingual Podcast Generator", |
|
description=( |
|
"Upload a lecture PDF and receive a two‑host audio podcast in English, " |
|
"Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the " |
|
"dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos " |
|
"or API keys needed." |
|
), |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|