File size: 4,571 Bytes
f1adb14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Uses SmolAgents HfApiModel for text generation and HF audio
# pipeline for speech. Generates two‑host dialogues in five
# languages (English, Bangla, Chinese, Urdu, Nepali) directly
# from a PDF lecture upload.
# -----------------------------------------------------------------
import os
import tempfile
import uuid
import textwrap
from typing import List, Dict
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline # for audio generation (e.g., xtts)
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
# ------------------------------------------------------------------
# LLM configuration (SmolAgents wrapper for HF Inference API)
# ------------------------------------------------------------------
llm = HfApiModel(
model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # 34B parameter instruct model
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
# ------------------------------------------------------------------
# Audio model (multilingual text ➜ speech); choose an open xtts‑v2
# model that supports our languages. Switch model id if you prefer.
# ------------------------------------------------------------------
audio_pipe = pipeline(
"text-to-audio",
model="suno/xtts_v2",
framework="pt",
)
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "speaker": "hostA"},
"bn": {"name": "Bangla", "speaker": "hostB"},
"zh": {"name": "Chinese", "speaker": "hostC"},
"ur": {"name": "Urdu", "speaker": "hostD"},
"ne": {"name": "Nepali", "speaker": "hostE"},
}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two‑host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of about 1200 words.
Use an engaging style: hosts ask each other questions, clarify ideas, add
simple analogies, and conclude with a short recap. Keep technical accuracy.
### Lecture Content
{content}
"""
)
# ------------------------------------------------------------------
# Utility: extract & truncate PDF text to fit LLM token budget
# ------------------------------------------------------------------
def extract_pdf_text(pdf_file) -> str:
reader = PdfReader(pdf_file)
raw = "\n".join(p.extract_text() or "" for p in reader.pages)
return raw
TOKEN_LIMIT = 6000 # conservative words (≈ tokens) for prompt+response
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit])
# ------------------------------------------------------------------
# Main generation function
# ------------------------------------------------------------------
def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
with tempfile.TemporaryDirectory() as tmpdir:
lecture_text = truncate_text(extract_pdf_text(pdf.name))
audio_outputs = []
for lang_code, info in LANG_INFO.items():
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
# --- Generate dialogue ---
dialogue = llm(prompt)
# Save text for transparency/debug
text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
with open(text_path, "w", encoding="utf-8") as f:
f.write(dialogue)
# --- TTS ---
audio = audio_pipe(dialogue, forward_params={"language": lang_code})
wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
audio["audio"].export(wav_path, format="wav")
audio_outputs.append((wav_path, None)) # Gradio Audio expects (file, label)
return audio_outputs
# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]
iface = gr.Interface(
fn=generate_podcast,
inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
outputs=audio_components,
title="Lecture → Multilingual Podcast Generator",
description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
)
if __name__ == "__main__":
iface.launch()
|