File size: 6,248 Bytes
f1adb14 fe00684 f1adb14 50d2a40 f036ad8 f1adb14 f036ad8 f1adb14 fe00684 f1adb14 f0eca57 f036ad8 f1adb14 f036ad8 f1adb14 50d2a40 f1adb14 50d2a40 f036ad8 50d2a40 f1adb14 50d2a40 fe00684 50d2a40 f1adb14 c172b12 f1adb14 c565171 f036ad8 50d2a40 f036ad8 c565171 f1adb14 f036ad8 f1adb14 f036ad8 50d2a40 c565171 f1adb14 f036ad8 f1adb14 c565171 f1adb14 fe00684 f036ad8 fe00684 f036ad8 fe00684 f036ad8 fe00684 f036ad8 fe00684 f036ad8 fe00684 f036ad8 c565171 f036ad8 c565171 f036ad8 fe00684 f1adb14 f036ad8 f1adb14 f036ad8 c565171 f036ad8 f1adb14 f036ad8 f1adb14 f036ad8 c172b12 f036ad8 c172b12 f036ad8 9e251c5 f036ad8 f1adb14 c172b12 fe00684 f036ad8 f1adb14 f036ad8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional
import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader # For PDF processing
from smolagents import HfApiModel # For LLM interaction
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# ------------------------------------------------------------------
# LLM setup – remote Qwen model via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
max_tokens=2048,
temperature=0.5,
)
# ------------------------------------------------------------------
# Hugging Face Inference API client
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))
# ------------------------------------------------------------------
# Language metadata and open TTS models
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two-host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of ~300 words.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy.
### Lecture Content
{content}
"""
)
TOKEN_LIMIT = 8000
CHUNK_CHAR_LIMIT = 280
# ------------------------------------------------------------------
# PDF text extraction
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
try:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
except Exception as e:
raise gr.Error(f"Failed to process PDF: {e}")
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
if len(words) > limit:
return " ".join(words[:limit])
return text
def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
chunks, current = [], ""
for sent in sentences:
if current and len(current) + len(sent) + 1 > limit:
chunks.append(current)
current = sent
else:
current = f"{current} {sent}".strip()
if current:
chunks.append(current)
return chunks
def synthesize_speech(text: str, model_id: str, tempdir: Path) -> Path:
chunks = _split_to_chunks(text)
if not chunks:
raise ValueError("No text chunks to synthesize.")
segments = []
for i, chunk in enumerate(chunks):
try:
audio_bytes = client.text_to_speech(chunk, model=model_id)
except HubHTTPError as e:
raise RuntimeError(f"TTS error on chunk {i}: {e}")
part = tempdir / f"seg_{i}.flac"
part.write_bytes(audio_bytes)
try:
seg = AudioSegment.from_file(part, format="flac")
except CouldntDecodeError as e:
raise RuntimeError(f"Decode error on chunk {i}: {e}")
segments.append(seg)
combined = sum(segments, AudioSegment.empty())
outpath = tempdir / "podcast.flac"
combined.export(outpath, format="flac")
return outpath
# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(pdf_file: Optional[gr.File], languages: List[str]):
if not pdf_file:
raise gr.Error("Please upload a PDF file.")
if not languages:
raise gr.Error("Select at least one language.")
# Extract and truncate
text = extract_pdf_text(pdf_file.name)
if not text.strip():
raise gr.Error("No text found in PDF.")
lecture = truncate_text(text)
transcripts, audios = [], []
with tempfile.TemporaryDirectory() as td:
base = Path(td)
for name in languages:
code = LANG_CODE_BY_NAME[name]
# 1️⃣ Dialogue
prompt = PROMPT_TEMPLATE.format(lang_name=name, content=lecture)
dialogue = llm(prompt).strip()
transcripts.append(dialogue)
# 2️⃣ Speech
tempdir = base / code
tempdir.mkdir(parents=True, exist_ok=True)
audio_path = synthesize_speech(dialogue, LANG_INFO[code]["tts_model"], tempdir)
audios.append(str(audio_path))
# Return alternating transcript and audio path
results: List = []
for t, a in zip(transcripts, audios):
results.extend([t, a])
return results
# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
languages = [info["name"] for info in LANG_INFO.values()]
inputs = [
gr.File(label="Lecture PDF", file_types=[".pdf"]),
gr.CheckboxGroup(languages, value=["English"], label="Languages"),
]
# Two outputs per language: transcript and audio
outputs = []
for name in languages:
outputs.append(gr.Textbox(label=f"{name} Transcript", interactive=False))
outputs.append(gr.Audio(label=f"{name} Podcast", type="filepath"))
iface = gr.Interface(
fn=generate_podcast,
inputs=inputs,
outputs=outputs,
title="Lecture → Podcast Generator",
description="Upload a lecture PDF, select languages, get dialogue transcript and audio podcast."
)
if __name__ == "__main__":
iface.launch()
|