|
|
|
|
|
|
|
|
|
import os |
|
import re |
|
import tempfile |
|
import textwrap |
|
from pathlib import Path |
|
from typing import List, Dict, Optional, Any |
|
|
|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from pydub import AudioSegment |
|
from pydub.exceptions import CouldntDecodeError |
|
|
|
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
import google.generativeai as genai |
|
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None |
|
|
|
|
|
LANG_INFO: Dict[str, Dict[str, str]] = { |
|
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"}, |
|
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"}, |
|
"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"}, |
|
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"}, |
|
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"}, |
|
} |
|
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()} |
|
|
|
|
|
PROMPT_TEMPLATE = textwrap.dedent( |
|
""" |
|
You are producing a lively two-host educational podcast in {lang_name}. |
|
Summarize the following lecture content into a dialogue of **approximately 300 words**. |
|
Make it engaging: hosts ask questions, clarify ideas with analogies, and |
|
wrap up with a concise recap. Preserve technical accuracy. |
|
|
|
### Lecture Content |
|
{content} |
|
""" |
|
) |
|
|
|
|
|
TOKEN_LIMIT = 8000 |
|
|
|
def extract_pdf_text(path: str) -> str: |
|
reader = PdfReader(path) |
|
return "\n".join(p.extract_text() or "" for p in reader.pages) |
|
|
|
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str: |
|
words = text.split() |
|
return " ".join(words[:limit]) if len(words) > limit else text |
|
|
|
|
|
CHUNK_CHAR_LIMIT = 280 |
|
|
|
def split_chunks(text: str) -> List[str]: |
|
sentences = re.split(r"(?<=[.!?])\s+", text.strip()) |
|
chunks, curr = [], "" |
|
for s in sentences: |
|
if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT: |
|
chunks.append(curr) |
|
curr = s |
|
else: |
|
curr = f"{curr} {s}" if curr else s |
|
if curr: chunks.append(curr) |
|
return chunks |
|
|
|
|
|
|
|
def synthesize(text: str, model_id: str, outdir: Path) -> str: |
|
segments = [] |
|
for i, chunk in enumerate(split_chunks(text)): |
|
audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id) |
|
path = outdir / f"part{i}.flac" |
|
path.write_bytes(audio_bytes) |
|
seg = AudioSegment.from_file(path, format="flac") |
|
segments.append(seg) |
|
final = sum(segments, AudioSegment.empty()) |
|
out = outdir / "podcast.flac" |
|
final.export(out, format="flac") |
|
return str(out) |
|
|
|
|
|
|
|
def generate_podcast( |
|
gemini_key: str, |
|
pdf_file: gr.File, |
|
langs: List[str] |
|
) -> List[Optional[Any]]: |
|
if not gemini_key: |
|
raise gr.Error("Enter Google AI Studio API Key.") |
|
if not pdf_file: |
|
raise gr.Error("Upload a PDF file.") |
|
if not langs: |
|
raise gr.Error("Select at least one language.") |
|
|
|
genai.configure(api_key=gemini_key) |
|
raw = extract_pdf_text(pdf_file.name) |
|
content = truncate_text(raw) |
|
|
|
tmp = Path(tempfile.mkdtemp()) |
|
results = [] |
|
data = {} |
|
|
|
for code, info in LANG_INFO.items(): |
|
if info["name"] not in langs: |
|
results.extend([None, None, None]) |
|
continue |
|
|
|
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content) |
|
model = genai.GenerativeModel('gemini-1.5-flash-latest') |
|
resp = model.generate_content(prompt) |
|
script = resp.text.strip() |
|
|
|
script_path = tmp / f"script_{code}.txt" |
|
script_path.write_text(script, encoding="utf-8") |
|
|
|
html_script = f"<pre>{script}</pre>" |
|
|
|
audio_path = None |
|
if hf_tts_client: |
|
audio_path = synthesize(script, info["tts_model"], tmp / code) |
|
results.extend([audio_path, html_script, str(script_path)]) |
|
return results |
|
|
|
|
|
inputs = [ |
|
gr.Textbox(label="Google AI Studio API Key", type="password"), |
|
gr.File(label="Lecture PDF", file_types=[".pdf"]), |
|
gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()], |
|
value=["English"], label="Languages") |
|
] |
|
outputs = [] |
|
for code, info in LANG_INFO.items(): |
|
outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath")) |
|
outputs.append(gr.HTML(label=f"{info['name']} Script HTML")) |
|
outputs.append(gr.File(label=f"Download {info['name']} Script")) |
|
|
|
iface = gr.Interface( |
|
fn=generate_podcast, |
|
inputs=inputs, |
|
outputs=outputs, |
|
title="Lecture → Podcast & Script", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|