File size: 5,186 Bytes
53744b5 764a881 53744b5 f1adb14 fe00684 f1adb14 50d2a40 53744b5 f1adb14 53744b5 f036ad8 f1adb14 764a881 1425202 369b2d2 764a881 369b2d2 764a881 369b2d2 764a881 f1adb14 764a881 f1adb14 369b2d2 f1adb14 c172b12 764a881 f1adb14 c565171 53744b5 50d2a40 764a881 c565171 f1adb14 764a881 369b2d2 53744b5 764a881 f1adb14 764a881 fe00684 764a881 f1adb14 53744b5 764a881 53744b5 764a881 c172b12 764a881 c172b12 9e251c5 764a881 f1adb14 c172b12 fe00684 764a881 f1adb14 764a881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# =============================================================
# Lecture → Podcast & Script Generator (Gemini + HF TTS)
# Modified: Script outputs rendered as HTML
# =============================================================
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional, Any
import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# Hugging Face TTS
from huggingface_hub import InferenceClient
# Google Gemini
import google.generativeai as genai
# ------------------------------------------------------------------
# HF TTS client
# ------------------------------------------------------------------
hf_token = os.getenv("HF_TOKEN")
hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None
# Language metadata
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
# Prompt template
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two-host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of **approximately 300 words**.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy.
### Lecture Content
{content}
"""
)
# PDF extraction
TOKEN_LIMIT = 8000
def extract_pdf_text(path: str) -> str:
reader = PdfReader(path)
return "\n".join(p.extract_text() or "" for p in reader.pages)
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
return " ".join(words[:limit]) if len(words) > limit else text
# TTS chunking
CHUNK_CHAR_LIMIT = 280
def split_chunks(text: str) -> List[str]:
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
chunks, curr = [], ""
for s in sentences:
if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
chunks.append(curr)
curr = s
else:
curr = f"{curr} {s}" if curr else s
if curr: chunks.append(curr)
return chunks
# Synthesize speech
def synthesize(text: str, model_id: str, outdir: Path) -> str:
segments = []
for i, chunk in enumerate(split_chunks(text)):
audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
path = outdir / f"part{i}.flac"
path.write_bytes(audio_bytes)
seg = AudioSegment.from_file(path, format="flac")
segments.append(seg)
final = sum(segments, AudioSegment.empty())
out = outdir / "podcast.flac"
final.export(out, format="flac")
return str(out)
# Main pipeline
def generate_podcast(
gemini_key: str,
pdf_file: gr.File,
langs: List[str]
) -> List[Optional[Any]]:
if not gemini_key:
raise gr.Error("Enter Google AI Studio API Key.")
if not pdf_file:
raise gr.Error("Upload a PDF file.")
if not langs:
raise gr.Error("Select at least one language.")
genai.configure(api_key=gemini_key)
raw = extract_pdf_text(pdf_file.name)
content = truncate_text(raw)
tmp = Path(tempfile.mkdtemp())
results = []
data = {}
for code, info in LANG_INFO.items():
if info["name"] not in langs:
results.extend([None, None, None])
continue
# Generate script
prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
model = genai.GenerativeModel('gemini-1.5-flash-latest')
resp = model.generate_content(prompt)
script = resp.text.strip()
# Save plain text
script_path = tmp / f"script_{code}.txt"
script_path.write_text(script, encoding="utf-8")
# Render HTML version
html_script = f"<pre>{script}</pre>"
# Synthesize audio if available
audio_path = None
if hf_tts_client:
audio_path = synthesize(script, info["tts_model"], tmp / code)
results.extend([audio_path, html_script, str(script_path)])
return results
# Interface
inputs = [
gr.Textbox(label="Google AI Studio API Key", type="password"),
gr.File(label="Lecture PDF", file_types=[".pdf"]),
gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
value=["English"], label="Languages")
]
outputs = []
for code, info in LANG_INFO.items():
outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
outputs.append(gr.File(label=f"Download {info['name']} Script"))
iface = gr.Interface(
fn=generate_podcast,
inputs=inputs,
outputs=outputs,
title="Lecture → Podcast & Script",
)
if __name__ == "__main__":
iface.launch()
|