|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import re |
|
import tempfile |
|
import textwrap |
|
from pathlib import Path |
|
from typing import List, Optional, Any |
|
|
|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from pydub import AudioSegment |
|
from pydub.exceptions import CouldntDecodeError |
|
|
|
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
try: |
|
import google.generativeai as genai |
|
except ImportError: |
|
raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai") |
|
|
|
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE = textwrap.dedent( |
|
""" |
|
You are producing a lively two-host educational podcast in English. |
|
Summarize the following lecture content into a dialogue of approximately 300 words. |
|
Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap. |
|
Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**). |
|
|
|
### Lecture Content |
|
{content} |
|
""" |
|
) |
|
|
|
|
|
HF_TTS_MODEL = "facebook/mms-tts-eng" |
|
|
|
CHUNK_CHAR_LIMIT = 280 |
|
|
|
|
|
tts_client = InferenceClient() |
|
|
|
|
|
|
|
|
|
def extract_pdf_text(pdf_path: str) -> str: |
|
"""Extracts all text from a PDF file.""" |
|
reader = PdfReader(pdf_path) |
|
return "\n".join(page.extract_text() or "" for page in reader.pages) |
|
|
|
def truncate_text(text: str, max_words: int = 8000) -> str: |
|
"""Truncate to max_words to fit LLM context.""" |
|
words = text.split() |
|
return " ".join(words[:max_words]) |
|
|
|
def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]: |
|
"""Split text into ≤limit-char chunks at sentence boundaries.""" |
|
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] |
|
chunks, current = [], "" |
|
for sent in sentences: |
|
if current and len(current) + len(sent) + 1 > limit: |
|
chunks.append(current) |
|
current = sent |
|
else: |
|
current = f"{current} {sent}".strip() if current else sent |
|
if current: |
|
chunks.append(current) |
|
return chunks |
|
|
|
def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path: |
|
"""Chunk-safe TTS via HF Inference API, concatenating FLAC segments.""" |
|
chunks = split_to_chunks(text) |
|
if not chunks: |
|
raise ValueError("No text to synthesize.") |
|
segments = [] |
|
for i, chunk in enumerate(chunks): |
|
try: |
|
audio_bytes = tts_client.text_to_speech(chunk, model=model_id) |
|
except Exception as e: |
|
raise RuntimeError(f"TTS failed on chunk {i+1}: {e}") |
|
part_path = out_dir / f"seg_{i}.flac" |
|
part_path.write_bytes(audio_bytes) |
|
try: |
|
seg = AudioSegment.from_file(part_path, format="flac") |
|
segments.append(seg) |
|
except CouldntDecodeError as e: |
|
raise RuntimeError(f"Could not decode segment {i+1}: {e}") |
|
|
|
final = sum(segments, AudioSegment.empty()) |
|
out_path = out_dir / "podcast_audio.flac" |
|
final.export(out_path, format="flac") |
|
return out_path |
|
|
|
|
|
|
|
|
|
def generate_podcast( |
|
gemini_api_key: Optional[str], |
|
lecture_pdf: Optional[gr.File] |
|
) -> List[Optional[Any]]: |
|
|
|
if not gemini_api_key: |
|
raise gr.Error("Enter your Google AI Studio API Key.") |
|
if not lecture_pdf: |
|
raise gr.Error("Upload a lecture PDF file.") |
|
|
|
genai.configure(api_key=gemini_api_key) |
|
|
|
raw = extract_pdf_text(lecture_pdf.name) |
|
content = truncate_text(raw) |
|
if not content.strip(): |
|
raise gr.Error("Lecture PDF contained no extractable text.") |
|
|
|
try: |
|
gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest") |
|
except Exception as e: |
|
raise gr.Error(f"Gemini init failed: {e}") |
|
|
|
prompt = PROMPT_TEMPLATE.format(content=content) |
|
try: |
|
resp = gemini_model.generate_content(prompt) |
|
script = resp.text or "" |
|
except Exception as e: |
|
raise gr.Error(f"Gemini generation error: {e}") |
|
|
|
with tempfile.TemporaryDirectory() as td: |
|
tmp = Path(td) |
|
|
|
script_path = tmp / "podcast_script.txt" |
|
script_path.write_text(script, encoding="utf-8") |
|
|
|
try: |
|
audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp) |
|
except Exception as e: |
|
raise gr.Error(f"Speech synthesis error: {e}") |
|
|
|
return [str(audio_path), script, str(script_path)] |
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_podcast, |
|
inputs=[ |
|
gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key"), |
|
gr.File(label="Upload Lecture PDF", file_types=[".pdf"]), |
|
], |
|
outputs=[ |
|
gr.Audio(label="English Podcast", type="filepath"), |
|
gr.Markdown(label="English Script"), |
|
gr.File(label="Download English Script (.txt)", type="filepath"), |
|
], |
|
title="Lecture → English Podcast & Script", |
|
description=( |
|
"Enter your Gemini API Key and upload a lecture PDF. " |
|
"Generates a two-host podcast audio and a Markdown script in English " |
|
"using Google Gemini for text and Hugging Face MMS-TTS for audio." |
|
), |
|
allow_flagging="never", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|