File size: 6,492 Bytes
53744b5 617d576 53744b5 4c19533 f1adb14 fe00684 f1adb14 50d2a40 617d576 f1adb14 53744b5 f036ad8 f1adb14 617d576 1425202 369b2d2 617d576 4c19533 369b2d2 617d576 4c19533 617d576 f1adb14 617d576 c565171 f1adb14 617d576 764a881 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 fe00684 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 f1adb14 4c19533 617d576 4c19533 53744b5 617d576 53744b5 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 4c19533 617d576 4c19533 f1adb14 617d576 4c19533 617d576 4c19533 f1adb14 764a881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# =============================================================
# Lecture → Podcast & Script Generator (English Only)
# • Text: Google Gemini API (via UI-provided key)
# • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
# =============================================================
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Optional, Any
import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# Hugging Face TTS client (anonymous/public access)
from huggingface_hub import InferenceClient
# Google Gemini SDK
try:
import google.generativeai as genai
except ImportError:
raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
# ------------------------------------------------------------------
# Globals & templates
# ------------------------------------------------------------------
# Gemini prompt for ~300-word two-host dialogue in English
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two-host educational podcast in English.
Summarize the following lecture content into a dialogue of approximately 300 words.
Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
### Lecture Content
{content}
"""
)
# TTS model ID for English MMS-TTS
HF_TTS_MODEL = "facebook/mms-tts-eng"
# Safe chunk size for HF text-to-speech
CHUNK_CHAR_LIMIT = 280
# Initialize HF TTS client (no token required for public models)
tts_client = InferenceClient()
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
"""Extracts all text from a PDF file."""
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
def truncate_text(text: str, max_words: int = 8000) -> str:
"""Truncate to max_words to fit LLM context."""
words = text.split()
return " ".join(words[:max_words])
def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
"""Split text into ≤limit-char chunks at sentence boundaries."""
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
chunks, current = [], ""
for sent in sentences:
if current and len(current) + len(sent) + 1 > limit:
chunks.append(current)
current = sent
else:
current = f"{current} {sent}".strip() if current else sent
if current:
chunks.append(current)
return chunks
def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
"""Chunk-safe TTS via HF Inference API, concatenating FLAC segments."""
chunks = split_to_chunks(text)
if not chunks:
raise ValueError("No text to synthesize.")
segments = []
for i, chunk in enumerate(chunks):
try:
audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
except Exception as e:
raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
part_path = out_dir / f"seg_{i}.flac"
part_path.write_bytes(audio_bytes)
try:
seg = AudioSegment.from_file(part_path, format="flac")
segments.append(seg)
except CouldntDecodeError as e:
raise RuntimeError(f"Could not decode segment {i+1}: {e}")
# Concatenate
final = sum(segments, AudioSegment.empty())
out_path = out_dir / "podcast_audio.flac"
final.export(out_path, format="flac")
return out_path
# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(
gemini_api_key: Optional[str],
lecture_pdf: Optional[gr.File]
) -> List[Optional[Any]]:
# Validate inputs
if not gemini_api_key:
raise gr.Error("Enter your Google AI Studio API Key.")
if not lecture_pdf:
raise gr.Error("Upload a lecture PDF file.")
# Configure Gemini
genai.configure(api_key=gemini_api_key)
# Extract & truncate lecture text
raw = extract_pdf_text(lecture_pdf.name)
content = truncate_text(raw)
if not content.strip():
raise gr.Error("Lecture PDF contained no extractable text.")
# Initialize Gemini model
try:
gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
except Exception as e:
raise gr.Error(f"Gemini init failed: {e}")
# Generate script
prompt = PROMPT_TEMPLATE.format(content=content)
try:
resp = gemini_model.generate_content(prompt)
script = resp.text or ""
except Exception as e:
raise gr.Error(f"Gemini generation error: {e}")
# Prepare temp directory
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
# Save script file
script_path = tmp / "podcast_script.txt"
script_path.write_text(script, encoding="utf-8")
# Synthesize audio
try:
audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
except Exception as e:
raise gr.Error(f"Speech synthesis error: {e}")
# Return [audio, markdown script, txt file]
return [str(audio_path), script, str(script_path)]
# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
iface = gr.Interface(
fn=generate_podcast,
inputs=[
gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key"),
gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
],
outputs=[
gr.Audio(label="English Podcast", type="filepath"),
gr.Markdown(label="English Script"),
gr.File(label="Download English Script (.txt)", type="filepath"),
],
title="Lecture → English Podcast & Script",
description=(
"Enter your Gemini API Key and upload a lecture PDF. "
"Generates a two-host podcast audio and a Markdown script in English "
"using Google Gemini for text and Hugging Face MMS-TTS for audio."
),
allow_flagging="never",
)
if __name__ == "__main__":
iface.launch()
|