File size: 6,213 Bytes
53744b5 617d576 cca7e91 53744b5 4c19533 fe00684 f1adb14 50d2a40 cca7e91 f1adb14 53744b5 f036ad8 f1adb14 617d576 4c19533 cca7e91 369b2d2 617d576 4c19533 f1adb14 617d576 c565171 f1adb14 617d576 764a881 cca7e91 617d576 4c19533 cca7e91 4c19533 617d576 4c19533 617d576 fe00684 617d576 cca7e91 4c19533 cca7e91 617d576 cca7e91 4c19533 617d576 4c19533 cca7e91 d4adc2b cca7e91 f1adb14 4c19533 cca7e91 4c19533 cca7e91 617d576 cca7e91 617d576 cca7e91 4c19533 cca7e91 4c19533 cca7e91 d4adc2b cca7e91 617d576 cca7e91 617d576 d4adc2b cca7e91 4c19533 cca7e91 4c19533 cca7e91 4c19533 cca7e91 d4adc2b cca7e91 cf56cc8 cca7e91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# =============================================================
# Lecture → Podcast & Script Generator (English Only)
# Two-step: 1) Gemini script 2) HF MMS-TTS audio
# =============================================================
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Optional
import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# Google Gemini SDK
try:
import google.generativeai as genai
except ImportError:
raise ImportError("Please install the Google Generative AI SDK:\n"
" pip install google-generativeai")
# Hugging Face TTS client (anonymous/public)
from huggingface_hub import InferenceClient
# ------------------------------------------------------------------
# Globals & templates
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two-host educational podcast in English.
Summarize the following lecture content into a dialogue of approximately 300 words.
Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
### Lecture Content
{content}
"""
)
HF_TTS_MODEL = "facebook/mms-tts-eng"
CHUNK_CHAR_LIMIT = 280
# Initialize the HF TTS client once
tts_client = InferenceClient()
# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
def truncate_text(text: str, max_words: int = 8000) -> str:
words = text.split()
return " ".join(words[:max_words])
def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
chunks, current = [], ""
for sent in sentences:
if current and len(current) + len(sent) + 1 > limit:
chunks.append(current)
current = sent
else:
current = f"{current} {sent}".strip() if current else sent
if current:
chunks.append(current)
return chunks
def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str:
chunks = split_to_chunks(script)
if not chunks:
raise RuntimeError("No text chunks to synthesize.")
segments = []
for idx, chunk in enumerate(chunks):
audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
part_path = out_dir / f"seg_{idx}.flac"
part_path.write_bytes(audio_bytes)
try:
seg = AudioSegment.from_file(part_path, format="flac")
segments.append(seg)
except CouldntDecodeError as e:
raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e
final_audio = sum(segments, AudioSegment.empty())
final_path = out_dir / "podcast_audio.flac"
final_audio.export(final_path, format="flac")
return str(final_path)
# ------------------------------------------------------------------
# Step 1: Generate script via Gemini
# ------------------------------------------------------------------
def generate_script(
gemini_api_key: str,
lecture_pdf: gr.File
) -> List[str]:
if not gemini_api_key:
raise gr.Error("Please enter your Google AI Studio API Key.")
if not lecture_pdf:
raise gr.Error("Please upload a lecture PDF.")
# Configure Gemini
try:
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel("gemini-1.5-flash-latest")
except Exception as e:
raise gr.Error(f"Gemini init/config error: {e}")
# Extract and truncate text
raw_text = extract_pdf_text(lecture_pdf.name)
content = truncate_text(raw_text)
if not content.strip():
raise gr.Error("No extractable text found in the PDF.")
# Generate dialogue script
prompt = PROMPT_TEMPLATE.format(content=content)
try:
response = model.generate_content(prompt)
script = response.text or ""
except Exception as e:
raise gr.Error(f"Gemini generation error: {e}")
return [script, script] # [for Markdown display, for state storage]
# ------------------------------------------------------------------
# Step 2: Generate audio from provided script
# ------------------------------------------------------------------
def generate_audio(
script: str
) -> str:
if not script:
raise gr.Error("No script available. Please generate the script first.")
# Create a temp dir for audio parts
with tempfile.TemporaryDirectory() as td:
out_dir = Path(td)
audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir)
return audio_path
# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
with gr.Blocks() as demo:
# Shared state for the script
script_state = gr.State()
with gr.Tab("Generate Script"):
api_key_input = gr.Textbox(
label="Google Gemini API Key",
type="password",
placeholder="Enter your key"
)
pdf_input = gr.File(
label="Upload Lecture PDF",
file_types=[".pdf"]
)
script_md = gr.Markdown(
label="Generated Script",
)
gen_script_btn = gr.Button("Generate Script")
gen_script_btn.click(
fn=generate_script,
inputs=[api_key_input, pdf_input],
outputs=[script_md, script_state]
)
with gr.Tab("Generate Audio"):
gen_audio_btn = gr.Button("Generate Audio")
audio_out = gr.Audio(
label="Podcast Audio",
type="filepath"
)
gen_audio_btn.click(
fn=generate_audio,
inputs=[script_state],
outputs=[audio_out]
)
demo.launch()
|