|
|
|
|
|
|
|
|
|
|
|
import re |
|
import tempfile |
|
import textwrap |
|
from pathlib import Path |
|
from typing import List, Optional |
|
|
|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from pydub import AudioSegment |
|
from pydub.exceptions import CouldntDecodeError |
|
|
|
|
|
try: |
|
import google.generativeai as genai |
|
except ImportError: |
|
raise ImportError("Please install the Google Generative AI SDK:\n" |
|
" pip install google-generativeai") |
|
|
|
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
|
|
|
|
PROMPT_TEMPLATE = textwrap.dedent( |
|
""" |
|
You are producing a lively two-host educational podcast in English. |
|
Summarize the following lecture content into a dialogue of approximately 300 words. |
|
Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap. |
|
Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**). |
|
|
|
### Lecture Content |
|
{content} |
|
""" |
|
) |
|
|
|
HF_TTS_MODEL = "facebook/mms-tts-eng" |
|
CHUNK_CHAR_LIMIT = 280 |
|
|
|
|
|
tts_client = InferenceClient() |
|
|
|
|
|
|
|
|
|
def extract_pdf_text(pdf_path: str) -> str: |
|
reader = PdfReader(pdf_path) |
|
return "\n".join(page.extract_text() or "" for page in reader.pages) |
|
|
|
def truncate_text(text: str, max_words: int = 8000) -> str: |
|
words = text.split() |
|
return " ".join(words[:max_words]) |
|
|
|
def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]: |
|
sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()] |
|
chunks, current = [], "" |
|
for sent in sentences: |
|
if current and len(current) + len(sent) + 1 > limit: |
|
chunks.append(current) |
|
current = sent |
|
else: |
|
current = f"{current} {sent}".strip() if current else sent |
|
if current: |
|
chunks.append(current) |
|
return chunks |
|
|
|
def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str: |
|
chunks = split_to_chunks(script) |
|
if not chunks: |
|
raise RuntimeError("No text chunks to synthesize.") |
|
segments = [] |
|
for idx, chunk in enumerate(chunks): |
|
audio_bytes = tts_client.text_to_speech(chunk, model=model_id) |
|
part_path = out_dir / f"seg_{idx}.flac" |
|
part_path.write_bytes(audio_bytes) |
|
try: |
|
seg = AudioSegment.from_file(part_path, format="flac") |
|
segments.append(seg) |
|
except CouldntDecodeError as e: |
|
raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e |
|
final_audio = sum(segments, AudioSegment.empty()) |
|
final_path = out_dir / "podcast_audio.flac" |
|
final_audio.export(final_path, format="flac") |
|
return str(final_path) |
|
|
|
|
|
|
|
|
|
def generate_script( |
|
gemini_api_key: str, |
|
lecture_pdf: gr.File |
|
) -> List[str]: |
|
if not gemini_api_key: |
|
raise gr.Error("Please enter your Google AI Studio API Key.") |
|
if not lecture_pdf: |
|
raise gr.Error("Please upload a lecture PDF.") |
|
|
|
try: |
|
genai.configure(api_key=gemini_api_key) |
|
model = genai.GenerativeModel("gemini-1.5-flash-latest") |
|
except Exception as e: |
|
raise gr.Error(f"Gemini init/config error: {e}") |
|
|
|
|
|
raw_text = extract_pdf_text(lecture_pdf.name) |
|
content = truncate_text(raw_text) |
|
if not content.strip(): |
|
raise gr.Error("No extractable text found in the PDF.") |
|
|
|
|
|
prompt = PROMPT_TEMPLATE.format(content=content) |
|
try: |
|
response = model.generate_content(prompt) |
|
script = response.text or "" |
|
except Exception as e: |
|
raise gr.Error(f"Gemini generation error: {e}") |
|
|
|
return [script, script] |
|
|
|
|
|
|
|
|
|
def generate_audio( |
|
script: str |
|
) -> str: |
|
if not script: |
|
raise gr.Error("No script available. Please generate the script first.") |
|
|
|
with tempfile.TemporaryDirectory() as td: |
|
out_dir = Path(td) |
|
audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir) |
|
return audio_path |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
script_state = gr.State() |
|
|
|
with gr.Tab("Generate Script"): |
|
api_key_input = gr.Textbox( |
|
label="Google Gemini API Key", |
|
type="password", |
|
placeholder="Enter your key" |
|
) |
|
pdf_input = gr.File( |
|
label="Upload Lecture PDF", |
|
file_types=[".pdf"] |
|
) |
|
script_md = gr.Markdown( |
|
label="Generated Script", |
|
|
|
) |
|
gen_script_btn = gr.Button("Generate Script") |
|
gen_script_btn.click( |
|
fn=generate_script, |
|
inputs=[api_key_input, pdf_input], |
|
outputs=[script_md, script_state] |
|
) |
|
|
|
with gr.Tab("Generate Audio"): |
|
gen_audio_btn = gr.Button("Generate Audio") |
|
audio_out = gr.Audio( |
|
label="Podcast Audio", |
|
type="filepath" |
|
) |
|
gen_audio_btn.click( |
|
fn=generate_audio, |
|
inputs=[script_state], |
|
outputs=[audio_out] |
|
) |
|
|
|
demo.launch() |
|
|