File size: 6,492 Bytes
53744b5
617d576
 
 
53744b5
4c19533
f1adb14
fe00684
f1adb14
 
50d2a40
617d576
f1adb14
 
53744b5
f036ad8
 
f1adb14
617d576
1425202
369b2d2
617d576
4c19533
 
 
 
369b2d2
 
617d576
4c19533
617d576
f1adb14
 
617d576
 
 
 
c565171
 
f1adb14
 
 
 
617d576
 
 
 
764a881
617d576
 
4c19533
 
617d576
4c19533
617d576
 
 
 
 
 
 
 
 
 
 
 
 
 
4c19533
617d576
 
 
fe00684
617d576
 
 
 
 
 
 
 
4c19533
617d576
 
 
4c19533
617d576
4c19533
617d576
 
4c19533
 
617d576
 
4c19533
617d576
 
 
 
 
 
f1adb14
4c19533
617d576
4c19533
53744b5
617d576
 
53744b5
617d576
 
 
 
 
 
 
 
 
 
 
 
 
4c19533
617d576
4c19533
617d576
 
 
 
 
 
 
 
 
4c19533
617d576
 
 
 
 
 
 
 
 
 
 
4c19533
 
617d576
4c19533
f1adb14
 
617d576
 
 
 
 
 
 
 
 
 
4c19533
617d576
 
 
4c19533
 
f1adb14
 
 
764a881
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# =============================================================
# Lecture → Podcast & Script Generator (English Only)
# • Text: Google Gemini API (via UI-provided key)
# • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
# =============================================================

import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Optional, Any

import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError

# Hugging Face TTS client (anonymous/public access)
from huggingface_hub import InferenceClient

# Google Gemini SDK
try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")

# ------------------------------------------------------------------
# Globals & templates
# ------------------------------------------------------------------
# Gemini prompt for ~300-word two-host dialogue in English
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two-host educational podcast in English.
    Summarize the following lecture content into a dialogue of approximately 300 words.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
    Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).

    ### Lecture Content
    {content}
    """
)

# TTS model ID for English MMS-TTS
HF_TTS_MODEL = "facebook/mms-tts-eng"
# Safe chunk size for HF text-to-speech
CHUNK_CHAR_LIMIT = 280

# Initialize HF TTS client (no token required for public models)
tts_client = InferenceClient()

# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
    """Extracts all text from a PDF file."""
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def truncate_text(text: str, max_words: int = 8000) -> str:
    """Truncate to max_words to fit LLM context."""
    words = text.split()
    return " ".join(words[:max_words])

def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
    """Split text into ≤limit-char chunks at sentence boundaries."""
    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    chunks, current = [], ""
    for sent in sentences:
        if current and len(current) + len(sent) + 1 > limit:
            chunks.append(current)
            current = sent
        else:
            current = f"{current} {sent}".strip() if current else sent
    if current:
        chunks.append(current)
    return chunks

def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
    """Chunk-safe TTS via HF Inference API, concatenating FLAC segments."""
    chunks = split_to_chunks(text)
    if not chunks:
        raise ValueError("No text to synthesize.")
    segments = []
    for i, chunk in enumerate(chunks):
        try:
            audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
        except Exception as e:
            raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
        part_path = out_dir / f"seg_{i}.flac"
        part_path.write_bytes(audio_bytes)
        try:
            seg = AudioSegment.from_file(part_path, format="flac")
            segments.append(seg)
        except CouldntDecodeError as e:
            raise RuntimeError(f"Could not decode segment {i+1}: {e}")
    # Concatenate
    final = sum(segments, AudioSegment.empty())
    out_path = out_dir / "podcast_audio.flac"
    final.export(out_path, format="flac")
    return out_path

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------
def generate_podcast(
    gemini_api_key: Optional[str],
    lecture_pdf: Optional[gr.File]
) -> List[Optional[Any]]:
    # Validate inputs
    if not gemini_api_key:
        raise gr.Error("Enter your Google AI Studio API Key.")
    if not lecture_pdf:
        raise gr.Error("Upload a lecture PDF file.")
    # Configure Gemini
    genai.configure(api_key=gemini_api_key)
    # Extract & truncate lecture text
    raw = extract_pdf_text(lecture_pdf.name)
    content = truncate_text(raw)
    if not content.strip():
        raise gr.Error("Lecture PDF contained no extractable text.")
    # Initialize Gemini model
    try:
        gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
    except Exception as e:
        raise gr.Error(f"Gemini init failed: {e}")
    # Generate script
    prompt = PROMPT_TEMPLATE.format(content=content)
    try:
        resp = gemini_model.generate_content(prompt)
        script = resp.text or ""
    except Exception as e:
        raise gr.Error(f"Gemini generation error: {e}")
    # Prepare temp directory
    with tempfile.TemporaryDirectory() as td:
        tmp = Path(td)
        # Save script file
        script_path = tmp / "podcast_script.txt"
        script_path.write_text(script, encoding="utf-8")
        # Synthesize audio
        try:
            audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
        except Exception as e:
            raise gr.Error(f"Speech synthesis error: {e}")
        # Return [audio, markdown script, txt file]
        return [str(audio_path), script, str(script_path)]

# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
iface = gr.Interface(
    fn=generate_podcast,
    inputs=[
        gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key"),
        gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    ],
    outputs=[
        gr.Audio(label="English Podcast", type="filepath"),
        gr.Markdown(label="English Script"),
        gr.File(label="Download English Script (.txt)", type="filepath"),
    ],
    title="Lecture → English Podcast & Script",
    description=(
        "Enter your Gemini API Key and upload a lecture PDF. "
        "Generates a two-host podcast audio and a Markdown script in English "
        "using Google Gemini for text and Hugging Face MMS-TTS for audio."
    ),
    allow_flagging="never",
)

if __name__ == "__main__":
    iface.launch()