File size: 6,213 Bytes
53744b5
617d576
cca7e91
53744b5
4c19533
fe00684
f1adb14
 
50d2a40
cca7e91
f1adb14
 
53744b5
f036ad8
 
f1adb14
617d576
4c19533
 
 
cca7e91
 
 
 
 
369b2d2
 
617d576
4c19533
f1adb14
 
617d576
 
 
 
c565171
 
f1adb14
 
 
 
617d576
 
764a881
cca7e91
617d576
4c19533
 
cca7e91
4c19533
617d576
 
 
 
 
 
 
 
 
 
 
4c19533
617d576
 
 
fe00684
617d576
 
 
 
 
cca7e91
 
4c19533
cca7e91
617d576
cca7e91
 
 
4c19533
 
617d576
 
4c19533
cca7e91
d4adc2b
cca7e91
 
 
f1adb14
4c19533
cca7e91
4c19533
cca7e91
 
 
 
617d576
cca7e91
617d576
cca7e91
 
4c19533
cca7e91
 
4c19533
cca7e91
d4adc2b
cca7e91
 
 
 
 
 
 
617d576
 
cca7e91
 
617d576
 
d4adc2b
cca7e91
 
 
 
 
 
 
 
 
 
 
4c19533
cca7e91
 
 
4c19533
 
cca7e91
4c19533
cca7e91
 
 
 
 
 
d4adc2b
 
cca7e91
 
 
 
 
 
 
 
cf56cc8
cca7e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# =============================================================
# Lecture → Podcast & Script Generator (English Only)
# Two-step: 1) Gemini script  2) HF MMS-TTS audio
# =============================================================

import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Optional

import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError

# Google Gemini SDK
try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("Please install the Google Generative AI SDK:\n"
                      "    pip install google-generativeai")

# Hugging Face TTS client (anonymous/public)
from huggingface_hub import InferenceClient

# ------------------------------------------------------------------
# Globals & templates
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two-host educational podcast in English.
    Summarize the following lecture content into a dialogue of approximately 300 words.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
    Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).

    ### Lecture Content
    {content}
    """
)

HF_TTS_MODEL = "facebook/mms-tts-eng"
CHUNK_CHAR_LIMIT = 280

# Initialize the HF TTS client once
tts_client = InferenceClient()

# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def truncate_text(text: str, max_words: int = 8000) -> str:
    words = text.split()
    return " ".join(words[:max_words])

def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    chunks, current = [], ""
    for sent in sentences:
        if current and len(current) + len(sent) + 1 > limit:
            chunks.append(current)
            current = sent
        else:
            current = f"{current} {sent}".strip() if current else sent
    if current:
        chunks.append(current)
    return chunks

def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str:
    chunks = split_to_chunks(script)
    if not chunks:
        raise RuntimeError("No text chunks to synthesize.")
    segments = []
    for idx, chunk in enumerate(chunks):
        audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
        part_path = out_dir / f"seg_{idx}.flac"
        part_path.write_bytes(audio_bytes)
        try:
            seg = AudioSegment.from_file(part_path, format="flac")
            segments.append(seg)
        except CouldntDecodeError as e:
            raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e
    final_audio = sum(segments, AudioSegment.empty())
    final_path = out_dir / "podcast_audio.flac"
    final_audio.export(final_path, format="flac")
    return str(final_path)

# ------------------------------------------------------------------
# Step 1: Generate script via Gemini
# ------------------------------------------------------------------
def generate_script(
    gemini_api_key: str,
    lecture_pdf: gr.File
) -> List[str]:
    if not gemini_api_key:
        raise gr.Error("Please enter your Google AI Studio API Key.")
    if not lecture_pdf:
        raise gr.Error("Please upload a lecture PDF.")
    # Configure Gemini
    try:
        genai.configure(api_key=gemini_api_key)
        model = genai.GenerativeModel("gemini-1.5-flash-latest")
    except Exception as e:
        raise gr.Error(f"Gemini init/config error: {e}")

    # Extract and truncate text
    raw_text = extract_pdf_text(lecture_pdf.name)
    content = truncate_text(raw_text)
    if not content.strip():
        raise gr.Error("No extractable text found in the PDF.")

    # Generate dialogue script
    prompt = PROMPT_TEMPLATE.format(content=content)
    try:
        response = model.generate_content(prompt)
        script = response.text or ""
    except Exception as e:
        raise gr.Error(f"Gemini generation error: {e}")

    return [script, script]  # [for Markdown display, for state storage]

# ------------------------------------------------------------------
# Step 2: Generate audio from provided script
# ------------------------------------------------------------------
def generate_audio(
    script: str
) -> str:
    if not script:
        raise gr.Error("No script available. Please generate the script first.")
    # Create a temp dir for audio parts
    with tempfile.TemporaryDirectory() as td:
        out_dir = Path(td)
        audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir)
        return audio_path

# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
with gr.Blocks() as demo:
    # Shared state for the script
    script_state = gr.State()

    with gr.Tab("Generate Script"):
        api_key_input = gr.Textbox(
            label="Google Gemini API Key",
            type="password",
            placeholder="Enter your key"
        )
        pdf_input = gr.File(
            label="Upload Lecture PDF",
            file_types=[".pdf"]
        )
        script_md = gr.Markdown(
            label="Generated Script",
            
        )
        gen_script_btn = gr.Button("Generate Script")
        gen_script_btn.click(
            fn=generate_script,
            inputs=[api_key_input, pdf_input],
            outputs=[script_md, script_state]
        )

    with gr.Tab("Generate Audio"):
        gen_audio_btn = gr.Button("Generate Audio")
        audio_out = gr.Audio(
            label="Podcast Audio",
            type="filepath"
        )
        gen_audio_btn.click(
            fn=generate_audio,
            inputs=[script_state],
            outputs=[audio_out]
        )

    demo.launch()