File size: 15,097 Bytes
53744b5
369b2d2
53744b5
369b2d2
 
53744b5
 
f1adb14
fe00684
f1adb14
 
50d2a40
53744b5
f1adb14
 
53744b5
f036ad8
 
f1adb14
369b2d2
 
 
 
53744b5
 
 
369b2d2
 
 
 
 
 
 
 
 
 
 
 
f1adb14
50d2a40
369b2d2
50d2a40
f1adb14
369b2d2
 
 
 
 
f1adb14
c172b12
 
53744b5
369b2d2
53744b5
f1adb14
 
c565171
53744b5
50d2a40
53744b5
c565171
 
f1adb14
 
 
 
53744b5
50d2a40
c565171
 
 
 
 
f1adb14
369b2d2
53744b5
f1adb14
 
c565171
53744b5
c565171
 
f1adb14
53744b5
369b2d2
53744b5
369b2d2
fe00684
369b2d2
53744b5
 
 
 
fe00684
53744b5
 
 
fe00684
53744b5
 
 
fe00684
f036ad8
369b2d2
 
 
53744b5
369b2d2
53744b5
369b2d2
f036ad8
53744b5
 
 
 
369b2d2
fe00684
369b2d2
 
 
 
 
 
 
53744b5
369b2d2
53744b5
369b2d2
 
53744b5
c565171
369b2d2
53744b5
c565171
369b2d2
53744b5
 
 
f036ad8
53744b5
369b2d2
 
53744b5
fe00684
f1adb14
53744b5
f1adb14
 
53744b5
369b2d2
53744b5
 
 
 
369b2d2
 
53744b5
c565171
53744b5
 
 
369b2d2
53744b5
369b2d2
 
53744b5
369b2d2
53744b5
369b2d2
 
 
 
 
 
 
53744b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369b2d2
 
 
 
 
 
 
53744b5
 
 
 
369b2d2
53744b5
 
 
 
 
 
 
 
 
 
369b2d2
53744b5
369b2d2
53744b5
 
 
 
 
 
 
 
 
 
 
 
369b2d2
 
 
53744b5
 
 
 
369b2d2
 
 
 
 
 
 
 
 
 
 
 
 
53744b5
 
 
 
 
 
 
 
 
 
 
369b2d2
53744b5
369b2d2
53744b5
 
 
 
f1adb14
 
53744b5
f1adb14
53744b5
c172b12
 
53744b5
369b2d2
53744b5
369b2d2
 
53744b5
 
 
 
369b2d2
53744b5
 
c172b12
 
9e251c5
53744b5
 
 
369b2d2
53744b5
 
f1adb14
 
 
c172b12
fe00684
369b2d2
53744b5
369b2d2
 
 
 
 
53744b5
369b2d2
53744b5
 
f1adb14
 
 
369b2d2
 
 
 
 
 
 
 
53744b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# =============================================================
# Hugging Face Space – Lecture β†’ Podcast Generator (Gemini + HF TTS)
# =============================================================
# β€’ **Text generation** – Google Gemini API (via user-provided genai API Key)
# β€’ **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
# -----------------------------------------------------------------

import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional, Any

import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError

# For Hugging Face TTS
from huggingface_hub import InferenceClient, HubHTTPError

# For Google Gemini
try:
    import google.generativeai as genai
except ImportError:
    raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")

# ------------------------------------------------------------------
# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
# ------------------------------------------------------------------
hf_tts_client: Optional[InferenceClient] = None
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    hf_tts_client = InferenceClient(token=hf_token)
else:
    # This print will show in the Space logs if HF_TOKEN is missing
    print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")

# ------------------------------------------------------------------
# Language metadata for Hugging Face MMS-TTS models
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

# ------------------------------------------------------------------
# Prompt template for Gemini
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two-host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of **approximately 300 words**.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).

    ### Lecture Content
    {content}
    """
)

# PDF helpers (unchanged) -------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
    try:
        reader = PdfReader(pdf_path)
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        raise gr.Error(f"Failed to process PDF: {e}")

TOKEN_LIMIT = 8000

def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    if len(words) > limit:
        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
        return " ".join(words[:limit])
    return text

# ------------------------------------------------------------------
# TTS helper using Hugging Face Inference API
# ------------------------------------------------------------------
CHUNK_CHAR_LIMIT_HF = 280

def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
    sentences = [s.strip() for s in sentences_raw if s.strip()]
    if not sentences: return []
    chunks, current_chunk = [], ""
    for sent in sentences:
        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
            chunks.append(current_chunk)
            current_chunk = sent
        else:
            current_chunk += (" " + sent) if current_chunk else sent
    if current_chunk: chunks.append(current_chunk)
    return [chunk for chunk in chunks if chunk.strip()]


def synthesize_speech_hf(
    text: str,
    hf_model_id: str,
    lang_tmpdir: Path,
    tts_client: InferenceClient
) -> Path:
    chunks = _split_to_chunks_hf(text)
    if not chunks:
        raise ValueError("Text resulted in no speakable chunks after splitting.")

    audio_segments: List[AudioSegment] = []
    for idx, chunk in enumerate(chunks):
        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
        try:
            audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
        except HubHTTPError as e:
            error_message = f"HF TTS request failed for chunk {idx+1} ('{chunk[:30]}...'): {e}"
            if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
                gr.Warning(f"Skipping an apparently empty chunk for HF TTS: Chunk {idx+1}")
                continue
            raise RuntimeError(error_message) from e
        except Exception as e:
             raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e

        part_path = lang_tmpdir / f"part_{idx}.flac"
        part_path.write_bytes(audio_bytes)
        
        try:
            segment = AudioSegment.from_file(part_path, format="flac")
            audio_segments.append(segment)
        except CouldntDecodeError as e:
            raise RuntimeError(f"Failed to decode FLAC audio chunk {idx+1} from {part_path}. Error: {e}") from e

    if not audio_segments:
        raise RuntimeError("No audio segments were successfully synthesized or decoded.")

    combined_audio = sum(audio_segments, AudioSegment.empty())
    final_path = lang_tmpdir / "podcast_audio.flac"
    combined_audio.export(final_path, format="flac")
    return final_path

# ------------------------------------------------------------------
# Main pipeline function for Gradio
# ------------------------------------------------------------------

def generate_podcast(
    gemini_api_key_from_ui: Optional[str], # Explicitly named to show source
    pdf_file_obj: Optional[gr.File],
    selected_lang_names: List[str]
) -> List[Optional[Any]]:

    if not gemini_api_key_from_ui: # Check the key provided from the UI input
        raise gr.Error("Please enter your Google AI Studio API Key for Gemini in the input field.")
    if not pdf_file_obj:
        raise gr.Error("Please upload a PDF file.")
    if not selected_lang_names:
        raise gr.Error("Please select at least one language for the podcast.")

    # Configure Gemini API using the key directly from the UI input
    try:
        genai.configure(api_key=gemini_api_key_from_ui)
        gr.Info("Gemini API configured successfully with the provided key.")
    except Exception as e:
        raise gr.Error(f"Failed to configure Gemini API with the provided key. Please check your API key. Error: {e}")

    # Check if HF TTS client is available (HF_TOKEN was provided as a secret)
    if not hf_tts_client:
        gr.Warning( # Changed to gr.Warning to allow script generation if TTS fails to init
            "Hugging Face TTS client is not available (HF_TOKEN secret might be missing or invalid). "
            "Speech synthesis will be skipped, but script generation will be attempted."
        )
        # Note: Script generation can still proceed, TTS will be skipped later if client is None.

    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
    results_data: Dict[str, Dict[str, Optional[str]]] = {
        code: {"audio": None, "script_text": None, "script_file": None}
        for code in LANG_INFO.keys()
    }

    try:
        with tempfile.TemporaryDirectory() as td:
            tmpdir_base = Path(td)
            
            gr.Info("Extracting text from PDF...")
            lecture_raw = extract_pdf_text(pdf_file_obj.name)
            lecture_text = truncate_text(lecture_raw)

            if not lecture_text.strip():
                raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")

            # Initialize Gemini model (e.g., 'gemini-1.5-flash-latest' or 'gemini-pro')
            # This happens after genai.configure has been called.
            try:
                gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
            except Exception as e:
                raise gr.Error(f"Failed to initialize Gemini model. This might be due to an invalid API key or API access issues. Error: {e}")


            for code in selected_codes:
                info = LANG_INFO[code]
                lang_name = info["name"]
                hf_tts_model_id = info["tts_model"]
                
                gr.Info(f"Processing for {lang_name}...")
                lang_tmpdir = tmpdir_base / code
                lang_tmpdir.mkdir(parents=True, exist_ok=True)
                
                dialogue: Optional[str] = None
                
                gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
                prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
                try:
                    # The gemini_model is initialized using the API key from genai.configure()
                    response = gemini_model.generate_content(prompt_for_gemini)
                    dialogue_raw = response.text
                    
                    if not dialogue_raw or not dialogue_raw.strip():
                        gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
                        continue
                    
                    dialogue = dialogue_raw
                    results_data[code]["script_text"] = dialogue
                    script_file_path = lang_tmpdir / f"podcast_script_{code}.txt"
                    script_file_path.write_text(dialogue, encoding="utf-8")
                    results_data[code]["script_file"] = str(script_file_path)

                except Exception as e:
                    # Check if the error indicates an API key issue from Gemini
                    if "API_KEY_INVALID" in str(e) or "permission" in str(e).lower():
                         raise gr.Error(f"Gemini API Key error for {lang_name}: {e}. Please verify your API key and its permissions.")
                    gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
                    continue 

                if dialogue:
                    if hf_tts_client: # Only attempt TTS if client is available
                        gr.Info(f"Synthesizing speech for {lang_name} with Hugging Face TTS ({hf_tts_model_id})...")
                        try:
                           tts_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
                           results_data[code]["audio"] = str(tts_path)
                        except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
                            gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
                        except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
                            gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
                        except Exception as e: # Catch any other unexpected errors during synthesis
                            gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
                    else:
                        gr.Info(f"HF TTS client not available. Skipping speech synthesis for {lang_name}.")
        
        final_ordered_results: List[Optional[Any]] = []
        for code_key in LANG_INFO.keys():
            lang_output_data = results_data[code_key]
            final_ordered_results.append(lang_output_data["audio"])
            final_ordered_results.append(lang_output_data["script_text"])
            final_ordered_results.append(lang_output_data["script_file"])
        
        gr.Info("Podcast generation complete!")
        return final_ordered_results

    except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
        raise e
    except Exception as e: # Catch other unexpected errors during the process
        import traceback
        print("An unexpected error occurred in generate_podcast:")
        traceback.print_exc()
        raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")

# ------------------------------------------------------------------
# Gradio Interface Setup
# ------------------------------------------------------------------
language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]

inputs = [
    gr.Textbox(
        label="Enter your Google AI Studio API Key (for Gemini text generation)",
        type="password",
        placeholder="Paste your Gemini API key here",
        # value=os.getenv("GEMINI_API_KEY_FOR_DEV") # Optional: for local dev default, remove for deployment
    ),
    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(
        choices=language_names_ordered,
        value=["English"], # Default language selection
        label="Select podcast language(s) to generate",
    ),
]

outputs = []
for code in LANG_INFO.keys():
    info = LANG_INFO[code]
    lang_name = info["name"]
    outputs.append(gr.Audio(label=f"{lang_name} Podcast (.flac)", type="filepath"))
    outputs.append(gr.Markdown(label=f"{lang_name} Script"))
    outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=outputs,
    title="Lecture β†’ Podcast & Script (Gemini Text + HF Speech)",
    description=(
        "**SETUP:**\n"
        "1. **Gemini API Key**: Enter your Google AI Studio API Key in the field below for text generation.\n"
        "2. **Hugging Face Token (for Speech)**: For Text-to-Speech, ensure you have a Hugging Face Token. "
        "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `HF_TOKEN`. "
        "Paste your Hugging Face token as its value.\n\n"
        "Upload a lecture PDF, choose language(s), and receive an audio podcast "
        "and its script. Dialogue by Google Gemini, speech by Hugging Face MMS-TTS."
    ),
    allow_flagging="never",
)

if __name__ == "__main__":
    # For local testing of HF_TOKEN, you can set it as an environment variable:
    # os.environ["HF_TOKEN"] = "your_hf_token_here"
    if not os.getenv("HF_TOKEN"):
        print("Reminder: For local testing with TTS, set the HF_TOKEN environment variable.")
    # The Gemini API key will be taken from the UI input.
    # You could add a default value for local testing to the gr.Textbox `value` argument if desired.
    # e.g. value=os.getenv("GEMINI_API_KEY_FOR_DEV")
    
    iface.launch()