File size: 15,657 Bytes
53744b5 f1adb14 fe00684 f1adb14 50d2a40 53744b5 f1adb14 53744b5 f036ad8 f1adb14 53744b5 f1adb14 50d2a40 53744b5 50d2a40 f1adb14 53744b5 f1adb14 c172b12 53744b5 f1adb14 c565171 53744b5 50d2a40 53744b5 c565171 f1adb14 53744b5 50d2a40 c565171 f1adb14 53744b5 f1adb14 c565171 53744b5 c565171 f1adb14 53744b5 fe00684 53744b5 fe00684 53744b5 fe00684 53744b5 fe00684 f036ad8 53744b5 fe00684 f036ad8 53744b5 f036ad8 fe00684 53744b5 c565171 53744b5 c565171 53744b5 f036ad8 53744b5 fe00684 f1adb14 53744b5 f1adb14 53744b5 c565171 53744b5 f1adb14 53744b5 f1adb14 53744b5 c172b12 53744b5 c172b12 9e251c5 53744b5 f1adb14 c172b12 fe00684 53744b5 f1adb14 53744b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# =============================================================
# Hugging Face Space β Lecture β Podcast Generator (Google Gemini & TTS)
# =============================================================
# β’ **Text generation** β Google Gemini API
# β’ **Speech synthesis** β Google Cloud Text-to-Speech API
# -----------------------------------------------------------------
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional, Any
import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# Import Google Cloud libraries
try:
import google.generativeai as genai
from google.cloud import texttospeech
except ImportError:
raise ImportError(
"Please install required Google libraries: "
"pip install google-generativeai google-cloud-texttospeech"
)
# ------------------------------------------------------------------
# Language metadata for Google TTS (BCP-47 codes)
# You might want to specify particular voices too (e.g., "en-US-Wavenet-D")
# For simplicity, we'll let Google pick a standard voice for the language code.
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
"en": {"name": "English", "tts_lang_code": "en-US"},
"bn": {"name": "Bangla", "tts_lang_code": "bn-IN"},
"zh": {"name": "Chinese (Mandarin)", "tts_lang_code": "cmn-CN"}, # cmn for Mandarin
"ur": {"name": "Urdu", "tts_lang_code": "ur-PK"},
"ne": {"name": "Nepali", "tts_lang_code": "ne-NP"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
# ------------------------------------------------------------------
# Prompt template (adjust if needed for Gemini's style)
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
"""
You are producing a lively two-host educational podcast in {lang_name}.
Summarize the following lecture content into a dialogue of **approximately 300 words**.
Make it engaging: hosts ask questions, clarify ideas with analogies, and
wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
### Lecture Content
{content}
"""
)
# PDF helpers (unchanged) -------------------------------------------
def extract_pdf_text(pdf_path: str) -> str:
try:
reader = PdfReader(pdf_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
except Exception as e:
raise gr.Error(f"Failed to process PDF: {e}")
TOKEN_LIMIT = 8000 # Word limit for input text
def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
words = text.split()
if len(words) > limit:
gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
return " ".join(words[:limit])
return text
# ------------------------------------------------------------------
# TTS helper β chunk long text (Google TTS has a limit of 5000 bytes per request)
# ------------------------------------------------------------------
CHUNK_CHAR_LIMIT = 1500 # Google TTS limit is 5000 bytes. Characters are safer.
# Average 3 bytes/char for UTF-8, so 1500 chars is ~4500 bytes.
def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
sentences = [s.strip() for s in sentences_raw if s.strip()]
if not sentences: return []
chunks, current_chunk = [], ""
for sent in sentences:
if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
chunks.append(current_chunk)
current_chunk = sent
else:
current_chunk += (" " + sent) if current_chunk else sent
if current_chunk: chunks.append(current_chunk)
return [chunk for chunk in chunks if chunk.strip()]
def synthesize_speech_google(
text: str,
google_lang_code: str,
lang_tmpdir: Path,
tts_client: texttospeech.TextToSpeechClient
) -> Path:
"""Splits text, synthesizes with Google TTS, concatenates MP3s."""
chunks = _split_to_chunks(text)
if not chunks:
raise ValueError("Text resulted in no speakable chunks after splitting.")
audio_segments: List[AudioSegment] = []
for idx, chunk in enumerate(chunks):
gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with Google TTS...")
synthesis_input = texttospeech.SynthesisInput(text=chunk)
voice = texttospeech.VoiceSelectionParams(
language_code=google_lang_code,
# You can specify a voice name, e.g., "en-US-Wavenet-D"
# ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL # Optional
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
try:
response = tts_client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
except Exception as e:
raise RuntimeError(f"Google TTS request failed for chunk {idx+1}: {e}") from e
part_path = lang_tmpdir / f"part_{idx}.mp3"
with open(part_path, "wb") as out_mp3:
out_mp3.write(response.audio_content)
try:
segment = AudioSegment.from_mp3(part_path)
audio_segments.append(segment)
except CouldntDecodeError as e:
raise RuntimeError(f"Failed to decode MP3 audio chunk {idx+1} from {part_path}. Error: {e}") from e
if not audio_segments:
raise RuntimeError("No audio segments were successfully synthesized or decoded.")
combined_audio = sum(audio_segments, AudioSegment.empty())
final_path = lang_tmpdir / "podcast_audio.mp3"
combined_audio.export(final_path, format="mp3")
return final_path
# ------------------------------------------------------------------
# Main pipeline function for Gradio
# ------------------------------------------------------------------
def generate_podcast(
gemini_api_key: Optional[str],
pdf_file_obj: Optional[gr.File],
selected_lang_names: List[str]
) -> List[Optional[Any]]:
if not gemini_api_key:
raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
if not pdf_file_obj:
raise gr.Error("Please upload a PDF file.")
if not selected_lang_names:
raise gr.Error("Please select at least one language for the podcast.")
try:
genai.configure(api_key=gemini_api_key)
except Exception as e:
raise gr.Error(f"Failed to configure Gemini API. Check your API key. Error: {e}")
# IMPORTANT: Google Cloud Text-to-Speech client initialization.
# It expects GOOGLE_APPLICATION_CREDENTIALS environment variable to be set,
# pointing to your service account JSON key file.
# In Hugging Face Spaces, upload this JSON file as a Secret, e.g., named
# `GOOGLE_CREDS_JSON_CONTENT` (paste the content of the file).
# Then, in your Space's startup or here, you'd write this content to a temporary file
# and set GOOGLE_APPLICATION_CREDENTIALS to that temp file's path.
# Or, if GOOGLE_APPLICATION_CREDENTIALS points to a file path directly (less secure for pasted content).
# Example for setting GOOGLE_APPLICATION_CREDENTIALS from a Space secret:
google_creds_json_content = os.getenv("GOOGLE_CREDS_JSON_CONTENT")
temp_creds_file = None
if google_creds_json_content:
try:
fd, temp_creds_path = tempfile.mkstemp(suffix=".json")
with os.fdopen(fd, "w") as tmp:
tmp.write(google_creds_json_content)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_creds_path
temp_creds_file = Path(temp_creds_path)
gr.Info("Using GOOGLE_CREDS_JSON_CONTENT secret for Text-to-Speech API authentication.")
except Exception as e:
gr.Warning(f"Could not process GOOGLE_CREDS_JSON_CONTENT secret: {e}. TTS might fail.")
elif not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
gr.Warning(
"GOOGLE_APPLICATION_CREDENTIALS environment variable not set, and no "
"GOOGLE_CREDS_JSON_CONTENT secret found. "
"Google Text-to-Speech API calls may fail. "
"Please set up authentication for Google Cloud Text-to-Speech."
)
try:
tts_client = texttospeech.TextToSpeechClient()
except Exception as e:
raise gr.Error(f"Failed to initialize Google Text-to-Speech client. Ensure authentication is set up. Error: {e}")
selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
results_data: Dict[str, Dict[str, Optional[str]]] = {
code: {"audio": None, "script_text": None, "script_file": None}
for code in LANG_INFO.keys()
}
try:
with tempfile.TemporaryDirectory() as td:
tmpdir_base = Path(td)
gr.Info("Extracting text from PDF...")
lecture_raw = extract_pdf_text(pdf_file_obj.name)
lecture_text = truncate_text(lecture_raw)
if not lecture_text.strip():
raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
# Initialize Gemini model (e.g., 'gemini-1.5-flash' or 'gemini-pro')
# Choose a model appropriate for your task and quota.
gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
for code in selected_codes:
info = LANG_INFO[code]
lang_name = info["name"]
google_tts_lang = info["tts_lang_code"]
gr.Info(f"Processing for {lang_name}...")
lang_tmpdir = tmpdir_base / code
lang_tmpdir.mkdir(parents=True, exist_ok=True)
dialogue: Optional[str] = None
gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
try:
response = gemini_model.generate_content(prompt_for_gemini)
dialogue_raw = response.text # Accessing the text part of the response
if not dialogue_raw or not dialogue_raw.strip():
gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
continue
dialogue = dialogue_raw
results_data[code]["script_text"] = dialogue
script_file_path = lang_tmpdir / f"podcast_script_{code}.txt"
script_file_path.write_text(dialogue, encoding="utf-8")
results_data[code]["script_file"] = str(script_file_path)
except Exception as e:
gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
continue
if dialogue:
gr.Info(f"Synthesizing speech for {lang_name} with Google TTS...")
try:
tts_path = synthesize_speech_google(dialogue, google_tts_lang, lang_tmpdir, tts_client)
results_data[code]["audio"] = str(tts_path)
except ValueError as e:
gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
except RuntimeError as e:
gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
except Exception as e:
gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
final_ordered_results: List[Optional[Any]] = []
for code_key in LANG_INFO.keys():
lang_output_data = results_data[code_key]
final_ordered_results.append(lang_output_data["audio"])
final_ordered_results.append(lang_output_data["script_text"])
final_ordered_results.append(lang_output_data["script_file"])
gr.Info("Podcast generation complete!")
return final_ordered_results
except gr.Error as e:
raise e
except Exception as e:
import traceback
print("An unexpected error occurred in generate_podcast:")
traceback.print_exc()
raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
finally:
# Clean up the temporary credentials file if it was created
if temp_creds_file and temp_creds_file.exists():
try:
temp_creds_file.unlink()
# Unset the env var if you want, though it's specific to this run
# if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.environ["GOOGLE_APPLICATION_CREDENTIALS"] == str(temp_creds_file):
# del os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
except Exception as e_clean:
print(f"Warning: Could not clean up temporary credentials file {temp_creds_file}: {e_clean}")
# ------------------------------------------------------------------
# Gradio Interface Setup
# ------------------------------------------------------------------
language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
inputs = [
gr.Textbox(
label="Enter your Google AI Studio API Key (for Gemini)",
type="password",
placeholder="Paste your API key here",
),
gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
gr.CheckboxGroup(
choices=language_names_ordered,
value=["English"],
label="Select podcast language(s) to generate",
),
]
outputs = []
for code in LANG_INFO.keys():
info = LANG_INFO[code]
lang_name = info["name"]
outputs.append(gr.Audio(label=f"{lang_name} Podcast (.mp3)", type="filepath"))
outputs.append(gr.Markdown(label=f"{lang_name} Script"))
outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
iface = gr.Interface(
fn=generate_podcast,
inputs=inputs,
outputs=outputs,
title="Lecture β Podcast & Script (Google Gemini & TTS)",
description=(
"**IMPORTANT SETUP:**\n"
"1. Enter your Google AI Studio API Key for Gemini text generation.\n"
"2. For Text-to-Speech: Enable the 'Cloud Text-to-Speech API' in your Google Cloud Project. "
"Create a service account with 'Cloud Text-to-Speech API User' role, download its JSON key. "
"In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `GOOGLE_CREDS_JSON_CONTENT`. "
"Paste the *entire content* of your service account JSON key file as the value for this secret.\n\n"
"Upload a lecture PDF, choose language(s), and receive an audio podcast "
"and its script. Dialogue by Google Gemini, speech by Google Cloud TTS."
),
allow_flagging="never",
)
if __name__ == "__main__":
# Make sure GOOGLE_CREDS_JSON_CONTENT is available as an environment variable
# or GOOGLE_APPLICATION_CREDENTIALS is set correctly if running locally for testing.
# For local testing with a service account key file:
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/your/service-account-file.json"
iface.launch() |