Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

4c19533 verified about 1 month ago

raw

history blame

9.79 kB

	# =============================================================
	# Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
	# =============================================================
	# • Text generation – Google Gemini API (via user-provided genai API Key)
	# • Speech synthesis – Hugging Face Inference API for TTS (via HF_TOKEN secret)
	# -----------------------------------------------------------------

	import os
	import re
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Dict, Optional, Any

	import gradio as gr
	from PyPDF2 import PdfReader
	from pydub import AudioSegment
	from pydub.exceptions import CouldntDecodeError

	# For Hugging Face TTS
	from huggingface_hub import InferenceClient

	# For Google Gemini
	try:
	import google.generativeai as genai
	except ImportError:
	raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")

	# ------------------------------------------------------------------
	# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
	# ------------------------------------------------------------------
	hf_tts_client: Optional[InferenceClient] = None
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	hf_tts_client = InferenceClient(token=hf_token)
	else:
	print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")

	# ------------------------------------------------------------------
	# Language metadata for Hugging Face MMS-TTS models
	# ------------------------------------------------------------------
	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
	"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
	"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
	"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
	"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
	}
	LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

	# ------------------------------------------------------------------
	# Prompt template for Gemini
	# ------------------------------------------------------------------
	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two-host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of approximately 300 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and
	wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., Host 1:).

	### Lecture Content
	{content}
	"""
	)

	# PDF helpers (unchanged) -------------------------------------------
	def extract_pdf_text(pdf_path: str) -> str:
	try:
	reader = PdfReader(pdf_path)
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	except Exception as e:
	raise gr.Error(f"Failed to process PDF: {e}")

	TOKEN_LIMIT = 8000
	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	if len(words) > limit:
	gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
	return " ".join(words[:limit])
	return text

	# ------------------------------------------------------------------
	# TTS helper using Hugging Face Inference API
	# ------------------------------------------------------------------
	CHUNK_CHAR_LIMIT_HF = 280
	def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
	sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
	sentences = [s.strip() for s in sentences_raw if s.strip()]
	chunks, current_chunk = [], ""
	for sent in sentences:
	if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
	chunks.append(current_chunk)
	current_chunk = sent
	else:
	current_chunk += (" " + sent) if current_chunk else sent
	if current_chunk:
	chunks.append(current_chunk)
	return [chunk for chunk in chunks if chunk.strip()]

	def synthesize_speech_hf(
	text: str,
	hf_model_id: str,
	lang_tmpdir: Path,
	tts_client: InferenceClient
	) -> Path:
	chunks = _split_to_chunks_hf(text)
	if not chunks:
	raise ValueError("Text resulted in no speakable chunks after splitting.")

	audio_segments: List[AudioSegment] = []
	for idx, chunk in enumerate(chunks):
	gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
	try:
	audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
	except Exception as e:
	raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e

	part_path = lang_tmpdir / f"part_{idx}.flac"
	part_path.write_bytes(audio_bytes)
	try:
	segment = AudioSegment.from_file(part_path, format="flac")
	audio_segments.append(segment)
	except CouldntDecodeError as e:
	raise RuntimeError(f"Failed to decode audio chunk {idx+1}: {e}") from e

	combined_audio = sum(audio_segments, AudioSegment.empty())
	final_path = lang_tmpdir / "podcast_audio.flac"
	combined_audio.export(final_path, format="flac")
	return final_path

	# ------------------------------------------------------------------
	# Main pipeline function for Gradio
	# ------------------------------------------------------------------
	def generate_podcast(
	gemini_api_key_from_ui: Optional[str],
	pdf_file_obj: Optional[gr.File],
	selected_lang_names: List[str]
	) -> List[Optional[Any]]:

	if not gemini_api_key_from_ui:
	raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
	if not pdf_file_obj:
	raise gr.Error("Please upload a PDF file.")
	if not selected_lang_names:
	raise gr.Error("Please select at least one language.")

	try:
	genai.configure(api_key=gemini_api_key_from_ui)
	except Exception as e:
	raise gr.Error(f"Failed to configure Gemini API: {e}")

	if not hf_tts_client:
	gr.Warning("HF TTS unavailable; only script will be generated.")

	selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
	results_data = {
	code: {"audio": None, "script_md": None, "script_file": None}
	for code in LANG_INFO.keys()
	}

	with tempfile.TemporaryDirectory() as td:
	tmpdir_base = Path(td)
	lecture_raw = extract_pdf_text(pdf_file_obj.name)
	lecture_text = truncate_text(lecture_raw)
	if not lecture_text.strip():
	raise gr.Error("Extracted PDF text is empty.")

	gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')

	for code in selected_codes:
	info = LANG_INFO[code]
	lang_name = info["name"]
	hf_tts_model_id = info["tts_model"]

	lang_tmpdir = tmpdir_base / code
	lang_tmpdir.mkdir(parents=True, exist_ok=True)

	# 1️⃣ Generate script via Gemini
	prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
	try:
	resp = gemini_model.generate_content(prompt)
	dialogue = resp.text or ""
	except Exception as e:
	raise gr.Error(f"Gemini error for {lang_name}: {e}")

	if dialogue:
	# store Markdown script
	results_data[code]["script_md"] = dialogue
	# write .txt file
	script_path = lang_tmpdir / f"podcast_script_{code}.txt"
	script_path.write_text(dialogue, encoding="utf-8")
	results_data[code]["script_file"] = str(script_path)

	# 2️⃣ Synthesize audio via HF TTS
	if hf_tts_client:
	try:
	audio_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
	results_data[code]["audio"] = str(audio_path)
	except Exception as e:
	gr.Error(f"TTS error for {lang_name}: {e}")

	# assemble outputs in the order: Audio, Markdown, File for each language
	final_outputs: List[Optional[Any]] = []
	for code in LANG_INFO.keys():
	out = results_data[code]
	final_outputs.extend([ out["audio"], out["script_md"], out["script_file"] ])

	return final_outputs

	# ------------------------------------------------------------------
	# Gradio Interface Setup
	# ------------------------------------------------------------------
	language_names_ordered = [info["name"] for info in LANG_INFO.values()]

	inputs = [
	gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key here"),
	gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
	gr.CheckboxGroup(choices=language_names_ordered, value=["English"], label="Select language(s)"),
	]

	outputs = []
	for code in LANG_INFO.keys():
	lang_name = LANG_INFO[code]["name"]
	outputs.append(gr.Audio(label=f"{lang_name} Podcast", type="filepath"))
	outputs.append(gr.Markdown(label=f"{lang_name} Script"))
	outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=inputs,
	outputs=outputs,
	title="Lecture → Podcast & Script",
	description=(
	"Enter your Gemini API Key, upload a lecture PDF, choose language(s), "
	"and get a two-host podcast (audio) plus the Markdown script & downloadable text."
	),
	allow_flagging="never",
	)

	if __name__ == "__main__":
	if not os.getenv("HF_TOKEN"):
	print("Reminder: set HF_TOKEN in Secrets for TTS to work.")
	iface.launch()