Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

d4adc2b verified about 1 month ago

raw

history blame

5.97 kB

	# =============================================================
	# Lecture → Podcast & Script Generator (English Only)
	# • Text: Google Gemini API (via UI-provided key)
	# • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
	# =============================================================

	import os
	import re
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Optional, Any

	import gradio as gr
	from PyPDF2 import PdfReader
	from pydub import AudioSegment
	from pydub.exceptions import CouldntDecodeError

	# Hugging Face TTS client (anonymous/public access)
	from huggingface_hub import InferenceClient

	# Google Gemini SDK
	try:
	import google.generativeai as genai
	except ImportError:
	raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")

	# ------------------------------------------------------------------
	# Globals & templates
	# ------------------------------------------------------------------
	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two-host educational podcast in English.
	Summarize the following lecture content into a dialogue of approximately 300 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
	Preserve technical accuracy. Use Markdown for host names (e.g., Host 1:).

	### Lecture Content
	{content}
	"""
	)

	HF_TTS_MODEL = "facebook/mms-tts-eng"
	CHUNK_CHAR_LIMIT = 280

	tts_client = InferenceClient()

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------
	def extract_pdf_text(pdf_path: str) -> str:
	reader = PdfReader(pdf_path)
	return "\n".join(page.extract_text() or "" for page in reader.pages)

	def truncate_text(text: str, max_words: int = 8000) -> str:
	words = text.split()
	return " ".join(words[:max_words])

	def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
	chunks, current = [], ""
	for sent in sentences:
	if current and len(current) + len(sent) + 1 > limit:
	chunks.append(current)
	current = sent
	else:
	current = f"{current} {sent}".strip() if current else sent
	if current:
	chunks.append(current)
	return chunks

	def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
	chunks = split_to_chunks(text)
	if not chunks:
	raise ValueError("No text to synthesize.")
	segments = []
	for i, chunk in enumerate(chunks):
	try:
	audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
	except Exception as e:
	raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
	part_path = out_dir / f"seg_{i}.flac"
	part_path.write_bytes(audio_bytes)
	try:
	seg = AudioSegment.from_file(part_path, format="flac")
	segments.append(seg)
	except CouldntDecodeError as e:
	raise RuntimeError(f"Could not decode segment {i+1}: {e}")
	final_audio = sum(segments, AudioSegment.empty())
	out_path = out_dir / "podcast_audio.flac"
	final_audio.export(out_path, format="flac")
	return out_path

	# ------------------------------------------------------------------
	# Main pipeline
	# ------------------------------------------------------------------
	def generate_podcast(
	gemini_api_key: Optional[str],
	lecture_pdf: Optional[gr.File]
	) -> List[Optional[Any]]:
	if not gemini_api_key:
	raise gr.Error("Enter your Google AI Studio API Key.")
	if not lecture_pdf:
	raise gr.Error("Upload a lecture PDF file.")

	genai.configure(api_key=gemini_api_key)

	raw = extract_pdf_text(lecture_pdf.name)
	content = truncate_text(raw)
	if not content.strip():
	raise gr.Error("Lecture PDF contained no extractable text.")

	try:
	gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
	except Exception as e:
	raise gr.Error(f"Gemini init failed: {e}")

	prompt = PROMPT_TEMPLATE.format(content=content)
	try:
	resp = gemini_model.generate_content(prompt)
	script = resp.text or ""
	except Exception as e:
	raise gr.Error(f"Gemini generation error: {e}")

	with tempfile.TemporaryDirectory() as td:
	tmp = Path(td)
	# Save script file
	script_path = tmp / "podcast_script.txt"
	script_path.write_text(script, encoding="utf-8")
	# Synthesize audio
	try:
	audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
	except Exception as e:
	raise gr.Error(f"Speech synthesis error: {e}")
	# Return [audio, markdown script, txt file]
	return [str(audio_path), script, str(script_path)]

	# ------------------------------------------------------------------
	# Gradio Interface
	# ------------------------------------------------------------------
	iface = gr.Interface(
	fn=generate_podcast,
	inputs=[
	gr.Textbox(
	label="Google Gemini API Key",
	type="password",
	placeholder="Paste your key here"
	),
	gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
	],
	outputs=[
	gr.Audio(label="English Podcast", type="filepath"),
	gr.Markdown(label="English Script"), # renders the script
	gr.File(label="Download English Script (.txt)", type="filepath"),
	],
	title="Lecture → English Podcast & Script",
	description=(
	"Enter your Gemini API Key and upload a lecture PDF. "
	"Generates a two-host podcast audio and a Markdown script in English "
	"using Google Gemini for text and Hugging Face MMS-TTS for audio."
	),
	allow_flagging="never",
	)

	if __name__ == "__main__":
	iface.launch()