Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

PodCastIt / app.py

HaiderAUT

Update app.py

764a881 verified about 1 month ago

raw

history blame

5.19 kB

	# =============================================================
	# Lecture → Podcast & Script Generator (Gemini + HF TTS)
	# Modified: Script outputs rendered as HTML
	# =============================================================
	import os
	import re
	import tempfile
	import textwrap
	from pathlib import Path
	from typing import List, Dict, Optional, Any

	import gradio as gr
	from PyPDF2 import PdfReader
	from pydub import AudioSegment
	from pydub.exceptions import CouldntDecodeError

	# Hugging Face TTS
	from huggingface_hub import InferenceClient

	# Google Gemini
	import google.generativeai as genai

	# ------------------------------------------------------------------
	# HF TTS client
	# ------------------------------------------------------------------
	hf_token = os.getenv("HF_TOKEN")
	hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None

	# Language metadata
	LANG_INFO: Dict[str, Dict[str, str]] = {
	"en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
	"bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
	"zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
	"ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
	"ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
	}
	LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

	# Prompt template
	PROMPT_TEMPLATE = textwrap.dedent(
	"""
	You are producing a lively two-host educational podcast in {lang_name}.
	Summarize the following lecture content into a dialogue of approximately 300 words.
	Make it engaging: hosts ask questions, clarify ideas with analogies, and
	wrap up with a concise recap. Preserve technical accuracy.

	### Lecture Content
	{content}
	"""
	)

	# PDF extraction
	TOKEN_LIMIT = 8000

	def extract_pdf_text(path: str) -> str:
	reader = PdfReader(path)
	return "\n".join(p.extract_text() or "" for p in reader.pages)

	def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
	words = text.split()
	return " ".join(words[:limit]) if len(words) > limit else text

	# TTS chunking
	CHUNK_CHAR_LIMIT = 280

	def split_chunks(text: str) -> List[str]:
	sentences = re.split(r"(?<=[.!?])\s+", text.strip())
	chunks, curr = [], ""
	for s in sentences:
	if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
	chunks.append(curr)
	curr = s
	else:
	curr = f"{curr} {s}" if curr else s
	if curr: chunks.append(curr)
	return chunks

	# Synthesize speech

	def synthesize(text: str, model_id: str, outdir: Path) -> str:
	segments = []
	for i, chunk in enumerate(split_chunks(text)):
	audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
	path = outdir / f"part{i}.flac"
	path.write_bytes(audio_bytes)
	seg = AudioSegment.from_file(path, format="flac")
	segments.append(seg)
	final = sum(segments, AudioSegment.empty())
	out = outdir / "podcast.flac"
	final.export(out, format="flac")
	return str(out)

	# Main pipeline

	def generate_podcast(
	gemini_key: str,
	pdf_file: gr.File,
	langs: List[str]
	) -> List[Optional[Any]]:
	if not gemini_key:
	raise gr.Error("Enter Google AI Studio API Key.")
	if not pdf_file:
	raise gr.Error("Upload a PDF file.")
	if not langs:
	raise gr.Error("Select at least one language.")

	genai.configure(api_key=gemini_key)
	raw = extract_pdf_text(pdf_file.name)
	content = truncate_text(raw)

	tmp = Path(tempfile.mkdtemp())
	results = []
	data = {}

	for code, info in LANG_INFO.items():
	if info["name"] not in langs:
	results.extend([None, None, None])
	continue
	# Generate script
	prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
	model = genai.GenerativeModel('gemini-1.5-flash-latest')
	resp = model.generate_content(prompt)
	script = resp.text.strip()
	# Save plain text
	script_path = tmp / f"script_{code}.txt"
	script_path.write_text(script, encoding="utf-8")
	# Render HTML version
	html_script = f"<pre>{script}</pre>"
	# Synthesize audio if available
	audio_path = None
	if hf_tts_client:
	audio_path = synthesize(script, info["tts_model"], tmp / code)
	results.extend([audio_path, html_script, str(script_path)])
	return results

	# Interface
	inputs = [
	gr.Textbox(label="Google AI Studio API Key", type="password"),
	gr.File(label="Lecture PDF", file_types=[".pdf"]),
	gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
	value=["English"], label="Languages")
	]
	outputs = []
	for code, info in LANG_INFO.items():
	outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
	outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
	outputs.append(gr.File(label=f"Download {info['name']} Script"))

	iface = gr.Interface(
	fn=generate_podcast,
	inputs=inputs,
	outputs=outputs,
	title="Lecture → Podcast & Script",
	)

	if __name__ == "__main__":
	iface.launch()