Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

617d576

verified ·

1 Parent(s): 4c19533

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -180

app.py CHANGED Viewed

@@ -1,243 +1,172 @@
 # =============================================================
-# Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 # =============================================================
-# • **Text generation** – Google Gemini API (via user-provided genai API Key)
-# • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
-# -----------------------------------------------------------------
 import os
 import re
 import tempfile
 import textwrap
 from pathlib import Path
-from typing import List, Dict, Optional, Any
 import gradio as gr
 from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
-# For Hugging Face TTS
 from huggingface_hub import InferenceClient
-# For Google Gemini
 try:
     import google.generativeai as genai
 except ImportError:
     raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
 # ------------------------------------------------------------------
-# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
-# ------------------------------------------------------------------
-hf_tts_client: Optional[InferenceClient] = None
-hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    hf_tts_client = InferenceClient(token=hf_token)
-else:
-    print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
-# ------------------------------------------------------------------
-# Language metadata for Hugging Face MMS-TTS models
-# ------------------------------------------------------------------
-LANG_INFO: Dict[str, Dict[str, str]] = {
-    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
-    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
-    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
-    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
-    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
-}
-LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
-# ------------------------------------------------------------------
-# Prompt template for Gemini
 # ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
-    You are producing a lively two-host educational podcast in {lang_name}.
-    Summarize the following lecture content into a dialogue of **approximately 300 words**.
-    Make it engaging: hosts ask questions, clarify ideas with analogies, and
-    wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
     ### Lecture Content
     {content}
     """
 )
-# PDF helpers (unchanged) -------------------------------------------
-def extract_pdf_text(pdf_path: str) -> str:
-    try:
-        reader = PdfReader(pdf_path)
-        return "\n".join(page.extract_text() or "" for page in reader.pages)
-    except Exception as e:
-        raise gr.Error(f"Failed to process PDF: {e}")
-TOKEN_LIMIT = 8000
-def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
-    words = text.split()
-    if len(words) > limit:
-        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
-        return " ".join(words[:limit])
-    return text
 # ------------------------------------------------------------------
-# TTS helper using Hugging Face Inference API
 # ------------------------------------------------------------------
-CHUNK_CHAR_LIMIT_HF = 280
-def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
-    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
-    sentences = [s.strip() for s in sentences_raw if s.strip()]
-    chunks, current_chunk = [], ""
     for sent in sentences:
-        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
-            chunks.append(current_chunk)
-            current_chunk = sent
         else:
-            current_chunk += (" " + sent) if current_chunk else sent
-    if current_chunk:
-        chunks.append(current_chunk)
-    return [chunk for chunk in chunks if chunk.strip()]
-def synthesize_speech_hf(
-    text: str,
-    hf_model_id: str,
-    lang_tmpdir: Path,
-    tts_client: InferenceClient
-) -> Path:
-    chunks = _split_to_chunks_hf(text)
     if not chunks:
-        raise ValueError("Text resulted in no speakable chunks after splitting.")
-    audio_segments: List[AudioSegment] = []
-    for idx, chunk in enumerate(chunks):
-        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
         try:
-            audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
         except Exception as e:
-            raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
-        part_path = lang_tmpdir / f"part_{idx}.flac"
         part_path.write_bytes(audio_bytes)
         try:
-            segment = AudioSegment.from_file(part_path, format="flac")
-            audio_segments.append(segment)
         except CouldntDecodeError as e:
-            raise RuntimeError(f"Failed to decode audio chunk {idx+1}: {e}") from e
-    combined_audio = sum(audio_segments, AudioSegment.empty())
-    final_path = lang_tmpdir / "podcast_audio.flac"
-    combined_audio.export(final_path, format="flac")
-    return final_path
 # ------------------------------------------------------------------
-# Main pipeline function for Gradio
 # ------------------------------------------------------------------
 def generate_podcast(
-    gemini_api_key_from_ui: Optional[str],
-    pdf_file_obj: Optional[gr.File],
-    selected_lang_names: List[str]
 ) -> List[Optional[Any]]:
-    if not gemini_api_key_from_ui:
-        raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
-    if not pdf_file_obj:
-        raise gr.Error("Please upload a PDF file.")
-    if not selected_lang_names:
-        raise gr.Error("Please select at least one language.")
     try:
-        genai.configure(api_key=gemini_api_key_from_ui)
     except Exception as e:
-        raise gr.Error(f"Failed to configure Gemini API: {e}")
-    if not hf_tts_client:
-        gr.Warning("HF TTS unavailable; only script will be generated.")
-    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
-    results_data = {
-        code: {"audio": None, "script_md": None, "script_file": None}
-        for code in LANG_INFO.keys()
-    }
     with tempfile.TemporaryDirectory() as td:
-        tmpdir_base = Path(td)
-        lecture_raw = extract_pdf_text(pdf_file_obj.name)
-        lecture_text = truncate_text(lecture_raw)
-        if not lecture_text.strip():
-            raise gr.Error("Extracted PDF text is empty.")
-        gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
-        for code in selected_codes:
-            info = LANG_INFO[code]
-            lang_name = info["name"]
-            hf_tts_model_id = info["tts_model"]
-            lang_tmpdir = tmpdir_base / code
-            lang_tmpdir.mkdir(parents=True, exist_ok=True)
-            # 1️⃣ Generate script via Gemini
-            prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
-            try:
-                resp = gemini_model.generate_content(prompt)
-                dialogue = resp.text or ""
-            except Exception as e:
-                raise gr.Error(f"Gemini error for {lang_name}: {e}")
-            if dialogue:
-                # store Markdown script
-                results_data[code]["script_md"] = dialogue
-                # write .txt file
-                script_path = lang_tmpdir / f"podcast_script_{code}.txt"
-                script_path.write_text(dialogue, encoding="utf-8")
-                results_data[code]["script_file"] = str(script_path)
-                # 2️⃣ Synthesize audio via HF TTS
-                if hf_tts_client:
-                    try:
-                        audio_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
-                        results_data[code]["audio"] = str(audio_path)
-                    except Exception as e:
-                        gr.Error(f"TTS error for {lang_name}: {e}")
-    # assemble outputs in the order: Audio, Markdown, File for each language
-    final_outputs: List[Optional[Any]] = []
-    for code in LANG_INFO.keys():
-        out = results_data[code]
-        final_outputs.extend([ out["audio"], out["script_md"], out["script_file"] ])
-    return final_outputs
 # ------------------------------------------------------------------
-# Gradio Interface Setup
 # ------------------------------------------------------------------
-language_names_ordered = [info["name"] for info in LANG_INFO.values()]
-inputs = [
-    gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key here"),
-    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
-    gr.CheckboxGroup(choices=language_names_ordered, value=["English"], label="Select language(s)"),
-]
-outputs = []
-for code in LANG_INFO.keys():
-    lang_name = LANG_INFO[code]["name"]
-    outputs.append(gr.Audio(label=f"{lang_name} Podcast", type="filepath"))
-    outputs.append(gr.Markdown(label=f"{lang_name} Script"))
-    outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
 iface = gr.Interface(
     fn=generate_podcast,
-    inputs=inputs,
-    outputs=outputs,
-    title="Lecture → Podcast & Script",
     description=(
-        "Enter your Gemini API Key, upload a lecture PDF, choose language(s), "
-        "and get a two-host podcast (audio) plus the Markdown script & downloadable text."
     ),
     allow_flagging="never",
 )
 if __name__ == "__main__":
-    if not os.getenv("HF_TOKEN"):
-        print("Reminder: set HF_TOKEN in Secrets for TTS to work.")
     iface.launch()

 # =============================================================
+# Lecture → Podcast & Script Generator (English Only)
+# • Text: Google Gemini API (via UI-provided key)
+# • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
 # =============================================================
 import os
 import re
 import tempfile
 import textwrap
 from pathlib import Path
+from typing import List, Optional, Any
 import gradio as gr
 from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+# Hugging Face TTS client (anonymous/public access)
 from huggingface_hub import InferenceClient
+# Google Gemini SDK
 try:
     import google.generativeai as genai
 except ImportError:
     raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
 # ------------------------------------------------------------------
+# Globals & templates
 # ------------------------------------------------------------------
+# Gemini prompt for ~300-word two-host dialogue in English
 PROMPT_TEMPLATE = textwrap.dedent(
     """
+    You are producing a lively two-host educational podcast in English.
+    Summarize the following lecture content into a dialogue of approximately 300 words.
+    Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
+    Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
     ### Lecture Content
     {content}
     """
 )
+# TTS model ID for English MMS-TTS
+HF_TTS_MODEL = "facebook/mms-tts-eng"
+# Safe chunk size for HF text-to-speech
+CHUNK_CHAR_LIMIT = 280
+# Initialize HF TTS client (no token required for public models)
+tts_client = InferenceClient()
 # ------------------------------------------------------------------
+# Helpers
 # ------------------------------------------------------------------
+def extract_pdf_text(pdf_path: str) -> str:
+    """Extracts all text from a PDF file."""
+    reader = PdfReader(pdf_path)
+    return "\n".join(page.extract_text() or "" for page in reader.pages)
+def truncate_text(text: str, max_words: int = 8000) -> str:
+    """Truncate to max_words to fit LLM context."""
+    words = text.split()
+    return " ".join(words[:max_words])
+def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
+    """Split text into ≤limit-char chunks at sentence boundaries."""
+    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
+    chunks, current = [], ""
     for sent in sentences:
+        if current and len(current) + len(sent) + 1 > limit:
+            chunks.append(current)
+            current = sent
         else:
+            current = f"{current} {sent}".strip() if current else sent
+    if current:
+        chunks.append(current)
+    return chunks
+def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
+    """Chunk-safe TTS via HF Inference API, concatenating FLAC segments."""
+    chunks = split_to_chunks(text)
     if not chunks:
+        raise ValueError("No text to synthesize.")
+    segments = []
+    for i, chunk in enumerate(chunks):
         try:
+            audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
         except Exception as e:
+            raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
+        part_path = out_dir / f"seg_{i}.flac"
         part_path.write_bytes(audio_bytes)
         try:
+            seg = AudioSegment.from_file(part_path, format="flac")
+            segments.append(seg)
         except CouldntDecodeError as e:
+            raise RuntimeError(f"Could not decode segment {i+1}: {e}")
+    # Concatenate
+    final = sum(segments, AudioSegment.empty())
+    out_path = out_dir / "podcast_audio.flac"
+    final.export(out_path, format="flac")
+    return out_path
 # ------------------------------------------------------------------
+# Main pipeline
 # ------------------------------------------------------------------
 def generate_podcast(
+    gemini_api_key: Optional[str],
+    lecture_pdf: Optional[gr.File]
 ) -> List[Optional[Any]]:
+    # Validate inputs
+    if not gemini_api_key:
+        raise gr.Error("Enter your Google AI Studio API Key.")
+    if not lecture_pdf:
+        raise gr.Error("Upload a lecture PDF file.")
+    # Configure Gemini
+    genai.configure(api_key=gemini_api_key)
+    # Extract & truncate lecture text
+    raw = extract_pdf_text(lecture_pdf.name)
+    content = truncate_text(raw)
+    if not content.strip():
+        raise gr.Error("Lecture PDF contained no extractable text.")
+    # Initialize Gemini model
     try:
+        gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
     except Exception as e:
+        raise gr.Error(f"Gemini init failed: {e}")
+    # Generate script
+    prompt = PROMPT_TEMPLATE.format(content=content)
+    try:
+        resp = gemini_model.generate_content(prompt)
+        script = resp.text or ""
+    except Exception as e:
+        raise gr.Error(f"Gemini generation error: {e}")
+    # Prepare temp directory
     with tempfile.TemporaryDirectory() as td:
+        tmp = Path(td)
+        # Save script file
+        script_path = tmp / "podcast_script.txt"
+        script_path.write_text(script, encoding="utf-8")
+        # Synthesize audio
+        try:
+            audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
+        except Exception as e:
+            raise gr.Error(f"Speech synthesis error: {e}")
+        # Return [audio, markdown script, txt file]
+        return [str(audio_path), script, str(script_path)]
 # ------------------------------------------------------------------
+# Gradio Interface
 # ------------------------------------------------------------------
 iface = gr.Interface(
     fn=generate_podcast,
+    inputs=[
+        gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key"),
+        gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
+    ],
+    outputs=[
+        gr.Audio(label="English Podcast", type="filepath"),
+        gr.Markdown(label="English Script"),
+        gr.File(label="Download English Script (.txt)", type="filepath"),
+    ],
+    title="Lecture → English Podcast & Script",
     description=(
+        "Enter your Gemini API Key and upload a lecture PDF. "
+        "Generates a two-host podcast audio and a Markdown script in English "
+        "using Google Gemini for text and Hugging Face MMS-TTS for audio."
     ),
     allow_flagging="never",
 )
 if __name__ == "__main__":
     iface.launch()