Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

c565171

verified ·

1 Parent(s): 313d24a

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -77

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # =============================================================
-# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
 # =============================================================
-# • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct)
-# • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe
-#   (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split
-#   into ≤280‑char chunks to stay within HF endpoint limits.
 # -----------------------------------------------------------------
 import os
@@ -12,30 +12,31 @@ import re
 import tempfile
 import textwrap
 from pathlib import Path
-from typing import List, Dict, Tuple, Optional
 import gradio as gr
-from huggingface_hub import InferenceClient
-from PyPDF2 import PdfReader
-from smolagents import HfApiModel
 # ------------------------------------------------------------------
 # LLM setup – remote Qwen model via SmolAgents
 # ------------------------------------------------------------------
 llm = HfApiModel(
     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
-    max_tokens=2048,
     temperature=0.5,
 )
 # ------------------------------------------------------------------
-# Hugging Face Inference API client (uses HF_TOKEN secret if provided)
 # ------------------------------------------------------------------
 client = InferenceClient(token=os.getenv("HF_TOKEN", None))
 # ------------------------------------------------------------------
 # Language metadata and corresponding open TTS model IDs
-# (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
@@ -44,19 +45,20 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
     "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
     "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
 # ------------------------------------------------------------------
-# Prompt template (≈300 words to keep TTS happy)
 # ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
-    You are producing a lively two‑host educational podcast in {lang_name}.
-    Summarize the following lecture content into a dialogue of **≈300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
     wrap up with a concise recap. Preserve technical accuracy.
-    ### Lecture Content
     {content}
     """
 )
@@ -64,120 +66,230 @@ PROMPT_TEMPLATE = textwrap.dedent(
 # PDF helpers -------------------------------------------------------
 def extract_pdf_text(pdf_path: str) -> str:
-    reader = PdfReader(pdf_path)
-    return "\n".join(page.extract_text() or "" for page in reader.pages)
-TOKEN_LIMIT = 4000  # approx words before hitting context limit
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
-    return " ".join(words[:limit])
 # ------------------------------------------------------------------
-# TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars)
 # ------------------------------------------------------------------
-CHUNK_CHAR_LIMIT = 280  # safe margin for MMS‑TTS
 def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
-    # split on sentence boundaries while respecting limit
-    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
-    chunks, current = [], ""
     for sent in sentences:
-        if len(current) + len(sent) + 1 > limit:
-            if current:
-                chunks.append(current.strip())
-            current = sent
         else:
-            current += " " + sent if current else sent
-    if current:
-        chunks.append(current.strip())
-    return chunks
-def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
-    """Stream chunks through HF TTS and concatenate FLAC bytes."""
     chunks = _split_to_chunks(text)
-    flac_paths: List[Path] = []
     for idx, chunk in enumerate(chunks):
         try:
             audio_bytes = client.text_to_speech(chunk, model=model_id)
         except HubHTTPError as e:
-            raise RuntimeError(f"TTS request failed: {e}") from e
-        part_path = tmpdir / f"part_{idx}.flac"
         part_path.write_bytes(audio_bytes)
-        flac_paths.append(part_path)
-    # simple concat of FLAC files (works because each part includes header)
-    # better: convert to raw & merge, but HF players handle sequential FLACs
-    final_path = tmpdir / "podcast.flac"
-    with open(final_path, "wb") as fout:
-        for p in flac_paths:
-            fout.write(p.read_bytes())
     return final_path
 # ------------------------------------------------------------------
-# Main pipeline
 # ------------------------------------------------------------------
-def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
     if not selected_lang_names:
-        raise gr.Error("Please select at least one language.")
     selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
-    results: List[Optional[Tuple[str, None]]] = []
-    with tempfile.TemporaryDirectory() as td:
-        tmpdir = Path(td)
-        lecture_raw = extract_pdf_text(pdf.name)
-        lecture_text = truncate_text(lecture_raw)
-        for code, info in LANG_INFO.items():
-            if code not in selected_codes:
-                results.append(None)
-                continue
-            # 1️⃣ Generate dialogue
-            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
-            dialogue: str = llm(prompt)
-            # 2️⃣ Speech synthesis (chunked)
-            tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)
-            results.append((str(tts_path), None))
-    return results
 # ------------------------------------------------------------------
-# Gradio Interface
 # ------------------------------------------------------------------
-language_choices = [info["name"] for info in LANG_INFO.values()]
 inputs = [
     gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     gr.CheckboxGroup(
-        choices=language_choices,
-        value=["English"],
         label="Select podcast language(s) to generate",
     ),
 ]
 outputs = [
-    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
 ]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
-    title="Lecture → Podcast Generator (Choose Languages)",
     description=(
-        "Upload a lecture PDF, choose language(s), and receive a two‑host "
-        "audio podcast. Dialogue comes from Qwen‑32B; speech is streamed "
-        "via the HF Inference API using open MMS‑TTS models. Long texts are "
-        "automatically chunked to fit API limits."
     ),
 )
 if __name__ == "__main__":
-    iface.launch()

 # =============================================================
+# Hugging Face Space – Lecture → Podcast Generator (User-selectable Languages)
 # =============================================================
+# • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5-Coder-32B-Instruct)
+# • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk-safe
+#   (MMS-TTS for en/bn/ur/ne, mms-TTS-zho for zh). Long texts are split
+#   into ≤280-char chunks to stay within HF endpoint limits.
 # -----------------------------------------------------------------
 import os
 import tempfile
 import textwrap
 from pathlib import Path
+from typing import List, Dict, Optional
 import gradio as gr
+from huggingface_hub import InferenceClient, HubHTTPError
+from PyPDF2 import PdfReader # For PDF processing
+from smolagents import HfApiModel # For LLM interaction
+from pydub import AudioSegment # Added for robust audio concatenation
+from pydub.exceptions import CouldntDecodeError # Specific pydub error
 # ------------------------------------------------------------------
 # LLM setup – remote Qwen model via SmolAgents
 # ------------------------------------------------------------------
 llm = HfApiModel(
     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
+    max_tokens=2048, # Max tokens for the generated output dialogue
     temperature=0.5,
 )
 # ------------------------------------------------------------------
+# Hugging Face Inference API client (uses HF_TOKEN secret if provided)
 # ------------------------------------------------------------------
 client = InferenceClient(token=os.getenv("HF_TOKEN", None))
 # ------------------------------------------------------------------
 # Language metadata and corresponding open TTS model IDs
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
     "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
     "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
 }
+# For reverse lookup: language name to language code
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
 # ------------------------------------------------------------------
+# Prompt template (target ~300 words for LLM output)
 # ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
+    You are producing a lively two-host educational podcast in {lang_name}.
+    Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
     wrap up with a concise recap. Preserve technical accuracy.
+    ### Lecture Content
     {content}
     """
 )
 # PDF helpers -------------------------------------------------------
 def extract_pdf_text(pdf_path: str) -> str:
+    try:
+        reader = PdfReader(pdf_path)
+        return "\n".join(page.extract_text() or "" for page in reader.pages)
+    except Exception as e:
+        # Raise a Gradio error to display it in the UI
+        raise gr.Error(f"Failed to process PDF: {e}")
+# Increased slightly; Qwen models have large context windows. This is input *words*.
+# Actual limit is in tokens. Qwen2.5-Coder-32B-Instruct context is 65536 tokens.
+# 8000 words is still conservative. The prompt itself also consumes tokens.
+TOKEN_LIMIT = 8000
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
+    if len(words) > limit:
+        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
+        return " ".join(words[:limit])
+    return text
 # ------------------------------------------------------------------
+# TTS helper – chunk long text safely (HF endpoint limit ~30s / 200-300 chars)
 # ------------------------------------------------------------------
+CHUNK_CHAR_LIMIT = 280  # Safe margin for MMS-TTS character limit per request
 def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
+    # Split on sentence boundaries (.!?) while respecting the character limit per chunk.
+    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
+    sentences = [s.strip() for s in sentences_raw if s.strip()] # Clean and filter empty sentences
+    if not sentences:
+        return []
+    chunks, current_chunk = [], ""
     for sent in sentences:
+        # If current_chunk is empty, the first sentence always starts a new chunk.
+        # If current_chunk is not empty, check if adding the new sentence (plus a space) exceeds the limit.
+        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
+            chunks.append(current_chunk) # Finalize the current chunk
+            current_chunk = sent         # Start a new chunk with the current sentence
         else:
+            # Append sentence to current_chunk (with a space if current_chunk is not empty)
+            current_chunk += (" " + sent) if current_chunk else sent
+    if current_chunk: # Add any remaining part as the last chunk
+        chunks.append(current_chunk)
+    return [chunk for chunk in chunks if chunk.strip()] # Ensure no empty chunks are returned
+def synthesize_speech(text: str, model_id: str, lang_tmpdir: Path) -> Path:
+    """Splits text into chunks, synthesizes speech for each, and concatenates them using pydub."""
     chunks = _split_to_chunks(text)
+    if not chunks:
+        raise ValueError("Text resulted in no speakable chunks after splitting.")
+    audio_segments: List[AudioSegment] = []
     for idx, chunk in enumerate(chunks):
+        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)}...")
         try:
             audio_bytes = client.text_to_speech(chunk, model=model_id)
         except HubHTTPError as e:
+            error_message = f"TTS request failed for chunk {idx+1}/{len(chunks)} ('{chunk[:30]}...'): {e}"
+            if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
+                gr.Warning(f"Skipping an apparently empty chunk for TTS that wasn't filtered: Chunk {idx+1}")
+                continue
+            raise RuntimeError(error_message) from e
+        part_path = lang_tmpdir / f"part_{idx}.flac" # Assuming TTS returns FLAC
         part_path.write_bytes(audio_bytes)
+        try:
+            # Load the audio part using pydub.
+            # MMS TTS via HF Inference API usually returns WAV by default, but filename implies FLAC.
+            # If API returns WAV, use format="wav". If FLAC, format="flac".
+            # The original code implies FLAC, so we'll stick to that.
+            segment = AudioSegment.from_file(part_path, format="flac")
+            audio_segments.append(segment)
+        except CouldntDecodeError as e:
+            # This can happen if the audio data is not valid FLAC or is empty/corrupted.
+            raise RuntimeError(
+                f"Failed to decode audio chunk {idx+1} from {part_path}. "
+                f"Audio data might be corrupted, empty, or not in FLAC format. TTS Error: {e}"
+            ) from e
+    if not audio_segments:
+        raise RuntimeError("No audio segments were successfully synthesized or decoded.")
+    # Concatenate all audio segments
+    combined_audio = sum(audio_segments, AudioSegment.empty()) # Efficient sum for pydub
+    final_path = lang_tmpdir / "podcast.flac"
+    combined_audio.export(final_path, format="flac")
     return final_path
 # ------------------------------------------------------------------
+# Main pipeline function for Gradio
 # ------------------------------------------------------------------
+def generate_podcast(pdf_file_obj: Optional[gr.File], selected_lang_names: List[str]):
+    if not pdf_file_obj:
+        raise gr.Error("Please upload a PDF file.")
     if not selected_lang_names:
+        raise gr.Error("Please select at least one language for the podcast.")
+    # Map selected language names back to their codes
     selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
+    # Initialize results map. Keys are lang codes, values will be audio file paths or None.
+    # This helps in populating results for selected languages only.
+    results_map: Dict[str, Optional[str]] = {code: None for code in LANG_INFO.keys()}
+    try:
+        with tempfile.TemporaryDirectory() as td:
+            tmpdir_base = Path(td) # Base temporary directory
+            gr.Info("Extracting text from PDF...")
+            lecture_raw = extract_pdf_text(pdf_file_obj.name) # .name is path to temp uploaded file
+            lecture_text = truncate_text(lecture_raw)
+            if not lecture_text.strip():
+                raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
+            for code in selected_codes: # Iterate only through user-selected languages
+                info = LANG_INFO[code]
+                lang_name = info["name"]
+                tts_model = info["tts_model"]
+                gr.Info(f"Processing for {lang_name}...")
+                # Create a language-specific subdirectory within the base temporary directory
+                lang_tmpdir = tmpdir_base / code
+                lang_tmpdir.mkdir(parents=True, exist_ok=True)
+                # 1️⃣ Generate dialogue using LLM
+                gr.Info(f"Generating dialogue for {lang_name}...")
+                prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
+                try:
+                    dialogue: str = llm(prompt)
+                    if not dialogue or not dialogue.strip():
+                        gr.Warning(f"LLM returned empty dialogue for {lang_name}. Skipping TTS for this language.")
+                        results_map[code] = None
+                        continue # Move to the next selected language
+                except Exception as e:
+                    gr.Error(f"Error generating dialogue for {lang_name}: {e}")
+                    results_map[code] = None
+                    continue
+                # 2️⃣ Synthesize speech from the dialogue (chunked and concatenated)
+                gr.Info(f"Synthesizing speech for {lang_name}...")
+                try:
+                    tts_path = synthesize_speech(dialogue, tts_model, lang_tmpdir)
+                    results_map[code] = str(tts_path) # Store the file path for this language
+                except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
+                    gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
+                    results_map[code] = None
+                except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
+                    gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
+                    results_map[code] = None
+                except Exception as e: # Catch any other unexpected errors during synthesis
+                    gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
+                    results_map[code] = None
+        # Convert the results_map to an ordered list based on LANG_INFO keys.
+        # This ensures the returned list matches the order of Gradio output components.
+        final_results = [results_map[lang_code] for lang_code in LANG_INFO.keys()]
+        gr.Info("Podcast generation complete!")
+        return final_results
+    except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
+        raise e
+    except Exception as e: # Catch other unexpected errors during the process
+        # Log the full error for debugging purposes (e.g., to server logs)
+        import traceback
+        print("An unexpected error occurred in generate_podcast:")
+        traceback.print_exc()
+        # Show a generic error message in the UI
+        raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
 # ------------------------------------------------------------------
+# Gradio Interface Setup
 # ------------------------------------------------------------------
+# Ensure choices and outputs maintain consistent order related to LANG_INFO
+language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
 inputs = [
     gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     gr.CheckboxGroup(
+        choices=language_names_ordered,
+        value=["English"], # Default language selection
         label="Select podcast language(s) to generate",
     ),
 ]
+# Create an gr.Audio output component for each language, in the defined order
 outputs = [
+    gr.Audio(label=f"{LANG_INFO[code]['name']} Podcast", type="filepath")
+    for code in LANG_INFO.keys()
 ]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
+    title="Lecture → Podcast Generator (Multi-Language)",
     description=(
+        "Upload a lecture PDF, choose language(s), and receive a two-host "
+        "audio podcast for each selected language. Dialogue is generated by Qwen-32B, "
+        "and speech is synthesized using open MMS-TTS models via the HF Inference API. "
+        "Long texts are automatically chunked, and audio parts are robustly combined."
     ),
+    allow_flagging="never", # Set to "auto" or "manual" if you want to enable flagging
+    # Provide examples if you have sample PDFs accessible to the Gradio app
+    # examples=[
+    #     ["path/to/sample_lecture.pdf", ["English", "Chinese"]],
+    # ]
 )
 if __name__ == "__main__":
+    # For local testing, ensure ffmpeg is installed and in PATH if pydub relies on it
+    # for FLAC conversion or other operations not handled by its built-in capabilities.
+    # The Hugging Face Inference API for MMS-TTS should ideally return FLAC directly
+    # if the model specified (e.g., facebook/mms-tts-eng) outputs that format.
+    iface.launch()