Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

764a881

verified ·

1 Parent(s): 1425202

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -264

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
 # =============================================================
-# Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 # =============================================================
-# • **Text generation** – Google Gemini API (via user-provided genai API Key)
-# • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
-# -----------------------------------------------------------------
 import os
 import re
 import tempfile
@@ -17,29 +14,19 @@ from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
-# For Hugging Face TTS
 from huggingface_hub import InferenceClient
-# For Google Gemini
-try:
-    import google.generativeai as genai
-except ImportError:
-    raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
 # ------------------------------------------------------------------
-# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
 # ------------------------------------------------------------------
-hf_tts_client: Optional[InferenceClient] = None
 hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    hf_tts_client = InferenceClient(token=hf_token)
-else:
-    # This print will show in the Space logs if HF_TOKEN is missing
-    print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
-# ------------------------------------------------------------------
-# Language metadata for Hugging Face MMS-TTS models
-# ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
     "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
@@ -49,276 +36,122 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
-# ------------------------------------------------------------------
-# Prompt template for Gemini
-# ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
     Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
-    wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
     ### Lecture Content
     {content}
     """
 )
-# PDF helpers (unchanged) -------------------------------------------
-def extract_pdf_text(pdf_path: str) -> str:
-    try:
-        reader = PdfReader(pdf_path)
-        return "\n".join(page.extract_text() or "" for page in reader.pages)
-    except Exception as e:
-        raise gr.Error(f"Failed to process PDF: {e}")
 TOKEN_LIMIT = 8000
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
-    if len(words) > limit:
-        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
-        return " ".join(words[:limit])
-    return text
-# ------------------------------------------------------------------
-# TTS helper using Hugging Face Inference API
-# ------------------------------------------------------------------
-CHUNK_CHAR_LIMIT_HF = 280
-def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
-    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
-    sentences = [s.strip() for s in sentences_raw if s.strip()]
-    if not sentences: return []
-    chunks, current_chunk = [], ""
-    for sent in sentences:
-        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
-            chunks.append(current_chunk)
-            current_chunk = sent
         else:
-            current_chunk += (" " + sent) if current_chunk else sent
-    if current_chunk: chunks.append(current_chunk)
-    return [chunk for chunk in chunks if chunk.strip()]
-def synthesize_speech_hf(
-    text: str,
-    hf_model_id: str,
-    lang_tmpdir: Path,
-    tts_client: InferenceClient
-) -> Path:
-    chunks = _split_to_chunks_hf(text)
-    if not chunks:
-        raise ValueError("Text resulted in no speakable chunks after splitting.")
-    audio_segments: List[AudioSegment] = []
-    for idx, chunk in enumerate(chunks):
-        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
-        try:
-            audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
-        except HubHTTPError as e:
-            error_message = f"HF TTS request failed for chunk {idx+1} ('{chunk[:30]}...'): {e}"
-            if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
-                gr.Warning(f"Skipping an apparently empty chunk for HF TTS: Chunk {idx+1}")
-                continue
-            raise RuntimeError(error_message) from e
-        except Exception as e:
-             raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
-        part_path = lang_tmpdir / f"part_{idx}.flac"
-        part_path.write_bytes(audio_bytes)
-        try:
-            segment = AudioSegment.from_file(part_path, format="flac")
-            audio_segments.append(segment)
-        except CouldntDecodeError as e:
-            raise RuntimeError(f"Failed to decode FLAC audio chunk {idx+1} from {part_path}. Error: {e}") from e
-    if not audio_segments:
-        raise RuntimeError("No audio segments were successfully synthesized or decoded.")
-    combined_audio = sum(audio_segments, AudioSegment.empty())
-    final_path = lang_tmpdir / "podcast_audio.flac"
-    combined_audio.export(final_path, format="flac")
-    return final_path
-# ------------------------------------------------------------------
-# Main pipeline function for Gradio
-# ------------------------------------------------------------------
 def generate_podcast(
-    gemini_api_key_from_ui: Optional[str], # Explicitly named to show source
-    pdf_file_obj: Optional[gr.File],
-    selected_lang_names: List[str]
 ) -> List[Optional[Any]]:
-    if not gemini_api_key_from_ui: # Check the key provided from the UI input
-        raise gr.Error("Please enter your Google AI Studio API Key for Gemini in the input field.")
-    if not pdf_file_obj:
-        raise gr.Error("Please upload a PDF file.")
-    if not selected_lang_names:
-        raise gr.Error("Please select at least one language for the podcast.")
-    # Configure Gemini API using the key directly from the UI input
-    try:
-        genai.configure(api_key=gemini_api_key_from_ui)
-        gr.Info("Gemini API configured successfully with the provided key.")
-    except Exception as e:
-        raise gr.Error(f"Failed to configure Gemini API with the provided key. Please check your API key. Error: {e}")
-    # Check if HF TTS client is available (HF_TOKEN was provided as a secret)
-    if not hf_tts_client:
-        gr.Warning( # Changed to gr.Warning to allow script generation if TTS fails to init
-            "Hugging Face TTS client is not available (HF_TOKEN secret might be missing or invalid). "
-            "Speech synthesis will be skipped, but script generation will be attempted."
-        )
-        # Note: Script generation can still proceed, TTS will be skipped later if client is None.
-    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
-    results_data: Dict[str, Dict[str, Optional[str]]] = {
-        code: {"audio": None, "script_text": None, "script_file": None}
-        for code in LANG_INFO.keys()
-    }
-    try:
-        with tempfile.TemporaryDirectory() as td:
-            tmpdir_base = Path(td)
-            gr.Info("Extracting text from PDF...")
-            lecture_raw = extract_pdf_text(pdf_file_obj.name)
-            lecture_text = truncate_text(lecture_raw)
-            if not lecture_text.strip():
-                raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
-            # Initialize Gemini model (e.g., 'gemini-1.5-flash-latest' or 'gemini-pro')
-            # This happens after genai.configure has been called.
-            try:
-                gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
-            except Exception as e:
-                raise gr.Error(f"Failed to initialize Gemini model. This might be due to an invalid API key or API access issues. Error: {e}")
-            for code in selected_codes:
-                info = LANG_INFO[code]
-                lang_name = info["name"]
-                hf_tts_model_id = info["tts_model"]
-                gr.Info(f"Processing for {lang_name}...")
-                lang_tmpdir = tmpdir_base / code
-                lang_tmpdir.mkdir(parents=True, exist_ok=True)
-                dialogue: Optional[str] = None
-                gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
-                prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
-                try:
-                    # The gemini_model is initialized using the API key from genai.configure()
-                    response = gemini_model.generate_content(prompt_for_gemini)
-                    dialogue_raw = response.text
-                    if not dialogue_raw or not dialogue_raw.strip():
-                        gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
-                        continue
-                    dialogue = dialogue_raw
-                    results_data[code]["script_text"] = dialogue
-                    script_file_path = lang_tmpdir / f"podcast_script_{code}.txt"
-                    script_file_path.write_text(dialogue, encoding="utf-8")
-                    results_data[code]["script_file"] = str(script_file_path)
-                except Exception as e:
-                    # Check if the error indicates an API key issue from Gemini
-                    if "API_KEY_INVALID" in str(e) or "permission" in str(e).lower():
-                         raise gr.Error(f"Gemini API Key error for {lang_name}: {e}. Please verify your API key and its permissions.")
-                    gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
-                    continue
-                if dialogue:
-                    if hf_tts_client: # Only attempt TTS if client is available
-                        gr.Info(f"Synthesizing speech for {lang_name} with Hugging Face TTS ({hf_tts_model_id})...")
-                        try:
-                           tts_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
-                           results_data[code]["audio"] = str(tts_path)
-                        except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
-                            gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
-                        except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
-                            gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
-                        except Exception as e: # Catch any other unexpected errors during synthesis
-                            gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
-                    else:
-                        gr.Info(f"HF TTS client not available. Skipping speech synthesis for {lang_name}.")
-        final_ordered_results: List[Optional[Any]] = []
-        for code_key in LANG_INFO.keys():
-            lang_output_data = results_data[code_key]
-            final_ordered_results.append(lang_output_data["audio"])
-            final_ordered_results.append(lang_output_data["script_text"])
-            final_ordered_results.append(lang_output_data["script_file"])
-        gr.Info("Podcast generation complete!")
-        return final_ordered_results
-    except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
-        raise e
-    except Exception as e: # Catch other unexpected errors during the process
-        import traceback
-        print("An unexpected error occurred in generate_podcast:")
-        traceback.print_exc()
-        raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
-# ------------------------------------------------------------------
-# Gradio Interface Setup
-# ------------------------------------------------------------------
-language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
 inputs = [
-    gr.Textbox(
-        label="Enter your Google AI Studio API Key (for Gemini text generation)",
-        type="password",
-        placeholder="Paste your Gemini API key here",
-        # value=os.getenv("GEMINI_API_KEY_FOR_DEV") # Optional: for local dev default, remove for deployment
-    ),
-    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
-    gr.CheckboxGroup(
-        choices=language_names_ordered,
-        value=["English"], # Default language selection
-        label="Select podcast language(s) to generate",
-    ),
 ]
 outputs = []
-for code in LANG_INFO.keys():
-    info = LANG_INFO[code]
-    lang_name = info["name"]
-    outputs.append(gr.Audio(label=f"{lang_name} Podcast (.flac)", type="filepath"))
-    outputs.append(gr.Markdown(label=f"{lang_name} Script"))
-    outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
-    title="Lecture → Podcast & Script (Gemini Text + HF Speech)",
-    description=(
-        "**SETUP:**\n"
-        "1. **Gemini API Key**: Enter your Google AI Studio API Key in the field below for text generation.\n"
-        "2. **Hugging Face Token (for Speech)**: For Text-to-Speech, ensure you have a Hugging Face Token. "
-        "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `HF_TOKEN`. "
-        "Paste your Hugging Face token as its value.\n\n"
-        "Upload a lecture PDF, choose language(s), and receive an audio podcast "
-        "and its script. Dialogue by Google Gemini, speech by Hugging Face MMS-TTS."
-    ),
-    allow_flagging="never",
 )
 if __name__ == "__main__":
-    # For local testing of HF_TOKEN, you can set it as an environment variable:
-    # os.environ["HF_TOKEN"] = "your_hf_token_here"
-    if not os.getenv("HF_TOKEN"):
-        print("Reminder: For local testing with TTS, set the HF_TOKEN environment variable.")
-    # The Gemini API key will be taken from the UI input.
-    # You could add a default value for local testing to the gr.Textbox `value` argument if desired.
-    # e.g. value=os.getenv("GEMINI_API_KEY_FOR_DEV")
-    iface.launch()

 # =============================================================
+# Lecture → Podcast & Script Generator (Gemini + HF TTS)
+# Modified: Script outputs rendered as HTML
 # =============================================================
 import os
 import re
 import tempfile
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+# Hugging Face TTS
 from huggingface_hub import InferenceClient
+# Google Gemini
+import google.generativeai as genai
 # ------------------------------------------------------------------
+# HF TTS client
 # ------------------------------------------------------------------
 hf_token = os.getenv("HF_TOKEN")
+hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None
+# Language metadata
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
     "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
+# Prompt template
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
     Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
+    wrap up with a concise recap. Preserve technical accuracy.
     ### Lecture Content
     {content}
     """
 )
+# PDF extraction
 TOKEN_LIMIT = 8000
+def extract_pdf_text(path: str) -> str:
+    reader = PdfReader(path)
+    return "\n".join(p.extract_text() or "" for p in reader.pages)
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
+    return " ".join(words[:limit]) if len(words) > limit else text
+# TTS chunking
+CHUNK_CHAR_LIMIT = 280
+def split_chunks(text: str) -> List[str]:
+    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
+    chunks, curr = [], ""
+    for s in sentences:
+        if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
+            chunks.append(curr)
+            curr = s
         else:
+            curr = f"{curr} {s}" if curr else s
+    if curr: chunks.append(curr)
+    return chunks
+# Synthesize speech
+def synthesize(text: str, model_id: str, outdir: Path) -> str:
+    segments = []
+    for i, chunk in enumerate(split_chunks(text)):
+        audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
+        path = outdir / f"part{i}.flac"
+        path.write_bytes(audio_bytes)
+        seg = AudioSegment.from_file(path, format="flac")
+        segments.append(seg)
+    final = sum(segments, AudioSegment.empty())
+    out = outdir / "podcast.flac"
+    final.export(out, format="flac")
+    return str(out)
+# Main pipeline
 def generate_podcast(
+    gemini_key: str,
+    pdf_file: gr.File,
+    langs: List[str]
 ) -> List[Optional[Any]]:
+    if not gemini_key:
+        raise gr.Error("Enter Google AI Studio API Key.")
+    if not pdf_file:
+        raise gr.Error("Upload a PDF file.")
+    if not langs:
+        raise gr.Error("Select at least one language.")
+    genai.configure(api_key=gemini_key)
+    raw = extract_pdf_text(pdf_file.name)
+    content = truncate_text(raw)
+    tmp = Path(tempfile.mkdtemp())
+    results = []
+    data = {}
+    for code, info in LANG_INFO.items():
+        if info["name"] not in langs:
+            results.extend([None, None, None])
+            continue
+        # Generate script
+        prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
+        model = genai.GenerativeModel('gemini-1.5-flash-latest')
+        resp = model.generate_content(prompt)
+        script = resp.text.strip()
+        # Save plain text
+        script_path = tmp / f"script_{code}.txt"
+        script_path.write_text(script, encoding="utf-8")
+        # Render HTML version
+        html_script = f"<pre>{script}</pre>"
+        # Synthesize audio if available
+        audio_path = None
+        if hf_tts_client:
+            audio_path = synthesize(script, info["tts_model"], tmp / code)
+        results.extend([audio_path, html_script, str(script_path)])
+    return results
+# Interface
 inputs = [
+    gr.Textbox(label="Google AI Studio API Key", type="password"),
+    gr.File(label="Lecture PDF", file_types=[".pdf"]),
+    gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
+                     value=["English"], label="Languages")
 ]
 outputs = []
+for code, info in LANG_INFO.items():
+    outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
+    outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
+    outputs.append(gr.File(label=f"Download {info['name']} Script"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
+    title="Lecture → Podcast & Script",
 )
 if __name__ == "__main__":
+    iface.launch()