Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

369b2d2

verified ·

1 Parent(s): b3c9dda

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -136

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # =============================================================
-# Hugging Face Space – Lecture → Podcast Generator (Google Gemini & TTS)
 # =============================================================
-# • **Text generation** – Google Gemini API
-# • **Speech synthesis** – Google Cloud Text-to-Speech API
 # -----------------------------------------------------------------
 import os
@@ -17,32 +17,40 @@ from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
-# Import Google Cloud libraries
 try:
     import google.generativeai as genai
-    from google.cloud import texttospeech
 except ImportError:
-    raise ImportError(
-        "Please install required Google libraries: "
-        "pip install google-generativeai google-cloud-texttospeech"
-    )
 # ------------------------------------------------------------------
-# Language metadata for Google TTS (BCP-47 codes)
-# You might want to specify particular voices too (e.g., "en-US-Wavenet-D")
-# For simplicity, we'll let Google pick a standard voice for the language code.
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
-    "en": {"name": "English", "tts_lang_code": "en-US"},
-    "bn": {"name": "Bangla",  "tts_lang_code": "bn-IN"},
-    "zh": {"name": "Chinese (Mandarin)", "tts_lang_code": "cmn-CN"}, # cmn for Mandarin
-    "ur": {"name": "Urdu",    "tts_lang_code": "ur-PK"},
-    "ne": {"name": "Nepali",  "tts_lang_code": "ne-NP"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
 # ------------------------------------------------------------------
-# Prompt template (adjust if needed for Gemini's style)
 # ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
@@ -64,7 +72,7 @@ def extract_pdf_text(pdf_path: str) -> str:
     except Exception as e:
         raise gr.Error(f"Failed to process PDF: {e}")
-TOKEN_LIMIT = 8000 # Word limit for input text
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
@@ -74,12 +82,11 @@ def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     return text
 # ------------------------------------------------------------------
-# TTS helper – chunk long text (Google TTS has a limit of 5000 bytes per request)
 # ------------------------------------------------------------------
-CHUNK_CHAR_LIMIT = 1500  # Google TTS limit is 5000 bytes. Characters are safer.
-                        # Average 3 bytes/char for UTF-8, so 1500 chars is ~4500 bytes.
-def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
     sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
     sentences = [s.strip() for s in sentences_raw if s.strip()]
     if not sentences: return []
@@ -94,54 +101,45 @@ def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
     return [chunk for chunk in chunks if chunk.strip()]
-def synthesize_speech_google(
-    text: str,
-    google_lang_code: str,
     lang_tmpdir: Path,
-    tts_client: texttospeech.TextToSpeechClient
 ) -> Path:
-    """Splits text, synthesizes with Google TTS, concatenates MP3s."""
-    chunks = _split_to_chunks(text)
     if not chunks:
         raise ValueError("Text resulted in no speakable chunks after splitting.")
     audio_segments: List[AudioSegment] = []
     for idx, chunk in enumerate(chunks):
-        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with Google TTS...")
-        synthesis_input = texttospeech.SynthesisInput(text=chunk)
-        voice = texttospeech.VoiceSelectionParams(
-            language_code=google_lang_code,
-            # You can specify a voice name, e.g., "en-US-Wavenet-D"
-            # ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL # Optional
-        )
-        audio_config = texttospeech.AudioConfig(
-            audio_encoding=texttospeech.AudioEncoding.MP3
-        )
         try:
-            response = tts_client.synthesize_speech(
-                input=synthesis_input, voice=voice, audio_config=audio_config
-            )
         except Exception as e:
-            raise RuntimeError(f"Google TTS request failed for chunk {idx+1}: {e}") from e
-        part_path = lang_tmpdir / f"part_{idx}.mp3"
-        with open(part_path, "wb") as out_mp3:
-            out_mp3.write(response.audio_content)
         try:
-            segment = AudioSegment.from_mp3(part_path)
             audio_segments.append(segment)
         except CouldntDecodeError as e:
-            raise RuntimeError(f"Failed to decode MP3 audio chunk {idx+1} from {part_path}. Error: {e}") from e
     if not audio_segments:
         raise RuntimeError("No audio segments were successfully synthesized or decoded.")
     combined_audio = sum(audio_segments, AudioSegment.empty())
-    final_path = lang_tmpdir / "podcast_audio.mp3"
-    combined_audio.export(final_path, format="mp3")
     return final_path
 # ------------------------------------------------------------------
@@ -149,58 +147,32 @@ def synthesize_speech_google(
 # ------------------------------------------------------------------
 def generate_podcast(
-    gemini_api_key: Optional[str],
     pdf_file_obj: Optional[gr.File],
     selected_lang_names: List[str]
 ) -> List[Optional[Any]]:
-    if not gemini_api_key:
-        raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
     if not pdf_file_obj:
         raise gr.Error("Please upload a PDF file.")
     if not selected_lang_names:
         raise gr.Error("Please select at least one language for the podcast.")
     try:
-        genai.configure(api_key=gemini_api_key)
-    except Exception as e:
-        raise gr.Error(f"Failed to configure Gemini API. Check your API key. Error: {e}")
-    # IMPORTANT: Google Cloud Text-to-Speech client initialization.
-    # It expects GOOGLE_APPLICATION_CREDENTIALS environment variable to be set,
-    # pointing to your service account JSON key file.
-    # In Hugging Face Spaces, upload this JSON file as a Secret, e.g., named
-    # `GOOGLE_CREDS_JSON_CONTENT` (paste the content of the file).
-    # Then, in your Space's startup or here, you'd write this content to a temporary file
-    # and set GOOGLE_APPLICATION_CREDENTIALS to that temp file's path.
-    # Or, if GOOGLE_APPLICATION_CREDENTIALS points to a file path directly (less secure for pasted content).
-    # Example for setting GOOGLE_APPLICATION_CREDENTIALS from a Space secret:
-    google_creds_json_content = os.getenv("GOOGLE_CREDS_JSON_CONTENT")
-    temp_creds_file = None
-    if google_creds_json_content:
-        try:
-            fd, temp_creds_path = tempfile.mkstemp(suffix=".json")
-            with os.fdopen(fd, "w") as tmp:
-                tmp.write(google_creds_json_content)
-            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_creds_path
-            temp_creds_file = Path(temp_creds_path)
-            gr.Info("Using GOOGLE_CREDS_JSON_CONTENT secret for Text-to-Speech API authentication.")
-        except Exception as e:
-            gr.Warning(f"Could not process GOOGLE_CREDS_JSON_CONTENT secret: {e}. TTS might fail.")
-    elif not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
-        gr.Warning(
-            "GOOGLE_APPLICATION_CREDENTIALS environment variable not set, and no "
-            "GOOGLE_CREDS_JSON_CONTENT secret found. "
-            "Google Text-to-Speech API calls may fail. "
-            "Please set up authentication for Google Cloud Text-to-Speech."
-        )
-    try:
-        tts_client = texttospeech.TextToSpeechClient()
     except Exception as e:
-        raise gr.Error(f"Failed to initialize Google Text-to-Speech client. Ensure authentication is set up. Error: {e}")
     selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
     results_data: Dict[str, Dict[str, Optional[str]]] = {
@@ -219,14 +191,18 @@ def generate_podcast(
             if not lecture_text.strip():
                 raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
-            # Initialize Gemini model (e.g., 'gemini-1.5-flash' or 'gemini-pro')
-            # Choose a model appropriate for your task and quota.
-            gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
             for code in selected_codes:
                 info = LANG_INFO[code]
                 lang_name = info["name"]
-                google_tts_lang = info["tts_lang_code"]
                 gr.Info(f"Processing for {lang_name}...")
                 lang_tmpdir = tmpdir_base / code
@@ -237,8 +213,9 @@ def generate_podcast(
                 gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
                 prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
                 try:
                     response = gemini_model.generate_content(prompt_for_gemini)
-                    dialogue_raw = response.text # Accessing the text part of the response
                     if not dialogue_raw or not dialogue_raw.strip():
                         gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
@@ -251,20 +228,26 @@ def generate_podcast(
                     results_data[code]["script_file"] = str(script_file_path)
                 except Exception as e:
                     gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
                     continue
                 if dialogue:
-                    gr.Info(f"Synthesizing speech for {lang_name} with Google TTS...")
-                    try:
-                        tts_path = synthesize_speech_google(dialogue, google_tts_lang, lang_tmpdir, tts_client)
-                        results_data[code]["audio"] = str(tts_path)
-                    except ValueError as e:
-                        gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
-                    except RuntimeError as e:
-                        gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
-                    except Exception as e:
-                        gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
         final_ordered_results: List[Optional[Any]] = []
         for code_key in LANG_INFO.keys():
@@ -276,24 +259,13 @@ def generate_podcast(
         gr.Info("Podcast generation complete!")
         return final_ordered_results
-    except gr.Error as e:
         raise e
-    except Exception as e:
         import traceback
         print("An unexpected error occurred in generate_podcast:")
         traceback.print_exc()
         raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
-    finally:
-        # Clean up the temporary credentials file if it was created
-        if temp_creds_file and temp_creds_file.exists():
-            try:
-                temp_creds_file.unlink()
-                # Unset the env var if you want, though it's specific to this run
-                # if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.environ["GOOGLE_APPLICATION_CREDENTIALS"] == str(temp_creds_file):
-                # del os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
-            except Exception as e_clean:
-                print(f"Warning: Could not clean up temporary credentials file {temp_creds_file}: {e_clean}")
 # ------------------------------------------------------------------
 # Gradio Interface Setup
@@ -302,14 +274,15 @@ language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
 inputs = [
     gr.Textbox(
-        label="Enter your Google AI Studio API Key (for Gemini)",
         type="password",
-        placeholder="Paste your API key here",
     ),
     gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     gr.CheckboxGroup(
         choices=language_names_ordered,
-        value=["English"],
         label="Select podcast language(s) to generate",
     ),
 ]
@@ -318,7 +291,7 @@ outputs = []
 for code in LANG_INFO.keys():
     info = LANG_INFO[code]
     lang_name = info["name"]
-    outputs.append(gr.Audio(label=f"{lang_name} Podcast (.mp3)", type="filepath"))
     outputs.append(gr.Markdown(label=f"{lang_name} Script"))
     outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
@@ -326,23 +299,26 @@ iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
-    title="Lecture → Podcast & Script (Google Gemini & TTS)",
     description=(
-        "**IMPORTANT SETUP:**\n"
-        "1. Enter your Google AI Studio API Key for Gemini text generation.\n"
-        "2. For Text-to-Speech: Enable the 'Cloud Text-to-Speech API' in your Google Cloud Project. "
-        "Create a service account with 'Cloud Text-to-Speech API User' role, download its JSON key. "
-        "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `GOOGLE_CREDS_JSON_CONTENT`. "
-        "Paste the *entire content* of your service account JSON key file as the value for this secret.\n\n"
         "Upload a lecture PDF, choose language(s), and receive an audio podcast "
-        "and its script. Dialogue by Google Gemini, speech by Google Cloud TTS."
     ),
     allow_flagging="never",
 )
 if __name__ == "__main__":
-    # Make sure GOOGLE_CREDS_JSON_CONTENT is available as an environment variable
-    # or GOOGLE_APPLICATION_CREDENTIALS is set correctly if running locally for testing.
-    # For local testing with a service account key file:
-    # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/your/service-account-file.json"
     iface.launch()

 # =============================================================
+# Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 # =============================================================
+# • **Text generation** – Google Gemini API (via user-provided genai API Key)
+# • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
 # -----------------------------------------------------------------
 import os
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+# For Hugging Face TTS
+from huggingface_hub import InferenceClient, HubHTTPError
+# For Google Gemini
 try:
     import google.generativeai as genai
 except ImportError:
+    raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
+# ------------------------------------------------------------------
+# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
+# ------------------------------------------------------------------
+hf_tts_client: Optional[InferenceClient] = None
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    hf_tts_client = InferenceClient(token=hf_token)
+else:
+    # This print will show in the Space logs if HF_TOKEN is missing
+    print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
 # ------------------------------------------------------------------
+# Language metadata for Hugging Face MMS-TTS models
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
+    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
+    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
+    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
+    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
+    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
 # ------------------------------------------------------------------
+# Prompt template for Gemini
 # ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     except Exception as e:
         raise gr.Error(f"Failed to process PDF: {e}")
+TOKEN_LIMIT = 8000
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     return text
 # ------------------------------------------------------------------
+# TTS helper using Hugging Face Inference API
 # ------------------------------------------------------------------
+CHUNK_CHAR_LIMIT_HF = 280
+def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
     sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
     sentences = [s.strip() for s in sentences_raw if s.strip()]
     if not sentences: return []
     return [chunk for chunk in chunks if chunk.strip()]
+def synthesize_speech_hf(
+    text: str,
+    hf_model_id: str,
     lang_tmpdir: Path,
+    tts_client: InferenceClient
 ) -> Path:
+    chunks = _split_to_chunks_hf(text)
     if not chunks:
         raise ValueError("Text resulted in no speakable chunks after splitting.")
     audio_segments: List[AudioSegment] = []
     for idx, chunk in enumerate(chunks):
+        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
         try:
+            audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
+        except HubHTTPError as e:
+            error_message = f"HF TTS request failed for chunk {idx+1} ('{chunk[:30]}...'): {e}"
+            if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
+                gr.Warning(f"Skipping an apparently empty chunk for HF TTS: Chunk {idx+1}")
+                continue
+            raise RuntimeError(error_message) from e
         except Exception as e:
+             raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
+        part_path = lang_tmpdir / f"part_{idx}.flac"
+        part_path.write_bytes(audio_bytes)
         try:
+            segment = AudioSegment.from_file(part_path, format="flac")
             audio_segments.append(segment)
         except CouldntDecodeError as e:
+            raise RuntimeError(f"Failed to decode FLAC audio chunk {idx+1} from {part_path}. Error: {e}") from e
     if not audio_segments:
         raise RuntimeError("No audio segments were successfully synthesized or decoded.")
     combined_audio = sum(audio_segments, AudioSegment.empty())
+    final_path = lang_tmpdir / "podcast_audio.flac"
+    combined_audio.export(final_path, format="flac")
     return final_path
 # ------------------------------------------------------------------
 # ------------------------------------------------------------------
 def generate_podcast(
+    gemini_api_key_from_ui: Optional[str], # Explicitly named to show source
     pdf_file_obj: Optional[gr.File],
     selected_lang_names: List[str]
 ) -> List[Optional[Any]]:
+    if not gemini_api_key_from_ui: # Check the key provided from the UI input
+        raise gr.Error("Please enter your Google AI Studio API Key for Gemini in the input field.")
     if not pdf_file_obj:
         raise gr.Error("Please upload a PDF file.")
     if not selected_lang_names:
         raise gr.Error("Please select at least one language for the podcast.")
+    # Configure Gemini API using the key directly from the UI input
     try:
+        genai.configure(api_key=gemini_api_key_from_ui)
+        gr.Info("Gemini API configured successfully with the provided key.")
     except Exception as e:
+        raise gr.Error(f"Failed to configure Gemini API with the provided key. Please check your API key. Error: {e}")
+    # Check if HF TTS client is available (HF_TOKEN was provided as a secret)
+    if not hf_tts_client:
+        gr.Warning( # Changed to gr.Warning to allow script generation if TTS fails to init
+            "Hugging Face TTS client is not available (HF_TOKEN secret might be missing or invalid). "
+            "Speech synthesis will be skipped, but script generation will be attempted."
+        )
+        # Note: Script generation can still proceed, TTS will be skipped later if client is None.
     selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
     results_data: Dict[str, Dict[str, Optional[str]]] = {
             if not lecture_text.strip():
                 raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
+            # Initialize Gemini model (e.g., 'gemini-1.5-flash-latest' or 'gemini-pro')
+            # This happens after genai.configure has been called.
+            try:
+                gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
+            except Exception as e:
+                raise gr.Error(f"Failed to initialize Gemini model. This might be due to an invalid API key or API access issues. Error: {e}")
             for code in selected_codes:
                 info = LANG_INFO[code]
                 lang_name = info["name"]
+                hf_tts_model_id = info["tts_model"]
                 gr.Info(f"Processing for {lang_name}...")
                 lang_tmpdir = tmpdir_base / code
                 gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
                 prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
                 try:
+                    # The gemini_model is initialized using the API key from genai.configure()
                     response = gemini_model.generate_content(prompt_for_gemini)
+                    dialogue_raw = response.text
                     if not dialogue_raw or not dialogue_raw.strip():
                         gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
                     results_data[code]["script_file"] = str(script_file_path)
                 except Exception as e:
+                    # Check if the error indicates an API key issue from Gemini
+                    if "API_KEY_INVALID" in str(e) or "permission" in str(e).lower():
+                         raise gr.Error(f"Gemini API Key error for {lang_name}: {e}. Please verify your API key and its permissions.")
                     gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
                     continue
                 if dialogue:
+                    if hf_tts_client: # Only attempt TTS if client is available
+                        gr.Info(f"Synthesizing speech for {lang_name} with Hugging Face TTS ({hf_tts_model_id})...")
+                        try:
+                           tts_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
+                           results_data[code]["audio"] = str(tts_path)
+                        except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
+                            gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
+                        except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
+                            gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
+                        except Exception as e: # Catch any other unexpected errors during synthesis
+                            gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
+                    else:
+                        gr.Info(f"HF TTS client not available. Skipping speech synthesis for {lang_name}.")
         final_ordered_results: List[Optional[Any]] = []
         for code_key in LANG_INFO.keys():
         gr.Info("Podcast generation complete!")
         return final_ordered_results
+    except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
         raise e
+    except Exception as e: # Catch other unexpected errors during the process
         import traceback
         print("An unexpected error occurred in generate_podcast:")
         traceback.print_exc()
         raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
 # ------------------------------------------------------------------
 # Gradio Interface Setup
 inputs = [
     gr.Textbox(
+        label="Enter your Google AI Studio API Key (for Gemini text generation)",
         type="password",
+        placeholder="Paste your Gemini API key here",
+        # value=os.getenv("GEMINI_API_KEY_FOR_DEV") # Optional: for local dev default, remove for deployment
     ),
     gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     gr.CheckboxGroup(
         choices=language_names_ordered,
+        value=["English"], # Default language selection
         label="Select podcast language(s) to generate",
     ),
 ]
 for code in LANG_INFO.keys():
     info = LANG_INFO[code]
     lang_name = info["name"]
+    outputs.append(gr.Audio(label=f"{lang_name} Podcast (.flac)", type="filepath"))
     outputs.append(gr.Markdown(label=f"{lang_name} Script"))
     outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
+    title="Lecture → Podcast & Script (Gemini Text + HF Speech)",
     description=(
+        "**SETUP:**\n"
+        "1. **Gemini API Key**: Enter your Google AI Studio API Key in the field below for text generation.\n"
+        "2. **Hugging Face Token (for Speech)**: For Text-to-Speech, ensure you have a Hugging Face Token. "
+        "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `HF_TOKEN`. "
+        "Paste your Hugging Face token as its value.\n\n"
         "Upload a lecture PDF, choose language(s), and receive an audio podcast "
+        "and its script. Dialogue by Google Gemini, speech by Hugging Face MMS-TTS."
     ),
     allow_flagging="never",
 )
 if __name__ == "__main__":
+    # For local testing of HF_TOKEN, you can set it as an environment variable:
+    # os.environ["HF_TOKEN"] = "your_hf_token_here"
+    if not os.getenv("HF_TOKEN"):
+        print("Reminder: For local testing with TTS, set the HF_TOKEN environment variable.")
+    # The Gemini API key will be taken from the UI input.
+    # You could add a default value for local testing to the gr.Textbox `value` argument if desired.
+    # e.g. value=os.getenv("GEMINI_API_KEY_FOR_DEV")
     iface.launch()