HaiderAUT commited on
Commit
369b2d2
·
verified ·
1 Parent(s): b3c9dda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -136
app.py CHANGED
@@ -1,8 +1,8 @@
1
  # =============================================================
2
- # Hugging Face Space – Lecture → Podcast Generator (Google Gemini & TTS)
3
  # =============================================================
4
- # • **Text generation** – Google Gemini API
5
- # • **Speech synthesis** – Google Cloud Text-to-Speech API
6
  # -----------------------------------------------------------------
7
 
8
  import os
@@ -17,32 +17,40 @@ from PyPDF2 import PdfReader
17
  from pydub import AudioSegment
18
  from pydub.exceptions import CouldntDecodeError
19
 
20
- # Import Google Cloud libraries
 
 
 
21
  try:
22
  import google.generativeai as genai
23
- from google.cloud import texttospeech
24
  except ImportError:
25
- raise ImportError(
26
- "Please install required Google libraries: "
27
- "pip install google-generativeai google-cloud-texttospeech"
28
- )
 
 
 
 
 
 
 
 
29
 
30
  # ------------------------------------------------------------------
31
- # Language metadata for Google TTS (BCP-47 codes)
32
- # You might want to specify particular voices too (e.g., "en-US-Wavenet-D")
33
- # For simplicity, we'll let Google pick a standard voice for the language code.
34
  # ------------------------------------------------------------------
35
  LANG_INFO: Dict[str, Dict[str, str]] = {
36
- "en": {"name": "English", "tts_lang_code": "en-US"},
37
- "bn": {"name": "Bangla", "tts_lang_code": "bn-IN"},
38
- "zh": {"name": "Chinese (Mandarin)", "tts_lang_code": "cmn-CN"}, # cmn for Mandarin
39
- "ur": {"name": "Urdu", "tts_lang_code": "ur-PK"},
40
- "ne": {"name": "Nepali", "tts_lang_code": "ne-NP"},
41
  }
42
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
43
 
44
  # ------------------------------------------------------------------
45
- # Prompt template (adjust if needed for Gemini's style)
46
  # ------------------------------------------------------------------
47
  PROMPT_TEMPLATE = textwrap.dedent(
48
  """
@@ -64,7 +72,7 @@ def extract_pdf_text(pdf_path: str) -> str:
64
  except Exception as e:
65
  raise gr.Error(f"Failed to process PDF: {e}")
66
 
67
- TOKEN_LIMIT = 8000 # Word limit for input text
68
 
69
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
70
  words = text.split()
@@ -74,12 +82,11 @@ def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
74
  return text
75
 
76
  # ------------------------------------------------------------------
77
- # TTS helper chunk long text (Google TTS has a limit of 5000 bytes per request)
78
  # ------------------------------------------------------------------
79
- CHUNK_CHAR_LIMIT = 1500 # Google TTS limit is 5000 bytes. Characters are safer.
80
- # Average 3 bytes/char for UTF-8, so 1500 chars is ~4500 bytes.
81
 
82
- def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
83
  sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
84
  sentences = [s.strip() for s in sentences_raw if s.strip()]
85
  if not sentences: return []
@@ -94,54 +101,45 @@ def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
94
  return [chunk for chunk in chunks if chunk.strip()]
95
 
96
 
97
- def synthesize_speech_google(
98
- text: str,
99
- google_lang_code: str,
100
  lang_tmpdir: Path,
101
- tts_client: texttospeech.TextToSpeechClient
102
  ) -> Path:
103
- """Splits text, synthesizes with Google TTS, concatenates MP3s."""
104
- chunks = _split_to_chunks(text)
105
  if not chunks:
106
  raise ValueError("Text resulted in no speakable chunks after splitting.")
107
 
108
  audio_segments: List[AudioSegment] = []
109
  for idx, chunk in enumerate(chunks):
110
- gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with Google TTS...")
111
-
112
- synthesis_input = texttospeech.SynthesisInput(text=chunk)
113
- voice = texttospeech.VoiceSelectionParams(
114
- language_code=google_lang_code,
115
- # You can specify a voice name, e.g., "en-US-Wavenet-D"
116
- # ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL # Optional
117
- )
118
- audio_config = texttospeech.AudioConfig(
119
- audio_encoding=texttospeech.AudioEncoding.MP3
120
- )
121
-
122
  try:
123
- response = tts_client.synthesize_speech(
124
- input=synthesis_input, voice=voice, audio_config=audio_config
125
- )
 
 
 
 
126
  except Exception as e:
127
- raise RuntimeError(f"Google TTS request failed for chunk {idx+1}: {e}") from e
128
 
129
- part_path = lang_tmpdir / f"part_{idx}.mp3"
130
- with open(part_path, "wb") as out_mp3:
131
- out_mp3.write(response.audio_content)
132
 
133
  try:
134
- segment = AudioSegment.from_mp3(part_path)
135
  audio_segments.append(segment)
136
  except CouldntDecodeError as e:
137
- raise RuntimeError(f"Failed to decode MP3 audio chunk {idx+1} from {part_path}. Error: {e}") from e
138
 
139
  if not audio_segments:
140
  raise RuntimeError("No audio segments were successfully synthesized or decoded.")
141
 
142
  combined_audio = sum(audio_segments, AudioSegment.empty())
143
- final_path = lang_tmpdir / "podcast_audio.mp3"
144
- combined_audio.export(final_path, format="mp3")
145
  return final_path
146
 
147
  # ------------------------------------------------------------------
@@ -149,58 +147,32 @@ def synthesize_speech_google(
149
  # ------------------------------------------------------------------
150
 
151
  def generate_podcast(
152
- gemini_api_key: Optional[str],
153
  pdf_file_obj: Optional[gr.File],
154
  selected_lang_names: List[str]
155
  ) -> List[Optional[Any]]:
156
 
157
- if not gemini_api_key:
158
- raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
159
  if not pdf_file_obj:
160
  raise gr.Error("Please upload a PDF file.")
161
  if not selected_lang_names:
162
  raise gr.Error("Please select at least one language for the podcast.")
163
 
 
164
  try:
165
- genai.configure(api_key=gemini_api_key)
166
- except Exception as e:
167
- raise gr.Error(f"Failed to configure Gemini API. Check your API key. Error: {e}")
168
-
169
- # IMPORTANT: Google Cloud Text-to-Speech client initialization.
170
- # It expects GOOGLE_APPLICATION_CREDENTIALS environment variable to be set,
171
- # pointing to your service account JSON key file.
172
- # In Hugging Face Spaces, upload this JSON file as a Secret, e.g., named
173
- # `GOOGLE_CREDS_JSON_CONTENT` (paste the content of the file).
174
- # Then, in your Space's startup or here, you'd write this content to a temporary file
175
- # and set GOOGLE_APPLICATION_CREDENTIALS to that temp file's path.
176
- # Or, if GOOGLE_APPLICATION_CREDENTIALS points to a file path directly (less secure for pasted content).
177
-
178
- # Example for setting GOOGLE_APPLICATION_CREDENTIALS from a Space secret:
179
- google_creds_json_content = os.getenv("GOOGLE_CREDS_JSON_CONTENT")
180
- temp_creds_file = None
181
- if google_creds_json_content:
182
- try:
183
- fd, temp_creds_path = tempfile.mkstemp(suffix=".json")
184
- with os.fdopen(fd, "w") as tmp:
185
- tmp.write(google_creds_json_content)
186
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_creds_path
187
- temp_creds_file = Path(temp_creds_path)
188
- gr.Info("Using GOOGLE_CREDS_JSON_CONTENT secret for Text-to-Speech API authentication.")
189
- except Exception as e:
190
- gr.Warning(f"Could not process GOOGLE_CREDS_JSON_CONTENT secret: {e}. TTS might fail.")
191
- elif not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
192
- gr.Warning(
193
- "GOOGLE_APPLICATION_CREDENTIALS environment variable not set, and no "
194
- "GOOGLE_CREDS_JSON_CONTENT secret found. "
195
- "Google Text-to-Speech API calls may fail. "
196
- "Please set up authentication for Google Cloud Text-to-Speech."
197
- )
198
-
199
- try:
200
- tts_client = texttospeech.TextToSpeechClient()
201
  except Exception as e:
202
- raise gr.Error(f"Failed to initialize Google Text-to-Speech client. Ensure authentication is set up. Error: {e}")
203
 
 
 
 
 
 
 
 
204
 
205
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
206
  results_data: Dict[str, Dict[str, Optional[str]]] = {
@@ -219,14 +191,18 @@ def generate_podcast(
219
  if not lecture_text.strip():
220
  raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
221
 
222
- # Initialize Gemini model (e.g., 'gemini-1.5-flash' or 'gemini-pro')
223
- # Choose a model appropriate for your task and quota.
224
- gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
 
 
 
 
225
 
226
  for code in selected_codes:
227
  info = LANG_INFO[code]
228
  lang_name = info["name"]
229
- google_tts_lang = info["tts_lang_code"]
230
 
231
  gr.Info(f"Processing for {lang_name}...")
232
  lang_tmpdir = tmpdir_base / code
@@ -237,8 +213,9 @@ def generate_podcast(
237
  gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
238
  prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
239
  try:
 
240
  response = gemini_model.generate_content(prompt_for_gemini)
241
- dialogue_raw = response.text # Accessing the text part of the response
242
 
243
  if not dialogue_raw or not dialogue_raw.strip():
244
  gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
@@ -251,20 +228,26 @@ def generate_podcast(
251
  results_data[code]["script_file"] = str(script_file_path)
252
 
253
  except Exception as e:
 
 
 
254
  gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
255
  continue
256
 
257
  if dialogue:
258
- gr.Info(f"Synthesizing speech for {lang_name} with Google TTS...")
259
- try:
260
- tts_path = synthesize_speech_google(dialogue, google_tts_lang, lang_tmpdir, tts_client)
261
- results_data[code]["audio"] = str(tts_path)
262
- except ValueError as e:
263
- gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
264
- except RuntimeError as e:
265
- gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
266
- except Exception as e:
267
- gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
 
 
 
268
 
269
  final_ordered_results: List[Optional[Any]] = []
270
  for code_key in LANG_INFO.keys():
@@ -276,24 +259,13 @@ def generate_podcast(
276
  gr.Info("Podcast generation complete!")
277
  return final_ordered_results
278
 
279
- except gr.Error as e:
280
  raise e
281
- except Exception as e:
282
  import traceback
283
  print("An unexpected error occurred in generate_podcast:")
284
  traceback.print_exc()
285
  raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
286
- finally:
287
- # Clean up the temporary credentials file if it was created
288
- if temp_creds_file and temp_creds_file.exists():
289
- try:
290
- temp_creds_file.unlink()
291
- # Unset the env var if you want, though it's specific to this run
292
- # if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.environ["GOOGLE_APPLICATION_CREDENTIALS"] == str(temp_creds_file):
293
- # del os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
294
- except Exception as e_clean:
295
- print(f"Warning: Could not clean up temporary credentials file {temp_creds_file}: {e_clean}")
296
-
297
 
298
  # ------------------------------------------------------------------
299
  # Gradio Interface Setup
@@ -302,14 +274,15 @@ language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
302
 
303
  inputs = [
304
  gr.Textbox(
305
- label="Enter your Google AI Studio API Key (for Gemini)",
306
  type="password",
307
- placeholder="Paste your API key here",
 
308
  ),
309
  gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
310
  gr.CheckboxGroup(
311
  choices=language_names_ordered,
312
- value=["English"],
313
  label="Select podcast language(s) to generate",
314
  ),
315
  ]
@@ -318,7 +291,7 @@ outputs = []
318
  for code in LANG_INFO.keys():
319
  info = LANG_INFO[code]
320
  lang_name = info["name"]
321
- outputs.append(gr.Audio(label=f"{lang_name} Podcast (.mp3)", type="filepath"))
322
  outputs.append(gr.Markdown(label=f"{lang_name} Script"))
323
  outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
324
 
@@ -326,23 +299,26 @@ iface = gr.Interface(
326
  fn=generate_podcast,
327
  inputs=inputs,
328
  outputs=outputs,
329
- title="Lecture → Podcast & Script (Google Gemini & TTS)",
330
  description=(
331
- "**IMPORTANT SETUP:**\n"
332
- "1. Enter your Google AI Studio API Key for Gemini text generation.\n"
333
- "2. For Text-to-Speech: Enable the 'Cloud Text-to-Speech API' in your Google Cloud Project. "
334
- "Create a service account with 'Cloud Text-to-Speech API User' role, download its JSON key. "
335
- "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `GOOGLE_CREDS_JSON_CONTENT`. "
336
- "Paste the *entire content* of your service account JSON key file as the value for this secret.\n\n"
337
  "Upload a lecture PDF, choose language(s), and receive an audio podcast "
338
- "and its script. Dialogue by Google Gemini, speech by Google Cloud TTS."
339
  ),
340
  allow_flagging="never",
341
  )
342
 
343
  if __name__ == "__main__":
344
- # Make sure GOOGLE_CREDS_JSON_CONTENT is available as an environment variable
345
- # or GOOGLE_APPLICATION_CREDENTIALS is set correctly if running locally for testing.
346
- # For local testing with a service account key file:
347
- # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/your/service-account-file.json"
 
 
 
 
348
  iface.launch()
 
1
  # =============================================================
2
+ # Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
3
  # =============================================================
4
+ # • **Text generation** – Google Gemini API (via user-provided genai API Key)
5
+ # • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
6
  # -----------------------------------------------------------------
7
 
8
  import os
 
17
  from pydub import AudioSegment
18
  from pydub.exceptions import CouldntDecodeError
19
 
20
+ # For Hugging Face TTS
21
+ from huggingface_hub import InferenceClient, HubHTTPError
22
+
23
+ # For Google Gemini
24
  try:
25
  import google.generativeai as genai
 
26
  except ImportError:
27
+ raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
28
+
29
+ # ------------------------------------------------------------------
30
+ # Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
31
+ # ------------------------------------------------------------------
32
+ hf_tts_client: Optional[InferenceClient] = None
33
+ hf_token = os.getenv("HF_TOKEN")
34
+ if hf_token:
35
+ hf_tts_client = InferenceClient(token=hf_token)
36
+ else:
37
+ # This print will show in the Space logs if HF_TOKEN is missing
38
+ print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
39
 
40
  # ------------------------------------------------------------------
41
+ # Language metadata for Hugging Face MMS-TTS models
 
 
42
  # ------------------------------------------------------------------
43
  LANG_INFO: Dict[str, Dict[str, str]] = {
44
+ "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
45
+ "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
46
+ "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
47
+ "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
48
+ "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
49
  }
50
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
51
 
52
  # ------------------------------------------------------------------
53
+ # Prompt template for Gemini
54
  # ------------------------------------------------------------------
55
  PROMPT_TEMPLATE = textwrap.dedent(
56
  """
 
72
  except Exception as e:
73
  raise gr.Error(f"Failed to process PDF: {e}")
74
 
75
+ TOKEN_LIMIT = 8000
76
 
77
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
78
  words = text.split()
 
82
  return text
83
 
84
  # ------------------------------------------------------------------
85
+ # TTS helper using Hugging Face Inference API
86
  # ------------------------------------------------------------------
87
+ CHUNK_CHAR_LIMIT_HF = 280
 
88
 
89
+ def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
90
  sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
91
  sentences = [s.strip() for s in sentences_raw if s.strip()]
92
  if not sentences: return []
 
101
  return [chunk for chunk in chunks if chunk.strip()]
102
 
103
 
104
+ def synthesize_speech_hf(
105
+ text: str,
106
+ hf_model_id: str,
107
  lang_tmpdir: Path,
108
+ tts_client: InferenceClient
109
  ) -> Path:
110
+ chunks = _split_to_chunks_hf(text)
 
111
  if not chunks:
112
  raise ValueError("Text resulted in no speakable chunks after splitting.")
113
 
114
  audio_segments: List[AudioSegment] = []
115
  for idx, chunk in enumerate(chunks):
116
+ gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
 
 
 
 
 
 
 
 
 
 
 
117
  try:
118
+ audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
119
+ except HubHTTPError as e:
120
+ error_message = f"HF TTS request failed for chunk {idx+1} ('{chunk[:30]}...'): {e}"
121
+ if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
122
+ gr.Warning(f"Skipping an apparently empty chunk for HF TTS: Chunk {idx+1}")
123
+ continue
124
+ raise RuntimeError(error_message) from e
125
  except Exception as e:
126
+ raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
127
 
128
+ part_path = lang_tmpdir / f"part_{idx}.flac"
129
+ part_path.write_bytes(audio_bytes)
 
130
 
131
  try:
132
+ segment = AudioSegment.from_file(part_path, format="flac")
133
  audio_segments.append(segment)
134
  except CouldntDecodeError as e:
135
+ raise RuntimeError(f"Failed to decode FLAC audio chunk {idx+1} from {part_path}. Error: {e}") from e
136
 
137
  if not audio_segments:
138
  raise RuntimeError("No audio segments were successfully synthesized or decoded.")
139
 
140
  combined_audio = sum(audio_segments, AudioSegment.empty())
141
+ final_path = lang_tmpdir / "podcast_audio.flac"
142
+ combined_audio.export(final_path, format="flac")
143
  return final_path
144
 
145
  # ------------------------------------------------------------------
 
147
  # ------------------------------------------------------------------
148
 
149
  def generate_podcast(
150
+ gemini_api_key_from_ui: Optional[str], # Explicitly named to show source
151
  pdf_file_obj: Optional[gr.File],
152
  selected_lang_names: List[str]
153
  ) -> List[Optional[Any]]:
154
 
155
+ if not gemini_api_key_from_ui: # Check the key provided from the UI input
156
+ raise gr.Error("Please enter your Google AI Studio API Key for Gemini in the input field.")
157
  if not pdf_file_obj:
158
  raise gr.Error("Please upload a PDF file.")
159
  if not selected_lang_names:
160
  raise gr.Error("Please select at least one language for the podcast.")
161
 
162
+ # Configure Gemini API using the key directly from the UI input
163
  try:
164
+ genai.configure(api_key=gemini_api_key_from_ui)
165
+ gr.Info("Gemini API configured successfully with the provided key.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  except Exception as e:
167
+ raise gr.Error(f"Failed to configure Gemini API with the provided key. Please check your API key. Error: {e}")
168
 
169
+ # Check if HF TTS client is available (HF_TOKEN was provided as a secret)
170
+ if not hf_tts_client:
171
+ gr.Warning( # Changed to gr.Warning to allow script generation if TTS fails to init
172
+ "Hugging Face TTS client is not available (HF_TOKEN secret might be missing or invalid). "
173
+ "Speech synthesis will be skipped, but script generation will be attempted."
174
+ )
175
+ # Note: Script generation can still proceed, TTS will be skipped later if client is None.
176
 
177
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
178
  results_data: Dict[str, Dict[str, Optional[str]]] = {
 
191
  if not lecture_text.strip():
192
  raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
193
 
194
+ # Initialize Gemini model (e.g., 'gemini-1.5-flash-latest' or 'gemini-pro')
195
+ # This happens after genai.configure has been called.
196
+ try:
197
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
198
+ except Exception as e:
199
+ raise gr.Error(f"Failed to initialize Gemini model. This might be due to an invalid API key or API access issues. Error: {e}")
200
+
201
 
202
  for code in selected_codes:
203
  info = LANG_INFO[code]
204
  lang_name = info["name"]
205
+ hf_tts_model_id = info["tts_model"]
206
 
207
  gr.Info(f"Processing for {lang_name}...")
208
  lang_tmpdir = tmpdir_base / code
 
213
  gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
214
  prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
215
  try:
216
+ # The gemini_model is initialized using the API key from genai.configure()
217
  response = gemini_model.generate_content(prompt_for_gemini)
218
+ dialogue_raw = response.text
219
 
220
  if not dialogue_raw or not dialogue_raw.strip():
221
  gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
 
228
  results_data[code]["script_file"] = str(script_file_path)
229
 
230
  except Exception as e:
231
+ # Check if the error indicates an API key issue from Gemini
232
+ if "API_KEY_INVALID" in str(e) or "permission" in str(e).lower():
233
+ raise gr.Error(f"Gemini API Key error for {lang_name}: {e}. Please verify your API key and its permissions.")
234
  gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
235
  continue
236
 
237
  if dialogue:
238
+ if hf_tts_client: # Only attempt TTS if client is available
239
+ gr.Info(f"Synthesizing speech for {lang_name} with Hugging Face TTS ({hf_tts_model_id})...")
240
+ try:
241
+ tts_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
242
+ results_data[code]["audio"] = str(tts_path)
243
+ except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
244
+ gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
245
+ except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
246
+ gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
247
+ except Exception as e: # Catch any other unexpected errors during synthesis
248
+ gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
249
+ else:
250
+ gr.Info(f"HF TTS client not available. Skipping speech synthesis for {lang_name}.")
251
 
252
  final_ordered_results: List[Optional[Any]] = []
253
  for code_key in LANG_INFO.keys():
 
259
  gr.Info("Podcast generation complete!")
260
  return final_ordered_results
261
 
262
+ except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
263
  raise e
264
+ except Exception as e: # Catch other unexpected errors during the process
265
  import traceback
266
  print("An unexpected error occurred in generate_podcast:")
267
  traceback.print_exc()
268
  raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  # ------------------------------------------------------------------
271
  # Gradio Interface Setup
 
274
 
275
  inputs = [
276
  gr.Textbox(
277
+ label="Enter your Google AI Studio API Key (for Gemini text generation)",
278
  type="password",
279
+ placeholder="Paste your Gemini API key here",
280
+ # value=os.getenv("GEMINI_API_KEY_FOR_DEV") # Optional: for local dev default, remove for deployment
281
  ),
282
  gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
283
  gr.CheckboxGroup(
284
  choices=language_names_ordered,
285
+ value=["English"], # Default language selection
286
  label="Select podcast language(s) to generate",
287
  ),
288
  ]
 
291
  for code in LANG_INFO.keys():
292
  info = LANG_INFO[code]
293
  lang_name = info["name"]
294
+ outputs.append(gr.Audio(label=f"{lang_name} Podcast (.flac)", type="filepath"))
295
  outputs.append(gr.Markdown(label=f"{lang_name} Script"))
296
  outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
297
 
 
299
  fn=generate_podcast,
300
  inputs=inputs,
301
  outputs=outputs,
302
+ title="Lecture → Podcast & Script (Gemini Text + HF Speech)",
303
  description=(
304
+ "**SETUP:**\n"
305
+ "1. **Gemini API Key**: Enter your Google AI Studio API Key in the field below for text generation.\n"
306
+ "2. **Hugging Face Token (for Speech)**: For Text-to-Speech, ensure you have a Hugging Face Token. "
307
+ "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `HF_TOKEN`. "
308
+ "Paste your Hugging Face token as its value.\n\n"
 
309
  "Upload a lecture PDF, choose language(s), and receive an audio podcast "
310
+ "and its script. Dialogue by Google Gemini, speech by Hugging Face MMS-TTS."
311
  ),
312
  allow_flagging="never",
313
  )
314
 
315
  if __name__ == "__main__":
316
+ # For local testing of HF_TOKEN, you can set it as an environment variable:
317
+ # os.environ["HF_TOKEN"] = "your_hf_token_here"
318
+ if not os.getenv("HF_TOKEN"):
319
+ print("Reminder: For local testing with TTS, set the HF_TOKEN environment variable.")
320
+ # The Gemini API key will be taken from the UI input.
321
+ # You could add a default value for local testing to the gr.Textbox `value` argument if desired.
322
+ # e.g. value=os.getenv("GEMINI_API_KEY_FOR_DEV")
323
+
324
  iface.launch()