HaiderAUT commited on
Commit
764a881
·
verified ·
1 Parent(s): 1425202

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -264
app.py CHANGED
@@ -1,10 +1,7 @@
1
  # =============================================================
2
- # Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 
3
  # =============================================================
4
- # • **Text generation** – Google Gemini API (via user-provided genai API Key)
5
- # • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
6
- # -----------------------------------------------------------------
7
-
8
  import os
9
  import re
10
  import tempfile
@@ -17,29 +14,19 @@ from PyPDF2 import PdfReader
17
  from pydub import AudioSegment
18
  from pydub.exceptions import CouldntDecodeError
19
 
20
- # For Hugging Face TTS
21
  from huggingface_hub import InferenceClient
22
 
23
- # For Google Gemini
24
- try:
25
- import google.generativeai as genai
26
- except ImportError:
27
- raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
28
 
29
  # ------------------------------------------------------------------
30
- # Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
31
  # ------------------------------------------------------------------
32
- hf_tts_client: Optional[InferenceClient] = None
33
  hf_token = os.getenv("HF_TOKEN")
34
- if hf_token:
35
- hf_tts_client = InferenceClient(token=hf_token)
36
- else:
37
- # This print will show in the Space logs if HF_TOKEN is missing
38
- print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
39
 
40
- # ------------------------------------------------------------------
41
- # Language metadata for Hugging Face MMS-TTS models
42
- # ------------------------------------------------------------------
43
  LANG_INFO: Dict[str, Dict[str, str]] = {
44
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
45
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
@@ -49,276 +36,122 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
49
  }
50
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
51
 
52
- # ------------------------------------------------------------------
53
- # Prompt template for Gemini
54
- # ------------------------------------------------------------------
55
  PROMPT_TEMPLATE = textwrap.dedent(
56
  """
57
  You are producing a lively two-host educational podcast in {lang_name}.
58
  Summarize the following lecture content into a dialogue of **approximately 300 words**.
59
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
60
- wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
61
 
62
  ### Lecture Content
63
  {content}
64
  """
65
  )
66
 
67
- # PDF helpers (unchanged) -------------------------------------------
68
- def extract_pdf_text(pdf_path: str) -> str:
69
- try:
70
- reader = PdfReader(pdf_path)
71
- return "\n".join(page.extract_text() or "" for page in reader.pages)
72
- except Exception as e:
73
- raise gr.Error(f"Failed to process PDF: {e}")
74
-
75
  TOKEN_LIMIT = 8000
76
 
 
 
 
 
77
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
78
  words = text.split()
79
- if len(words) > limit:
80
- gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
81
- return " ".join(words[:limit])
82
- return text
83
-
84
- # ------------------------------------------------------------------
85
- # TTS helper using Hugging Face Inference API
86
- # ------------------------------------------------------------------
87
- CHUNK_CHAR_LIMIT_HF = 280
88
-
89
- def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
90
- sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
91
- sentences = [s.strip() for s in sentences_raw if s.strip()]
92
- if not sentences: return []
93
- chunks, current_chunk = [], ""
94
- for sent in sentences:
95
- if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
96
- chunks.append(current_chunk)
97
- current_chunk = sent
98
  else:
99
- current_chunk += (" " + sent) if current_chunk else sent
100
- if current_chunk: chunks.append(current_chunk)
101
- return [chunk for chunk in chunks if chunk.strip()]
102
-
103
-
104
- def synthesize_speech_hf(
105
- text: str,
106
- hf_model_id: str,
107
- lang_tmpdir: Path,
108
- tts_client: InferenceClient
109
- ) -> Path:
110
- chunks = _split_to_chunks_hf(text)
111
- if not chunks:
112
- raise ValueError("Text resulted in no speakable chunks after splitting.")
113
-
114
- audio_segments: List[AudioSegment] = []
115
- for idx, chunk in enumerate(chunks):
116
- gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
117
- try:
118
- audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
119
- except HubHTTPError as e:
120
- error_message = f"HF TTS request failed for chunk {idx+1} ('{chunk[:30]}...'): {e}"
121
- if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
122
- gr.Warning(f"Skipping an apparently empty chunk for HF TTS: Chunk {idx+1}")
123
- continue
124
- raise RuntimeError(error_message) from e
125
- except Exception as e:
126
- raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
127
-
128
- part_path = lang_tmpdir / f"part_{idx}.flac"
129
- part_path.write_bytes(audio_bytes)
130
-
131
- try:
132
- segment = AudioSegment.from_file(part_path, format="flac")
133
- audio_segments.append(segment)
134
- except CouldntDecodeError as e:
135
- raise RuntimeError(f"Failed to decode FLAC audio chunk {idx+1} from {part_path}. Error: {e}") from e
136
-
137
- if not audio_segments:
138
- raise RuntimeError("No audio segments were successfully synthesized or decoded.")
139
-
140
- combined_audio = sum(audio_segments, AudioSegment.empty())
141
- final_path = lang_tmpdir / "podcast_audio.flac"
142
- combined_audio.export(final_path, format="flac")
143
- return final_path
144
-
145
- # ------------------------------------------------------------------
146
- # Main pipeline function for Gradio
147
- # ------------------------------------------------------------------
148
 
149
  def generate_podcast(
150
- gemini_api_key_from_ui: Optional[str], # Explicitly named to show source
151
- pdf_file_obj: Optional[gr.File],
152
- selected_lang_names: List[str]
153
  ) -> List[Optional[Any]]:
154
-
155
- if not gemini_api_key_from_ui: # Check the key provided from the UI input
156
- raise gr.Error("Please enter your Google AI Studio API Key for Gemini in the input field.")
157
- if not pdf_file_obj:
158
- raise gr.Error("Please upload a PDF file.")
159
- if not selected_lang_names:
160
- raise gr.Error("Please select at least one language for the podcast.")
161
-
162
- # Configure Gemini API using the key directly from the UI input
163
- try:
164
- genai.configure(api_key=gemini_api_key_from_ui)
165
- gr.Info("Gemini API configured successfully with the provided key.")
166
- except Exception as e:
167
- raise gr.Error(f"Failed to configure Gemini API with the provided key. Please check your API key. Error: {e}")
168
-
169
- # Check if HF TTS client is available (HF_TOKEN was provided as a secret)
170
- if not hf_tts_client:
171
- gr.Warning( # Changed to gr.Warning to allow script generation if TTS fails to init
172
- "Hugging Face TTS client is not available (HF_TOKEN secret might be missing or invalid). "
173
- "Speech synthesis will be skipped, but script generation will be attempted."
174
- )
175
- # Note: Script generation can still proceed, TTS will be skipped later if client is None.
176
-
177
- selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
178
- results_data: Dict[str, Dict[str, Optional[str]]] = {
179
- code: {"audio": None, "script_text": None, "script_file": None}
180
- for code in LANG_INFO.keys()
181
- }
182
-
183
- try:
184
- with tempfile.TemporaryDirectory() as td:
185
- tmpdir_base = Path(td)
186
-
187
- gr.Info("Extracting text from PDF...")
188
- lecture_raw = extract_pdf_text(pdf_file_obj.name)
189
- lecture_text = truncate_text(lecture_raw)
190
-
191
- if not lecture_text.strip():
192
- raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
193
-
194
- # Initialize Gemini model (e.g., 'gemini-1.5-flash-latest' or 'gemini-pro')
195
- # This happens after genai.configure has been called.
196
- try:
197
- gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
198
- except Exception as e:
199
- raise gr.Error(f"Failed to initialize Gemini model. This might be due to an invalid API key or API access issues. Error: {e}")
200
-
201
-
202
- for code in selected_codes:
203
- info = LANG_INFO[code]
204
- lang_name = info["name"]
205
- hf_tts_model_id = info["tts_model"]
206
-
207
- gr.Info(f"Processing for {lang_name}...")
208
- lang_tmpdir = tmpdir_base / code
209
- lang_tmpdir.mkdir(parents=True, exist_ok=True)
210
-
211
- dialogue: Optional[str] = None
212
-
213
- gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
214
- prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
215
- try:
216
- # The gemini_model is initialized using the API key from genai.configure()
217
- response = gemini_model.generate_content(prompt_for_gemini)
218
- dialogue_raw = response.text
219
-
220
- if not dialogue_raw or not dialogue_raw.strip():
221
- gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
222
- continue
223
-
224
- dialogue = dialogue_raw
225
- results_data[code]["script_text"] = dialogue
226
- script_file_path = lang_tmpdir / f"podcast_script_{code}.txt"
227
- script_file_path.write_text(dialogue, encoding="utf-8")
228
- results_data[code]["script_file"] = str(script_file_path)
229
-
230
- except Exception as e:
231
- # Check if the error indicates an API key issue from Gemini
232
- if "API_KEY_INVALID" in str(e) or "permission" in str(e).lower():
233
- raise gr.Error(f"Gemini API Key error for {lang_name}: {e}. Please verify your API key and its permissions.")
234
- gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
235
- continue
236
-
237
- if dialogue:
238
- if hf_tts_client: # Only attempt TTS if client is available
239
- gr.Info(f"Synthesizing speech for {lang_name} with Hugging Face TTS ({hf_tts_model_id})...")
240
- try:
241
- tts_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
242
- results_data[code]["audio"] = str(tts_path)
243
- except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
244
- gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
245
- except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
246
- gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
247
- except Exception as e: # Catch any other unexpected errors during synthesis
248
- gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
249
- else:
250
- gr.Info(f"HF TTS client not available. Skipping speech synthesis for {lang_name}.")
251
-
252
- final_ordered_results: List[Optional[Any]] = []
253
- for code_key in LANG_INFO.keys():
254
- lang_output_data = results_data[code_key]
255
- final_ordered_results.append(lang_output_data["audio"])
256
- final_ordered_results.append(lang_output_data["script_text"])
257
- final_ordered_results.append(lang_output_data["script_file"])
258
-
259
- gr.Info("Podcast generation complete!")
260
- return final_ordered_results
261
-
262
- except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
263
- raise e
264
- except Exception as e: # Catch other unexpected errors during the process
265
- import traceback
266
- print("An unexpected error occurred in generate_podcast:")
267
- traceback.print_exc()
268
- raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
269
-
270
- # ------------------------------------------------------------------
271
- # Gradio Interface Setup
272
- # ------------------------------------------------------------------
273
- language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
274
-
275
  inputs = [
276
- gr.Textbox(
277
- label="Enter your Google AI Studio API Key (for Gemini text generation)",
278
- type="password",
279
- placeholder="Paste your Gemini API key here",
280
- # value=os.getenv("GEMINI_API_KEY_FOR_DEV") # Optional: for local dev default, remove for deployment
281
- ),
282
- gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
283
- gr.CheckboxGroup(
284
- choices=language_names_ordered,
285
- value=["English"], # Default language selection
286
- label="Select podcast language(s) to generate",
287
- ),
288
  ]
289
-
290
  outputs = []
291
- for code in LANG_INFO.keys():
292
- info = LANG_INFO[code]
293
- lang_name = info["name"]
294
- outputs.append(gr.Audio(label=f"{lang_name} Podcast (.flac)", type="filepath"))
295
- outputs.append(gr.Markdown(label=f"{lang_name} Script"))
296
- outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
297
 
298
  iface = gr.Interface(
299
  fn=generate_podcast,
300
  inputs=inputs,
301
  outputs=outputs,
302
- title="Lecture → Podcast & Script (Gemini Text + HF Speech)",
303
- description=(
304
- "**SETUP:**\n"
305
- "1. **Gemini API Key**: Enter your Google AI Studio API Key in the field below for text generation.\n"
306
- "2. **Hugging Face Token (for Speech)**: For Text-to-Speech, ensure you have a Hugging Face Token. "
307
- "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `HF_TOKEN`. "
308
- "Paste your Hugging Face token as its value.\n\n"
309
- "Upload a lecture PDF, choose language(s), and receive an audio podcast "
310
- "and its script. Dialogue by Google Gemini, speech by Hugging Face MMS-TTS."
311
- ),
312
- allow_flagging="never",
313
  )
314
 
315
  if __name__ == "__main__":
316
- # For local testing of HF_TOKEN, you can set it as an environment variable:
317
- # os.environ["HF_TOKEN"] = "your_hf_token_here"
318
- if not os.getenv("HF_TOKEN"):
319
- print("Reminder: For local testing with TTS, set the HF_TOKEN environment variable.")
320
- # The Gemini API key will be taken from the UI input.
321
- # You could add a default value for local testing to the gr.Textbox `value` argument if desired.
322
- # e.g. value=os.getenv("GEMINI_API_KEY_FOR_DEV")
323
-
324
- iface.launch()
 
1
  # =============================================================
2
+ # Lecture → Podcast & Script Generator (Gemini + HF TTS)
3
+ # Modified: Script outputs rendered as HTML
4
  # =============================================================
 
 
 
 
5
  import os
6
  import re
7
  import tempfile
 
14
  from pydub import AudioSegment
15
  from pydub.exceptions import CouldntDecodeError
16
 
17
+ # Hugging Face TTS
18
  from huggingface_hub import InferenceClient
19
 
20
+ # Google Gemini
21
+ import google.generativeai as genai
 
 
 
22
 
23
  # ------------------------------------------------------------------
24
+ # HF TTS client
25
  # ------------------------------------------------------------------
 
26
  hf_token = os.getenv("HF_TOKEN")
27
+ hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None
 
 
 
 
28
 
29
+ # Language metadata
 
 
30
  LANG_INFO: Dict[str, Dict[str, str]] = {
31
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
32
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
 
36
  }
37
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
38
 
39
+ # Prompt template
 
 
40
  PROMPT_TEMPLATE = textwrap.dedent(
41
  """
42
  You are producing a lively two-host educational podcast in {lang_name}.
43
  Summarize the following lecture content into a dialogue of **approximately 300 words**.
44
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
45
+ wrap up with a concise recap. Preserve technical accuracy.
46
 
47
  ### Lecture Content
48
  {content}
49
  """
50
  )
51
 
52
+ # PDF extraction
 
 
 
 
 
 
 
53
  TOKEN_LIMIT = 8000
54
 
55
+ def extract_pdf_text(path: str) -> str:
56
+ reader = PdfReader(path)
57
+ return "\n".join(p.extract_text() or "" for p in reader.pages)
58
+
59
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
60
  words = text.split()
61
+ return " ".join(words[:limit]) if len(words) > limit else text
62
+
63
+ # TTS chunking
64
+ CHUNK_CHAR_LIMIT = 280
65
+
66
+ def split_chunks(text: str) -> List[str]:
67
+ sentences = re.split(r"(?<=[.!?])\s+", text.strip())
68
+ chunks, curr = [], ""
69
+ for s in sentences:
70
+ if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
71
+ chunks.append(curr)
72
+ curr = s
 
 
 
 
 
 
 
73
  else:
74
+ curr = f"{curr} {s}" if curr else s
75
+ if curr: chunks.append(curr)
76
+ return chunks
77
+
78
+ # Synthesize speech
79
+
80
+ def synthesize(text: str, model_id: str, outdir: Path) -> str:
81
+ segments = []
82
+ for i, chunk in enumerate(split_chunks(text)):
83
+ audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
84
+ path = outdir / f"part{i}.flac"
85
+ path.write_bytes(audio_bytes)
86
+ seg = AudioSegment.from_file(path, format="flac")
87
+ segments.append(seg)
88
+ final = sum(segments, AudioSegment.empty())
89
+ out = outdir / "podcast.flac"
90
+ final.export(out, format="flac")
91
+ return str(out)
92
+
93
+ # Main pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def generate_podcast(
96
+ gemini_key: str,
97
+ pdf_file: gr.File,
98
+ langs: List[str]
99
  ) -> List[Optional[Any]]:
100
+ if not gemini_key:
101
+ raise gr.Error("Enter Google AI Studio API Key.")
102
+ if not pdf_file:
103
+ raise gr.Error("Upload a PDF file.")
104
+ if not langs:
105
+ raise gr.Error("Select at least one language.")
106
+
107
+ genai.configure(api_key=gemini_key)
108
+ raw = extract_pdf_text(pdf_file.name)
109
+ content = truncate_text(raw)
110
+
111
+ tmp = Path(tempfile.mkdtemp())
112
+ results = []
113
+ data = {}
114
+
115
+ for code, info in LANG_INFO.items():
116
+ if info["name"] not in langs:
117
+ results.extend([None, None, None])
118
+ continue
119
+ # Generate script
120
+ prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
121
+ model = genai.GenerativeModel('gemini-1.5-flash-latest')
122
+ resp = model.generate_content(prompt)
123
+ script = resp.text.strip()
124
+ # Save plain text
125
+ script_path = tmp / f"script_{code}.txt"
126
+ script_path.write_text(script, encoding="utf-8")
127
+ # Render HTML version
128
+ html_script = f"<pre>{script}</pre>"
129
+ # Synthesize audio if available
130
+ audio_path = None
131
+ if hf_tts_client:
132
+ audio_path = synthesize(script, info["tts_model"], tmp / code)
133
+ results.extend([audio_path, html_script, str(script_path)])
134
+ return results
135
+
136
+ # Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  inputs = [
138
+ gr.Textbox(label="Google AI Studio API Key", type="password"),
139
+ gr.File(label="Lecture PDF", file_types=[".pdf"]),
140
+ gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
141
+ value=["English"], label="Languages")
 
 
 
 
 
 
 
 
142
  ]
 
143
  outputs = []
144
+ for code, info in LANG_INFO.items():
145
+ outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
146
+ outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
147
+ outputs.append(gr.File(label=f"Download {info['name']} Script"))
 
 
148
 
149
  iface = gr.Interface(
150
  fn=generate_podcast,
151
  inputs=inputs,
152
  outputs=outputs,
153
+ title="Lecture → Podcast & Script",
 
 
 
 
 
 
 
 
 
 
154
  )
155
 
156
  if __name__ == "__main__":
157
+ iface.launch()