HaiderAUT commited on
Commit
c565171
·
verified ·
1 Parent(s): 313d24a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -77
app.py CHANGED
@@ -1,10 +1,10 @@
1
  # =============================================================
2
- # HuggingFaceSpace – LecturePodcastGenerator (Userselectable Languages)
3
  # =============================================================
4
- # • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5Coder32BInstruct)
5
- # • **Speech synthesis** – `InferenceClient.text_to_speech`, chunksafe
6
- # (MMSTTS for en/bn/ur/ne, mmsTTSzho for zh). Long texts are split
7
- # into ≤280char chunks to stay within HF endpoint limits.
8
  # -----------------------------------------------------------------
9
 
10
  import os
@@ -12,30 +12,31 @@ import re
12
  import tempfile
13
  import textwrap
14
  from pathlib import Path
15
- from typing import List, Dict, Tuple, Optional
16
 
17
  import gradio as gr
18
- from huggingface_hub import InferenceClient
19
- from PyPDF2 import PdfReader
20
- from smolagents import HfApiModel
 
 
21
 
22
  # ------------------------------------------------------------------
23
  # LLM setup – remote Qwen model via SmolAgents
24
  # ------------------------------------------------------------------
25
  llm = HfApiModel(
26
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
27
- max_tokens=2048,
28
  temperature=0.5,
29
  )
30
 
31
  # ------------------------------------------------------------------
32
- # Hugging Face Inference API client (uses HF_TOKEN secret if provided)
33
  # ------------------------------------------------------------------
34
  client = InferenceClient(token=os.getenv("HF_TOKEN", None))
35
 
36
  # ------------------------------------------------------------------
37
  # Language metadata and corresponding open TTS model IDs
38
- # (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
39
  # ------------------------------------------------------------------
40
  LANG_INFO: Dict[str, Dict[str, str]] = {
41
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
@@ -44,19 +45,20 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
44
  "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
45
  "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
46
  }
 
47
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
48
 
49
  # ------------------------------------------------------------------
50
- # Prompt template (300 words to keep TTS happy)
51
  # ------------------------------------------------------------------
52
  PROMPT_TEMPLATE = textwrap.dedent(
53
  """
54
- You are producing a lively twohost educational podcast in {lang_name}.
55
- Summarize the following lecture content into a dialogue of **≈300 words**.
56
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
57
  wrap up with a concise recap. Preserve technical accuracy.
58
-
59
- ### Lecture Content
60
  {content}
61
  """
62
  )
@@ -64,120 +66,230 @@ PROMPT_TEMPLATE = textwrap.dedent(
64
  # PDF helpers -------------------------------------------------------
65
 
66
  def extract_pdf_text(pdf_path: str) -> str:
67
- reader = PdfReader(pdf_path)
68
- return "\n".join(page.extract_text() or "" for page in reader.pages)
 
 
 
 
69
 
70
- TOKEN_LIMIT = 4000 # approx words before hitting context limit
 
 
 
 
71
 
72
 
73
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
74
  words = text.split()
75
- return " ".join(words[:limit])
 
 
 
76
 
77
  # ------------------------------------------------------------------
78
- # TTS helper – chunk long text safely (HF endpoint ~30 s / 200300 chars)
79
  # ------------------------------------------------------------------
80
- CHUNK_CHAR_LIMIT = 280 # safe margin for MMSTTS
81
 
82
  def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
83
- # split on sentence boundaries while respecting limit
84
- sentences = re.split(r"(?<=[.!?])\s+", text.strip())
85
- chunks, current = [], ""
 
 
 
 
 
86
  for sent in sentences:
87
- if len(current) + len(sent) + 1 > limit:
88
- if current:
89
- chunks.append(current.strip())
90
- current = sent
 
91
  else:
92
- current += " " + sent if current else sent
93
- if current:
94
- chunks.append(current.strip())
95
- return chunks
 
 
 
96
 
97
 
98
- def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
99
- """Stream chunks through HF TTS and concatenate FLAC bytes."""
100
  chunks = _split_to_chunks(text)
101
- flac_paths: List[Path] = []
 
 
 
102
  for idx, chunk in enumerate(chunks):
 
103
  try:
104
  audio_bytes = client.text_to_speech(chunk, model=model_id)
105
  except HubHTTPError as e:
106
- raise RuntimeError(f"TTS request failed: {e}") from e
107
- part_path = tmpdir / f"part_{idx}.flac"
 
 
 
 
 
108
  part_path.write_bytes(audio_bytes)
109
- flac_paths.append(part_path)
110
-
111
- # simple concat of FLAC files (works because each part includes header)
112
- # better: convert to raw & merge, but HF players handle sequential FLACs
113
- final_path = tmpdir / "podcast.flac"
114
- with open(final_path, "wb") as fout:
115
- for p in flac_paths:
116
- fout.write(p.read_bytes())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  return final_path
118
 
119
  # ------------------------------------------------------------------
120
- # Main pipeline
121
  # ------------------------------------------------------------------
122
 
123
- def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
 
 
124
  if not selected_lang_names:
125
- raise gr.Error("Please select at least one language.")
126
 
 
127
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
128
- results: List[Optional[Tuple[str, None]]] = []
 
 
 
129
 
130
- with tempfile.TemporaryDirectory() as td:
131
- tmpdir = Path(td)
132
- lecture_raw = extract_pdf_text(pdf.name)
133
- lecture_text = truncate_text(lecture_raw)
 
 
 
134
 
135
- for code, info in LANG_INFO.items():
136
- if code not in selected_codes:
137
- results.append(None)
138
- continue
139
 
140
- # 1️⃣ Generate dialogue
141
- prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
142
- dialogue: str = llm(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- # 2️⃣ Speech synthesis (chunked)
145
- tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- results.append((str(tts_path), None))
 
 
 
 
 
 
 
 
148
 
149
- return results
150
 
151
  # ------------------------------------------------------------------
152
- # Gradio Interface
153
  # ------------------------------------------------------------------
154
- language_choices = [info["name"] for info in LANG_INFO.values()]
 
155
 
156
  inputs = [
157
  gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
158
  gr.CheckboxGroup(
159
- choices=language_choices,
160
- value=["English"],
161
  label="Select podcast language(s) to generate",
162
  ),
163
  ]
164
 
 
165
  outputs = [
166
- gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
 
167
  ]
168
 
169
  iface = gr.Interface(
170
  fn=generate_podcast,
171
  inputs=inputs,
172
  outputs=outputs,
173
- title="Lecture  Podcast Generator (Choose Languages)",
174
  description=(
175
- "Upload a lecture PDF, choose language(s), and receive a twohost "
176
- "audio podcast. Dialogue comes from Qwen32B; speech is streamed "
177
- "via the HF Inference API using open MMSTTS models. Long texts are "
178
- "automatically chunked to fit API limits."
179
  ),
 
 
 
 
 
180
  )
181
 
182
  if __name__ == "__main__":
183
- iface.launch()
 
 
 
 
 
1
  # =============================================================
2
+ # Hugging Face Space – Lecture Podcast Generator (User-selectable Languages)
3
  # =============================================================
4
+ # • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5-Coder-32B-Instruct)
5
+ # • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk-safe
6
+ # (MMS-TTS for en/bn/ur/ne, mms-TTS-zho for zh). Long texts are split
7
+ # into ≤280-char chunks to stay within HF endpoint limits.
8
  # -----------------------------------------------------------------
9
 
10
  import os
 
12
  import tempfile
13
  import textwrap
14
  from pathlib import Path
15
+ from typing import List, Dict, Optional
16
 
17
  import gradio as gr
18
+ from huggingface_hub import InferenceClient, HubHTTPError
19
+ from PyPDF2 import PdfReader # For PDF processing
20
+ from smolagents import HfApiModel # For LLM interaction
21
+ from pydub import AudioSegment # Added for robust audio concatenation
22
+ from pydub.exceptions import CouldntDecodeError # Specific pydub error
23
 
24
  # ------------------------------------------------------------------
25
  # LLM setup – remote Qwen model via SmolAgents
26
  # ------------------------------------------------------------------
27
  llm = HfApiModel(
28
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
29
+ max_tokens=2048, # Max tokens for the generated output dialogue
30
  temperature=0.5,
31
  )
32
 
33
  # ------------------------------------------------------------------
34
+ # Hugging Face Inference API client (uses HF_TOKEN secret if provided)
35
  # ------------------------------------------------------------------
36
  client = InferenceClient(token=os.getenv("HF_TOKEN", None))
37
 
38
  # ------------------------------------------------------------------
39
  # Language metadata and corresponding open TTS model IDs
 
40
  # ------------------------------------------------------------------
41
  LANG_INFO: Dict[str, Dict[str, str]] = {
42
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
 
45
  "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
46
  "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
47
  }
48
+ # For reverse lookup: language name to language code
49
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
50
 
51
  # ------------------------------------------------------------------
52
+ # Prompt template (target ~300 words for LLM output)
53
  # ------------------------------------------------------------------
54
  PROMPT_TEMPLATE = textwrap.dedent(
55
  """
56
+ You are producing a lively two-host educational podcast in {lang_name}.
57
+ Summarize the following lecture content into a dialogue of **approximately 300 words**.
58
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
59
  wrap up with a concise recap. Preserve technical accuracy.
60
+
61
+ ### Lecture Content
62
  {content}
63
  """
64
  )
 
66
  # PDF helpers -------------------------------------------------------
67
 
68
  def extract_pdf_text(pdf_path: str) -> str:
69
+ try:
70
+ reader = PdfReader(pdf_path)
71
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
72
+ except Exception as e:
73
+ # Raise a Gradio error to display it in the UI
74
+ raise gr.Error(f"Failed to process PDF: {e}")
75
 
76
+
77
+ # Increased slightly; Qwen models have large context windows. This is input *words*.
78
+ # Actual limit is in tokens. Qwen2.5-Coder-32B-Instruct context is 65536 tokens.
79
+ # 8000 words is still conservative. The prompt itself also consumes tokens.
80
+ TOKEN_LIMIT = 8000
81
 
82
 
83
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
84
  words = text.split()
85
+ if len(words) > limit:
86
+ gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
87
+ return " ".join(words[:limit])
88
+ return text
89
 
90
  # ------------------------------------------------------------------
91
+ # TTS helper – chunk long text safely (HF endpoint limit ~30s / 200-300 chars)
92
  # ------------------------------------------------------------------
93
+ CHUNK_CHAR_LIMIT = 280 # Safe margin for MMS-TTS character limit per request
94
 
95
  def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
96
+ # Split on sentence boundaries (.!?) while respecting the character limit per chunk.
97
+ sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
98
+ sentences = [s.strip() for s in sentences_raw if s.strip()] # Clean and filter empty sentences
99
+
100
+ if not sentences:
101
+ return []
102
+
103
+ chunks, current_chunk = [], ""
104
  for sent in sentences:
105
+ # If current_chunk is empty, the first sentence always starts a new chunk.
106
+ # If current_chunk is not empty, check if adding the new sentence (plus a space) exceeds the limit.
107
+ if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
108
+ chunks.append(current_chunk) # Finalize the current chunk
109
+ current_chunk = sent # Start a new chunk with the current sentence
110
  else:
111
+ # Append sentence to current_chunk (with a space if current_chunk is not empty)
112
+ current_chunk += (" " + sent) if current_chunk else sent
113
+
114
+ if current_chunk: # Add any remaining part as the last chunk
115
+ chunks.append(current_chunk)
116
+
117
+ return [chunk for chunk in chunks if chunk.strip()] # Ensure no empty chunks are returned
118
 
119
 
120
+ def synthesize_speech(text: str, model_id: str, lang_tmpdir: Path) -> Path:
121
+ """Splits text into chunks, synthesizes speech for each, and concatenates them using pydub."""
122
  chunks = _split_to_chunks(text)
123
+ if not chunks:
124
+ raise ValueError("Text resulted in no speakable chunks after splitting.")
125
+
126
+ audio_segments: List[AudioSegment] = []
127
  for idx, chunk in enumerate(chunks):
128
+ gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)}...")
129
  try:
130
  audio_bytes = client.text_to_speech(chunk, model=model_id)
131
  except HubHTTPError as e:
132
+ error_message = f"TTS request failed for chunk {idx+1}/{len(chunks)} ('{chunk[:30]}...'): {e}"
133
+ if "Input validation error: `inputs` must be non-empty" in str(e) and not chunk.strip():
134
+ gr.Warning(f"Skipping an apparently empty chunk for TTS that wasn't filtered: Chunk {idx+1}")
135
+ continue
136
+ raise RuntimeError(error_message) from e
137
+
138
+ part_path = lang_tmpdir / f"part_{idx}.flac" # Assuming TTS returns FLAC
139
  part_path.write_bytes(audio_bytes)
140
+
141
+ try:
142
+ # Load the audio part using pydub.
143
+ # MMS TTS via HF Inference API usually returns WAV by default, but filename implies FLAC.
144
+ # If API returns WAV, use format="wav". If FLAC, format="flac".
145
+ # The original code implies FLAC, so we'll stick to that.
146
+ segment = AudioSegment.from_file(part_path, format="flac")
147
+ audio_segments.append(segment)
148
+ except CouldntDecodeError as e:
149
+ # This can happen if the audio data is not valid FLAC or is empty/corrupted.
150
+ raise RuntimeError(
151
+ f"Failed to decode audio chunk {idx+1} from {part_path}. "
152
+ f"Audio data might be corrupted, empty, or not in FLAC format. TTS Error: {e}"
153
+ ) from e
154
+
155
+ if not audio_segments:
156
+ raise RuntimeError("No audio segments were successfully synthesized or decoded.")
157
+
158
+ # Concatenate all audio segments
159
+ combined_audio = sum(audio_segments, AudioSegment.empty()) # Efficient sum for pydub
160
+
161
+ final_path = lang_tmpdir / "podcast.flac"
162
+ combined_audio.export(final_path, format="flac")
163
+
164
  return final_path
165
 
166
  # ------------------------------------------------------------------
167
+ # Main pipeline function for Gradio
168
  # ------------------------------------------------------------------
169
 
170
+ def generate_podcast(pdf_file_obj: Optional[gr.File], selected_lang_names: List[str]):
171
+ if not pdf_file_obj:
172
+ raise gr.Error("Please upload a PDF file.")
173
  if not selected_lang_names:
174
+ raise gr.Error("Please select at least one language for the podcast.")
175
 
176
+ # Map selected language names back to their codes
177
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
178
+
179
+ # Initialize results map. Keys are lang codes, values will be audio file paths or None.
180
+ # This helps in populating results for selected languages only.
181
+ results_map: Dict[str, Optional[str]] = {code: None for code in LANG_INFO.keys()}
182
 
183
+ try:
184
+ with tempfile.TemporaryDirectory() as td:
185
+ tmpdir_base = Path(td) # Base temporary directory
186
+
187
+ gr.Info("Extracting text from PDF...")
188
+ lecture_raw = extract_pdf_text(pdf_file_obj.name) # .name is path to temp uploaded file
189
+ lecture_text = truncate_text(lecture_raw)
190
 
191
+ if not lecture_text.strip():
192
+ raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
 
 
193
 
194
+ for code in selected_codes: # Iterate only through user-selected languages
195
+ info = LANG_INFO[code]
196
+ lang_name = info["name"]
197
+ tts_model = info["tts_model"]
198
+
199
+ gr.Info(f"Processing for {lang_name}...")
200
+
201
+ # Create a language-specific subdirectory within the base temporary directory
202
+ lang_tmpdir = tmpdir_base / code
203
+ lang_tmpdir.mkdir(parents=True, exist_ok=True)
204
+
205
+ # 1️⃣ Generate dialogue using LLM
206
+ gr.Info(f"Generating dialogue for {lang_name}...")
207
+ prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
208
+ try:
209
+ dialogue: str = llm(prompt)
210
+ if not dialogue or not dialogue.strip():
211
+ gr.Warning(f"LLM returned empty dialogue for {lang_name}. Skipping TTS for this language.")
212
+ results_map[code] = None
213
+ continue # Move to the next selected language
214
+ except Exception as e:
215
+ gr.Error(f"Error generating dialogue for {lang_name}: {e}")
216
+ results_map[code] = None
217
+ continue
218
 
219
+ # 2️⃣ Synthesize speech from the dialogue (chunked and concatenated)
220
+ gr.Info(f"Synthesizing speech for {lang_name}...")
221
+ try:
222
+ tts_path = synthesize_speech(dialogue, tts_model, lang_tmpdir)
223
+ results_map[code] = str(tts_path) # Store the file path for this language
224
+ except ValueError as e: # From _split_to_chunks or synthesize_speech if no chunks
225
+ gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
226
+ results_map[code] = None
227
+ except RuntimeError as e: # From synthesize_speech (TTS/pydub errors)
228
+ gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
229
+ results_map[code] = None
230
+ except Exception as e: # Catch any other unexpected errors during synthesis
231
+ gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
232
+ results_map[code] = None
233
+
234
+ # Convert the results_map to an ordered list based on LANG_INFO keys.
235
+ # This ensures the returned list matches the order of Gradio output components.
236
+ final_results = [results_map[lang_code] for lang_code in LANG_INFO.keys()]
237
+ gr.Info("Podcast generation complete!")
238
+ return final_results
239
 
240
+ except gr.Error as e: # Re-raise Gradio-specific errors to be displayed in UI
241
+ raise e
242
+ except Exception as e: # Catch other unexpected errors during the process
243
+ # Log the full error for debugging purposes (e.g., to server logs)
244
+ import traceback
245
+ print("An unexpected error occurred in generate_podcast:")
246
+ traceback.print_exc()
247
+ # Show a generic error message in the UI
248
+ raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
249
 
 
250
 
251
  # ------------------------------------------------------------------
252
+ # Gradio Interface Setup
253
  # ------------------------------------------------------------------
254
+ # Ensure choices and outputs maintain consistent order related to LANG_INFO
255
+ language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
256
 
257
  inputs = [
258
  gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
259
  gr.CheckboxGroup(
260
+ choices=language_names_ordered,
261
+ value=["English"], # Default language selection
262
  label="Select podcast language(s) to generate",
263
  ),
264
  ]
265
 
266
+ # Create an gr.Audio output component for each language, in the defined order
267
  outputs = [
268
+ gr.Audio(label=f"{LANG_INFO[code]['name']} Podcast", type="filepath")
269
+ for code in LANG_INFO.keys()
270
  ]
271
 
272
  iface = gr.Interface(
273
  fn=generate_podcast,
274
  inputs=inputs,
275
  outputs=outputs,
276
+ title="Lecture Podcast Generator (Multi-Language)",
277
  description=(
278
+ "Upload a lecture PDF, choose language(s), and receive a two-host "
279
+ "audio podcast for each selected language. Dialogue is generated by Qwen-32B, "
280
+ "and speech is synthesized using open MMS-TTS models via the HF Inference API. "
281
+ "Long texts are automatically chunked, and audio parts are robustly combined."
282
  ),
283
+ allow_flagging="never", # Set to "auto" or "manual" if you want to enable flagging
284
+ # Provide examples if you have sample PDFs accessible to the Gradio app
285
+ # examples=[
286
+ # ["path/to/sample_lecture.pdf", ["English", "Chinese"]],
287
+ # ]
288
  )
289
 
290
  if __name__ == "__main__":
291
+ # For local testing, ensure ffmpeg is installed and in PATH if pydub relies on it
292
+ # for FLAC conversion or other operations not handled by its built-in capabilities.
293
+ # The Hugging Face Inference API for MMS-TTS should ideally return FLAC directly
294
+ # if the model specified (e.g., facebook/mms-tts-eng) outputs that format.
295
+ iface.launch()