HaiderAUT commited on
Commit
617d576
·
verified ·
1 Parent(s): 4c19533

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -180
app.py CHANGED
@@ -1,243 +1,172 @@
1
  # =============================================================
2
- # Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 
 
3
  # =============================================================
4
- # • **Text generation** – Google Gemini API (via user-provided genai API Key)
5
- # • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
6
- # -----------------------------------------------------------------
7
 
8
  import os
9
  import re
10
  import tempfile
11
  import textwrap
12
  from pathlib import Path
13
- from typing import List, Dict, Optional, Any
14
 
15
  import gradio as gr
16
  from PyPDF2 import PdfReader
17
  from pydub import AudioSegment
18
  from pydub.exceptions import CouldntDecodeError
19
 
20
- # For Hugging Face TTS
21
  from huggingface_hub import InferenceClient
22
 
23
- # For Google Gemini
24
  try:
25
  import google.generativeai as genai
26
  except ImportError:
27
  raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
28
 
29
  # ------------------------------------------------------------------
30
- # Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
31
- # ------------------------------------------------------------------
32
- hf_tts_client: Optional[InferenceClient] = None
33
- hf_token = os.getenv("HF_TOKEN")
34
- if hf_token:
35
- hf_tts_client = InferenceClient(token=hf_token)
36
- else:
37
- print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
38
-
39
- # ------------------------------------------------------------------
40
- # Language metadata for Hugging Face MMS-TTS models
41
- # ------------------------------------------------------------------
42
- LANG_INFO: Dict[str, Dict[str, str]] = {
43
- "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
44
- "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
45
- "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
46
- "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
47
- "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
48
- }
49
- LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
50
-
51
- # ------------------------------------------------------------------
52
- # Prompt template for Gemini
53
  # ------------------------------------------------------------------
 
54
  PROMPT_TEMPLATE = textwrap.dedent(
55
  """
56
- You are producing a lively two-host educational podcast in {lang_name}.
57
- Summarize the following lecture content into a dialogue of **approximately 300 words**.
58
- Make it engaging: hosts ask questions, clarify ideas with analogies, and
59
- wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
60
 
61
  ### Lecture Content
62
  {content}
63
  """
64
  )
65
 
66
- # PDF helpers (unchanged) -------------------------------------------
67
- def extract_pdf_text(pdf_path: str) -> str:
68
- try:
69
- reader = PdfReader(pdf_path)
70
- return "\n".join(page.extract_text() or "" for page in reader.pages)
71
- except Exception as e:
72
- raise gr.Error(f"Failed to process PDF: {e}")
73
 
74
- TOKEN_LIMIT = 8000
75
- def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
76
- words = text.split()
77
- if len(words) > limit:
78
- gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
79
- return " ".join(words[:limit])
80
- return text
81
 
82
  # ------------------------------------------------------------------
83
- # TTS helper using Hugging Face Inference API
84
  # ------------------------------------------------------------------
85
- CHUNK_CHAR_LIMIT_HF = 280
86
- def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
87
- sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
88
- sentences = [s.strip() for s in sentences_raw if s.strip()]
89
- chunks, current_chunk = [], ""
 
 
 
 
 
 
 
 
 
90
  for sent in sentences:
91
- if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
92
- chunks.append(current_chunk)
93
- current_chunk = sent
94
  else:
95
- current_chunk += (" " + sent) if current_chunk else sent
96
- if current_chunk:
97
- chunks.append(current_chunk)
98
- return [chunk for chunk in chunks if chunk.strip()]
99
-
100
- def synthesize_speech_hf(
101
- text: str,
102
- hf_model_id: str,
103
- lang_tmpdir: Path,
104
- tts_client: InferenceClient
105
- ) -> Path:
106
- chunks = _split_to_chunks_hf(text)
107
  if not chunks:
108
- raise ValueError("Text resulted in no speakable chunks after splitting.")
109
-
110
- audio_segments: List[AudioSegment] = []
111
- for idx, chunk in enumerate(chunks):
112
- gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
113
  try:
114
- audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
115
  except Exception as e:
116
- raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
117
-
118
- part_path = lang_tmpdir / f"part_{idx}.flac"
119
  part_path.write_bytes(audio_bytes)
120
  try:
121
- segment = AudioSegment.from_file(part_path, format="flac")
122
- audio_segments.append(segment)
123
  except CouldntDecodeError as e:
124
- raise RuntimeError(f"Failed to decode audio chunk {idx+1}: {e}") from e
125
-
126
- combined_audio = sum(audio_segments, AudioSegment.empty())
127
- final_path = lang_tmpdir / "podcast_audio.flac"
128
- combined_audio.export(final_path, format="flac")
129
- return final_path
130
 
131
  # ------------------------------------------------------------------
132
- # Main pipeline function for Gradio
133
  # ------------------------------------------------------------------
134
  def generate_podcast(
135
- gemini_api_key_from_ui: Optional[str],
136
- pdf_file_obj: Optional[gr.File],
137
- selected_lang_names: List[str]
138
  ) -> List[Optional[Any]]:
139
-
140
- if not gemini_api_key_from_ui:
141
- raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
142
- if not pdf_file_obj:
143
- raise gr.Error("Please upload a PDF file.")
144
- if not selected_lang_names:
145
- raise gr.Error("Please select at least one language.")
146
-
 
 
 
 
 
147
  try:
148
- genai.configure(api_key=gemini_api_key_from_ui)
149
  except Exception as e:
150
- raise gr.Error(f"Failed to configure Gemini API: {e}")
151
-
152
- if not hf_tts_client:
153
- gr.Warning("HF TTS unavailable; only script will be generated.")
154
-
155
- selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
156
- results_data = {
157
- code: {"audio": None, "script_md": None, "script_file": None}
158
- for code in LANG_INFO.keys()
159
- }
160
-
161
  with tempfile.TemporaryDirectory() as td:
162
- tmpdir_base = Path(td)
163
- lecture_raw = extract_pdf_text(pdf_file_obj.name)
164
- lecture_text = truncate_text(lecture_raw)
165
- if not lecture_text.strip():
166
- raise gr.Error("Extracted PDF text is empty.")
167
-
168
- gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
169
-
170
- for code in selected_codes:
171
- info = LANG_INFO[code]
172
- lang_name = info["name"]
173
- hf_tts_model_id = info["tts_model"]
174
-
175
- lang_tmpdir = tmpdir_base / code
176
- lang_tmpdir.mkdir(parents=True, exist_ok=True)
177
-
178
- # 1️⃣ Generate script via Gemini
179
- prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
180
- try:
181
- resp = gemini_model.generate_content(prompt)
182
- dialogue = resp.text or ""
183
- except Exception as e:
184
- raise gr.Error(f"Gemini error for {lang_name}: {e}")
185
-
186
- if dialogue:
187
- # store Markdown script
188
- results_data[code]["script_md"] = dialogue
189
- # write .txt file
190
- script_path = lang_tmpdir / f"podcast_script_{code}.txt"
191
- script_path.write_text(dialogue, encoding="utf-8")
192
- results_data[code]["script_file"] = str(script_path)
193
-
194
- # 2️⃣ Synthesize audio via HF TTS
195
- if hf_tts_client:
196
- try:
197
- audio_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
198
- results_data[code]["audio"] = str(audio_path)
199
- except Exception as e:
200
- gr.Error(f"TTS error for {lang_name}: {e}")
201
-
202
- # assemble outputs in the order: Audio, Markdown, File for each language
203
- final_outputs: List[Optional[Any]] = []
204
- for code in LANG_INFO.keys():
205
- out = results_data[code]
206
- final_outputs.extend([ out["audio"], out["script_md"], out["script_file"] ])
207
-
208
- return final_outputs
209
 
210
  # ------------------------------------------------------------------
211
- # Gradio Interface Setup
212
  # ------------------------------------------------------------------
213
- language_names_ordered = [info["name"] for info in LANG_INFO.values()]
214
-
215
- inputs = [
216
- gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key here"),
217
- gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
218
- gr.CheckboxGroup(choices=language_names_ordered, value=["English"], label="Select language(s)"),
219
- ]
220
-
221
- outputs = []
222
- for code in LANG_INFO.keys():
223
- lang_name = LANG_INFO[code]["name"]
224
- outputs.append(gr.Audio(label=f"{lang_name} Podcast", type="filepath"))
225
- outputs.append(gr.Markdown(label=f"{lang_name} Script"))
226
- outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
227
-
228
  iface = gr.Interface(
229
  fn=generate_podcast,
230
- inputs=inputs,
231
- outputs=outputs,
232
- title="Lecture Podcast & Script",
 
 
 
 
 
 
 
233
  description=(
234
- "Enter your Gemini API Key, upload a lecture PDF, choose language(s), "
235
- "and get a two-host podcast (audio) plus the Markdown script & downloadable text."
 
236
  ),
237
  allow_flagging="never",
238
  )
239
 
240
  if __name__ == "__main__":
241
- if not os.getenv("HF_TOKEN"):
242
- print("Reminder: set HF_TOKEN in Secrets for TTS to work.")
243
  iface.launch()
 
1
  # =============================================================
2
+ # Lecture → Podcast & Script Generator (English Only)
3
+ # • Text: Google Gemini API (via UI-provided key)
4
+ # • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
5
  # =============================================================
 
 
 
6
 
7
  import os
8
  import re
9
  import tempfile
10
  import textwrap
11
  from pathlib import Path
12
+ from typing import List, Optional, Any
13
 
14
  import gradio as gr
15
  from PyPDF2 import PdfReader
16
  from pydub import AudioSegment
17
  from pydub.exceptions import CouldntDecodeError
18
 
19
+ # Hugging Face TTS client (anonymous/public access)
20
  from huggingface_hub import InferenceClient
21
 
22
+ # Google Gemini SDK
23
  try:
24
  import google.generativeai as genai
25
  except ImportError:
26
  raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
27
 
28
  # ------------------------------------------------------------------
29
+ # Globals & templates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # ------------------------------------------------------------------
31
+ # Gemini prompt for ~300-word two-host dialogue in English
32
  PROMPT_TEMPLATE = textwrap.dedent(
33
  """
34
+ You are producing a lively two-host educational podcast in English.
35
+ Summarize the following lecture content into a dialogue of approximately 300 words.
36
+ Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
37
+ Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
38
 
39
  ### Lecture Content
40
  {content}
41
  """
42
  )
43
 
44
+ # TTS model ID for English MMS-TTS
45
+ HF_TTS_MODEL = "facebook/mms-tts-eng"
46
+ # Safe chunk size for HF text-to-speech
47
+ CHUNK_CHAR_LIMIT = 280
 
 
 
48
 
49
+ # Initialize HF TTS client (no token required for public models)
50
+ tts_client = InferenceClient()
 
 
 
 
 
51
 
52
  # ------------------------------------------------------------------
53
+ # Helpers
54
  # ------------------------------------------------------------------
55
+ def extract_pdf_text(pdf_path: str) -> str:
56
+ """Extracts all text from a PDF file."""
57
+ reader = PdfReader(pdf_path)
58
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
59
+
60
+ def truncate_text(text: str, max_words: int = 8000) -> str:
61
+ """Truncate to max_words to fit LLM context."""
62
+ words = text.split()
63
+ return " ".join(words[:max_words])
64
+
65
+ def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
66
+ """Split text into ≤limit-char chunks at sentence boundaries."""
67
+ sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
68
+ chunks, current = [], ""
69
  for sent in sentences:
70
+ if current and len(current) + len(sent) + 1 > limit:
71
+ chunks.append(current)
72
+ current = sent
73
  else:
74
+ current = f"{current} {sent}".strip() if current else sent
75
+ if current:
76
+ chunks.append(current)
77
+ return chunks
78
+
79
+ def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
80
+ """Chunk-safe TTS via HF Inference API, concatenating FLAC segments."""
81
+ chunks = split_to_chunks(text)
 
 
 
 
82
  if not chunks:
83
+ raise ValueError("No text to synthesize.")
84
+ segments = []
85
+ for i, chunk in enumerate(chunks):
 
 
86
  try:
87
+ audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
88
  except Exception as e:
89
+ raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
90
+ part_path = out_dir / f"seg_{i}.flac"
 
91
  part_path.write_bytes(audio_bytes)
92
  try:
93
+ seg = AudioSegment.from_file(part_path, format="flac")
94
+ segments.append(seg)
95
  except CouldntDecodeError as e:
96
+ raise RuntimeError(f"Could not decode segment {i+1}: {e}")
97
+ # Concatenate
98
+ final = sum(segments, AudioSegment.empty())
99
+ out_path = out_dir / "podcast_audio.flac"
100
+ final.export(out_path, format="flac")
101
+ return out_path
102
 
103
  # ------------------------------------------------------------------
104
+ # Main pipeline
105
  # ------------------------------------------------------------------
106
  def generate_podcast(
107
+ gemini_api_key: Optional[str],
108
+ lecture_pdf: Optional[gr.File]
 
109
  ) -> List[Optional[Any]]:
110
+ # Validate inputs
111
+ if not gemini_api_key:
112
+ raise gr.Error("Enter your Google AI Studio API Key.")
113
+ if not lecture_pdf:
114
+ raise gr.Error("Upload a lecture PDF file.")
115
+ # Configure Gemini
116
+ genai.configure(api_key=gemini_api_key)
117
+ # Extract & truncate lecture text
118
+ raw = extract_pdf_text(lecture_pdf.name)
119
+ content = truncate_text(raw)
120
+ if not content.strip():
121
+ raise gr.Error("Lecture PDF contained no extractable text.")
122
+ # Initialize Gemini model
123
  try:
124
+ gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
125
  except Exception as e:
126
+ raise gr.Error(f"Gemini init failed: {e}")
127
+ # Generate script
128
+ prompt = PROMPT_TEMPLATE.format(content=content)
129
+ try:
130
+ resp = gemini_model.generate_content(prompt)
131
+ script = resp.text or ""
132
+ except Exception as e:
133
+ raise gr.Error(f"Gemini generation error: {e}")
134
+ # Prepare temp directory
 
 
135
  with tempfile.TemporaryDirectory() as td:
136
+ tmp = Path(td)
137
+ # Save script file
138
+ script_path = tmp / "podcast_script.txt"
139
+ script_path.write_text(script, encoding="utf-8")
140
+ # Synthesize audio
141
+ try:
142
+ audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
143
+ except Exception as e:
144
+ raise gr.Error(f"Speech synthesis error: {e}")
145
+ # Return [audio, markdown script, txt file]
146
+ return [str(audio_path), script, str(script_path)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # ------------------------------------------------------------------
149
+ # Gradio Interface
150
  # ------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  iface = gr.Interface(
152
  fn=generate_podcast,
153
+ inputs=[
154
+ gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key"),
155
+ gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
156
+ ],
157
+ outputs=[
158
+ gr.Audio(label="English Podcast", type="filepath"),
159
+ gr.Markdown(label="English Script"),
160
+ gr.File(label="Download English Script (.txt)", type="filepath"),
161
+ ],
162
+ title="Lecture → English Podcast & Script",
163
  description=(
164
+ "Enter your Gemini API Key and upload a lecture PDF. "
165
+ "Generates a two-host podcast audio and a Markdown script in English "
166
+ "using Google Gemini for text and Hugging Face MMS-TTS for audio."
167
  ),
168
  allow_flagging="never",
169
  )
170
 
171
  if __name__ == "__main__":
 
 
172
  iface.launch()