HaiderAUT commited on
Commit
4c19533
·
verified ·
1 Parent(s): 764a881

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -96
app.py CHANGED
@@ -1,7 +1,10 @@
1
  # =============================================================
2
- # Lecture → Podcast & Script Generator (Gemini + HF TTS)
3
- # Modified: Script outputs rendered as HTML
4
  # =============================================================
 
 
 
 
5
  import os
6
  import re
7
  import tempfile
@@ -14,19 +17,28 @@ from PyPDF2 import PdfReader
14
  from pydub import AudioSegment
15
  from pydub.exceptions import CouldntDecodeError
16
 
17
- # Hugging Face TTS
18
  from huggingface_hub import InferenceClient
19
 
20
- # Google Gemini
21
- import google.generativeai as genai
 
 
 
22
 
23
  # ------------------------------------------------------------------
24
- # HF TTS client
25
  # ------------------------------------------------------------------
 
26
  hf_token = os.getenv("HF_TOKEN")
27
- hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None
 
 
 
28
 
29
- # Language metadata
 
 
30
  LANG_INFO: Dict[str, Dict[str, str]] = {
31
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
32
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
@@ -36,122 +48,196 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
36
  }
37
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
38
 
39
- # Prompt template
 
 
40
  PROMPT_TEMPLATE = textwrap.dedent(
41
  """
42
  You are producing a lively two-host educational podcast in {lang_name}.
43
  Summarize the following lecture content into a dialogue of **approximately 300 words**.
44
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
45
- wrap up with a concise recap. Preserve technical accuracy.
46
 
47
  ### Lecture Content
48
  {content}
49
  """
50
  )
51
 
52
- # PDF extraction
53
- TOKEN_LIMIT = 8000
54
-
55
- def extract_pdf_text(path: str) -> str:
56
- reader = PdfReader(path)
57
- return "\n".join(p.extract_text() or "" for p in reader.pages)
 
58
 
 
59
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
60
  words = text.split()
61
- return " ".join(words[:limit]) if len(words) > limit else text
62
-
63
- # TTS chunking
64
- CHUNK_CHAR_LIMIT = 280
65
-
66
- def split_chunks(text: str) -> List[str]:
67
- sentences = re.split(r"(?<=[.!?])\s+", text.strip())
68
- chunks, curr = [], ""
69
- for s in sentences:
70
- if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
71
- chunks.append(curr)
72
- curr = s
 
 
 
 
 
73
  else:
74
- curr = f"{curr} {s}" if curr else s
75
- if curr: chunks.append(curr)
76
- return chunks
77
-
78
- # Synthesize speech
79
-
80
- def synthesize(text: str, model_id: str, outdir: Path) -> str:
81
- segments = []
82
- for i, chunk in enumerate(split_chunks(text)):
83
- audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
84
- path = outdir / f"part{i}.flac"
85
- path.write_bytes(audio_bytes)
86
- seg = AudioSegment.from_file(path, format="flac")
87
- segments.append(seg)
88
- final = sum(segments, AudioSegment.empty())
89
- out = outdir / "podcast.flac"
90
- final.export(out, format="flac")
91
- return str(out)
92
-
93
- # Main pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
 
95
  def generate_podcast(
96
- gemini_key: str,
97
- pdf_file: gr.File,
98
- langs: List[str]
99
  ) -> List[Optional[Any]]:
100
- if not gemini_key:
101
- raise gr.Error("Enter Google AI Studio API Key.")
102
- if not pdf_file:
103
- raise gr.Error("Upload a PDF file.")
104
- if not langs:
105
- raise gr.Error("Select at least one language.")
106
-
107
- genai.configure(api_key=gemini_key)
108
- raw = extract_pdf_text(pdf_file.name)
109
- content = truncate_text(raw)
110
-
111
- tmp = Path(tempfile.mkdtemp())
112
- results = []
113
- data = {}
114
-
115
- for code, info in LANG_INFO.items():
116
- if info["name"] not in langs:
117
- results.extend([None, None, None])
118
- continue
119
- # Generate script
120
- prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
121
- model = genai.GenerativeModel('gemini-1.5-flash-latest')
122
- resp = model.generate_content(prompt)
123
- script = resp.text.strip()
124
- # Save plain text
125
- script_path = tmp / f"script_{code}.txt"
126
- script_path.write_text(script, encoding="utf-8")
127
- # Render HTML version
128
- html_script = f"<pre>{script}</pre>"
129
- # Synthesize audio if available
130
- audio_path = None
131
- if hf_tts_client:
132
- audio_path = synthesize(script, info["tts_model"], tmp / code)
133
- results.extend([audio_path, html_script, str(script_path)])
134
- return results
135
-
136
- # Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  inputs = [
138
- gr.Textbox(label="Google AI Studio API Key", type="password"),
139
- gr.File(label="Lecture PDF", file_types=[".pdf"]),
140
- gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
141
- value=["English"], label="Languages")
142
  ]
 
143
  outputs = []
144
- for code, info in LANG_INFO.items():
145
- outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
146
- outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
147
- outputs.append(gr.File(label=f"Download {info['name']} Script"))
 
148
 
149
  iface = gr.Interface(
150
  fn=generate_podcast,
151
  inputs=inputs,
152
  outputs=outputs,
153
  title="Lecture → Podcast & Script",
 
 
 
 
 
154
  )
155
 
156
  if __name__ == "__main__":
 
 
157
  iface.launch()
 
1
  # =============================================================
2
+ # Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 
3
  # =============================================================
4
+ # • **Text generation** – Google Gemini API (via user-provided genai API Key)
5
+ # • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
6
+ # -----------------------------------------------------------------
7
+
8
  import os
9
  import re
10
  import tempfile
 
17
  from pydub import AudioSegment
18
  from pydub.exceptions import CouldntDecodeError
19
 
20
+ # For Hugging Face TTS
21
  from huggingface_hub import InferenceClient
22
 
23
+ # For Google Gemini
24
+ try:
25
+ import google.generativeai as genai
26
+ except ImportError:
27
+ raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
28
 
29
  # ------------------------------------------------------------------
30
+ # Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
31
  # ------------------------------------------------------------------
32
+ hf_tts_client: Optional[InferenceClient] = None
33
  hf_token = os.getenv("HF_TOKEN")
34
+ if hf_token:
35
+ hf_tts_client = InferenceClient(token=hf_token)
36
+ else:
37
+ print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
38
 
39
+ # ------------------------------------------------------------------
40
+ # Language metadata for Hugging Face MMS-TTS models
41
+ # ------------------------------------------------------------------
42
  LANG_INFO: Dict[str, Dict[str, str]] = {
43
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
44
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
 
48
  }
49
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
50
 
51
+ # ------------------------------------------------------------------
52
+ # Prompt template for Gemini
53
+ # ------------------------------------------------------------------
54
  PROMPT_TEMPLATE = textwrap.dedent(
55
  """
56
  You are producing a lively two-host educational podcast in {lang_name}.
57
  Summarize the following lecture content into a dialogue of **approximately 300 words**.
58
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
59
+ wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
60
 
61
  ### Lecture Content
62
  {content}
63
  """
64
  )
65
 
66
+ # PDF helpers (unchanged) -------------------------------------------
67
+ def extract_pdf_text(pdf_path: str) -> str:
68
+ try:
69
+ reader = PdfReader(pdf_path)
70
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
71
+ except Exception as e:
72
+ raise gr.Error(f"Failed to process PDF: {e}")
73
 
74
+ TOKEN_LIMIT = 8000
75
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
76
  words = text.split()
77
+ if len(words) > limit:
78
+ gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
79
+ return " ".join(words[:limit])
80
+ return text
81
+
82
+ # ------------------------------------------------------------------
83
+ # TTS helper using Hugging Face Inference API
84
+ # ------------------------------------------------------------------
85
+ CHUNK_CHAR_LIMIT_HF = 280
86
+ def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
87
+ sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
88
+ sentences = [s.strip() for s in sentences_raw if s.strip()]
89
+ chunks, current_chunk = [], ""
90
+ for sent in sentences:
91
+ if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
92
+ chunks.append(current_chunk)
93
+ current_chunk = sent
94
  else:
95
+ current_chunk += (" " + sent) if current_chunk else sent
96
+ if current_chunk:
97
+ chunks.append(current_chunk)
98
+ return [chunk for chunk in chunks if chunk.strip()]
99
+
100
+ def synthesize_speech_hf(
101
+ text: str,
102
+ hf_model_id: str,
103
+ lang_tmpdir: Path,
104
+ tts_client: InferenceClient
105
+ ) -> Path:
106
+ chunks = _split_to_chunks_hf(text)
107
+ if not chunks:
108
+ raise ValueError("Text resulted in no speakable chunks after splitting.")
109
+
110
+ audio_segments: List[AudioSegment] = []
111
+ for idx, chunk in enumerate(chunks):
112
+ gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
113
+ try:
114
+ audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
115
+ except Exception as e:
116
+ raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
117
+
118
+ part_path = lang_tmpdir / f"part_{idx}.flac"
119
+ part_path.write_bytes(audio_bytes)
120
+ try:
121
+ segment = AudioSegment.from_file(part_path, format="flac")
122
+ audio_segments.append(segment)
123
+ except CouldntDecodeError as e:
124
+ raise RuntimeError(f"Failed to decode audio chunk {idx+1}: {e}") from e
125
+
126
+ combined_audio = sum(audio_segments, AudioSegment.empty())
127
+ final_path = lang_tmpdir / "podcast_audio.flac"
128
+ combined_audio.export(final_path, format="flac")
129
+ return final_path
130
 
131
+ # ------------------------------------------------------------------
132
+ # Main pipeline function for Gradio
133
+ # ------------------------------------------------------------------
134
  def generate_podcast(
135
+ gemini_api_key_from_ui: Optional[str],
136
+ pdf_file_obj: Optional[gr.File],
137
+ selected_lang_names: List[str]
138
  ) -> List[Optional[Any]]:
139
+
140
+ if not gemini_api_key_from_ui:
141
+ raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
142
+ if not pdf_file_obj:
143
+ raise gr.Error("Please upload a PDF file.")
144
+ if not selected_lang_names:
145
+ raise gr.Error("Please select at least one language.")
146
+
147
+ try:
148
+ genai.configure(api_key=gemini_api_key_from_ui)
149
+ except Exception as e:
150
+ raise gr.Error(f"Failed to configure Gemini API: {e}")
151
+
152
+ if not hf_tts_client:
153
+ gr.Warning("HF TTS unavailable; only script will be generated.")
154
+
155
+ selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
156
+ results_data = {
157
+ code: {"audio": None, "script_md": None, "script_file": None}
158
+ for code in LANG_INFO.keys()
159
+ }
160
+
161
+ with tempfile.TemporaryDirectory() as td:
162
+ tmpdir_base = Path(td)
163
+ lecture_raw = extract_pdf_text(pdf_file_obj.name)
164
+ lecture_text = truncate_text(lecture_raw)
165
+ if not lecture_text.strip():
166
+ raise gr.Error("Extracted PDF text is empty.")
167
+
168
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
169
+
170
+ for code in selected_codes:
171
+ info = LANG_INFO[code]
172
+ lang_name = info["name"]
173
+ hf_tts_model_id = info["tts_model"]
174
+
175
+ lang_tmpdir = tmpdir_base / code
176
+ lang_tmpdir.mkdir(parents=True, exist_ok=True)
177
+
178
+ # 1️⃣ Generate script via Gemini
179
+ prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
180
+ try:
181
+ resp = gemini_model.generate_content(prompt)
182
+ dialogue = resp.text or ""
183
+ except Exception as e:
184
+ raise gr.Error(f"Gemini error for {lang_name}: {e}")
185
+
186
+ if dialogue:
187
+ # store Markdown script
188
+ results_data[code]["script_md"] = dialogue
189
+ # write .txt file
190
+ script_path = lang_tmpdir / f"podcast_script_{code}.txt"
191
+ script_path.write_text(dialogue, encoding="utf-8")
192
+ results_data[code]["script_file"] = str(script_path)
193
+
194
+ # 2️⃣ Synthesize audio via HF TTS
195
+ if hf_tts_client:
196
+ try:
197
+ audio_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
198
+ results_data[code]["audio"] = str(audio_path)
199
+ except Exception as e:
200
+ gr.Error(f"TTS error for {lang_name}: {e}")
201
+
202
+ # assemble outputs in the order: Audio, Markdown, File for each language
203
+ final_outputs: List[Optional[Any]] = []
204
+ for code in LANG_INFO.keys():
205
+ out = results_data[code]
206
+ final_outputs.extend([ out["audio"], out["script_md"], out["script_file"] ])
207
+
208
+ return final_outputs
209
+
210
+ # ------------------------------------------------------------------
211
+ # Gradio Interface Setup
212
+ # ------------------------------------------------------------------
213
+ language_names_ordered = [info["name"] for info in LANG_INFO.values()]
214
+
215
  inputs = [
216
+ gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key here"),
217
+ gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
218
+ gr.CheckboxGroup(choices=language_names_ordered, value=["English"], label="Select language(s)"),
 
219
  ]
220
+
221
  outputs = []
222
+ for code in LANG_INFO.keys():
223
+ lang_name = LANG_INFO[code]["name"]
224
+ outputs.append(gr.Audio(label=f"{lang_name} Podcast", type="filepath"))
225
+ outputs.append(gr.Markdown(label=f"{lang_name} Script"))
226
+ outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
227
 
228
  iface = gr.Interface(
229
  fn=generate_podcast,
230
  inputs=inputs,
231
  outputs=outputs,
232
  title="Lecture → Podcast & Script",
233
+ description=(
234
+ "Enter your Gemini API Key, upload a lecture PDF, choose language(s), "
235
+ "and get a two-host podcast (audio) plus the Markdown script & downloadable text."
236
+ ),
237
+ allow_flagging="never",
238
  )
239
 
240
  if __name__ == "__main__":
241
+ if not os.getenv("HF_TOKEN"):
242
+ print("Reminder: set HF_TOKEN in Secrets for TTS to work.")
243
  iface.launch()