HaiderAUT commited on
Commit
fe00684
·
verified ·
1 Parent(s): c172b12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -46
app.py CHANGED
@@ -1,118 +1,156 @@
1
  # =============================================================
2
- # Hugging Face Space – Lecture  Podcast Generator (User‑selectable Languages)
3
  # =============================================================
4
- # * **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct).
5
- # * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`.
6
- # * Users pick which languages to generate (English, Bangla, Chinese,
7
- # Urdu, Nepali). Unselected languages are skipped.
8
  # -----------------------------------------------------------------
9
 
10
  import os
 
11
  import tempfile
12
  import textwrap
13
  from pathlib import Path
14
  from typing import List, Dict, Tuple, Optional
15
 
16
  import gradio as gr
17
- from huggingface_hub import InferenceClient
18
  from PyPDF2 import PdfReader
19
  from smolagents import HfApiModel
20
 
21
  # ------------------------------------------------------------------
22
- # LLM: Qwen 32‑B via SmolAgents
23
  # ------------------------------------------------------------------
24
  llm = HfApiModel(
25
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
26
- max_tokens=2096,
27
  temperature=0.5,
28
- custom_role_conversions=None,
29
  )
30
 
31
  # ------------------------------------------------------------------
32
- # HF Inference API client (reads HF_TOKEN secret if set)
33
  # ------------------------------------------------------------------
34
  client = InferenceClient(token=os.getenv("HF_TOKEN", None))
35
 
36
  # ------------------------------------------------------------------
37
- # Language metadata and matching TTS model IDs
 
38
  # ------------------------------------------------------------------
39
  LANG_INFO: Dict[str, Dict[str, str]] = {
40
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
41
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
42
- "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
43
- "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
44
  "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
45
  }
46
-
47
- # Helper map: name ➜ code
48
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
49
 
 
 
 
50
  PROMPT_TEMPLATE = textwrap.dedent(
51
  """
52
  You are producing a lively two‑host educational podcast in {lang_name}.
53
- Summarize the following lecture content into a dialogue of ≈1200 words.
54
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
55
  wrap up with a concise recap. Preserve technical accuracy.
56
-
57
  ### Lecture Content
58
  {content}
59
  """
60
  )
61
 
62
- # ------------------------------------------------------------------
63
- # Helpers: extract and truncate PDF text
64
- # ------------------------------------------------------------------
65
 
66
  def extract_pdf_text(pdf_path: str) -> str:
67
  reader = PdfReader(pdf_path)
68
  return "\n".join(page.extract_text() or "" for page in reader.pages)
69
 
70
- TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit
 
71
 
72
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
73
  words = text.split()
74
  return " ".join(words[:limit])
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # ------------------------------------------------------------------
77
  # Main pipeline
78
  # ------------------------------------------------------------------
79
 
80
- def generate_podcast(pdf: gr.File, selected_lang_names: List[str]) -> List[Optional[Tuple[str, None]]]:
81
- """Generate podcast audio files for chosen languages. Returns a list
82
- aligned with LANG_INFO order; unselected languages yield None."""
83
- # Ensure at least one language selected
84
  if not selected_lang_names:
85
- return [None] * len(LANG_INFO)
86
 
87
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
 
88
 
89
- with tempfile.TemporaryDirectory() as tmpdir:
90
- raw_text = extract_pdf_text(pdf.name)
91
- lecture_text = truncate_text(raw_text)
92
- outputs: List[Optional[Tuple[str, None]]] = []
93
 
94
  for code, info in LANG_INFO.items():
95
  if code not in selected_codes:
96
- outputs.append(None)
97
  continue
98
 
99
- # 1️⃣ Draft dialogue in the target language
100
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
101
  dialogue: str = llm(prompt)
102
 
103
- # 2️⃣ Synthesize speech via HF Inference API
104
- audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
105
- flac_path = Path(tmpdir) / f"podcast_{code}.flac"
106
- flac_path.write_bytes(audio_bytes)
107
 
108
- outputs.append((str(flac_path), None)) # (filepath, label)
109
 
110
- return outputs
111
 
112
  # ------------------------------------------------------------------
113
- # Gradio interface
114
  # ------------------------------------------------------------------
115
-
116
  language_choices = [info["name"] for info in LANG_INFO.values()]
117
 
118
  inputs = [
@@ -124,20 +162,20 @@ inputs = [
124
  ),
125
  ]
126
 
127
- audio_components = [
128
  gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
129
  ]
130
 
131
  iface = gr.Interface(
132
  fn=generate_podcast,
133
  inputs=inputs,
134
- outputs=audio_components,
135
  title="Lecture → Podcast Generator (Choose Languages)",
136
  description=(
137
- "Upload a lecture PDF, choose your desired languages, and receive a "
138
- "two‑host audio podcast. Dialogue is crafted by Qwen‑32B; speech is "
139
- "synthesized on‑the‑fly using the Hugging Face Inference API "
140
- "no heavy downloads or GPUs required."
141
  ),
142
  )
143
 
 
1
  # =============================================================
2
+ # HuggingFaceSpace – LecturePodcastGenerator (User‑selectable Languages)
3
  # =============================================================
4
+ # **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct)
5
+ # **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe
6
+ # (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split
7
+ # into ≤280‑char chunks to stay within HF endpoint limits.
8
  # -----------------------------------------------------------------
9
 
10
  import os
11
+ import re
12
  import tempfile
13
  import textwrap
14
  from pathlib import Path
15
  from typing import List, Dict, Tuple, Optional
16
 
17
  import gradio as gr
18
+ from huggingface_hub import InferenceClient, HubHTTPError
19
  from PyPDF2 import PdfReader
20
  from smolagents import HfApiModel
21
 
22
  # ------------------------------------------------------------------
23
+ # LLM setup – remote Qwen model via SmolAgents
24
  # ------------------------------------------------------------------
25
  llm = HfApiModel(
26
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
27
+ max_tokens=2048,
28
  temperature=0.5,
 
29
  )
30
 
31
  # ------------------------------------------------------------------
32
+ # Hugging Face Inference API client (uses HF_TOKEN secret if provided)
33
  # ------------------------------------------------------------------
34
  client = InferenceClient(token=os.getenv("HF_TOKEN", None))
35
 
36
  # ------------------------------------------------------------------
37
+ # Language metadata and corresponding open TTS model IDs
38
+ # (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
39
  # ------------------------------------------------------------------
40
  LANG_INFO: Dict[str, Dict[str, str]] = {
41
  "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
42
  "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
43
+ "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
44
+ "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd"},
45
  "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
46
  }
 
 
47
  LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
48
 
49
+ # ------------------------------------------------------------------
50
+ # Prompt template (≈300 words to keep TTS happy)
51
+ # ------------------------------------------------------------------
52
  PROMPT_TEMPLATE = textwrap.dedent(
53
  """
54
  You are producing a lively two‑host educational podcast in {lang_name}.
55
+ Summarize the following lecture content into a dialogue of **≈300 words**.
56
  Make it engaging: hosts ask questions, clarify ideas with analogies, and
57
  wrap up with a concise recap. Preserve technical accuracy.
58
+
59
  ### Lecture Content
60
  {content}
61
  """
62
  )
63
 
64
+ # PDF helpers -------------------------------------------------------
 
 
65
 
66
  def extract_pdf_text(pdf_path: str) -> str:
67
  reader = PdfReader(pdf_path)
68
  return "\n".join(page.extract_text() or "" for page in reader.pages)
69
 
70
+ TOKEN_LIMIT = 4000 # approx words before hitting context limit
71
+
72
 
73
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
74
  words = text.split()
75
  return " ".join(words[:limit])
76
 
77
+ # ------------------------------------------------------------------
78
+ # TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars)
79
+ # ------------------------------------------------------------------
80
+ CHUNK_CHAR_LIMIT = 280 # safe margin for MMS‑TTS
81
+
82
+ def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
83
+ # split on sentence boundaries while respecting limit
84
+ sentences = re.split(r"(?<=[.!?])\s+", text.strip())
85
+ chunks, current = [], ""
86
+ for sent in sentences:
87
+ if len(current) + len(sent) + 1 > limit:
88
+ if current:
89
+ chunks.append(current.strip())
90
+ current = sent
91
+ else:
92
+ current += " " + sent if current else sent
93
+ if current:
94
+ chunks.append(current.strip())
95
+ return chunks
96
+
97
+
98
+ def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
99
+ """Stream chunks through HF TTS and concatenate FLAC bytes."""
100
+ chunks = _split_to_chunks(text)
101
+ flac_paths: List[Path] = []
102
+ for idx, chunk in enumerate(chunks):
103
+ try:
104
+ audio_bytes = client.text_to_speech(chunk, model=model_id)
105
+ except HubHTTPError as e:
106
+ raise RuntimeError(f"TTS request failed: {e}") from e
107
+ part_path = tmpdir / f"part_{idx}.flac"
108
+ part_path.write_bytes(audio_bytes)
109
+ flac_paths.append(part_path)
110
+
111
+ # simple concat of FLAC files (works because each part includes header)
112
+ # better: convert to raw & merge, but HF players handle sequential FLACs
113
+ final_path = tmpdir / "podcast.flac"
114
+ with open(final_path, "wb") as fout:
115
+ for p in flac_paths:
116
+ fout.write(p.read_bytes())
117
+ return final_path
118
+
119
  # ------------------------------------------------------------------
120
  # Main pipeline
121
  # ------------------------------------------------------------------
122
 
123
+ def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
 
 
 
124
  if not selected_lang_names:
125
+ raise gr.Error("Please select at least one language.")
126
 
127
  selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
128
+ results: List[Optional[Tuple[str, None]]] = []
129
 
130
+ with tempfile.TemporaryDirectory() as td:
131
+ tmpdir = Path(td)
132
+ lecture_raw = extract_pdf_text(pdf.name)
133
+ lecture_text = truncate_text(lecture_raw)
134
 
135
  for code, info in LANG_INFO.items():
136
  if code not in selected_codes:
137
+ results.append(None)
138
  continue
139
 
140
+ # 1️⃣ Generate dialogue
141
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
142
  dialogue: str = llm(prompt)
143
 
144
+ # 2️⃣ Speech synthesis (chunked)
145
+ tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)
 
 
146
 
147
+ results.append((str(tts_path), None))
148
 
149
+ return results
150
 
151
  # ------------------------------------------------------------------
152
+ # Gradio Interface
153
  # ------------------------------------------------------------------
 
154
  language_choices = [info["name"] for info in LANG_INFO.values()]
155
 
156
  inputs = [
 
162
  ),
163
  ]
164
 
165
+ outputs = [
166
  gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
167
  ]
168
 
169
  iface = gr.Interface(
170
  fn=generate_podcast,
171
  inputs=inputs,
172
+ outputs=outputs,
173
  title="Lecture → Podcast Generator (Choose Languages)",
174
  description=(
175
+ "Upload a lecture PDF, choose language(s), and receive a two‑host "
176
+ "audio podcast. Dialogue comes from Qwen‑32B; speech is streamed "
177
+ "via the HF Inference API using open MMS‑TTS models. Long texts are "
178
+ "automatically chunked to fit API limits."
179
  ),
180
  )
181