HaiderAUT commited on
Commit
f1adb14
·
verified ·
1 Parent(s): 910bbfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -195
app.py CHANGED
@@ -1,195 +1,118 @@
1
- # =============================================================
2
- # Hugging Face Space – Lecture → Multilingual Podcast Generator
3
- # =============================================================
4
- # Upload a lecture PDF generate a two‑host dialogue (podcast)
5
- # directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
6
- # using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
7
- # Face *audio‑generation* model for speech (no external TTS APIs).
8
- # -----------------------------------------------------------------
9
- # Files for your Space:
10
- # • app.py (this file)
11
- # • requirements.txt (see bottom comment block)
12
- # -----------------------------------------------------------------
13
- # Add your HF_TOKEN as a Space secret if required for gated models.
14
- # =============================================================
15
-
16
- import os
17
- import tempfile
18
- import uuid
19
- import textwrap
20
- from typing import Dict, Tuple
21
-
22
- import gradio as gr
23
- from PyPDF2 import PdfReader
24
- import nltk # sentence tokenisation
25
- from llama_index.llms.huggingface import HfApiModel
26
- from transformers import pipeline # HF TTS pipeline
27
- import soundfile as sf # save audio
28
-
29
- # ---------------------------------------------------------------
30
- # Ensure NLTK punkt is present on first launch
31
- # ---------------------------------------------------------------
32
- try:
33
- nltk.data.find("tokenizers/punkt")
34
- except LookupError:
35
- nltk.download("punkt")
36
-
37
- # --------------------------- LLM Setup ---------------------------
38
- llm = HfApiModel(
39
- max_tokens=2096,
40
- temperature=0.5,
41
- model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
42
- custom_role_conversions=None,
43
- )
44
-
45
- # ------------------------ TTS Setup ------------------------------
46
- # Multilingual TTS model (≈ 500 MB). It supports >100 languages.
47
- # If you need lighter weights choose language‑specific VITS models.
48
- # ----------------------------------------------------------------
49
- TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
50
- # Load once; Space queues requests so single GPU/CPU is okay.
51
- try:
52
- tts_pipeline = pipeline(
53
- "text-to-speech",
54
- model=TTS_MODEL_ID,
55
- device_map="auto", # GPU if available, else CPU
56
- )
57
- except Exception as e:
58
- raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
59
-
60
- # ------------------------ Helpers --------------------------------
61
- LANG_CONFIG = {
62
- "English": {
63
- "tts_lang": "en",
64
- "prompt_tag": "English",
65
- },
66
- "Bangla": {
67
- "tts_lang": "bn",
68
- "prompt_tag": "Bangla (বাংলা)",
69
- },
70
- "Chinese": {
71
- "tts_lang": "zh",
72
- "prompt_tag": "Mandarin Chinese",
73
- },
74
- "Urdu": {
75
- "tts_lang": "ur",
76
- "prompt_tag": "Urdu (اردو)",
77
- },
78
- "Nepali": {
79
- "tts_lang": "ne",
80
- "prompt_tag": "Nepali (नेपाली)",
81
- },
82
- }
83
-
84
-
85
- def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
86
- """Extract raw text from PDF, truncate to avoid token overflow."""
87
- reader = PdfReader(pdf_path)
88
- text_parts = []
89
- for page in reader.pages:
90
- page_text = page.extract_text() or ""
91
- text_parts.append(page_text)
92
- if sum(len(t) for t in text_parts) >= max_chars:
93
- break
94
- raw_text = "\n".join(text_parts)[:max_chars]
95
- return raw_text
96
-
97
-
98
- def build_prompt(lecture_text: str, lang: str) -> str:
99
- """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
100
- # Compress lecture to ~150 sentences to stay under token budget
101
- sentences = nltk.sent_tokenize(lecture_text)
102
- short_text = " ".join(sentences[: min(len(sentences), 150)])
103
-
104
- prompt = textwrap.dedent(
105
- f"""
106
- You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
107
-
108
- Lecture content (for reference):
109
- """.strip()
110
- ) + "\n" + short_text + "\n"
111
- return prompt
112
-
113
-
114
- def generate_dialogue(lecture_text: str, lang: str) -> str:
115
- """Call the Qwen model to generate podcast script for the given language."""
116
- prompt = build_prompt(lecture_text, lang)
117
- try:
118
- response = llm.complete(prompt)
119
- dialogue = response.text.strip()
120
- except Exception as e:
121
- dialogue = f"Error generating dialogue in {lang}: {e}"
122
- return dialogue
123
-
124
-
125
- def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
126
- """Convert text to speech via HF TTS; returns (filepath, mime)."""
127
- language_code = LANG_CONFIG[lang]["tts_lang"]
128
- tmpdir = tempfile.gettempdir()
129
- filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
130
- try:
131
- # xtts_v2 accepts a `language` forward param
132
- speech = tts_pipeline(text, forward_params={"language": language_code})
133
- sf.write(filename, speech["audio"], speech["sampling_rate"])
134
- return filename, "audio/wav"
135
- except Exception as e:
136
- # Return dummy text file explaining error
137
- errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
138
- with open(errfile, "w", encoding="utf-8") as fh:
139
- fh.write(f"TTS error for {lang}: {e}\n")
140
- return errfile, "text/plain"
141
-
142
-
143
- def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
144
- """Full pipeline returning a dict: language → (file_path, mime)."""
145
- if pdf_file is None:
146
- raise gr.Error("Please upload a PDF lecture first.")
147
- lecture_text = extract_text(pdf_file)
148
-
149
- audio_outputs = {}
150
- for lang in LANG_CONFIG.keys():
151
- dialogue = generate_dialogue(lecture_text, lang)
152
- path, mime = tts_for_dialogue(lang, dialogue)
153
- audio_outputs[lang] = (path, mime)
154
- return audio_outputs
155
-
156
-
157
- # ------------------------ Gradio UI --------------------------------
158
-
159
- with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
160
- gr.Markdown(
161
- """# 📚🎙️ Lecture → Podcast
162
- Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
163
- """
164
- )
165
- with gr.Row():
166
- inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
167
- btn = gr.Button("Generate Podcast")
168
- with gr.Group():
169
- audio_components = [
170
- gr.Audio(label=lang, interactive=False, type="filepath")
171
- for lang in LANG_CONFIG.keys()
172
- ]
173
-
174
-
175
- def gradio_wrapper(pdf_file):
176
- results = pipeline_runner(pdf_file)
177
- return [results[lang][0] for lang in LANG_CONFIG.keys()]
178
-
179
-
180
- btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
181
-
182
- if __name__ == "__main__":
183
- demo.launch()
184
-
185
- # ---------------------------------------------------------------
186
- # requirements.txt (commit as separate file in the Space root)
187
- # ---------------------------------------------------------------
188
- # gradio>=4.28.0
189
- # PyPDF2>=3.0.1
190
- # nltk>=3.8.1
191
- # transformers>=4.39.0
192
- # torch>=2.1.2
193
- # soundfile>=0.12.1
194
- # llama-index>=0.11.47
195
- # huggingface-hub>=0.23.0
 
1
+ # =============================================================
2
+ # Hugging Face Space – Lecture → Multilingual Podcast Generator
3
+ # =============================================================
4
+ # Uses SmolAgents HfApiModel for text generation and HF audio
5
+ # pipeline for speech. Generates two‑host dialogues in five
6
+ # languages (English, Bangla, Chinese, Urdu, Nepali) directly
7
+ # from a PDF lecture upload.
8
+ # -----------------------------------------------------------------
9
+
10
+ import os
11
+ import tempfile
12
+ import uuid
13
+ import textwrap
14
+ from typing import List, Dict
15
+
16
+ import gradio as gr
17
+ from PyPDF2 import PdfReader
18
+ from transformers import pipeline # for audio generation (e.g., xtts)
19
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
20
+
21
+ # ------------------------------------------------------------------
22
+ # LLM configuration (SmolAgents wrapper for HF Inference API)
23
+ # ------------------------------------------------------------------
24
+ llm = HfApiModel(
25
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # 34B parameter instruct model
26
+ max_tokens=2096,
27
+ temperature=0.5,
28
+ custom_role_conversions=None,
29
+ )
30
+
31
+ # ------------------------------------------------------------------
32
+ # Audio model (multilingual text ➜ speech); choose an open xtts‑v2
33
+ # model that supports our languages. Switch model id if you prefer.
34
+ # ------------------------------------------------------------------
35
+ audio_pipe = pipeline(
36
+ "text-to-audio",
37
+ model="suno/xtts_v2",
38
+ framework="pt",
39
+ )
40
+
41
+ LANG_INFO: Dict[str, Dict[str, str]] = {
42
+ "en": {"name": "English", "speaker": "hostA"},
43
+ "bn": {"name": "Bangla", "speaker": "hostB"},
44
+ "zh": {"name": "Chinese", "speaker": "hostC"},
45
+ "ur": {"name": "Urdu", "speaker": "hostD"},
46
+ "ne": {"name": "Nepali", "speaker": "hostE"},
47
+ }
48
+
49
+ PROMPT_TEMPLATE = textwrap.dedent(
50
+ """
51
+ You are producing a lively two‑host educational podcast in {lang_name}.
52
+ Summarize the following lecture content into a dialogue of about 1200 words.
53
+ Use an engaging style: hosts ask each other questions, clarify ideas, add
54
+ simple analogies, and conclude with a short recap. Keep technical accuracy.
55
+
56
+ ### Lecture Content
57
+ {content}
58
+ """
59
+ )
60
+
61
+ # ------------------------------------------------------------------
62
+ # Utility: extract & truncate PDF text to fit LLM token budget
63
+ # ------------------------------------------------------------------
64
+
65
+ def extract_pdf_text(pdf_file) -> str:
66
+ reader = PdfReader(pdf_file)
67
+ raw = "\n".join(p.extract_text() or "" for p in reader.pages)
68
+ return raw
69
+
70
+ TOKEN_LIMIT = 6000 # conservative words (≈ tokens) for prompt+response
71
+
72
+
73
+ def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
74
+ words = text.split()
75
+ return " ".join(words[:limit])
76
+
77
+ # ------------------------------------------------------------------
78
+ # Main generation function
79
+ # ------------------------------------------------------------------
80
+
81
+ def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
82
+ with tempfile.TemporaryDirectory() as tmpdir:
83
+ lecture_text = truncate_text(extract_pdf_text(pdf.name))
84
+ audio_outputs = []
85
+ for lang_code, info in LANG_INFO.items():
86
+ prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
87
+ # --- Generate dialogue ---
88
+ dialogue = llm(prompt)
89
+
90
+ # Save text for transparency/debug
91
+ text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
92
+ with open(text_path, "w", encoding="utf-8") as f:
93
+ f.write(dialogue)
94
+
95
+ # --- TTS ---
96
+ audio = audio_pipe(dialogue, forward_params={"language": lang_code})
97
+ wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
98
+ audio["audio"].export(wav_path, format="wav")
99
+ audio_outputs.append((wav_path, None)) # Gradio Audio expects (file, label)
100
+
101
+ return audio_outputs
102
+
103
+ # ------------------------------------------------------------------
104
+ # Gradio Interface
105
+ # ------------------------------------------------------------------
106
+
107
+ audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]
108
+
109
+ iface = gr.Interface(
110
+ fn=generate_podcast,
111
+ inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
112
+ outputs=audio_components,
113
+ title="Lecture → Multilingual Podcast Generator",
114
+ description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
115
+ )
116
+
117
+ if __name__ == "__main__":
118
+ iface.launch()