HaiderAUT commited on
Commit
910bbfc
·
verified ·
1 Parent(s): 8f20f5f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +195 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================
2
+ # Hugging Face Space – Lecture → Multilingual Podcast Generator
3
+ # =============================================================
4
+ # Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
5
+ # directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
6
+ # using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
7
+ # Face *audio‑generation* model for speech (no external TTS APIs).
8
+ # -----------------------------------------------------------------
9
+ # Files for your Space:
10
+ # • app.py (this file)
11
+ # • requirements.txt (see bottom comment block)
12
+ # -----------------------------------------------------------------
13
+ # Add your HF_TOKEN as a Space secret if required for gated models.
14
+ # =============================================================
15
+
16
+ import os
17
+ import tempfile
18
+ import uuid
19
+ import textwrap
20
+ from typing import Dict, Tuple
21
+
22
+ import gradio as gr
23
+ from PyPDF2 import PdfReader
24
+ import nltk # sentence tokenisation
25
+ from llama_index.llms.huggingface import HfApiModel
26
+ from transformers import pipeline # HF TTS pipeline
27
+ import soundfile as sf # save audio
28
+
29
+ # ---------------------------------------------------------------
30
+ # Ensure NLTK punkt is present on first launch
31
+ # ---------------------------------------------------------------
32
+ try:
33
+ nltk.data.find("tokenizers/punkt")
34
+ except LookupError:
35
+ nltk.download("punkt")
36
+
37
+ # --------------------------- LLM Setup ---------------------------
38
+ llm = HfApiModel(
39
+ max_tokens=2096,
40
+ temperature=0.5,
41
+ model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
42
+ custom_role_conversions=None,
43
+ )
44
+
45
+ # ------------------------ TTS Setup ------------------------------
46
+ # Multilingual TTS model (≈ 500 MB). It supports >100 languages.
47
+ # If you need lighter weights choose language‑specific VITS models.
48
+ # ----------------------------------------------------------------
49
+ TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
50
+ # Load once; Space queues requests so single GPU/CPU is okay.
51
+ try:
52
+ tts_pipeline = pipeline(
53
+ "text-to-speech",
54
+ model=TTS_MODEL_ID,
55
+ device_map="auto", # GPU if available, else CPU
56
+ )
57
+ except Exception as e:
58
+ raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
59
+
60
+ # ------------------------ Helpers --------------------------------
61
+ LANG_CONFIG = {
62
+ "English": {
63
+ "tts_lang": "en",
64
+ "prompt_tag": "English",
65
+ },
66
+ "Bangla": {
67
+ "tts_lang": "bn",
68
+ "prompt_tag": "Bangla (বাংলা)",
69
+ },
70
+ "Chinese": {
71
+ "tts_lang": "zh",
72
+ "prompt_tag": "Mandarin Chinese",
73
+ },
74
+ "Urdu": {
75
+ "tts_lang": "ur",
76
+ "prompt_tag": "Urdu (اردو)",
77
+ },
78
+ "Nepali": {
79
+ "tts_lang": "ne",
80
+ "prompt_tag": "Nepali (नेपाली)",
81
+ },
82
+ }
83
+
84
+
85
+ def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
86
+ """Extract raw text from PDF, truncate to avoid token overflow."""
87
+ reader = PdfReader(pdf_path)
88
+ text_parts = []
89
+ for page in reader.pages:
90
+ page_text = page.extract_text() or ""
91
+ text_parts.append(page_text)
92
+ if sum(len(t) for t in text_parts) >= max_chars:
93
+ break
94
+ raw_text = "\n".join(text_parts)[:max_chars]
95
+ return raw_text
96
+
97
+
98
+ def build_prompt(lecture_text: str, lang: str) -> str:
99
+ """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
100
+ # Compress lecture to ~150 sentences to stay under token budget
101
+ sentences = nltk.sent_tokenize(lecture_text)
102
+ short_text = " ".join(sentences[: min(len(sentences), 150)])
103
+
104
+ prompt = textwrap.dedent(
105
+ f"""
106
+ You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
107
+
108
+ Lecture content (for reference):
109
+ """.strip()
110
+ ) + "\n" + short_text + "\n"
111
+ return prompt
112
+
113
+
114
+ def generate_dialogue(lecture_text: str, lang: str) -> str:
115
+ """Call the Qwen model to generate podcast script for the given language."""
116
+ prompt = build_prompt(lecture_text, lang)
117
+ try:
118
+ response = llm.complete(prompt)
119
+ dialogue = response.text.strip()
120
+ except Exception as e:
121
+ dialogue = f"Error generating dialogue in {lang}: {e}"
122
+ return dialogue
123
+
124
+
125
+ def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
126
+ """Convert text to speech via HF TTS; returns (filepath, mime)."""
127
+ language_code = LANG_CONFIG[lang]["tts_lang"]
128
+ tmpdir = tempfile.gettempdir()
129
+ filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
130
+ try:
131
+ # xtts_v2 accepts a `language` forward param
132
+ speech = tts_pipeline(text, forward_params={"language": language_code})
133
+ sf.write(filename, speech["audio"], speech["sampling_rate"])
134
+ return filename, "audio/wav"
135
+ except Exception as e:
136
+ # Return dummy text file explaining error
137
+ errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
138
+ with open(errfile, "w", encoding="utf-8") as fh:
139
+ fh.write(f"TTS error for {lang}: {e}\n")
140
+ return errfile, "text/plain"
141
+
142
+
143
+ def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
144
+ """Full pipeline returning a dict: language → (file_path, mime)."""
145
+ if pdf_file is None:
146
+ raise gr.Error("Please upload a PDF lecture first.")
147
+ lecture_text = extract_text(pdf_file)
148
+
149
+ audio_outputs = {}
150
+ for lang in LANG_CONFIG.keys():
151
+ dialogue = generate_dialogue(lecture_text, lang)
152
+ path, mime = tts_for_dialogue(lang, dialogue)
153
+ audio_outputs[lang] = (path, mime)
154
+ return audio_outputs
155
+
156
+
157
+ # ------------------------ Gradio UI --------------------------------
158
+
159
+ with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
160
+ gr.Markdown(
161
+ """# 📚🎙️ Lecture → Podcast
162
+ Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
163
+ """
164
+ )
165
+ with gr.Row():
166
+ inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
167
+ btn = gr.Button("Generate Podcast")
168
+ with gr.Group():
169
+ audio_components = [
170
+ gr.Audio(label=lang, interactive=False, type="filepath")
171
+ for lang in LANG_CONFIG.keys()
172
+ ]
173
+
174
+
175
+ def gradio_wrapper(pdf_file):
176
+ results = pipeline_runner(pdf_file)
177
+ return [results[lang][0] for lang in LANG_CONFIG.keys()]
178
+
179
+
180
+ btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
181
+
182
+ if __name__ == "__main__":
183
+ demo.launch()
184
+
185
+ # ---------------------------------------------------------------
186
+ # requirements.txt (commit as separate file in the Space root)
187
+ # ---------------------------------------------------------------
188
+ # gradio>=4.28.0
189
+ # PyPDF2>=3.0.1
190
+ # nltk>=3.8.1
191
+ # transformers>=4.39.0
192
+ # torch>=2.1.2
193
+ # soundfile>=0.12.1
194
+ # llama-index>=0.11.47
195
+ # huggingface-hub>=0.23.0
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.28.0
2
+ PyPDF2>=3.0.1
3
+ nltk>=3.8.1
4
+ transformers>=4.39.0
5
+ torch>=2.1.2
6
+ soundfile>=0.12.1
7
+ llama-index>=0.11.47
8
+ huggingface-hub>=0.23.0