Upload 2 files
Browse files- app.py +195 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# =============================================================
|
2 |
+
# Hugging Face Space – Lecture → Multilingual Podcast Generator
|
3 |
+
# =============================================================
|
4 |
+
# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
|
5 |
+
# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
|
6 |
+
# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
|
7 |
+
# Face *audio‑generation* model for speech (no external TTS APIs).
|
8 |
+
# -----------------------------------------------------------------
|
9 |
+
# Files for your Space:
|
10 |
+
# • app.py (this file)
|
11 |
+
# • requirements.txt (see bottom comment block)
|
12 |
+
# -----------------------------------------------------------------
|
13 |
+
# Add your HF_TOKEN as a Space secret if required for gated models.
|
14 |
+
# =============================================================
|
15 |
+
|
16 |
+
import os
|
17 |
+
import tempfile
|
18 |
+
import uuid
|
19 |
+
import textwrap
|
20 |
+
from typing import Dict, Tuple
|
21 |
+
|
22 |
+
import gradio as gr
|
23 |
+
from PyPDF2 import PdfReader
|
24 |
+
import nltk # sentence tokenisation
|
25 |
+
from llama_index.llms.huggingface import HfApiModel
|
26 |
+
from transformers import pipeline # HF TTS pipeline
|
27 |
+
import soundfile as sf # save audio
|
28 |
+
|
29 |
+
# ---------------------------------------------------------------
|
30 |
+
# Ensure NLTK punkt is present on first launch
|
31 |
+
# ---------------------------------------------------------------
|
32 |
+
try:
|
33 |
+
nltk.data.find("tokenizers/punkt")
|
34 |
+
except LookupError:
|
35 |
+
nltk.download("punkt")
|
36 |
+
|
37 |
+
# --------------------------- LLM Setup ---------------------------
|
38 |
+
llm = HfApiModel(
|
39 |
+
max_tokens=2096,
|
40 |
+
temperature=0.5,
|
41 |
+
model_id="Qwen/Qwen2.5-Coder-32B-Instruct", # text generation
|
42 |
+
custom_role_conversions=None,
|
43 |
+
)
|
44 |
+
|
45 |
+
# ------------------------ TTS Setup ------------------------------
|
46 |
+
# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
|
47 |
+
# If you need lighter weights choose language‑specific VITS models.
|
48 |
+
# ----------------------------------------------------------------
|
49 |
+
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
|
50 |
+
# Load once; Space queues requests so single GPU/CPU is okay.
|
51 |
+
try:
|
52 |
+
tts_pipeline = pipeline(
|
53 |
+
"text-to-speech",
|
54 |
+
model=TTS_MODEL_ID,
|
55 |
+
device_map="auto", # GPU if available, else CPU
|
56 |
+
)
|
57 |
+
except Exception as e:
|
58 |
+
raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
|
59 |
+
|
60 |
+
# ------------------------ Helpers --------------------------------
|
61 |
+
LANG_CONFIG = {
|
62 |
+
"English": {
|
63 |
+
"tts_lang": "en",
|
64 |
+
"prompt_tag": "English",
|
65 |
+
},
|
66 |
+
"Bangla": {
|
67 |
+
"tts_lang": "bn",
|
68 |
+
"prompt_tag": "Bangla (বাংলা)",
|
69 |
+
},
|
70 |
+
"Chinese": {
|
71 |
+
"tts_lang": "zh",
|
72 |
+
"prompt_tag": "Mandarin Chinese",
|
73 |
+
},
|
74 |
+
"Urdu": {
|
75 |
+
"tts_lang": "ur",
|
76 |
+
"prompt_tag": "Urdu (اردو)",
|
77 |
+
},
|
78 |
+
"Nepali": {
|
79 |
+
"tts_lang": "ne",
|
80 |
+
"prompt_tag": "Nepali (नेपाली)",
|
81 |
+
},
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
|
86 |
+
"""Extract raw text from PDF, truncate to avoid token overflow."""
|
87 |
+
reader = PdfReader(pdf_path)
|
88 |
+
text_parts = []
|
89 |
+
for page in reader.pages:
|
90 |
+
page_text = page.extract_text() or ""
|
91 |
+
text_parts.append(page_text)
|
92 |
+
if sum(len(t) for t in text_parts) >= max_chars:
|
93 |
+
break
|
94 |
+
raw_text = "\n".join(text_parts)[:max_chars]
|
95 |
+
return raw_text
|
96 |
+
|
97 |
+
|
98 |
+
def build_prompt(lecture_text: str, lang: str) -> str:
|
99 |
+
"""Craft a prompt instructing the LLM to return a dialogue in `lang`."""
|
100 |
+
# Compress lecture to ~150 sentences to stay under token budget
|
101 |
+
sentences = nltk.sent_tokenize(lecture_text)
|
102 |
+
short_text = " ".join(sentences[: min(len(sentences), 150)])
|
103 |
+
|
104 |
+
prompt = textwrap.dedent(
|
105 |
+
f"""
|
106 |
+
You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
|
107 |
+
|
108 |
+
Lecture content (for reference):
|
109 |
+
""".strip()
|
110 |
+
) + "\n" + short_text + "\n"
|
111 |
+
return prompt
|
112 |
+
|
113 |
+
|
114 |
+
def generate_dialogue(lecture_text: str, lang: str) -> str:
|
115 |
+
"""Call the Qwen model to generate podcast script for the given language."""
|
116 |
+
prompt = build_prompt(lecture_text, lang)
|
117 |
+
try:
|
118 |
+
response = llm.complete(prompt)
|
119 |
+
dialogue = response.text.strip()
|
120 |
+
except Exception as e:
|
121 |
+
dialogue = f"Error generating dialogue in {lang}: {e}"
|
122 |
+
return dialogue
|
123 |
+
|
124 |
+
|
125 |
+
def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
|
126 |
+
"""Convert text to speech via HF TTS; returns (filepath, mime)."""
|
127 |
+
language_code = LANG_CONFIG[lang]["tts_lang"]
|
128 |
+
tmpdir = tempfile.gettempdir()
|
129 |
+
filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
|
130 |
+
try:
|
131 |
+
# xtts_v2 accepts a `language` forward param
|
132 |
+
speech = tts_pipeline(text, forward_params={"language": language_code})
|
133 |
+
sf.write(filename, speech["audio"], speech["sampling_rate"])
|
134 |
+
return filename, "audio/wav"
|
135 |
+
except Exception as e:
|
136 |
+
# Return dummy text file explaining error
|
137 |
+
errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
|
138 |
+
with open(errfile, "w", encoding="utf-8") as fh:
|
139 |
+
fh.write(f"TTS error for {lang}: {e}\n")
|
140 |
+
return errfile, "text/plain"
|
141 |
+
|
142 |
+
|
143 |
+
def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
|
144 |
+
"""Full pipeline returning a dict: language → (file_path, mime)."""
|
145 |
+
if pdf_file is None:
|
146 |
+
raise gr.Error("Please upload a PDF lecture first.")
|
147 |
+
lecture_text = extract_text(pdf_file)
|
148 |
+
|
149 |
+
audio_outputs = {}
|
150 |
+
for lang in LANG_CONFIG.keys():
|
151 |
+
dialogue = generate_dialogue(lecture_text, lang)
|
152 |
+
path, mime = tts_for_dialogue(lang, dialogue)
|
153 |
+
audio_outputs[lang] = (path, mime)
|
154 |
+
return audio_outputs
|
155 |
+
|
156 |
+
|
157 |
+
# ------------------------ Gradio UI --------------------------------
|
158 |
+
|
159 |
+
with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
|
160 |
+
gr.Markdown(
|
161 |
+
"""# 📚🎙️ Lecture → Podcast
|
162 |
+
Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
|
163 |
+
"""
|
164 |
+
)
|
165 |
+
with gr.Row():
|
166 |
+
inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
|
167 |
+
btn = gr.Button("Generate Podcast")
|
168 |
+
with gr.Group():
|
169 |
+
audio_components = [
|
170 |
+
gr.Audio(label=lang, interactive=False, type="filepath")
|
171 |
+
for lang in LANG_CONFIG.keys()
|
172 |
+
]
|
173 |
+
|
174 |
+
|
175 |
+
def gradio_wrapper(pdf_file):
|
176 |
+
results = pipeline_runner(pdf_file)
|
177 |
+
return [results[lang][0] for lang in LANG_CONFIG.keys()]
|
178 |
+
|
179 |
+
|
180 |
+
btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
demo.launch()
|
184 |
+
|
185 |
+
# ---------------------------------------------------------------
|
186 |
+
# requirements.txt (commit as separate file in the Space root)
|
187 |
+
# ---------------------------------------------------------------
|
188 |
+
# gradio>=4.28.0
|
189 |
+
# PyPDF2>=3.0.1
|
190 |
+
# nltk>=3.8.1
|
191 |
+
# transformers>=4.39.0
|
192 |
+
# torch>=2.1.2
|
193 |
+
# soundfile>=0.12.1
|
194 |
+
# llama-index>=0.11.47
|
195 |
+
# huggingface-hub>=0.23.0
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.28.0
|
2 |
+
PyPDF2>=3.0.1
|
3 |
+
nltk>=3.8.1
|
4 |
+
transformers>=4.39.0
|
5 |
+
torch>=2.1.2
|
6 |
+
soundfile>=0.12.1
|
7 |
+
llama-index>=0.11.47
|
8 |
+
huggingface-hub>=0.23.0
|