File size: 7,466 Bytes
910bbfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
# Face *audio‑generation* model for speech (no external TTS APIs).
# -----------------------------------------------------------------
# Files for your Space:
#   • app.py              (this file)
#   • requirements.txt    (see bottom comment block)
# -----------------------------------------------------------------
# Add your HF_TOKEN as a Space secret if required for gated models.
# =============================================================

import os
import tempfile
import uuid
import textwrap
from typing import Dict, Tuple

import gradio as gr
from PyPDF2 import PdfReader
import nltk  # sentence tokenisation
from llama_index.llms.huggingface import HfApiModel
from transformers import pipeline  # HF TTS pipeline
import soundfile as sf  # save audio

# ---------------------------------------------------------------
# Ensure NLTK punkt is present on first launch
# ---------------------------------------------------------------
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# --------------------------- LLM Setup ---------------------------
llm = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",  # text generation
    custom_role_conversions=None,
)

# ------------------------ TTS Setup ------------------------------
# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
# If you need lighter weights choose language‑specific VITS models.
# ----------------------------------------------------------------
TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
# Load once; Space queues requests so single GPU/CPU is okay.
try:
    tts_pipeline = pipeline(
        "text-to-speech",
        model=TTS_MODEL_ID,
        device_map="auto",  # GPU if available, else CPU
    )
except Exception as e:
    raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")

# ------------------------ Helpers --------------------------------
LANG_CONFIG = {
    "English": {
        "tts_lang": "en",
        "prompt_tag": "English",
    },
    "Bangla": {
        "tts_lang": "bn",
        "prompt_tag": "Bangla (বাংলা)",
    },
    "Chinese": {
        "tts_lang": "zh",
        "prompt_tag": "Mandarin Chinese",
    },
    "Urdu": {
        "tts_lang": "ur",
        "prompt_tag": "Urdu (اردو)",
    },
    "Nepali": {
        "tts_lang": "ne",
        "prompt_tag": "Nepali (नेपाली)",
    },
}


def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
    """Extract raw text from PDF, truncate to avoid token overflow."""
    reader = PdfReader(pdf_path)
    text_parts = []
    for page in reader.pages:
        page_text = page.extract_text() or ""
        text_parts.append(page_text)
        if sum(len(t) for t in text_parts) >= max_chars:
            break
    raw_text = "\n".join(text_parts)[:max_chars]
    return raw_text


def build_prompt(lecture_text: str, lang: str) -> str:
    """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
    # Compress lecture to ~150 sentences to stay under token budget
    sentences = nltk.sent_tokenize(lecture_text)
    short_text = " ".join(sentences[: min(len(sentences), 150)])

    prompt = textwrap.dedent(
        f"""

        You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.



        Lecture content (for reference):

        """.strip()
    ) + "\n" + short_text + "\n"
    return prompt


def generate_dialogue(lecture_text: str, lang: str) -> str:
    """Call the Qwen model to generate podcast script for the given language."""
    prompt = build_prompt(lecture_text, lang)
    try:
        response = llm.complete(prompt)
        dialogue = response.text.strip()
    except Exception as e:
        dialogue = f"Error generating dialogue in {lang}: {e}"
    return dialogue


def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
    """Convert text to speech via HF TTS; returns (filepath, mime)."""
    language_code = LANG_CONFIG[lang]["tts_lang"]
    tmpdir = tempfile.gettempdir()
    filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
    try:
        # xtts_v2 accepts a `language` forward param
        speech = tts_pipeline(text, forward_params={"language": language_code})
        sf.write(filename, speech["audio"], speech["sampling_rate"])
        return filename, "audio/wav"
    except Exception as e:
        # Return dummy text file explaining error
        errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
        with open(errfile, "w", encoding="utf-8") as fh:
            fh.write(f"TTS error for {lang}: {e}\n")
        return errfile, "text/plain"


def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
    """Full pipeline returning a dict: language → (file_path, mime)."""
    if pdf_file is None:
        raise gr.Error("Please upload a PDF lecture first.")
    lecture_text = extract_text(pdf_file)

    audio_outputs = {}
    for lang in LANG_CONFIG.keys():
        dialogue = generate_dialogue(lecture_text, lang)
        path, mime = tts_for_dialogue(lang, dialogue)
        audio_outputs[lang] = (path, mime)
    return audio_outputs


# ------------------------ Gradio UI --------------------------------

with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
    gr.Markdown(
        """# 📚🎙️ Lecture → Podcast

Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.

"""
    )
    with gr.Row():
        inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
        btn = gr.Button("Generate Podcast")
    with gr.Group():
        audio_components = [
            gr.Audio(label=lang, interactive=False, type="filepath")
            for lang in LANG_CONFIG.keys()
        ]


    def gradio_wrapper(pdf_file):
        results = pipeline_runner(pdf_file)
        return [results[lang][0] for lang in LANG_CONFIG.keys()]


    btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)

if __name__ == "__main__":
    demo.launch()

# ---------------------------------------------------------------
# requirements.txt  (commit as separate file in the Space root)
# ---------------------------------------------------------------
# gradio>=4.28.0
# PyPDF2>=3.0.1
# nltk>=3.8.1
# transformers>=4.39.0
# torch>=2.1.2
# soundfile>=0.12.1
# llama-index>=0.11.47
# huggingface-hub>=0.23.0