File size: 7,083 Bytes
f1adb14
fe00684
f1adb14
fe00684
 
 
 
f1adb14
 
 
fe00684
f1adb14
 
50d2a40
c172b12
f1adb14
 
fe00684
f1adb14
f0eca57
f1adb14
 
fe00684
f1adb14
 
f0eca57
fe00684
f1adb14
 
 
 
fe00684
f1adb14
50d2a40
f1adb14
50d2a40
fe00684
 
50d2a40
f1adb14
50d2a40
 
fe00684
 
50d2a40
f1adb14
c172b12
 
fe00684
 
 
f1adb14
 
 
fe00684
50d2a40
 
fe00684
50d2a40
f1adb14
 
 
 
fe00684
f1adb14
50d2a40
 
 
f1adb14
fe00684
 
f1adb14
 
 
 
 
fe00684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1adb14
50d2a40
f1adb14
 
fe00684
c172b12
fe00684
c172b12
 
fe00684
c172b12
fe00684
 
 
 
f0eca57
50d2a40
c172b12
fe00684
c172b12
 
fe00684
f1adb14
50d2a40
f1adb14
fe00684
 
f0eca57
fe00684
f1adb14
fe00684
f1adb14
 
fe00684
f1adb14
c172b12
 
 
 
 
 
 
 
 
 
 
fe00684
c172b12
f0eca57
f1adb14
 
 
c172b12
fe00684
c172b12
f0eca57
fe00684
 
 
 
f0eca57
f1adb14
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# =============================================================
# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
# =============================================================
# • **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct)
# • **Speech synthesis** – `InferenceClient.text_to_speech`, chunk‑safe
#   (MMS‑TTS for en/bn/ur/ne, mms‑TTS‑zho for zh). Long texts are split
#   into ≤280‑char chunks to stay within HF endpoint limits.
# -----------------------------------------------------------------

import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import gradio as gr
from huggingface_hub import InferenceClient, HubHTTPError
from PyPDF2 import PdfReader
from smolagents import HfApiModel

# ------------------------------------------------------------------
# LLM setup – remote Qwen model via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2048,
    temperature=0.5,
)

# ------------------------------------------------------------------
# Hugging Face Inference API client (uses HF_TOKEN secret if provided)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))

# ------------------------------------------------------------------
# Language metadata and corresponding open TTS model IDs
# (MMS‑TTS supports 100+ langs but per‑lang repos have shorter ids)
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

# ------------------------------------------------------------------
# Prompt template (≈300 words to keep TTS happy)
# ------------------------------------------------------------------
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of **≈300 words**.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.
    
    ### Lecture Content
    {content}
    """
)

# PDF helpers -------------------------------------------------------

def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

TOKEN_LIMIT = 4000  # approx words before hitting context limit


def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# TTS helper – chunk long text safely (HF endpoint ~30 s / 200‑300 chars)
# ------------------------------------------------------------------
CHUNK_CHAR_LIMIT = 280  # safe margin for MMS‑TTS

def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
    # split on sentence boundaries while respecting limit
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    chunks, current = [], ""
    for sent in sentences:
        if len(current) + len(sent) + 1 > limit:
            if current:
                chunks.append(current.strip())
            current = sent
        else:
            current += " " + sent if current else sent
    if current:
        chunks.append(current.strip())
    return chunks


def synthesize_speech(text: str, model_id: str, tmpdir: Path) -> Path:
    """Stream chunks through HF TTS and concatenate FLAC bytes."""
    chunks = _split_to_chunks(text)
    flac_paths: List[Path] = []
    for idx, chunk in enumerate(chunks):
        try:
            audio_bytes = client.text_to_speech(chunk, model=model_id)
        except HubHTTPError as e:
            raise RuntimeError(f"TTS request failed: {e}") from e
        part_path = tmpdir / f"part_{idx}.flac"
        part_path.write_bytes(audio_bytes)
        flac_paths.append(part_path)

    # simple concat of FLAC files (works because each part includes header)
    # better: convert to raw & merge, but HF players handle sequential FLACs
    final_path = tmpdir / "podcast.flac"
    with open(final_path, "wb") as fout:
        for p in flac_paths:
            fout.write(p.read_bytes())
    return final_path

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File, selected_lang_names: List[str]):
    if not selected_lang_names:
        raise gr.Error("Please select at least one language.")

    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
    results: List[Optional[Tuple[str, None]]] = []

    with tempfile.TemporaryDirectory() as td:
        tmpdir = Path(td)
        lecture_raw = extract_pdf_text(pdf.name)
        lecture_text = truncate_text(lecture_raw)

        for code, info in LANG_INFO.items():
            if code not in selected_codes:
                results.append(None)
                continue

            # 1️⃣ Generate dialogue
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            dialogue: str = llm(prompt)

            # 2️⃣ Speech synthesis (chunked)
            tts_path = synthesize_speech(dialogue, info["tts_model"], tmpdir / code)

            results.append((str(tts_path), None))

    return results

# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------
language_choices = [info["name"] for info in LANG_INFO.values()]

inputs = [
    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(
        choices=language_choices,
        value=["English"],
        label="Select podcast language(s) to generate",
    ),
]

outputs = [
    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=outputs,
    title="Lecture → Podcast Generator (Choose Languages)",
    description=(
        "Upload a lecture PDF, choose language(s), and receive a two‑host "
        "audio podcast. Dialogue comes from Qwen‑32B; speech is streamed "
        "via the HF Inference API using open MMS‑TTS models. Long texts are "
        "automatically chunked to fit API limits."
    ),
)

if __name__ == "__main__":
    iface.launch()