File size: 4,926 Bytes
f1adb14
50d2a40
f1adb14
50d2a40
 
 
 
 
 
f1adb14
 
 
 
 
50d2a40
f1adb14
 
 
50d2a40
f1adb14
f0eca57
f1adb14
 
50d2a40
f1adb14
 
f0eca57
f1adb14
 
 
 
 
 
50d2a40
f1adb14
50d2a40
f1adb14
50d2a40
 
 
f1adb14
50d2a40
 
 
 
 
 
f1adb14
 
 
 
 
50d2a40
 
 
f1adb14
50d2a40
f1adb14
 
 
 
 
50d2a40
f1adb14
 
50d2a40
 
 
f1adb14
50d2a40
f1adb14
 
 
 
 
 
50d2a40
f1adb14
 
 
50d2a40
f1adb14
50d2a40
 
 
f0eca57
50d2a40
 
f1adb14
50d2a40
f1adb14
50d2a40
 
 
 
f0eca57
50d2a40
f1adb14
50d2a40
f1adb14
 
50d2a40
f1adb14
 
f0eca57
50d2a40
 
f0eca57
f1adb14
 
 
 
 
50d2a40
f0eca57
50d2a40
 
 
 
f0eca57
f1adb14
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# * **Text generation** – SmolAgents `HfApiModel` running the remote
#   Qwen/Qwen2.5‑Coder‑32B‑Instruct model.
# * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`
#   (serverless) with open models per language – no heavy local
#   downloads.
# * Outputs five FLAC files (English, Bangla, Chinese, Urdu, Nepali).
# -----------------------------------------------------------------

import os
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict

import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader
from smolagents import HfApiModel

# ------------------------------------------------------------------
# LLM: Qwen 32‑B via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2096,
    temperature=0.5,
    custom_role_conversions=None,
)

# ------------------------------------------------------------------
# HF Inference API client (reads HF_TOKEN secret if set)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))

# ------------------------------------------------------------------
# Language metadata and matching TTS model IDs
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    # MMS lacks mainstream Mandarin — fallback to an open Chinese TTS
    "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd-script_arabic"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}

PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of ≈1200 words.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.

    ### Lecture Content
    {content}
    """
)

# ------------------------------------------------------------------
# Helpers: extract and truncate PDF text
# ------------------------------------------------------------------

def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

TOKEN_LIMIT = 6000  # rough word‑level cap before hitting context limit

def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
    """Generate multilingual podcast from a lecture PDF."""
    with tempfile.TemporaryDirectory() as tmpdir:
        raw_text = extract_pdf_text(pdf.name)
        lecture_text = truncate_text(raw_text)
        outputs: List[tuple] = []

        for code, info in LANG_INFO.items():
            # 1️⃣ Draft dialogue in the target language
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            dialogue: str = llm(prompt)

            # 2️⃣ Synthesize speech via HF Inference API
            audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
            flac_path = Path(tmpdir) / f"podcast_{code}.flac"
            flac_path.write_bytes(audio_bytes)

            outputs.append((str(flac_path), None))  # (filepath, label)

        return outputs

# ------------------------------------------------------------------
# Gradio interface
# ------------------------------------------------------------------

audio_components = [
    gr.Audio(label=f"{info['name']} Podcast", type="filepath")
    for info in LANG_INFO.values()
]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    outputs=audio_components,
    title="Lecture → Multilingual Podcast Generator",
    description=(
        "Upload a lecture PDF and receive a two‑host audio podcast in five "
        "languages (English, Bangla, Chinese, Urdu, Nepali). Dialogue is "
        "crafted by Qwen‑32B; speech is synthesized on‑the‑fly using the "
        "Hugging Face Inference API — no heavy downloads or GPUs required."
    ),
)

if __name__ == "__main__":
    iface.launch()