File size: 5,606 Bytes
f1adb14
c172b12
f1adb14
c172b12
 
 
 
f1adb14
 
 
 
 
50d2a40
c172b12
f1adb14
 
50d2a40
f1adb14
f0eca57
f1adb14
 
50d2a40
f1adb14
 
f0eca57
f1adb14
 
 
 
 
 
50d2a40
f1adb14
50d2a40
f1adb14
50d2a40
 
 
f1adb14
50d2a40
 
 
 
 
f1adb14
 
c172b12
 
 
f1adb14
 
 
50d2a40
 
 
f1adb14
50d2a40
f1adb14
 
 
 
 
50d2a40
f1adb14
 
50d2a40
 
 
f1adb14
50d2a40
f1adb14
 
 
 
 
 
50d2a40
f1adb14
 
c172b12
 
 
 
 
 
 
 
 
f1adb14
50d2a40
 
c172b12
f0eca57
50d2a40
c172b12
 
 
 
50d2a40
f1adb14
50d2a40
f1adb14
50d2a40
 
 
 
f0eca57
50d2a40
f1adb14
50d2a40
f1adb14
 
50d2a40
f1adb14
 
c172b12
 
 
 
 
 
 
 
 
 
 
f0eca57
c172b12
f0eca57
f1adb14
 
 
c172b12
f1adb14
c172b12
f0eca57
c172b12
 
 
 
f0eca57
f1adb14
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# =============================================================
# Hugging Face Space – Lecture → Podcast Generator (User‑selectable Languages)
# =============================================================
# * **Text generation** – SmolAgents `HfApiModel` (Qwen/Qwen2.5‑Coder‑32B‑Instruct).
# * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`.
# * Users pick which languages to generate (English, Bangla, Chinese,
#   Urdu, Nepali). Unselected languages are skipped.
# -----------------------------------------------------------------

import os
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import gradio as gr
from huggingface_hub import InferenceClient
from PyPDF2 import PdfReader
from smolagents import HfApiModel

# ------------------------------------------------------------------
# LLM: Qwen 32‑B via SmolAgents
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2096,
    temperature=0.5,
    custom_role_conversions=None,
)

# ------------------------------------------------------------------
# HF Inference API client (reads HF_TOKEN secret if set)
# ------------------------------------------------------------------
client = InferenceClient(token=os.getenv("HF_TOKEN", None))

# ------------------------------------------------------------------
# Language metadata and matching TTS model IDs
# ------------------------------------------------------------------
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd-script_arabic"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}

# Helper map: name ➜ code
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of ≈1200 words.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.

    ### Lecture Content
    {content}
    """
)

# ------------------------------------------------------------------
# Helpers: extract and truncate PDF text
# ------------------------------------------------------------------

def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

TOKEN_LIMIT = 6000  # rough word‑level cap before hitting context limit

def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# Main pipeline
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File, selected_lang_names: List[str]) -> List[Optional[Tuple[str, None]]]:
    """Generate podcast audio files for chosen languages. Returns a list
    aligned with LANG_INFO order; unselected languages yield None."""
    # Ensure at least one language selected
    if not selected_lang_names:
        return [None] * len(LANG_INFO)

    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]

    with tempfile.TemporaryDirectory() as tmpdir:
        raw_text = extract_pdf_text(pdf.name)
        lecture_text = truncate_text(raw_text)
        outputs: List[Optional[Tuple[str, None]]] = []

        for code, info in LANG_INFO.items():
            if code not in selected_codes:
                outputs.append(None)
                continue

            # 1️⃣ Draft dialogue in the target language
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            dialogue: str = llm(prompt)

            # 2️⃣ Synthesize speech via HF Inference API
            audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
            flac_path = Path(tmpdir) / f"podcast_{code}.flac"
            flac_path.write_bytes(audio_bytes)

            outputs.append((str(flac_path), None))  # (filepath, label)

        return outputs

# ------------------------------------------------------------------
# Gradio interface
# ------------------------------------------------------------------

language_choices = [info["name"] for info in LANG_INFO.values()]

inputs = [
    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(
        choices=language_choices,
        value=["English"],
        label="Select podcast language(s) to generate",
    ),
]

audio_components = [
    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=audio_components,
    title="Lecture → Podcast Generator (Choose Languages)",
    description=(
        "Upload a lecture PDF, choose your desired languages, and receive a "
        "two‑host audio podcast. Dialogue is crafted by Qwen‑32B; speech is "
        "synthesized on‑the‑fly using the Hugging Face Inference API — "
        "no heavy downloads or GPUs required."
    ),
)

if __name__ == "__main__":
    iface.launch()