File size: 4,673 Bytes
f1adb14
f0eca57
f1adb14
f0eca57
 
 
 
f1adb14
 
 
 
 
 
 
 
 
 
f0eca57
 
f1adb14
 
 
 
 
f0eca57
f1adb14
 
 
 
 
 
f0eca57
f1adb14
f0eca57
 
 
 
f1adb14
 
f0eca57
 
 
 
 
f1adb14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0eca57
f1adb14
 
 
 
f0eca57
f1adb14
f0eca57
f1adb14
 
 
 
 
 
f0eca57
f1adb14
 
 
 
 
 
f0eca57
f1adb14
f0eca57
f1adb14
 
 
f0eca57
 
 
f1adb14
 
f0eca57
f1adb14
f0eca57
 
 
 
f1adb14
 
 
 
f0eca57
f1adb14
 
f0eca57
 
 
f1adb14
 
 
 
 
 
f0eca57
 
 
 
 
 
f1adb14
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
# * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib
#   (no private / gated repo, so it runs without a HF token).
# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
# -----------------------------------------------------------------

import os
import tempfile
import uuid
import textwrap
from typing import List, Dict

import gradio as gr
from PyPDF2 import PdfReader
from smolagents import HfApiModel
from TTS.api import TTS  # ↳ Coqui TTS

# ------------------------------------------------------------------
# LLM configuration (SmolAgents wrapper for HF Inference API)
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    max_tokens=2096,
    temperature=0.5,
    custom_role_conversions=None,
)

# ------------------------------------------------------------------
# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
# ------------------------------------------------------------------
TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"

tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
# Automatically downloads and caches the model on first run.

LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English"},
    "bn": {"name": "Bangla"},
    "zh": {"name": "Chinese"},
    "ur": {"name": "Urdu"},
    "ne": {"name": "Nepali"},
}

PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of about 1200 words.
    Use an engaging style: hosts ask each other questions, clarify ideas, add
    simple analogies, and conclude with a short recap. Keep technical accuracy.

    ### Lecture Content
    {content}
    """
)

# ------------------------------------------------------------------
# Utility: extract & truncate PDF text to fit the LLM token budget
# ------------------------------------------------------------------

def extract_pdf_text(pdf_file) -> str:
    reader = PdfReader(pdf_file)
    return "\n".join(p.extract_text() or "" for p in reader.pages)

TOKEN_LIMIT = 6000  # ≈ tokens (safe margin for prompt + response)

def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# Main generation routine
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
    with tempfile.TemporaryDirectory() as tmpdir:
        lecture_text = truncate_text(extract_pdf_text(pdf.name))
        audio_outputs = []

        for lang_code, info in LANG_INFO.items():
            # 1️⃣  Create prompt + generate dialogue
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            dialogue = llm(prompt)

            # 2️⃣  Save raw dialogue text (for reference)
            txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(dialogue)

            # 3️⃣  Synthesise speech with XTTS‑v2
            wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
            # ► xtts_v2 accepts ISO‑639‑1 language codes directly
            tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)

            audio_outputs.append((wav_path, None))  # (file, label) for Gradio Audio

        return audio_outputs

# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------

audio_components = [
    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    outputs=audio_components,
    title="Lecture → Multilingual Podcast Generator",
    description=(
        "Upload a lecture PDF and receive a two‑host audio podcast in English, "
        "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
        "dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
        "or API keys needed."
    ),
)

if __name__ == "__main__":
    iface.launch()