File size: 4,571 Bytes
f1adb14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# =============================================================
# Hugging Face Space – Lecture → Multilingual Podcast Generator
# =============================================================
# Uses SmolAgents HfApiModel for text generation and HF audio
# pipeline for speech. Generates two‑host dialogues in five
# languages (English, Bangla, Chinese, Urdu, Nepali) directly
# from a PDF lecture upload.
# -----------------------------------------------------------------

import os
import tempfile
import uuid
import textwrap
from typing import List, Dict

import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline  # for audio generation (e.g., xtts)
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool

# ------------------------------------------------------------------
# LLM configuration (SmolAgents wrapper for HF Inference API)
# ------------------------------------------------------------------
llm = HfApiModel(
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # 34B parameter instruct model
    max_tokens=2096,
    temperature=0.5,
    custom_role_conversions=None,
)

# ------------------------------------------------------------------
# Audio model (multilingual text ➜ speech); choose an open xtts‑v2
# model that supports our languages. Switch model id if you prefer.
# ------------------------------------------------------------------
audio_pipe = pipeline(
    "text-to-audio",
    model="suno/xtts_v2",
    framework="pt",
)

LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "speaker": "hostA"},
    "bn": {"name": "Bangla", "speaker": "hostB"},
    "zh": {"name": "Chinese", "speaker": "hostC"},
    "ur": {"name": "Urdu", "speaker": "hostD"},
    "ne": {"name": "Nepali", "speaker": "hostE"},
}

PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two‑host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of about 1200 words.
    Use an engaging style: hosts ask each other questions, clarify ideas, add
    simple analogies, and conclude with a short recap. Keep technical accuracy.

    ### Lecture Content
    {content}
    """
)

# ------------------------------------------------------------------
# Utility: extract & truncate PDF text to fit LLM token budget
# ------------------------------------------------------------------

def extract_pdf_text(pdf_file) -> str:
    reader = PdfReader(pdf_file)
    raw = "\n".join(p.extract_text() or "" for p in reader.pages)
    return raw

TOKEN_LIMIT = 6000  # conservative words (≈ tokens) for prompt+response


def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit])

# ------------------------------------------------------------------
# Main generation function
# ------------------------------------------------------------------

def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
    with tempfile.TemporaryDirectory() as tmpdir:
        lecture_text = truncate_text(extract_pdf_text(pdf.name))
        audio_outputs = []
        for lang_code, info in LANG_INFO.items():
            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
            # --- Generate dialogue ---
            dialogue = llm(prompt)

            # Save text for transparency/debug
            text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
            with open(text_path, "w", encoding="utf-8") as f:
                f.write(dialogue)

            # --- TTS ---
            audio = audio_pipe(dialogue, forward_params={"language": lang_code})
            wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
            audio["audio"].export(wav_path, format="wav")
            audio_outputs.append((wav_path, None))  # Gradio Audio expects (file, label)

        return audio_outputs

# ------------------------------------------------------------------
# Gradio Interface
# ------------------------------------------------------------------

audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]

iface = gr.Interface(
    fn=generate_podcast,
    inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
    outputs=audio_components,
    title="Lecture → Multilingual Podcast Generator",
    description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
)

if __name__ == "__main__":
    iface.launch()