File size: 5,186 Bytes
53744b5
764a881
 
53744b5
f1adb14
fe00684
f1adb14
 
50d2a40
53744b5
f1adb14
 
53744b5
f036ad8
 
f1adb14
764a881
1425202
369b2d2
764a881
 
369b2d2
 
764a881
369b2d2
 
764a881
f1adb14
764a881
f1adb14
369b2d2
 
 
 
 
f1adb14
c172b12
 
764a881
f1adb14
 
c565171
53744b5
50d2a40
764a881
c565171
 
f1adb14
 
 
 
764a881
369b2d2
53744b5
764a881
 
 
 
f1adb14
 
764a881
 
 
 
 
 
 
 
 
 
 
 
fe00684
764a881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1adb14
53744b5
764a881
 
 
53744b5
764a881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c172b12
764a881
 
 
 
c172b12
9e251c5
764a881
 
 
 
f1adb14
 
 
c172b12
fe00684
764a881
f1adb14
 
 
764a881
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# =============================================================
# Lecture → Podcast & Script Generator (Gemini + HF TTS)
# Modified: Script outputs rendered as HTML
# =============================================================
import os
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List, Dict, Optional, Any

import gradio as gr
from PyPDF2 import PdfReader
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError

# Hugging Face TTS
from huggingface_hub import InferenceClient

# Google Gemini
import google.generativeai as genai

# ------------------------------------------------------------------
# HF TTS client
# ------------------------------------------------------------------
hf_token = os.getenv("HF_TOKEN")
hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None

# Language metadata
LANG_INFO: Dict[str, Dict[str, str]] = {
    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
}
LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}

# Prompt template
PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are producing a lively two-host educational podcast in {lang_name}.
    Summarize the following lecture content into a dialogue of **approximately 300 words**.
    Make it engaging: hosts ask questions, clarify ideas with analogies, and
    wrap up with a concise recap. Preserve technical accuracy.

    ### Lecture Content
    {content}
    """
)

# PDF extraction
TOKEN_LIMIT = 8000

def extract_pdf_text(path: str) -> str:
    reader = PdfReader(path)
    return "\n".join(p.extract_text() or "" for p in reader.pages)

def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
    words = text.split()
    return " ".join(words[:limit]) if len(words) > limit else text

# TTS chunking
CHUNK_CHAR_LIMIT = 280

def split_chunks(text: str) -> List[str]:
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    chunks, curr = [], ""
    for s in sentences:
        if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
            chunks.append(curr)
            curr = s
        else:
            curr = f"{curr} {s}" if curr else s
    if curr: chunks.append(curr)
    return chunks

# Synthesize speech

def synthesize(text: str, model_id: str, outdir: Path) -> str:
    segments = []
    for i, chunk in enumerate(split_chunks(text)):
        audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
        path = outdir / f"part{i}.flac"
        path.write_bytes(audio_bytes)
        seg = AudioSegment.from_file(path, format="flac")
        segments.append(seg)
    final = sum(segments, AudioSegment.empty())
    out = outdir / "podcast.flac"
    final.export(out, format="flac")
    return str(out)

# Main pipeline

def generate_podcast(
    gemini_key: str,
    pdf_file: gr.File,
    langs: List[str]
) -> List[Optional[Any]]:
    if not gemini_key:
        raise gr.Error("Enter Google AI Studio API Key.")
    if not pdf_file:
        raise gr.Error("Upload a PDF file.")
    if not langs:
        raise gr.Error("Select at least one language.")

    genai.configure(api_key=gemini_key)
    raw = extract_pdf_text(pdf_file.name)
    content = truncate_text(raw)

    tmp = Path(tempfile.mkdtemp())
    results = []
    data = {}

    for code, info in LANG_INFO.items():
        if info["name"] not in langs:
            results.extend([None, None, None])
            continue
        # Generate script
        prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
        model = genai.GenerativeModel('gemini-1.5-flash-latest')
        resp = model.generate_content(prompt)
        script = resp.text.strip()
        # Save plain text
        script_path = tmp / f"script_{code}.txt"
        script_path.write_text(script, encoding="utf-8")
        # Render HTML version
        html_script = f"<pre>{script}</pre>"
        # Synthesize audio if available
        audio_path = None
        if hf_tts_client:
            audio_path = synthesize(script, info["tts_model"], tmp / code)
        results.extend([audio_path, html_script, str(script_path)])
    return results

# Interface
inputs = [
    gr.Textbox(label="Google AI Studio API Key", type="password"),
    gr.File(label="Lecture PDF", file_types=[".pdf"]),
    gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
                     value=["English"], label="Languages")
]
outputs = []
for code, info in LANG_INFO.items():
    outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
    outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
    outputs.append(gr.File(label=f"Download {info['name']} Script"))

iface = gr.Interface(
    fn=generate_podcast,
    inputs=inputs,
    outputs=outputs,
    title="Lecture → Podcast & Script",
)

if __name__ == "__main__":
    iface.launch()