File size: 9,376 Bytes
3ac5c08 7c4326e 3ac5c08 eab7fca 7c4326e 8266b8d dca7813 3ac5c08 7c4326e 2ad5135 dca7813 fc0739a d756e7e 2ad5135 7c4326e eab7fca 8266b8d 2ad5135 8266b8d 2ad5135 8266b8d 2ad5135 8266b8d 2ad5135 25acce6 eab7fca 7c4326e eab7fca 7c4326e eab7fca 3ac5c08 7c4326e 3ac5c08 7c4326e 3ac5c08 d756e7e 7c4326e 3ac5c08 7c4326e 3ac5c08 7c4326e 3ac5c08 7c4326e 3ac5c08 7c4326e 03eec30 7c4326e 03eec30 7c4326e 3ac5c08 7c4326e 3ac5c08 7c4326e eab7fca 7c4326e eab7fca 7c4326e 3ac5c08 7c4326e dca7813 3ac5c08 7c4326e 3ac5c08 7c4326e 03eec30 7c4326e 3ac5c08 7c4326e 3ac5c08 03eec30 7c4326e 3ac5c08 7c4326e 3ac5c08 25acce6 3ac5c08 7c4326e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import os
import tempfile
import shutil
import ast
import numpy as np
import soundfile as sf
import warnings
import multiprocessing
import concurrent.futures
import urllib.request
import pathlib
try:
from moshi.models.tts import TTSModel
except ImportError:
print("Moshi TTSModel not available β install Kyutaiβs version via pip.")
TTSModel = None
from notebook_lm_kokoro import (
generate_podcast_script,
generate_audio_from_script,
generate_audio_kyutai,
KPipeline,
)
os.environ["HOME"] = "/tmp/home" # Prevent fallback to /root
os.makedirs("/tmp/home", exist_ok=True)
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["MOSHI_CACHE_DIR"] = "/tmp/moshi"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
import gradio as gr
warnings.filterwarnings("ignore")
NUM_WORKERS = multiprocessing.cpu_count()
def ensure_gradio_frpc():
"""
Ensures the frpc binary is present in the location Gradio expects.
Avoids /.cache symlinks (which are not writable in HF Spaces).
"""
gradio_temp_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio")
target_dir = os.path.join(gradio_temp_dir, "frpc")
os.makedirs(target_dir, exist_ok=True)
frpc_file = os.path.join(target_dir, "frpc_linux_amd64_v0.3")
if not os.path.exists(frpc_file):
print(f"[INFO] Downloading frpc binary to: {frpc_file}")
try:
url = "https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64"
urllib.request.urlretrieve(url, frpc_file)
os.chmod(frpc_file, 0o755) # Make it executable
print("[SUCCESS] frpc binary downloaded and made executable.")
except Exception as e:
print(f"[ERROR] Failed to download frpc binary: {e}")
else:
print("[INFO] frpc binary already exists at expected path.")
def process_segment(entry_and_voice_map):
entry, voice_map = entry_and_voice_map
speaker, dialogue = entry
chosen_voice = voice_map.get(speaker, "af_heart")
pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
generator = pipeline(dialogue, voice=chosen_voice)
return np.concatenate([audio for _, _, audio in generator], axis=0) if generator else None
def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
print("[DEBUG] Raw transcript string:")
print(script)
voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
try:
transcript_list = ast.literal_eval(script)
if not isinstance(transcript_list, list):
raise ValueError("Transcript is not a list")
results = []
for entry in transcript_list:
audio = process_segment((entry, voice_map))
if audio is not None:
results.append(audio)
if not results:
return None
sample_rate = 24000
pause = np.zeros(sample_rate, dtype=np.float32)
final_audio = results[0]
for seg in results[1:]:
final_audio = np.concatenate((final_audio, pause, seg), axis=0)
sf.write(output_file, final_audio, sample_rate)
return output_file
except Exception as e:
print(f"Transcript parse error: {e}")
return None
def process_pdf(pdf_file, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
provider, openai_key=None, openrouter_key=None, openrouter_base=None, tts_engine=None):
try:
if provider == "openai" and not openai_key:
return "OpenAI API key is required", None
if provider == "openrouter" and not openrouter_key:
return "OpenRouter API key is required", None
if provider in ["openai", "kyutai"]:
os.environ["OPENAI_API_KEY"] = openai_key or ""
os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
if provider in ["openrouter", "kyutai"]:
os.environ["OPENAI_API_KEY"] = openrouter_key or ""
os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
if pdf_file is None:
return "No file uploaded", None
tmp_path = pdf_file.name
script_provider = "openrouter" if provider == "kyutai" and openrouter_key else provider
transcript, _ = generate_podcast_script(pdf_file.name, provider=script_provider)
if transcript is None:
return "Transcript generation failed: got None", None
if not transcript.strip().startswith("["):
return f"Malformed transcript:\n{transcript}", None
audio_path = os.path.join(os.path.dirname(tmp_path), f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}")
if tts_engine == "kyutai":
result = generate_audio_kyutai(transcript, kyutai_voice1, kyutai_voice2, audio_path)
else:
result = generate_audio_from_script_with_voices(transcript, speaker1_voice, speaker2_voice, audio_path)
return ("Process complete!", result) if result else ("Error generating audio", None)
except Exception as e:
print(f"process_pdf error: {e}")
return f"Error: {e}", None
def update_ui(provider, tts_engine):
return [
gr.update(visible=tts_engine == "kokoro"),
gr.update(visible=tts_engine == "kokoro"),
gr.update(visible=tts_engine == "kyutai"),
gr.update(visible=tts_engine == "kyutai"),
gr.update(visible=provider in ["openai", "kyutai"]),
gr.update(visible=provider in ["openrouter", "kyutai"]),
gr.update(visible=provider == "openrouter"),
]
def create_gradio_app():
css = ".gradio-container {max-width: 900px !important}"
with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
gr.Markdown("# π§ PDF to Podcast β NotebookLM + Kokoro/Kyutai")
with gr.Row():
with gr.Column(scale=1.5):
pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="π Upload your PDF")
provider = gr.Radio(["openai", "openrouter"], value="openrouter", label="π§ API Provider")
tts_engine = gr.Radio(["kokoro", "kyutai"], value="kokoro", label="π€ TTS Engine")
speaker1_voice = gr.Dropdown(["af_heart","af_bella","hf_beta"], value="af_heart", label="Speaker 1 Voice", visible=True)
speaker2_voice = gr.Dropdown(["af_nicole","af_heart","bf_emma"], value="bf_emma", label="Speaker 2 Voice", visible=True)
kyutai_voice1 = gr.Dropdown(
[
"expresso/ex03-ex01_happy_001_channel1_334s.wav",
"expresso/ex03-ex02_narration_001_channel1_674s.wav",
"vctk/p226_023_mic1.wav"
],
value="expresso/ex03-ex01_happy_001_channel1_334s.wav",
label="Kyutai Voice 1",
visible=True
)
kyutai_voice2 = gr.Dropdown(
[
"expresso/ex03-ex01_happy_001_channel1_334s.wav",
"expresso/ex03-ex02_narration_001_channel1_674s.wav",
"vctk/p225_023_mic1.wav"
],
value="expresso/ex03-ex02_narration_001_channel1_674s.wav",
label="Kyutai Voice 2",
visible=True
)
with gr.Accordion("π API Keys", open=True):
openai_key = gr.Textbox(type="password", label="OpenAI Key", show_label=True, visible=True)
openrouter_key = gr.Textbox(type="password", label="OpenRouter Key", show_label=True, visible=True)
openrouter_base = gr.Textbox(placeholder="https://openrouter.ai/api/v1", label="OpenRouter Base URL", visible=True)
submit_btn = gr.Button("ποΈ Generate Podcast", variant="primary")
with gr.Column(scale=1):
status_output = gr.Textbox(label="π Status", interactive=False)
audio_output = gr.Audio(type="filepath", label="π΅ Your Podcast")
submit_btn.click(
process_pdf,
inputs=[pdf_input, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
provider, openai_key, openrouter_key, openrouter_base, tts_engine],
outputs=[status_output, audio_output]
)
provider.change(update_ui, [provider, tts_engine],
[speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
openai_key, openrouter_key, openrouter_base])
tts_engine.change(update_ui, [provider, tts_engine],
[speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
openai_key, openrouter_key, openrouter_base])
gr.Markdown("""
**π Tips**
- Pick your API provider and then set appropriate keys.
- Choose **TTS Engine** (Kokoro/Kyutai) to reveal relevant voice options.
- Works well with clean, structured PDFs.
""")
return app
ensure_gradio_frpc()
if __name__ == "__main__":
create_gradio_app().queue().launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, pwa=True) |