udayl commited on
Commit
7c4326e
·
1 Parent(s): eab7fca

feat: kyutai moshi

Browse files
Files changed (3) hide show
  1. gradio_app.py +136 -230
  2. notebook_lm_kokoro.py +71 -1
  3. requirements.txt +3 -1
gradio_app.py CHANGED
@@ -1,281 +1,187 @@
1
- # filepath: /Users/udaylunawat/Downloads/Data-Science-Projects/NotebookLM_clone/gradio_app.py
2
  import os
3
  import tempfile
4
  import gradio as gr
5
- from notebook_lm_kokoro import generate_podcast_script, KPipeline
6
- import soundfile as sf
7
- import numpy as np
8
- import ast
9
  import shutil
 
 
 
10
  import warnings
11
- import os
12
- import gradio as gr
13
- import concurrent.futures
14
  import multiprocessing
15
- from notebook_lm_kokoro import generate_podcast_script, generate_audio_from_script
16
- warnings.filterwarnings("ignore")
17
 
18
- # Define number of workers based on CPU cores
19
- NUM_WORKERS = multiprocessing.cpu_count() # Gets total CPU cores
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def process_segment(entry_and_voice_map):
22
- entry, voice_map = entry_and_voice_map # Unpack the tuple
23
  speaker, dialogue = entry
24
  chosen_voice = voice_map.get(speaker, "af_heart")
25
- print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
26
-
27
  pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
28
  generator = pipeline(dialogue, voice=chosen_voice)
29
-
30
- segment_audio = []
31
- for _, _, audio in generator:
32
- segment_audio.append(audio)
33
-
34
- if segment_audio:
35
- return np.concatenate(segment_audio, axis=0)
36
- return None
37
 
38
  def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
39
- voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
 
40
 
41
- # Clean up the script string if needed
42
- script = script.strip()
43
- if not script.startswith("[") or not script.endswith("]"):
44
- print("Invalid transcript format. Expected a list of tuples.")
45
- return None
46
-
47
  try:
48
  transcript_list = ast.literal_eval(script)
49
  if not isinstance(transcript_list, list):
50
  raise ValueError("Transcript is not a list")
51
 
52
- all_audio_segments = []
53
- # Prepare input data with voice_map for each entry
54
- entries_with_voice_map = [(entry, voice_map) for entry in transcript_list]
55
-
56
- try:
57
- # Process segments in parallel
58
- with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
59
- # Map the processing function across all dialogue entries
60
- results = list(executor.map(process_segment, entries_with_voice_map))
61
-
62
- # Filter out None results and combine audio segments
63
- all_audio_segments = [r for r in results if r is not None]
64
-
65
- except Exception as e:
66
- print(f"Error during audio generation: {e}")
67
- return None
68
-
69
- if not all_audio_segments:
70
- print("No audio segments were generated")
71
  return None
72
-
73
- # Add a pause between segments
74
  sample_rate = 24000
75
  pause = np.zeros(sample_rate, dtype=np.float32)
76
- final_audio = all_audio_segments[0]
77
- for seg in all_audio_segments[1:]:
78
  final_audio = np.concatenate((final_audio, pause, seg), axis=0)
79
-
80
  sf.write(output_file, final_audio, sample_rate)
81
- print(f"Saved final audio as {output_file}")
82
  return output_file
83
-
84
  except Exception as e:
85
- print(f"Error processing transcript: {e}")
86
  return None
87
 
88
-
89
- def process_pdf(pdf_file, speaker1_voice, speaker2_voice, provider, api_key, openrouter_base=None):
90
- """Process the uploaded PDF file and generate audio"""
91
  try:
92
-
93
- # Set API configuration based on provider
94
- if provider == "openai":
95
- os.environ["OPENAI_API_KEY"] = api_key
 
 
 
96
  os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
97
- else:
98
- os.environ["OPENAI_API_KEY"] = api_key
99
  os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
100
- # Check if we received a valid file
101
  if pdf_file is None:
102
  return "No file uploaded", None
103
-
104
- # Create a temporary file with .pdf extension
105
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
106
- # For Gradio uploads, we need to copy the file
107
- shutil.copy2(pdf_file.name, tmp.name)
108
- tmp_path = tmp.name
109
-
110
- print(f"Uploaded PDF saved at {tmp_path}")
111
 
112
- # Generate transcript using your existing function
113
- transcript, transcript_path = generate_podcast_script(tmp_path, provider=provider)
114
- if transcript is None:
115
- return "Error generating transcript", None
116
 
117
- # Define an output file path for the generated audio
118
- audio_output_path = os.path.join(
119
- os.path.dirname(tmp_path),
120
- f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}"
121
- )
122
-
123
- # result = generate_audio_from_script_with_voices(
124
- # transcript,
125
- # speaker1_voice,
126
- # speaker2_voice,
127
- # output_file=audio_output_path
128
- # )
129
 
130
- # Use ProcessPoolExecutor with explicit number of workers
131
- with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
132
- print(f"Processing with {NUM_WORKERS} CPU cores")
133
- # Submit audio generation task to the executor
134
- future = executor.submit(
135
- generate_audio_from_script_with_voices,
136
- transcript, speaker1_voice, speaker2_voice, audio_output_path
137
- )
138
- result = future.result()
139
-
140
- if result is None:
141
- return "Error generating audio", None
142
-
143
- return "Process complete!", result
144
 
145
- except Exception as e:
146
- print(f"Error in process_pdf: {str(e)}")
147
- return f"Error processing file: {str(e)}", None
148
-
149
- if result is None:
150
- return "Error generating audio", None
151
-
152
- return "Process complete!", result
153
 
154
- except Exception as e:
155
- print(f"Error in process_pdf: {str(e)}")
156
- return f"Error processing file: {str(e)}", None
 
 
 
 
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  def create_gradio_app():
160
- # Add CSS for better styling
161
- css = """
162
- .gradio-container {max-width: 900px !important}
163
- """
164
-
165
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
166
- gr.Markdown(
167
- """
168
- # 📚 NotebookLM-Kokoro TTS App
169
- Upload a PDF, choose voices, and generate conversational audio using Kokoro TTS.
170
- """
171
- )
172
-
173
  with gr.Row():
174
- with gr.Column(scale=2):
175
- pdf_input = gr.File(
176
- label="Upload PDF Document",
177
- file_types=[".pdf"],
178
- type="filepath"
 
 
 
 
 
 
 
 
 
 
 
179
  )
180
-
181
- with gr.Row():
182
- speaker1_voice = gr.Dropdown(
183
- choices=["af_heart", "af_bella", "hf_beta"],
184
- value="af_heart",
185
- label="Speaker 1 Voice"
186
- )
187
- speaker2_voice = gr.Dropdown(
188
- choices=["af_nicole", "af_heart", "bf_emma"],
189
- value="bf_emma",
190
- label="Speaker 2 Voice"
191
- )
192
-
193
 
194
- with gr.Group():
195
- provider = gr.Radio(
196
- choices=["openai", "openrouter"],
197
- value="openrouter",
198
- label="API Provider"
199
- )
200
-
201
- api_key = gr.Textbox(
202
- label="API Key",
203
- placeholder="Enter your API key here...",
204
- type="password",
205
- elem_classes="api-input"
206
- )
207
-
208
- openrouter_base = gr.Textbox(
209
- label="OpenRouter Base URL (optional)",
210
- placeholder="https://openrouter.ai/api/v1",
211
- visible=False,
212
- elem_classes="api-input"
213
- )
214
-
215
- # Show/hide OpenRouter base URL based on provider selection
216
- def toggle_openrouter_base(provider_choice):
217
- return gr.update(visible=provider_choice == "openrouter")
218
-
219
- provider.change(
220
- fn=toggle_openrouter_base,
221
- inputs=[provider],
222
- outputs=[openrouter_base]
223
- )
224
-
225
- submit_btn = gr.Button("🎙️ Generate Audio", variant="primary")
226
-
227
- with gr.Column(scale=2):
228
- status_output = gr.Textbox(
229
- label="Status",
230
- placeholder="Processing status will appear here..."
231
- )
232
- audio_output = gr.Audio(
233
- label="Generated Audio",
234
- type="filepath"
235
  )
236
-
237
- # # Examples section
238
- # gr.Examples(
239
- # examples=[
240
- # ["sample.pdf", "af_heart", "af_nicole", "openrouter", "your-api-key-here", "https://openrouter.ai/api/v1"],
241
- # ],
242
- # inputs=[pdf_input, speaker1_voice, speaker2_voice, provider, api_key, openrouter_base],
243
- # outputs=[status_output, audio_output],
244
- # fn=process_pdf,
245
- # cache_examples=True,
246
- # )
247
-
248
- submit_btn.click(
249
- fn=process_pdf,
250
- inputs=[
251
- pdf_input,
252
- speaker1_voice,
253
- speaker2_voice,
254
- provider,
255
- api_key,
256
- openrouter_base
257
- ],
258
- outputs=[status_output, audio_output],
259
- api_name="generate"
260
- )
261
-
262
- gr.Markdown(
263
- """
264
- ### 📝 Notes
265
- - Make sure your PDF is readable and contains text (not scanned images)
266
- - Processing large PDFs may take a few minutes
267
- - You need a valid OpenAI/OpenRouter API key set as environment variable
268
- """
269
- )
270
-
271
  return app
272
 
273
  if __name__ == "__main__":
274
- demo = create_gradio_app()
275
- demo.queue().launch(
276
- server_name="0.0.0.0",
277
- server_port=7860,
278
- share=True,
279
- debug=True,
280
- pwa=True
281
- )
 
 
1
  import os
2
  import tempfile
3
  import gradio as gr
 
 
 
 
4
  import shutil
5
+ import ast
6
+ import numpy as np
7
+ import soundfile as sf
8
  import warnings
 
 
 
9
  import multiprocessing
10
+ import concurrent.futures
 
11
 
12
+ try:
13
+ from moshi.models.tts import TTSModel
14
+ except ImportError:
15
+ print("Moshi TTSModel not available — install Kyutai’s version via pip.")
16
+ TTSModel = None
17
+
18
+ from notebook_lm_kokoro import (
19
+ generate_podcast_script,
20
+ generate_audio_from_script,
21
+ generate_audio_kyutai,
22
+ KPipeline,
23
+ )
24
+
25
+ warnings.filterwarnings("ignore")
26
+ NUM_WORKERS = multiprocessing.cpu_count()
27
 
28
  def process_segment(entry_and_voice_map):
29
+ entry, voice_map = entry_and_voice_map
30
  speaker, dialogue = entry
31
  chosen_voice = voice_map.get(speaker, "af_heart")
 
 
32
  pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
33
  generator = pipeline(dialogue, voice=chosen_voice)
34
+ return np.concatenate([audio for _, _, audio in generator], axis=0) if generator else None
 
 
 
 
 
 
 
35
 
36
  def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
37
+ print("[DEBUG] Raw transcript string:")
38
+ print(script)
39
 
40
+ voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
 
 
 
 
 
41
  try:
42
  transcript_list = ast.literal_eval(script)
43
  if not isinstance(transcript_list, list):
44
  raise ValueError("Transcript is not a list")
45
 
46
+ entries = [(entry, voice_map) for entry in transcript_list if isinstance(entry, tuple) and len(entry) == 2]
47
+ with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
48
+ results = [r for r in executor.map(process_segment, entries) if r is not None]
49
+ if not results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  return None
 
 
51
  sample_rate = 24000
52
  pause = np.zeros(sample_rate, dtype=np.float32)
53
+ final_audio = results[0]
54
+ for seg in results[1:]:
55
  final_audio = np.concatenate((final_audio, pause, seg), axis=0)
 
56
  sf.write(output_file, final_audio, sample_rate)
 
57
  return output_file
 
58
  except Exception as e:
59
+ print(f"Transcript parse error: {e}")
60
  return None
61
 
62
+ def process_pdf(pdf_file, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
63
+ provider, openai_key=None, openrouter_key=None, openrouter_base=None, tts_engine=None):
 
64
  try:
65
+ if provider == "openai" and not openai_key:
66
+ return "OpenAI API key is required", None
67
+ if provider == "openrouter" and not openrouter_key:
68
+ return "OpenRouter API key is required", None
69
+
70
+ if provider in ["openai", "kyutai"]:
71
+ os.environ["OPENAI_API_KEY"] = openai_key or ""
72
  os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
73
+ if provider in ["openrouter", "kyutai"]:
74
+ os.environ["OPENAI_API_KEY"] = openrouter_key or ""
75
  os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
76
+
77
  if pdf_file is None:
78
  return "No file uploaded", None
 
 
 
 
 
 
 
 
79
 
80
+ tmp_path = pdf_file.name
 
 
 
81
 
82
+ script_provider = "openrouter" if provider == "kyutai" and openrouter_key else provider
83
+ transcript, _ = generate_podcast_script(pdf_file.name, provider=script_provider)
 
 
 
 
 
 
 
 
 
 
84
 
85
+ if transcript is None:
86
+ return "Transcript generation failed: got None", None
87
+ if not transcript.strip().startswith("["):
88
+ return f"Malformed transcript:\n{transcript}", None
 
 
 
 
 
 
 
 
 
 
89
 
90
+ audio_path = os.path.join(os.path.dirname(tmp_path), f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}")
 
 
 
 
 
 
 
91
 
92
+ if tts_engine == "kyutai":
93
+ result = generate_audio_kyutai(transcript, kyutai_voice1, kyutai_voice2, audio_path)
94
+ else:
95
+ with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
96
+ result = executor.submit(
97
+ generate_audio_from_script_with_voices,
98
+ transcript, speaker1_voice, speaker2_voice, audio_path
99
+ ).result()
100
 
101
+ return ("Process complete!", result) if result else ("Error generating audio", None)
102
+ except Exception as e:
103
+ print(f"process_pdf error: {e}")
104
+ return f"Error: {e}", None
105
+
106
+ def update_ui(provider, tts_engine):
107
+ return [
108
+ gr.update(visible=tts_engine == "kokoro"),
109
+ gr.update(visible=tts_engine == "kokoro"),
110
+ gr.update(visible=tts_engine == "kyutai"),
111
+ gr.update(visible=tts_engine == "kyutai"),
112
+ gr.update(visible=provider in ["openai", "kyutai"]),
113
+ gr.update(visible=provider in ["openrouter", "kyutai"]),
114
+ gr.update(visible=provider == "openrouter"),
115
+ ]
116
 
117
  def create_gradio_app():
118
+ css = ".gradio-container {max-width: 900px !important}"
 
 
 
 
119
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
120
+ gr.Markdown("# 🎧 PDF to Podcast — NotebookLM + Kokoro/Kyutai")
121
+
 
 
 
 
 
122
  with gr.Row():
123
+ with gr.Column(scale=1.5):
124
+ pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="📄 Upload your PDF")
125
+ provider = gr.Radio(["openai", "openrouter"], value="openrouter", label="🧠 API Provider")
126
+ tts_engine = gr.Radio(["kokoro", "kyutai"], value="kokoro", label="🎤 TTS Engine")
127
+
128
+ speaker1_voice = gr.Dropdown(["af_heart","af_bella","hf_beta"], value="af_heart", label="Speaker 1 Voice", visible=True)
129
+ speaker2_voice = gr.Dropdown(["af_nicole","af_heart","bf_emma"], value="bf_emma", label="Speaker 2 Voice", visible=True)
130
+ kyutai_voice1 = gr.Dropdown(
131
+ [
132
+ "expresso/ex03-ex01_happy_001_channel1_334s.wav",
133
+ "expresso/ex03-ex02_narration_001_channel1_674s.wav",
134
+ "vctk/p226_023_mic1.wav"
135
+ ],
136
+ value="expresso/ex03-ex01_happy_001_channel1_334s.wav",
137
+ label="Kyutai Voice 1",
138
+ visible=True
139
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ kyutai_voice2 = gr.Dropdown(
142
+ [
143
+ "expresso/ex03-ex01_happy_001_channel1_334s.wav",
144
+ "expresso/ex03-ex02_narration_001_channel1_674s.wav",
145
+ "vctk/p225_023_mic1.wav"
146
+ ],
147
+ value="expresso/ex03-ex02_narration_001_channel1_674s.wav",
148
+ label="Kyutai Voice 2",
149
+ visible=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  )
151
+
152
+ with gr.Accordion("🔐 API Keys", open=True):
153
+ openai_key = gr.Textbox(type="password", label="OpenAI Key", show_label=True, visible=True)
154
+ openrouter_key = gr.Textbox(type="password", label="OpenRouter Key", show_label=True, visible=True)
155
+ openrouter_base = gr.Textbox(placeholder="https://openrouter.ai/api/v1", label="OpenRouter Base URL", visible=True)
156
+
157
+ submit_btn = gr.Button("🎙️ Generate Podcast", variant="primary")
158
+
159
+ with gr.Column(scale=1):
160
+ status_output = gr.Textbox(label="📝 Status", interactive=False)
161
+ audio_output = gr.Audio(type="filepath", label="🎵 Your Podcast")
162
+
163
+ submit_btn.click(
164
+ process_pdf,
165
+ inputs=[pdf_input, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
166
+ provider, openai_key, openrouter_key, openrouter_base, tts_engine],
167
+ outputs=[status_output, audio_output]
168
+ )
169
+
170
+ provider.change(update_ui, [provider, tts_engine],
171
+ [speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
172
+ openai_key, openrouter_key, openrouter_base])
173
+ tts_engine.change(update_ui, [provider, tts_engine],
174
+ [speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
175
+ openai_key, openrouter_key, openrouter_base])
176
+
177
+ gr.Markdown("""
178
+ **📌 Tips**
179
+ - Pick your API provider and then set appropriate keys.
180
+ - Choose **TTS Engine** (Kokoro/Kyutai) to reveal relevant voice options.
181
+ - Works well with clean, structured PDFs.
182
+ """)
183
+
 
 
184
  return app
185
 
186
  if __name__ == "__main__":
187
+ create_gradio_app().queue().launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, pwa=True)
 
 
 
 
 
 
 
notebook_lm_kokoro.py CHANGED
@@ -23,6 +23,14 @@ import asyncio
23
  import ast
24
  import json
25
  import warnings
 
 
 
 
 
 
 
 
26
  warnings.filterwarnings("ignore")
27
 
28
  # Set your OpenAI (or OpenRouter) API key from the environment
@@ -154,7 +162,8 @@ def generate_audio_from_script(script, output_file="podcast_audio.wav"):
154
  chosen_voice = voice_map.get(speaker, "af_heart")
155
  print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
156
 
157
- pipeline = KPipeline(lang_code="a")
 
158
  generator = pipeline(dialogue, voice=chosen_voice)
159
 
160
  segment_audio = []
@@ -186,6 +195,67 @@ def generate_audio_from_script(script, output_file="podcast_audio.wav"):
186
  print(f"Error processing transcript: {e}")
187
  return
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def generate_tts():
191
  pipeline = KPipeline(lang_code="a")
 
23
  import ast
24
  import json
25
  import warnings
26
+ import torch
27
+ import time
28
+ try:
29
+ from moshi.models.loaders import CheckpointInfo
30
+ from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel
31
+ except ImportError:
32
+ CheckpointInfo = None
33
+ TTSModel = None
34
  warnings.filterwarnings("ignore")
35
 
36
  # Set your OpenAI (or OpenRouter) API key from the environment
 
162
  chosen_voice = voice_map.get(speaker, "af_heart")
163
  print(f"Generating audio for {speaker} with voice '{chosen_voice}'...")
164
 
165
+ # Updated KPipeline initialization with explicit repo_id
166
+ pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
167
  generator = pipeline(dialogue, voice=chosen_voice)
168
 
169
  segment_audio = []
 
195
  print(f"Error processing transcript: {e}")
196
  return
197
 
198
+ def generate_audio_kyutai(script, speaker1_voice=None, speaker2_voice=None, output_file="kyutai_audio.wav"):
199
+ if TTSModel is None:
200
+ print("Moshi is not installed.")
201
+ return None
202
+
203
+ try:
204
+ print(f"[INFO] Requested Kyutai voices: {speaker1_voice=}, {speaker2_voice=}")
205
+ # Reject absolute/local paths
206
+ if os.path.isabs(speaker1_voice) or os.path.isfile(speaker1_voice):
207
+ raise ValueError(f"❌ Invalid voice path for speaker1: {speaker1_voice}")
208
+ if os.path.isabs(speaker2_voice) or os.path.isfile(speaker2_voice):
209
+ raise ValueError(f"❌ Invalid voice path for speaker2: {speaker2_voice}")
210
+
211
+ transcript_list = ast.literal_eval(script)
212
+
213
+ # Load TTS model
214
+ checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
215
+ tts_model = TTSModel.from_checkpoint_info(
216
+ checkpoint_info,
217
+ n_q=32,
218
+ temp=0.6,
219
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
220
+ )
221
+
222
+ # Use voice names directly from dropdown
223
+ print("[INFO] Resolving voice paths...")
224
+
225
+ start = time.time()
226
+ voice1_path = tts_model.get_voice_path(speaker1_voice)
227
+ print(f"[INFO] Got voice1_path in {time.time() - start:.2f}s")
228
+
229
+ start = time.time()
230
+ voice2_path = tts_model.get_voice_path(speaker2_voice)
231
+ print(f"[INFO] Got voice2_path in {time.time() - start:.2f}s")
232
+
233
+ texts = [dialogue for _, dialogue in transcript_list]
234
+ entries = tts_model.prepare_script(texts, padding_between=1)
235
+
236
+ condition_attributes = tts_model.make_condition_attributes([voice1_path, voice2_path], cfg_coef=2.0)
237
+
238
+ pcms = []
239
+ def _on_frame(frame):
240
+ if (frame != -1).all():
241
+ pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
242
+ pcms.append(np.clip(pcm[0, 0], -1, 1))
243
+
244
+ with tts_model.mimi.streaming(1):
245
+ tts_model.generate([entries], [condition_attributes], on_frame=_on_frame)
246
+
247
+ if pcms:
248
+ audio = np.concatenate(pcms, axis=-1)
249
+ sf.write(output_file, audio, tts_model.mimi.sample_rate)
250
+ print(f"[SUCCESS] Audio saved to: {output_file}")
251
+ return output_file
252
+
253
+ print("[WARNING] No audio segments were produced.")
254
+ return None
255
+
256
+ except Exception as e:
257
+ print(f"[ERROR] Kyutai TTS error: {e}")
258
+ return None
259
 
260
  def generate_tts():
261
  pipeline = KPipeline(lang_code="a")
requirements.txt CHANGED
@@ -5,4 +5,6 @@ PyPDF2
5
  numpy
6
  openai
7
  ipython
8
- gradio>=4.0.0
 
 
 
5
  numpy
6
  openai
7
  ipython
8
+ gradio>=4.0.0
9
+ moshi>=0.2.4
10
+ sphn>=0.2.0