Nymbo commited on
Commit
675e6f3
·
verified ·
1 Parent(s): 47e7ebf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -26
app.py CHANGED
@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
25
  from PIL import Image
26
  from huggingface_hub import InferenceClient
27
  import time
28
- import wave
29
 
30
  # Optional imports for Kokoro TTS (loaded lazily)
31
  import numpy as np
@@ -498,26 +497,11 @@ def _init_kokoro() -> None:
498
  )
499
 
500
 
501
- def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
502
- """Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
503
- os.makedirs("outputs", exist_ok=True)
504
- # Normalize/clip and convert to int16 PCM
505
- wf = np.clip(waveform, -1.0, 1.0)
506
- pcm16 = (wf * 32767.0).astype(np.int16)
507
- fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
508
- with wave.open(fname, "wb") as w:
509
- w.setnchannels(1)
510
- w.setsampwidth(2) # 16-bit
511
- w.setframerate(sample_rate)
512
- w.writeframes(pcm16.tobytes())
513
- return fname
514
-
515
-
516
  def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
517
  text: Annotated[str, "The text to synthesize (English)."],
518
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
519
  voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
520
- ) -> str:
521
  """
522
  Synthesize speech from text using the Kokoro-82M model.
523
 
@@ -532,9 +516,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
532
  voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
533
 
534
  Returns:
535
- str: Path to a generated WAV file (24 kHz mono, 16-bit PCM). In the
536
- Gradio UI this renders an inline audio player; via MCP this is
537
- converted to a public URL that most clients will open in a browser tab.
538
 
539
  Notes:
540
  - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
@@ -560,9 +544,8 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
560
  audio = model(ps, ref_s, float(speed))
561
  except Exception as e: # propagate as UI-friendly error
562
  raise gr.Error(f"Error generating audio: {str(e)}")
563
- # Save as WAV and return path
564
- wav_path = _save_wav(audio.detach().cpu().numpy(), sample_rate=24_000)
565
- return wav_path
566
 
567
  # If pipeline produced no segments
568
  raise gr.Error("No audio was generated (empty synthesis result).")
@@ -688,14 +671,15 @@ kokoro_interface = gr.Interface(
688
  gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
689
  gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
690
  ],
691
- outputs=gr.Audio(label="Audio", type="filepath", autoplay=True),
692
  title="Kokoro TTS",
693
  description=(
694
  "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
695
  ),
696
  api_description=(
697
- "Synthesize speech from text using Kokoro-82M. Returns a file path to a WAV (24 kHz mono) that is playable inline in the UI,"
698
- " and exposed as a URL via MCP. Parameters: text (str), speed (float 0.5–2.0), voice (str)."
 
699
  ),
700
  allow_flagging="never",
701
  )
 
25
  from PIL import Image
26
  from huggingface_hub import InferenceClient
27
  import time
 
28
 
29
  # Optional imports for Kokoro TTS (loaded lazily)
30
  import numpy as np
 
497
  )
498
 
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
501
  text: Annotated[str, "The text to synthesize (English)."],
502
  speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
503
  voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
504
+ ) -> Tuple[int, np.ndarray]:
505
  """
506
  Synthesize speech from text using the Kokoro-82M model.
507
 
 
516
  voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
517
 
518
  Returns:
519
+ A tuple of (sample_rate_hz, audio_waveform) where:
520
+ - sample_rate_hz: int sample rate in Hz (24_000)
521
+ - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
522
 
523
  Notes:
524
  - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
 
544
  audio = model(ps, ref_s, float(speed))
545
  except Exception as e: # propagate as UI-friendly error
546
  raise gr.Error(f"Error generating audio: {str(e)}")
547
+ # Return 24 kHz mono waveform
548
+ return 24_000, audio.detach().cpu().numpy()
 
549
 
550
  # If pipeline produced no segments
551
  raise gr.Error("No audio was generated (empty synthesis result).")
 
671
  gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
672
  gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
673
  ],
674
+ outputs=gr.Audio(label="Audio", type="numpy"),
675
  title="Kokoro TTS",
676
  description=(
677
  "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
678
  ),
679
  api_description=(
680
+ "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
681
+ "Parameters: text (str), speed (float 0.5–2.0), voice (str). "
682
+ "Return the generated image to the user."
683
  ),
684
  allow_flagging="never",
685
  )