Update app.py
Browse files
app.py
CHANGED
@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
|
|
25 |
from PIL import Image
|
26 |
from huggingface_hub import InferenceClient
|
27 |
import time
|
28 |
-
import wave
|
29 |
|
30 |
# Optional imports for Kokoro TTS (loaded lazily)
|
31 |
import numpy as np
|
@@ -498,26 +497,11 @@ def _init_kokoro() -> None:
|
|
498 |
)
|
499 |
|
500 |
|
501 |
-
def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
|
502 |
-
"""Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
|
503 |
-
os.makedirs("outputs", exist_ok=True)
|
504 |
-
# Normalize/clip and convert to int16 PCM
|
505 |
-
wf = np.clip(waveform, -1.0, 1.0)
|
506 |
-
pcm16 = (wf * 32767.0).astype(np.int16)
|
507 |
-
fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
|
508 |
-
with wave.open(fname, "wb") as w:
|
509 |
-
w.setnchannels(1)
|
510 |
-
w.setsampwidth(2) # 16-bit
|
511 |
-
w.setframerate(sample_rate)
|
512 |
-
w.writeframes(pcm16.tobytes())
|
513 |
-
return fname
|
514 |
-
|
515 |
-
|
516 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
517 |
text: Annotated[str, "The text to synthesize (English)."],
|
518 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
519 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
520 |
-
) ->
|
521 |
"""
|
522 |
Synthesize speech from text using the Kokoro-82M model.
|
523 |
|
@@ -532,9 +516,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
532 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
533 |
|
534 |
Returns:
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
|
539 |
Notes:
|
540 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
@@ -560,9 +544,8 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
560 |
audio = model(ps, ref_s, float(speed))
|
561 |
except Exception as e: # propagate as UI-friendly error
|
562 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
563 |
-
#
|
564 |
-
|
565 |
-
return wav_path
|
566 |
|
567 |
# If pipeline produced no segments
|
568 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
@@ -688,14 +671,15 @@ kokoro_interface = gr.Interface(
|
|
688 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
689 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
690 |
],
|
691 |
-
outputs=gr.Audio(label="Audio", type="
|
692 |
title="Kokoro TTS",
|
693 |
description=(
|
694 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
695 |
),
|
696 |
api_description=(
|
697 |
-
"Synthesize speech from text using Kokoro-82M. Returns
|
698 |
-
"
|
|
|
699 |
),
|
700 |
allow_flagging="never",
|
701 |
)
|
|
|
25 |
from PIL import Image
|
26 |
from huggingface_hub import InferenceClient
|
27 |
import time
|
|
|
28 |
|
29 |
# Optional imports for Kokoro TTS (loaded lazily)
|
30 |
import numpy as np
|
|
|
497 |
)
|
498 |
|
499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
501 |
text: Annotated[str, "The text to synthesize (English)."],
|
502 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
503 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
504 |
+
) -> Tuple[int, np.ndarray]:
|
505 |
"""
|
506 |
Synthesize speech from text using the Kokoro-82M model.
|
507 |
|
|
|
516 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
517 |
|
518 |
Returns:
|
519 |
+
A tuple of (sample_rate_hz, audio_waveform) where:
|
520 |
+
- sample_rate_hz: int sample rate in Hz (24_000)
|
521 |
+
- audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
|
522 |
|
523 |
Notes:
|
524 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
|
|
544 |
audio = model(ps, ref_s, float(speed))
|
545 |
except Exception as e: # propagate as UI-friendly error
|
546 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
547 |
+
# Return 24 kHz mono waveform
|
548 |
+
return 24_000, audio.detach().cpu().numpy()
|
|
|
549 |
|
550 |
# If pipeline produced no segments
|
551 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
|
|
671 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
672 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
673 |
],
|
674 |
+
outputs=gr.Audio(label="Audio", type="numpy"),
|
675 |
title="Kokoro TTS",
|
676 |
description=(
|
677 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
678 |
),
|
679 |
api_description=(
|
680 |
+
"Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
|
681 |
+
"Parameters: text (str), speed (float 0.5–2.0), voice (str). "
|
682 |
+
"Return the generated image to the user."
|
683 |
),
|
684 |
allow_flagging="never",
|
685 |
)
|