Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Aug 24

Commit

208563c

verified ·

1 Parent(s): dbd129c

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -174

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
-import wave
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
@@ -502,15 +501,14 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
-) -> str:
     """
     Synthesize speech from text using the Kokoro-82M model.
-    Per current HF Gradio MCP guidance (see hf-docs-search), tools should return
-    browser/client-friendly artifacts where possible. This function returns the
-    path to a WAV file on disk so the UI renders an HTML5 audio player and MCP
-    clients receive a file URL that opens in the browser rather than forcing a
-    direct download.
     Args:
         text: The text to synthesize (English).
@@ -518,8 +516,9 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
-        str: Path to a 24 kHz mono WAV file on disk (served by Gradio; MCP converts
-        paths to file URLs).
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
@@ -545,11 +544,8 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
-    # Save 24 kHz mono waveform to WAV and return its path for in-browser playback
-    sr = 24_000
-    wav = audio.detach().cpu().numpy()
-    path = _write_audio_tmp(wav, sample_rate=sr, suffix=".wav")
-    return path
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
@@ -641,7 +637,7 @@ CSS_STYLES = """
     /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
     .gradio-container h1::before {
         grid-row: 2;
-    content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation | Generate Code";
         display: block;
         font-size: 1rem;
         font-weight: 700;
@@ -651,7 +647,7 @@ CSS_STYLES = """
     }
     .gradio-container h1::after {
         grid-row: 3;
-    content: "Authentication is optional. Image/Video (and some Code) generation may require `HF_READ_TOKEN`; Image/Video tabs hide without it.";
         display: block;
         font-size: 1rem;
         font-weight: 400;
@@ -675,14 +671,15 @@ kokoro_interface = gr.Interface(
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
-    outputs=gr.Audio(label="Audio", type="filepath"),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
-        "Synthesize speech from text using Kokoro-82M. Returns a file path to a 24 kHz mono WAV, which renders in-browser and is exposed as a file URL over MCP. "
-        "Parameters: text (str), speed (float 0.5–2.0), voice (str)."
     ),
     allow_flagging="never",
 )
@@ -990,158 +987,6 @@ video_generation_interface = gr.Interface(
     allow_flagging="never",
 )
-# ==========================
-# Audio helper (save WAV)
-# ==========================
-def _write_audio_tmp(audio: np.ndarray, sample_rate: int = 24_000, suffix: str = ".wav") -> str:
-    """Write mono float32 waveform [-1,1] to 16-bit PCM WAV and return path."""
-    if audio.ndim > 1:
-        audio = np.mean(audio, axis=0)
-    audio = np.clip(audio.astype(np.float32), -1.0, 1.0)
-    pcm = (audio * 32767.0).astype(np.int16)
-    os.makedirs("outputs", exist_ok=True)
-    fname = f"outputs/audio_{int(time.time())}_{random.randint(1000,9999)}{suffix}"
-    with wave.open(fname, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sample_rate)
-        wf.writeframes(pcm.tobytes())
-    return fname
-# ==========================
-# Code Generation (Serverless)
-# ==========================
-def Generate_Code(
-    instruction: Annotated[str, "Describe the code to generate (requirements, I/O, constraints)."],
-    language: Annotated[str, "Optional language/framework hint (e.g., 'python', 'typescript react')."] = "",
-    model_id: Annotated[str, "HF text-generation model id (e.g., 'bigcode/starcoder2-3b')."] = "bigcode/starcoder2-3b",
-    max_new_tokens: Annotated[int, "Maximum tokens to generate (64–4096, model dependent)."] = 512,
-    temperature: Annotated[float, "Sampling temperature (0–1.5). Lower = more deterministic."] = 0.2,
-    top_p: Annotated[float, "Nucleus sampling p (0–1)."] = 0.95,
-    top_k: Annotated[int, "Top-k sampling cutoff (0 disables)."] = 50,
-    repetition_penalty: Annotated[float, "Discourage repeats (>1.0)."] = 1.05,
-    seed: Annotated[int, "Random seed (-1 = random)."] = -1,
-    save_to_file: Annotated[bool, "If true, save under ./outputs and prepend 'Saved to:' path."] = False,
-    filename: Annotated[str, "Optional filename when saving (e.g., main.py)."] = "",
-) -> str:
-    """
-    Generate source code via Hugging Face Inference text-generation models and return code as plain text.
-    Per current MCP docs (via hf-docs-search), schemas are inferred from type hints and docstrings. Returning
-    text is broadly compatible; when save_to_file is enabled, the response is prefixed with the saved path so
-    MCP clients can expose a file URL.
-    """
-    if not instruction or not instruction.strip():
-        raise gr.Error("Please provide a non-empty instruction.")
-    token = os.getenv("HF_READ_TOKEN") or os.getenv("HF_TOKEN")
-    providers = ["auto", "replicate", "fal-ai"]
-    lang_hint = f" in {language.strip()}" if language and language.strip() else ""
-    system_preamble = (
-        "You are a precise coding assistant. Output only runnable code without explanations. "
-        "Prefer idiomatic patterns, minimal comments, and include necessary imports."
-    )
-    prompt = (
-        f"{system_preamble}\n\nTask{lang_hint}:\n{instruction.strip()}\n\n"
-        "Return only the code, no backticks."
-    )
-    last_error: Exception | None = None
-    for provider in providers:
-        try:
-            client = InferenceClient(api_key=token, provider=provider)
-            out = client.text_generation(
-                model=model_id,
-                prompt=prompt,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                repetition_penalty=repetition_penalty,
-                seed=(None if seed == -1 else seed),
-                stream=False,
-            )
-            code = (out or "").strip()
-            if not code:
-                raise gr.Error("Model returned empty output.")
-            prefix = ""
-            if save_to_file:
-                os.makedirs("outputs", exist_ok=True)
-                base = filename.strip() or f"code_{int(time.time())}_{random.randint(1000,9999)}"
-                if "." not in base and language:
-                    ext_map = {
-                        "python": ".py",
-                        "py": ".py",
-                        "typescript": ".ts",
-                        "tsx": ".tsx",
-                        "javascript": ".js",
-                        "jsx": ".jsx",
-                        "go": ".go",
-                        "rust": ".rs",
-                        "java": ".java",
-                        "csharp": ".cs",
-                        "c#": ".cs",
-                        "cpp": ".cpp",
-                        "c++": ".cpp",
-                        "c": ".c",
-                        "bash": ".sh",
-                        "shell": ".sh",
-                        "html": ".html",
-                        "css": ".css",
-                        "json": ".json",
-                        "yaml": ".yaml",
-                        "yml": ".yml",
-                    }
-                    key = language.lower().split()[0]
-                    base += ext_map.get(key, "")
-                path = os.path.join("outputs", base)
-                with open(path, "w", encoding="utf-8") as f:
-                    f.write(code)
-                prefix = f"Saved to: {path}\n\n"
-            return f"{prefix}{code}"
-        except Exception as e:
-            last_error = e
-            continue
-    msg = str(last_error) if last_error else "Unknown error"
-    if "401" in msg or "403" in msg:
-        raise gr.Error("Authentication failed or not permitted. Set HF_READ_TOKEN/HF_TOKEN with inference access.")
-    if "404" in msg:
-        raise gr.Error(f"Model not found or unavailable: {model_id}.")
-    if "503" in msg:
-        raise gr.Error("The model is warming up. Please try again shortly.")
-    raise gr.Error(f"Code generation failed: {msg}")
-code_generation_interface = gr.Interface(
-    fn=Generate_Code,
-    inputs=[
-        gr.Textbox(label="Instruction", placeholder="Describe what to build, inputs/outputs, edge cases…", lines=6),
-        gr.Textbox(label="Language (optional)", value="", placeholder="e.g., python, typescript react"),
-        gr.Textbox(label="Model", value="bigcode/starcoder2-3b", placeholder="creator/model-name"),
-        gr.Slider(minimum=64, maximum=4096, value=512, step=16, label="Max new tokens"),
-        gr.Slider(minimum=0.0, maximum=1.5, value=0.2, step=0.05, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.01, label="Top-p"),
-        gr.Slider(minimum=0, maximum=200, value=50, step=1, label="Top-k"),
-        gr.Slider(minimum=1.0, maximum=2.0, value=1.05, step=0.01, label="Repetition penalty"),
-        gr.Slider(minimum=-1, maximum=1_000_000_000, value=-1, step=1, label="Seed (-1 = random)"),
-        gr.Checkbox(value=False, label="Save to file (./outputs)"),
-        gr.Textbox(label="Filename (optional)", value="", placeholder="e.g., main.py"),
-    ],
-    outputs=gr.Code(label="Generated Code"),
-    title="Generate Code",
-    description=(
-        "<div style=\"text-align:center\">Generate code via Hugging Face Inference text-generation models. Provide a clear instruction and (optionally) a language hint.</div>"
-    ),
-    api_description=(
-        "Generate source code using a HF Inference text-generation model. Parameters: instruction (str), language (str), model_id (str), "
-        "max_new_tokens (int), temperature (float), top_p (float), top_k (int), repetition_penalty (float), seed (int), save_to_file (bool), filename (str). "
-        "Returns the code as text; if saved, prepends 'Saved to: <path>'."
-    ),
-    allow_flagging="never",
-)
 # Build tabbed app; disable Image/Video tools if no HF token is present
 HAS_HF_TOKEN = bool(HF_API_TOKEN or HF_VIDEO_TOKEN)
@@ -1162,9 +1007,6 @@ if HAS_HF_TOKEN:
     _interfaces.extend([image_generation_interface, video_generation_interface])
     _tab_names.extend(["Image Generation", "Video Generation"])
-# Always add Generate Code as the last tab
-_interfaces.append(code_generation_interface)
-_tab_names.append("Generate Code")
 demo = gr.TabbedInterface(
     interface_list=_interfaces,
     tab_names=_tab_names,

 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
+) -> Tuple[int, np.ndarray]:
     """
     Synthesize speech from text using the Kokoro-82M model.
+    This function returns raw audio suitable for a Gradio Audio component and is
+    also exposed as an MCP tool (per the latest Hugging Face/Gradio MCP docs, a
+    tool is created for each function wired into your app; docstrings and type
+    hints are used to describe the tool).
     Args:
         text: The text to synthesize (English).
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
+        A tuple of (sample_rate_hz, audio_waveform) where:
+        - sample_rate_hz: int sample rate in Hz (24_000)
+        - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
+        # Return 24 kHz mono waveform
+        return 24_000, audio.detach().cpu().numpy()
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
     /* Place bold tools list on line 2, normal auth note on line 3 (below title) */
     .gradio-container h1::before {
         grid-row: 2;
+        content: "Fetch Webpage | Search DuckDuckGo | Code Interpreter | Kokoro TTS | Image Generation | Video Generation";
         display: block;
         font-size: 1rem;
         font-weight: 700;
     }
     .gradio-container h1::after {
         grid-row: 3;
+        content: "Authentication is optional but Image/Video Generation require a `HF_READ_TOKEN` in env variables. They are hidden otherwise.";
         display: block;
         font-size: 1rem;
         font-weight: 400;
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
+    outputs=gr.Audio(label="Audio", type="numpy"),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
+        "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
+        "Parameters: text (str), speed (float 0.5–2.0), voice (str). "
+        "Return the generated image to the user."
     ),
     allow_flagging="never",
 )
     allow_flagging="never",
 )
 # Build tabbed app; disable Image/Video tools if no HF token is present
 HAS_HF_TOKEN = bool(HF_API_TOKEN or HF_VIDEO_TOKEN)
     _interfaces.extend([image_generation_interface, video_generation_interface])
     _tab_names.extend(["Image Generation", "Video Generation"])
 demo = gr.TabbedInterface(
     interface_list=_interfaces,
     tab_names=_tab_names,