Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on 10 days ago

Commit

7e3de09

verified ·

1 Parent(s): 4413610

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -135

app.py CHANGED Viewed

@@ -2,15 +2,11 @@
 # Created by bilsimaging.com
 import os
-# ---- Prefer safetensors for all HF model loads (fixes CLAP .bin crash on ZeroGPU) ----
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
-import io
 import json
-import uuid
-import time
-import shutil
 import base64
 import random
 import tempfile
@@ -37,10 +33,10 @@ OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
-SPACE_TAGLINE = "Text/Video → Audio Foley. Created by bilsimaging.com"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
-# Keep GPU <= 120s for ZeroGPU (default 110)
 GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
 # Globals
@@ -63,10 +59,7 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
             d = torch.device("cpu")
     else:
         d = torch.device(pref)
-    if d.type == "cuda":
-        logger.info(f"Using CUDA {d}")
-    else:
-        logger.info(f"Using {d}")
     return d
@@ -116,9 +109,9 @@ def auto_load_models() -> str:
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
-        return "Model already loaded."
-    # Ensure Transformers prefers safetensors for everything:
     os.environ["HF_PREFER_SAFETENSORS"] = "1"
     sys.path.append(str(REPO_DIR))
@@ -133,9 +126,8 @@ def auto_load_models() -> str:
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
     except OSError as e:
-        # If any OSError (often from trying to read pytorch_model.bin), retry after enforcing safetensors.
         logger.error(str(e))
-        logger.info("Retrying load after enforcing safetensors preference...")
         os.environ["HF_PREFER_SAFETENSORS"] = "1"
         try:
             _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
@@ -149,7 +141,7 @@ def auto_load_models() -> str:
 def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
-    """Use project's helper (preferred) with a fallback to ffmpeg via subprocess."""
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
@@ -171,7 +163,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
-    """Save WAV + MP4 in outputs/, add metadata and a small watermark note (metadata only)."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
@@ -186,14 +178,14 @@ def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
     _merge_audio_video(str(wav_path), video_src, str(out_mp4))
-    # Save JSON sidecar
     meta = {
         "id": base,
         "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
         "source_video": Path(video_src).name,
         "output_video": Path(out_mp4).name,
         "prompt": prompt or "",
-        "watermark": WATERMARK_NOTE,
         "tool": "ShortiFoley (HunyuanVideo-Foley)"
     }
     (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
@@ -226,8 +218,11 @@ def infer_single_video(
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
     if _model_dict is None or _cfg is None:
-        return [], "❌ Load the model first (open the app once)."
     if not video_file:
         return [], "❌ Please provide a video."
@@ -269,84 +264,96 @@ def _about_html() -> str:
     return f"""
     <div style="line-height:1.6">
       <h2>About ShortiFoley</h2>
-      <p><b>ShortiFoley</b> automatically generates realistic Foley soundtracks for short videos using
-      Tencent’s HunyuanVideo-Foley with CLAP & SigLIP2 encoders. It includes autosave and an MCP server so
-      you can call it from agents or workflows (e.g., n8n).</p>
-      <p><b>Created by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a></b></p>
-      <h3>How to use</h3>
       <ol>
-        <li>Upload a video (ideally &lt; 120 seconds).</li>
-        <li>Optionally enter a text description of the sound (English).</li>
-        <li>Adjust CFG scale, steps, and number of variants.</li>
-        <li>Click <b>Generate</b>. Results appear on the right and are stored in the Gallery.</li>
       </ol>
-      <h3>Tips</h3>
       <ul>
-        <li>Trim clips to the key action (5–30s) for faster, crisper results.</li>
-        <li>Include material cues (“wood”, “metal”, “concrete”), action cues (“splash”, “glass shatter”), and ambience (“roomy”, “echoey”).</li>
-        <li>Generate multiple variants and pick the most natural.</li>
       </ul>
-      <h3>MCP / Automation</h3>
-      <p>This app runs as an <b>MCP server</b>. Open the footer “View API → MCP” to copy a ready config. You can also use the REST endpoints listed there. Perfect for n8n integrations.</p>
       <h3>Watermark</h3>
-      <p>Each output’s metadata includes: <i>{WATERMARK_NOTE}</i>. If you want a <b>visible video overlay</b>, I can add an ffmpeg overlay step on request.</p>
     </div>
     """
 def create_ui() -> gr.Blocks:
-    with gr.Blocks(
-        title="ShortiFoley — HunyuanVideo-Foley",
-        css="""
-        .main-header{ text-align:center; padding:1.2rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
-        .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
-        .generate-btn button{ font-weight:700; }
-        """
-    ) as demo:
         gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
         with gr.Tabs():
             with gr.Tab("Run"):
                 with gr.Row():
                     with gr.Column(scale=1, elem_classes=["card"]):
                         gr.Markdown("### 📹 Input")
                         video_input = gr.Video(label="Upload Video", height=300)
                         text_input = gr.Textbox(
                             label="🎯 Audio Description (optional, English)",
-                            placeholder="e.g., Rubber soles on wet tile, distant chatter.",
                             lines=3
                         )
                         with gr.Row():
-                            guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
                             steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
                             samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
-                        generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
                     with gr.Column(scale=1, elem_classes=["card"]):
                         gr.Markdown("### 🎥 Result(s)")
                         v1 = gr.Video(label="Sample 1", height=260, visible=True)
-                        v2 = gr.Video(label="Sample 2", height=160, visible=False)
-                        v3 = gr.Video(label="Sample 3", height=160, visible=False)
-                        v4 = gr.Video(label="Sample 4", height=160, visible=False)
-                        v5 = gr.Video(label="Sample 5", height=160, visible=False)
                         v6 = gr.Video(label="Sample 6", height=160, visible=False)
-                        status = gr.Textbox(label="Status", interactive=False)
                 # Generate handler
                 def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
-                    vis_updates = []
                     for i in range(6):
                         if i < len(outs):
-                            vis_updates.append(gr.update(visible=True, value=outs[i]))
                         else:
-                            vis_updates.append(gr.update(visible=False, value=None))
-                    return (*vis_updates, msg)
                 generate.click(
                     fn=_process_and_update,
@@ -356,7 +363,15 @@ def create_ui() -> gr.Blocks:
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
-                # Toggle visibility when # of samples changes
                 def _toggle_vis(n):
                     n = int(n)
                     return [
@@ -380,78 +395,110 @@ def create_ui() -> gr.Blocks:
                 refresh = gr.Button("🔄 Refresh Gallery")
                 refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
-        # Keep gallery in sync after generate
-        generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
-        # -----------------------
-        # MCP + REST API endpoints
-        # -----------------------
-        def _download_to_tmp(url: str) -> str:
-            try:
-                import requests
-            except Exception:
-                raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
-            r = requests.get(url, timeout=30)
-            r.raise_for_status()
-            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-            tmp.write(r.content)
-            tmp.flush()
-            tmp.close()
-            return tmp.name
-        def _maybe_from_base64(data_url_or_b64: str) -> str:
-            b64 = data_url_or_b64
-            if data_url_or_b64.startswith("data:"):
-                b64 = data_url_or_b64.split(",", 1)[-1]
-            raw = base64.b64decode(b64)
-            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-            tmp.write(raw)
-            tmp.flush()
-            tmp.close()
-            return tmp.name
-        def _normalize_video_input(video_url_or_b64: str) -> str:
-            v = (video_url_or_b64 or "").strip()
-            if v.startswith("http://") or v.startswith("https://"):
-                return _download_to_tmp(v)
-            return _maybe_from_base64(v)
-        @gr.api
-        def api_generate_from_url(
-            video_url_or_b64: str,
-            text_prompt: str = "",
-            guidance_scale: float = 4.5,
-            num_inference_steps: int = 50,
-            sample_nums: int = 1,
-        ) -> Dict[str, List[str]]:
-            if _model_dict is None or _cfg is None:
-                raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
-            local = _normalize_video_input(video_url_or_b64)
-            outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
-            return {"videos": outs, "message": msg}
-        @gr.api
-        def load_model_tool() -> str:
-            """Ensure model is loaded on server (MCP convenience)."""
-            return auto_load_models()
-        @gr.mcp.resource("shortifoley://status")
-        def shortifoley_status() -> str:
-            """Return a simple readiness string for MCP clients."""
-            ready = _model_dict is not None and _cfg is not None
-            dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
-            return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
-        @gr.mcp.prompt()
-        def foley_prompt(name: str = "default") -> str:
-            """Reusable guidance for describing sound ambience."""
-            return (
-                "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
-                "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
-            )
     return demo
@@ -473,22 +520,15 @@ if __name__ == "__main__":
     logger.info("===== Application Startup =====\n")
     prepare_once()
-    # Ensure import paths after repo is present
     sys.path.append(str(REPO_DIR))
     try:
-        # Probe key modules early (better error surfacing)
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
         from hunyuanvideo_foley.utils.feature_utils import feature_process  # noqa: F401
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video  # noqa: F401
     except Exception as e:
         logger.warning(f"Repo imports not ready yet: {e}")
-    msg = auto_load_models()
-    if not msg.startswith("✅"):
-        logger.error(f"[BOOT][ERROR] auto_load_models() failed:\n{msg}")
-    else:
-        logger.info(msg)
     ui = create_ui()
     # Enable MCP server so tools/resources/prompts are discoverable
@@ -496,5 +536,5 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         share=False,
         show_error=True,
-        mcp_server=True,   # MCP on
     )

 # Created by bilsimaging.com
 import os
+# Prefer safetensors globally (fixes CLAP .bin crash on ZeroGPU)
 os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
 import sys
 import json
 import base64
 import random
 import tempfile
 OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
 SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
+SPACE_TAGLINE = "Text/Video → Audio Foley · Created by bilsimaging.com"
 WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
+# ZeroGPU limit (<=120)
 GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
 # Globals
             d = torch.device("cpu")
     else:
         d = torch.device(pref)
+    logger.info(f"Using {d}")
     return d
     global _model_dict, _cfg, _device
     if _model_dict is not None and _cfg is not None:
+        return "✅ Model already loaded."
+    # Make absolutely sure safetensors is preferred
     os.environ["HF_PREFER_SAFETENSORS"] = "1"
     sys.path.append(str(REPO_DIR))
         _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
         return "✅ Model loaded."
     except OSError as e:
         logger.error(str(e))
+        logger.info("Retrying after enforcing safetensors preference...")
         os.environ["HF_PREFER_SAFETENSORS"] = "1"
         try:
             _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
 def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
+    """Preferred: project's util; fallback to ffmpeg."""
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video
 def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
                   prompt: str) -> str:
+    """Save WAV + MP4 in outputs/, add metadata with a soft watermark note."""
     # torchaudio expects [C, N]
     if audio_tensor.ndim == 1:
         audio_tensor = audio_tensor.unsqueeze(0)
     _merge_audio_video(str(wav_path), video_src, str(out_mp4))
+    # Sidecar JSON
     meta = {
         "id": base,
         "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
         "source_video": Path(video_src).name,
         "output_video": Path(out_mp4).name,
         "prompt": prompt or "",
+        "watermark_note": WATERMARK_NOTE,
         "tool": "ShortiFoley (HunyuanVideo-Foley)"
     }
     (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
     Generate Foley audio for an uploaded video (1–6 variants).
     Returns: (list of output video paths, status message)
     """
+    # Lazy-load if needed
     if _model_dict is None or _cfg is None:
+        msg = auto_load_models()
+        if not str(msg).startswith("✅"):
+            return [], f"❌ {msg}"
     if not video_file:
         return [], "❌ Please provide a video."
     return f"""
     <div style="line-height:1.6">
       <h2>About ShortiFoley</h2>
+      <p><b>ShortiFoley</b> turns short videos (and an optional text hint) into realistic Foley sound.
+      Powered by Tencent’s HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation (e.g., n8n).</p>
+      <p><b>Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a></b></p>
+      <h3>Quick Steps</h3>
       <ol>
+        <li>Upload a clip (ideally &lt; 120s).</li>
+        <li>Optionally describe the sound (English).</li>
+        <li>Pick variants (1–6), adjust CFG and steps.</li>
+        <li>Hit <b>Generate</b>. Results show on the right and save into the Gallery.</li>
       </ol>
+      <h3>Tips for Best Quality</h3>
       <ul>
+        <li>Use tight clips (5–30s) around the action.</li>
+        <li>Include material & action cues: “metal clang”, “glass shatter”, “rubber on wet tile”.</li>
+        <li>Describe ambience: “roomy”, “echoey”, “distant crowd”.</li>
+        <li>Generate 2–4 variants and pick the most natural.</li>
       </ul>
+      <h3>MCP & API</h3>
+      <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
+      Perfect for pipelines and tools like <b>n8n</b>.</p>
       <h3>Watermark</h3>
+      <p>Each output writes a JSON sidecar including: <i>{WATERMARK_NOTE}</i>. Ask if you want a visible overlay.</p>
     </div>
     """
 def create_ui() -> gr.Blocks:
+    css = """
+    .main-header{ text-align:center; padding:1.2rem; border-radius:18px; background:linear-gradient(135deg,#6366f1,#8b5cf6); color:white; box-shadow:0 12px 40px rgba(99,102,241,.35); margin-bottom:16px;}
+    .main-header h1{ margin:0; font-size:2.0rem; font-weight:800;}
+    .main-header p{ margin:.25rem 0 0; opacity:.95; font-weight:500;}
+    .card{ background:white; border:1px solid #e7e9ef; border-radius:16px; padding:14px; box-shadow:0 10px 28px rgba(0,0,0,.06);}
+    .generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
+    .minor-btn button{ border-radius:10px;}
+    .muted{ color:#64748b; }
+    """
+    with gr.Blocks(title="ShortiFoley — HunyuanVideo-Foley", css=css) as demo:
         gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
         with gr.Tabs():
             with gr.Tab("Run"):
                 with gr.Row():
+                    # LEFT: input
                     with gr.Column(scale=1, elem_classes=["card"]):
                         gr.Markdown("### 📹 Input")
                         video_input = gr.Video(label="Upload Video", height=300)
                         text_input = gr.Textbox(
                             label="🎯 Audio Description (optional, English)",
+                            placeholder="e.g., Rubber soles on wet tile; distant chatter; occasional splashes.",
                             lines=3
                         )
                         with gr.Row():
+                            guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG")
                             steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
                             samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
+                        with gr.Row():
+                            load_btn = gr.Button("⚙️ Load model", variant="secondary", elem_classes=["minor-btn"])
+                            generate = gr.Button("🎵 Generate", variant="primary", elem_classes=["generate-btn"])
+                        status = gr.Textbox(label="Status", interactive=False)
+                    # RIGHT: results
                     with gr.Column(scale=1, elem_classes=["card"]):
                         gr.Markdown("### 🎥 Result(s)")
                         v1 = gr.Video(label="Sample 1", height=260, visible=True)
+                        with gr.Row():
+                            v2 = gr.Video(label="Sample 2", height=160, visible=False)
+                            v3 = gr.Video(label="Sample 3", height=160, visible=False)
+                        with gr.Row():
+                            v4 = gr.Video(label="Sample 4", height=160, visible=False)
+                            v5 = gr.Video(label="Sample 5", height=160, visible=False)
                         v6 = gr.Video(label="Sample 6", height=160, visible=False)
+                        gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
                 # Generate handler
                 def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
                     outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
+                    vis = []
                     for i in range(6):
                         if i < len(outs):
+                            vis.append(gr.update(visible=True, value=outs[i]))
                         else:
+                            vis.append(gr.update(visible=False, value=None))
+                    return (*vis, msg)
                 generate.click(
                     fn=_process_and_update,
                     api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
                 )
+                load_btn.click(
+                    fn=auto_load_models,
+                    inputs=[],
+                    outputs=[status],
+                    api_name="/load_model",
+                    api_description="Load/initialize the ShortiFoley model and encoders."
+                )
+                # Toggle visibility based on variants
                 def _toggle_vis(n):
                     n = int(n)
                     return [
                 refresh = gr.Button("🔄 Refresh Gallery")
                 refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
+            with gr.Tab("API & MCP"):
+                gr.Markdown("""
+### REST examples
+**POST** `/api_generate_from_url`
+```json
+{
+  "video_url_or_b64": "https://yourhost/sample.mp4",
+  "text_prompt": "metallic clink; hollow room reverb",
+  "guidance_scale": 4.5,
+  "num_inference_steps": 50,
+  "sample_nums": 2
+}
+```
+**POST** `/load_model_tool`
+Loads the model proactively (useful before batch runs).
+### MCP resources & prompt
+- `shortifoley://status` → quick health info
+- `foley_prompt` → reusable guidance for describing the sound
+Works great with n8n: call `load_model_tool` once, then `api_generate_from_url` for each clip.
+""")
             with gr.Tab("ℹ️ About"):
                 gr.HTML(_about_html())
+    # Keep gallery fresh after generation
+    generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
+    # ---- REST + MCP endpoints (inside Blocks) ----
+    def _download_to_tmp(url: str) -> str:
+        try:
+            import requests
+        except Exception:
+            raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
+        r = requests.get(url, timeout=30)
+        r.raise_for_status()
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+        tmp.write(r.content)
+        tmp.flush()
+        tmp.close()
+        return tmp.name
+    def _maybe_from_base64(data_url_or_b64: str) -> str:
+        b64 = data_url_or_b64
+        if data_url_or_b64.startswith("data:"):
+            b64 = data_url_or_b64.split(",", 1)[-1]
+        raw = base64.b64decode(b64)
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+        tmp.write(raw)
+        tmp.flush()
+        tmp.close()
+        return tmp.name
+    def _normalize_video_input(video_url_or_b64: str) -> str:
+        v = (video_url_or_b64 or "").strip()
+        if v.startswith("http://") or v.startswith("https://"):
+            return _download_to_tmp(v)
+        return _maybe_from_base64(v)
+    @gr.api
+    def api_generate_from_url(
+        video_url_or_b64: str,
+        text_prompt: str = "",
+        guidance_scale: float = 4.5,
+        num_inference_steps: int = 50,
+        sample_nums: int = 1,
+    ) -> Dict[str, List[str]]:
+        if _model_dict is None or _cfg is None:
+            msg = auto_load_models()
+            if not str(msg).startswith("✅"):
+                raise RuntimeError(msg)
+        local = _normalize_video_input(video_url_or_b64)
+        outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
+        return {"videos": outs, "message": msg}
+    @gr.api
+    def load_model_tool() -> str:
+        """Ensure model is loaded on server (convenient for MCP/REST)."""
+        return auto_load_models()
+    @gr.mcp.resource("shortifoley://status")
+    def shortifoley_status() -> str:
+        """Return a simple readiness string for MCP clients."""
+        ready = _model_dict is not None and _cfg is not None
+        dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
+        return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
+    @gr.mcp.prompt()
+    def foley_prompt(name: str = "default") -> str:
+        """Reusable guidance for describing sound ambience."""
+        return (
+            "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
+            "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
+        )
+    # Auto-load model when UI first renders
+    demo.load(
+        fn=auto_load_models,
+        inputs=None,
+        outputs=[status]
+    )
     return demo
     logger.info("===== Application Startup =====\n")
     prepare_once()
+    # Probe imports (early surfacing)
     sys.path.append(str(REPO_DIR))
     try:
         from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa: F401
         from hunyuanvideo_foley.utils.feature_utils import feature_process  # noqa: F401
         from hunyuanvideo_foley.utils.media_utils import merge_audio_video  # noqa: F401
     except Exception as e:
         logger.warning(f"Repo imports not ready yet: {e}")
     ui = create_ui()
     # Enable MCP server so tools/resources/prompts are discoverable
         server_name="0.0.0.0",
         share=False,
         show_error=True,
+        mcp_server=True,   # MCP on (great for n8n)
     )