Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
-
|
| 2 |
# Created by bilsimaging.com
|
| 3 |
|
| 4 |
import os
|
|
|
|
| 5 |
os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
|
| 6 |
|
| 7 |
import sys
|
| 8 |
import json
|
|
|
|
|
|
|
|
|
|
| 9 |
import base64
|
| 10 |
import random
|
| 11 |
import tempfile
|
|
@@ -21,7 +25,6 @@ from loguru import logger
|
|
| 21 |
from huggingface_hub import snapshot_download
|
| 22 |
import spaces
|
| 23 |
|
| 24 |
-
|
| 25 |
# -------------------------
|
| 26 |
# Constants & configuration
|
| 27 |
# -------------------------
|
|
@@ -29,25 +32,41 @@ ROOT = Path(__file__).parent.resolve()
|
|
| 29 |
REPO_DIR = ROOT / "HunyuanVideo-Foley"
|
| 30 |
WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
|
| 31 |
CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
|
| 32 |
-
OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
|
| 33 |
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 34 |
|
| 35 |
SPACE_TITLE = "π΅ ShortiFoley β HunyuanVideo-Foley"
|
| 36 |
-
SPACE_TAGLINE = "
|
| 37 |
WATERMARK_NOTE = "Made with β€οΈ by bilsimaging.com"
|
| 38 |
|
| 39 |
-
# ZeroGPU limit
|
| 40 |
GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
|
| 41 |
|
| 42 |
-
# Globals
|
| 43 |
_model_dict = None
|
| 44 |
_cfg = None
|
| 45 |
_device: Optional[torch.device] = None
|
| 46 |
|
| 47 |
|
| 48 |
# ------------
|
| 49 |
-
# Small helpers
|
| 50 |
# ------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def _ensure_repo() -> None:
|
| 52 |
"""Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
|
| 53 |
if REPO_DIR.exists():
|
|
@@ -86,30 +105,26 @@ def prepare_once() -> None:
|
|
| 86 |
# -----------------------
|
| 87 |
# Model load & inference
|
| 88 |
# -----------------------
|
| 89 |
-
def auto_load_models(
|
| 90 |
"""
|
| 91 |
-
Load HunyuanVideo-Foley + encoders on the
|
| 92 |
-
|
| 93 |
"""
|
| 94 |
global _model_dict, _cfg, _device
|
| 95 |
|
| 96 |
if _model_dict is not None and _cfg is not None:
|
| 97 |
return "β
Model already loaded."
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
|
| 101 |
-
return "β Load the model inside a GPU task first (use the Load button or run Generate)."
|
| 102 |
-
|
| 103 |
-
os.environ["HF_PREFER_SAFETENSORS"] = "1" # enforce again for safety
|
| 104 |
|
| 105 |
sys.path.append(str(REPO_DIR))
|
| 106 |
from hunyuanvideo_foley.utils.model_utils import load_model
|
| 107 |
|
| 108 |
-
_device =
|
| 109 |
logger.info("Loading HunyuanVideo-Foley model...")
|
| 110 |
logger.info(f"MODEL_PATH: {WEIGHTS_DIR}")
|
| 111 |
logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
|
| 112 |
-
logger.info(f"TARGET_DEVICE: {_device}")
|
| 113 |
|
| 114 |
try:
|
| 115 |
_model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
|
|
@@ -152,7 +167,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
|
|
| 152 |
|
| 153 |
def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
|
| 154 |
prompt: str) -> str:
|
| 155 |
-
"""Save WAV + MP4 in
|
| 156 |
# torchaudio expects [C, N]
|
| 157 |
if audio_tensor.ndim == 1:
|
| 158 |
audio_tensor = audio_tensor.unsqueeze(0)
|
|
@@ -207,12 +222,9 @@ def infer_single_video(
|
|
| 207 |
Generate Foley audio for an uploaded video (1β6 variants).
|
| 208 |
Returns: (list of output video paths, status message)
|
| 209 |
"""
|
| 210 |
-
#
|
| 211 |
-
device = torch.device("cuda:0")
|
| 212 |
-
|
| 213 |
-
# Lazy-load if needed on GPU
|
| 214 |
if _model_dict is None or _cfg is None:
|
| 215 |
-
msg = auto_load_models(
|
| 216 |
if not str(msg).startswith("β
"):
|
| 217 |
return [], f"β {msg}"
|
| 218 |
|
|
@@ -249,31 +261,17 @@ def infer_single_video(
|
|
| 249 |
return outs, f"β
Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
|
| 250 |
|
| 251 |
|
| 252 |
-
|
| 253 |
-
@spaces.GPU(duration=GPU_DURATION)
|
| 254 |
-
def gpu_load_models() -> str:
|
| 255 |
-
device = torch.device("cuda:0")
|
| 256 |
-
return auto_load_models(device)
|
| 257 |
-
|
| 258 |
-
|
| 259 |
# -------------
|
| 260 |
-
# Gradio UI (with MCP
|
| 261 |
# -------------
|
| 262 |
def _about_html() -> str:
|
| 263 |
return f"""
|
| 264 |
<div style="line-height:1.6">
|
| 265 |
<h2>About ShortiFoley</h2>
|
| 266 |
-
<p><b>ShortiFoley</b> turns short videos into realistic Foley sound
|
| 267 |
-
Powered by Tencentβs HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation
|
| 268 |
-
|
| 269 |
-
<p>
|
| 270 |
-
<a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a>,
|
| 271 |
-
built to streamline creative workflows across video, sound, and publishing.</p>
|
| 272 |
-
|
| 273 |
-
<p>ShortiFoley integrates seamlessly with automation tools like
|
| 274 |
-
<a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a>,
|
| 275 |
-
making it easy to plug into custom workflows and pipelines.</p>
|
| 276 |
-
|
| 277 |
|
| 278 |
<h3>Quick Steps</h3>
|
| 279 |
<ol>
|
|
@@ -293,9 +291,9 @@ making it easy to plug into custom workflows and pipelines.</p>
|
|
| 293 |
|
| 294 |
<h3>MCP & API</h3>
|
| 295 |
<p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see βAPI & MCPβ tab).
|
| 296 |
-
Perfect for pipelines and tools like <b>n8n</b>.</p>
|
|
|
|
| 297 |
|
| 298 |
-
|
| 299 |
</div>
|
| 300 |
"""
|
| 301 |
|
|
@@ -309,7 +307,7 @@ def create_ui() -> gr.Blocks:
|
|
| 309 |
.generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
|
| 310 |
.minor-btn button{ border-radius:10px;}
|
| 311 |
.muted{ color:#64748b; }
|
| 312 |
-
.footer-text{
|
| 313 |
"""
|
| 314 |
with gr.Blocks(title="ShortiFoley β HunyuanVideo-Foley", css=css) as demo:
|
| 315 |
|
|
@@ -333,7 +331,7 @@ def create_ui() -> gr.Blocks:
|
|
| 333 |
samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
|
| 334 |
|
| 335 |
with gr.Row():
|
| 336 |
-
load_btn = gr.Button("βοΈ Load model", variant="secondary", elem_classes=["minor-btn"])
|
| 337 |
generate = gr.Button("π΅ Generate", variant="primary", elem_classes=["generate-btn"])
|
| 338 |
|
| 339 |
status = gr.Textbox(label="Status", interactive=False)
|
|
@@ -356,27 +354,47 @@ def create_ui() -> gr.Blocks:
|
|
| 356 |
outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
|
| 357 |
vis = []
|
| 358 |
for i in range(6):
|
| 359 |
-
if i < len(outs):
|
| 360 |
vis.append(gr.update(visible=True, value=outs[i]))
|
| 361 |
else:
|
| 362 |
-
vis.append(gr.update(visible=
|
| 363 |
-
|
|
|
|
|
|
|
| 364 |
|
| 365 |
-
|
| 366 |
fn=_process_and_update,
|
| 367 |
inputs=[video_input, text_input, guidance_scale, steps, samples],
|
| 368 |
-
outputs=[v1, v2, v3, v4, v5, v6, status],
|
| 369 |
api_name="/infer",
|
| 370 |
api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
|
| 371 |
)
|
| 372 |
|
| 373 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
load_btn.click(
|
| 375 |
-
fn=
|
| 376 |
inputs=[],
|
| 377 |
outputs=[status],
|
| 378 |
api_name="/load_model",
|
| 379 |
-
api_description="Load/initialize the ShortiFoley model and encoders (
|
| 380 |
)
|
| 381 |
|
| 382 |
# Toggle visibility based on variants
|
|
@@ -393,7 +411,7 @@ def create_ui() -> gr.Blocks:
|
|
| 393 |
samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
|
| 394 |
|
| 395 |
with gr.Tab("π Gallery"):
|
| 396 |
-
gr.Markdown("Latest generated videos (autosaved to `outputs/`).")
|
| 397 |
gallery = gr.Gallery(
|
| 398 |
value=_list_gallery(),
|
| 399 |
columns=3,
|
|
@@ -401,49 +419,50 @@ def create_ui() -> gr.Blocks:
|
|
| 401 |
label="Saved Results"
|
| 402 |
)
|
| 403 |
refresh = gr.Button("π Refresh Gallery")
|
| 404 |
-
|
| 405 |
-
def _refresh_gallery():
|
| 406 |
-
return gr.update(value=_list_gallery())
|
| 407 |
-
|
| 408 |
-
# Refresh via button
|
| 409 |
-
refresh.click(_refresh_gallery, outputs=[gallery])
|
| 410 |
-
# Also refresh after generation finishes
|
| 411 |
-
gen_evt.then(_refresh_gallery, inputs=None, outputs=[gallery])
|
| 412 |
|
| 413 |
with gr.Tab("API & MCP"):
|
| 414 |
-
gr.Markdown(
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
with gr.Tab("βΉοΈ About"):
|
| 434 |
gr.HTML(_about_html())
|
| 435 |
|
| 436 |
# Footer
|
| 437 |
-
gr.HTML(
|
| 438 |
-
|
| 439 |
-
<
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
# ---- REST + MCP endpoints (inside Blocks) ----
|
| 444 |
def _download_to_tmp(url: str) -> str:
|
| 445 |
try:
|
| 446 |
-
import requests
|
| 447 |
except Exception:
|
| 448 |
raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
|
| 449 |
r = requests.get(url, timeout=30)
|
|
@@ -479,9 +498,10 @@ def create_ui() -> gr.Blocks:
|
|
| 479 |
num_inference_steps: int = 50,
|
| 480 |
sample_nums: int = 1,
|
| 481 |
) -> Dict[str, List[str]]:
|
| 482 |
-
# Ensure model is ready (GPU-safe path)
|
| 483 |
if _model_dict is None or _cfg is None:
|
| 484 |
-
|
|
|
|
|
|
|
| 485 |
local = _normalize_video_input(video_url_or_b64)
|
| 486 |
outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
|
| 487 |
return {"videos": outs, "message": msg}
|
|
@@ -489,14 +509,14 @@ def create_ui() -> gr.Blocks:
|
|
| 489 |
@gr.api
|
| 490 |
def load_model_tool() -> str:
|
| 491 |
"""Ensure model is loaded on server (convenient for MCP/REST)."""
|
| 492 |
-
return
|
| 493 |
|
| 494 |
@gr.mcp.resource("shortifoley://status")
|
| 495 |
def shortifoley_status() -> str:
|
| 496 |
"""Return a simple readiness string for MCP clients."""
|
| 497 |
ready = _model_dict is not None and _cfg is not None
|
| 498 |
dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
|
| 499 |
-
return f"ShortiFoley status: {'ready' if ready else '
|
| 500 |
|
| 501 |
@gr.mcp.prompt()
|
| 502 |
def foley_prompt(name: str = "default") -> str:
|
|
@@ -506,9 +526,6 @@ def create_ui() -> gr.Blocks:
|
|
| 506 |
"Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
|
| 507 |
)
|
| 508 |
|
| 509 |
-
# IMPORTANT: Do NOT auto-load models here to avoid CUDA init in main process
|
| 510 |
-
demo.load(lambda: "Ready. Click 'Load model' or 'Generate' to start.", inputs=None, outputs=None)
|
| 511 |
-
|
| 512 |
return demo
|
| 513 |
|
| 514 |
|
|
@@ -519,7 +536,7 @@ def set_seeds(s: int = 1):
|
|
| 519 |
|
| 520 |
|
| 521 |
# -------------
|
| 522 |
-
# App bootstrap
|
| 523 |
# -------------
|
| 524 |
if __name__ == "__main__":
|
| 525 |
logger.remove()
|
|
@@ -529,7 +546,7 @@ if __name__ == "__main__":
|
|
| 529 |
logger.info("===== Application Startup =====\n")
|
| 530 |
prepare_once()
|
| 531 |
|
| 532 |
-
# Probe imports (early surfacing)
|
| 533 |
sys.path.append(str(REPO_DIR))
|
| 534 |
try:
|
| 535 |
from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
|
|
|
|
| 1 |
+
|
| 2 |
# Created by bilsimaging.com
|
| 3 |
|
| 4 |
import os
|
| 5 |
+
|
| 6 |
os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
|
| 7 |
|
| 8 |
import sys
|
| 9 |
import json
|
| 10 |
+
import uuid
|
| 11 |
+
import time
|
| 12 |
+
import shutil
|
| 13 |
import base64
|
| 14 |
import random
|
| 15 |
import tempfile
|
|
|
|
| 25 |
from huggingface_hub import snapshot_download
|
| 26 |
import spaces
|
| 27 |
|
|
|
|
| 28 |
# -------------------------
|
| 29 |
# Constants & configuration
|
| 30 |
# -------------------------
|
|
|
|
| 32 |
REPO_DIR = ROOT / "HunyuanVideo-Foley"
|
| 33 |
WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
|
| 34 |
CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
|
| 35 |
+
OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
|
| 36 |
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 37 |
|
| 38 |
SPACE_TITLE = "π΅ ShortiFoley β HunyuanVideo-Foley"
|
| 39 |
+
SPACE_TAGLINE = "Text/Video β Audio Foley Β· Created by bilsimaging.com"
|
| 40 |
WATERMARK_NOTE = "Made with β€οΈ by bilsimaging.com"
|
| 41 |
|
| 42 |
+
# ZeroGPU limit
|
| 43 |
GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
|
| 44 |
|
| 45 |
+
# Globals
|
| 46 |
_model_dict = None
|
| 47 |
_cfg = None
|
| 48 |
_device: Optional[torch.device] = None
|
| 49 |
|
| 50 |
|
| 51 |
# ------------
|
| 52 |
+
# Small helpers
|
| 53 |
# ------------
|
| 54 |
+
def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
|
| 55 |
+
"""
|
| 56 |
+
Pick device safely.
|
| 57 |
+
IMPORTANT: Do NOT query torch.cuda.is_available() in main/non-GPU processes
|
| 58 |
+
on Stateless GPU Spaces. Only set CUDA when called from a @spaces.GPU context.
|
| 59 |
+
"""
|
| 60 |
+
if pref.startswith("cuda"):
|
| 61 |
+
d = torch.device(f"cuda:{gpu_id}")
|
| 62 |
+
elif pref == "mps":
|
| 63 |
+
d = torch.device("mps")
|
| 64 |
+
else:
|
| 65 |
+
d = torch.device("cpu")
|
| 66 |
+
logger.info(f"Using {d}")
|
| 67 |
+
return d
|
| 68 |
+
|
| 69 |
+
|
| 70 |
def _ensure_repo() -> None:
|
| 71 |
"""Shallow-clone Tencent repo with LFS smudge disabled (avoid LFS quota checkout)."""
|
| 72 |
if REPO_DIR.exists():
|
|
|
|
| 105 |
# -----------------------
|
| 106 |
# Model load & inference
|
| 107 |
# -----------------------
|
| 108 |
+
def auto_load_models(device_str: str = "cpu") -> str:
|
| 109 |
"""
|
| 110 |
+
Load HunyuanVideo-Foley + encoders on the chosen device.
|
| 111 |
+
Use device_str="cuda" ONLY inside @spaces.GPU function to avoid CUDA init in main process.
|
| 112 |
"""
|
| 113 |
global _model_dict, _cfg, _device
|
| 114 |
|
| 115 |
if _model_dict is not None and _cfg is not None:
|
| 116 |
return "β
Model already loaded."
|
| 117 |
|
| 118 |
+
# Make absolutely sure safetensors is preferred
|
| 119 |
+
os.environ["HF_PREFER_SAFETENSORS"] = "1"
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
sys.path.append(str(REPO_DIR))
|
| 122 |
from hunyuanvideo_foley.utils.model_utils import load_model
|
| 123 |
|
| 124 |
+
_device = _setup_device(device_str, 0)
|
| 125 |
logger.info("Loading HunyuanVideo-Foley model...")
|
| 126 |
logger.info(f"MODEL_PATH: {WEIGHTS_DIR}")
|
| 127 |
logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
|
|
|
|
| 128 |
|
| 129 |
try:
|
| 130 |
_model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
|
|
|
|
| 167 |
|
| 168 |
def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
|
| 169 |
prompt: str) -> str:
|
| 170 |
+
"""Save WAV + MP4 in autosaved/, add metadata with a soft watermark note."""
|
| 171 |
# torchaudio expects [C, N]
|
| 172 |
if audio_tensor.ndim == 1:
|
| 173 |
audio_tensor = audio_tensor.unsqueeze(0)
|
|
|
|
| 222 |
Generate Foley audio for an uploaded video (1β6 variants).
|
| 223 |
Returns: (list of output video paths, status message)
|
| 224 |
"""
|
| 225 |
+
# Lazy-load on GPU
|
|
|
|
|
|
|
|
|
|
| 226 |
if _model_dict is None or _cfg is None:
|
| 227 |
+
msg = auto_load_models(device_str="cuda")
|
| 228 |
if not str(msg).startswith("β
"):
|
| 229 |
return [], f"β {msg}"
|
| 230 |
|
|
|
|
| 261 |
return outs, f"β
Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
|
| 262 |
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
# -------------
|
| 265 |
+
# Gradio UI (with MCP+API inside the same app)
|
| 266 |
# -------------
|
| 267 |
def _about_html() -> str:
|
| 268 |
return f"""
|
| 269 |
<div style="line-height:1.6">
|
| 270 |
<h2>About ShortiFoley</h2>
|
| 271 |
+
<p><b>ShortiFoley</b> turns short videos into realistic Foley sound.<br/>
|
| 272 |
+
Powered by Tencentβs HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation
|
| 273 |
+
(<a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a> flows).</p>
|
| 274 |
+
<p><b>Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a></b></p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
<h3>Quick Steps</h3>
|
| 277 |
<ol>
|
|
|
|
| 291 |
|
| 292 |
<h3>MCP & API</h3>
|
| 293 |
<p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see βAPI & MCPβ tab).
|
| 294 |
+
Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
|
| 295 |
+
|
| 296 |
|
|
|
|
| 297 |
</div>
|
| 298 |
"""
|
| 299 |
|
|
|
|
| 307 |
.generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
|
| 308 |
.minor-btn button{ border-radius:10px;}
|
| 309 |
.muted{ color:#64748b; }
|
| 310 |
+
.footer-text{ color:#64748b; text-align:center; padding:12px 0; font-size:.95rem; }
|
| 311 |
"""
|
| 312 |
with gr.Blocks(title="ShortiFoley β HunyuanVideo-Foley", css=css) as demo:
|
| 313 |
|
|
|
|
| 331 |
samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
|
| 332 |
|
| 333 |
with gr.Row():
|
| 334 |
+
load_btn = gr.Button("βοΈ Load model (CPU)", variant="secondary", elem_classes=["minor-btn"])
|
| 335 |
generate = gr.Button("π΅ Generate", variant="primary", elem_classes=["generate-btn"])
|
| 336 |
|
| 337 |
status = gr.Textbox(label="Status", interactive=False)
|
|
|
|
| 354 |
outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
|
| 355 |
vis = []
|
| 356 |
for i in range(6):
|
| 357 |
+
if outs and i < len(outs):
|
| 358 |
vis.append(gr.update(visible=True, value=outs[i]))
|
| 359 |
else:
|
| 360 |
+
vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
|
| 361 |
+
# Also refresh the gallery in this same event
|
| 362 |
+
new_gallery = _list_gallery()
|
| 363 |
+
return (*vis, msg, new_gallery)
|
| 364 |
|
| 365 |
+
generate.click(
|
| 366 |
fn=_process_and_update,
|
| 367 |
inputs=[video_input, text_input, guidance_scale, steps, samples],
|
| 368 |
+
outputs=[v1, v2, v3, v4, v5, v6, status], # updated below to include gallery via .then-like merge
|
| 369 |
api_name="/infer",
|
| 370 |
api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
|
| 371 |
)
|
| 372 |
|
| 373 |
+
# Workaround: extend outputs to include gallery refresh using a wrapper
|
| 374 |
+
def _process_and_update_with_gallery(video_file, text_prompt, cfg, nsteps, nsamples):
|
| 375 |
+
outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
|
| 376 |
+
vis = []
|
| 377 |
+
for i in range(6):
|
| 378 |
+
if outs and i < len(outs):
|
| 379 |
+
vis.append(gr.update(visible=True, value=outs[i]))
|
| 380 |
+
else:
|
| 381 |
+
vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
|
| 382 |
+
new_gallery = _list_gallery()
|
| 383 |
+
return (*vis, msg, new_gallery)
|
| 384 |
+
|
| 385 |
+
# Re-bind with gallery as extra output
|
| 386 |
+
generate.click(
|
| 387 |
+
fn=_process_and_update_with_gallery,
|
| 388 |
+
inputs=[video_input, text_input, guidance_scale, steps, samples],
|
| 389 |
+
outputs=[v1, v2, v3, v4, v5, v6, status,], # gallery will be refreshed on Gallery tab itself
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
load_btn.click(
|
| 393 |
+
fn=lambda: auto_load_models(device_str="cpu"),
|
| 394 |
inputs=[],
|
| 395 |
outputs=[status],
|
| 396 |
api_name="/load_model",
|
| 397 |
+
api_description="Load/initialize the ShortiFoley model and encoders on CPU (GPU loads during inference)."
|
| 398 |
)
|
| 399 |
|
| 400 |
# Toggle visibility based on variants
|
|
|
|
| 411 |
samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
|
| 412 |
|
| 413 |
with gr.Tab("π Gallery"):
|
| 414 |
+
gr.Markdown("Latest generated videos (autosaved to `outputs/autosaved/`).")
|
| 415 |
gallery = gr.Gallery(
|
| 416 |
value=_list_gallery(),
|
| 417 |
columns=3,
|
|
|
|
| 419 |
label="Saved Results"
|
| 420 |
)
|
| 421 |
refresh = gr.Button("π Refresh Gallery")
|
| 422 |
+
refresh.click(lambda: _list_gallery(), outputs=[gallery])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
with gr.Tab("API & MCP"):
|
| 425 |
+
gr.Markdown("""
|
| 426 |
+
### REST examples
|
| 427 |
+
|
| 428 |
+
**POST** `/api_generate_from_url`
|
| 429 |
+
```json
|
| 430 |
+
{
|
| 431 |
+
"video_url_or_b64": "https://yourhost/sample.mp4",
|
| 432 |
+
"text_prompt": "metallic clink; hollow room reverb",
|
| 433 |
+
"guidance_scale": 4.5,
|
| 434 |
+
"num_inference_steps": 50,
|
| 435 |
+
"sample_nums": 2
|
| 436 |
+
}
|
| 437 |
+
```
|
| 438 |
+
|
| 439 |
+
**POST** `/load_model_tool`
|
| 440 |
+
Loads the model proactively (useful before batch runs).
|
| 441 |
+
|
| 442 |
+
**MCP resources & prompt**
|
| 443 |
+
- `shortifoley://status` β quick health info
|
| 444 |
+
- `foley_prompt` β reusable guidance for describing the sound
|
| 445 |
+
|
| 446 |
+
Works great with media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
|
| 447 |
+
""")
|
| 448 |
|
| 449 |
with gr.Tab("βΉοΈ About"):
|
| 450 |
gr.HTML(_about_html())
|
| 451 |
|
| 452 |
# Footer
|
| 453 |
+
gr.HTML(
|
| 454 |
+
"""
|
| 455 |
+
<div class="footer-text">
|
| 456 |
+
π Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a>
|
| 457 |
+
· Powered by HunyuanVideo-Foley
|
| 458 |
+
</div>
|
| 459 |
+
"""
|
| 460 |
+
)
|
| 461 |
|
| 462 |
# ---- REST + MCP endpoints (inside Blocks) ----
|
| 463 |
def _download_to_tmp(url: str) -> str:
|
| 464 |
try:
|
| 465 |
+
import requests
|
| 466 |
except Exception:
|
| 467 |
raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
|
| 468 |
r = requests.get(url, timeout=30)
|
|
|
|
| 498 |
num_inference_steps: int = 50,
|
| 499 |
sample_nums: int = 1,
|
| 500 |
) -> Dict[str, List[str]]:
|
|
|
|
| 501 |
if _model_dict is None or _cfg is None:
|
| 502 |
+
msg = auto_load_models(device_str="cpu") # safe in HTTP context; GPU will be used inside infer
|
| 503 |
+
if not str(msg).startswith("β
"):
|
| 504 |
+
raise RuntimeError(msg)
|
| 505 |
local = _normalize_video_input(video_url_or_b64)
|
| 506 |
outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
|
| 507 |
return {"videos": outs, "message": msg}
|
|
|
|
| 509 |
@gr.api
|
| 510 |
def load_model_tool() -> str:
|
| 511 |
"""Ensure model is loaded on server (convenient for MCP/REST)."""
|
| 512 |
+
return auto_load_models(device_str="cpu")
|
| 513 |
|
| 514 |
@gr.mcp.resource("shortifoley://status")
|
| 515 |
def shortifoley_status() -> str:
|
| 516 |
"""Return a simple readiness string for MCP clients."""
|
| 517 |
ready = _model_dict is not None and _cfg is not None
|
| 518 |
dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
|
| 519 |
+
return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
|
| 520 |
|
| 521 |
@gr.mcp.prompt()
|
| 522 |
def foley_prompt(name: str = "default") -> str:
|
|
|
|
| 526 |
"Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
|
| 527 |
)
|
| 528 |
|
|
|
|
|
|
|
|
|
|
| 529 |
return demo
|
| 530 |
|
| 531 |
|
|
|
|
| 536 |
|
| 537 |
|
| 538 |
# -------------
|
| 539 |
+
# App bootstrap
|
| 540 |
# -------------
|
| 541 |
if __name__ == "__main__":
|
| 542 |
logger.remove()
|
|
|
|
| 546 |
logger.info("===== Application Startup =====\n")
|
| 547 |
prepare_once()
|
| 548 |
|
| 549 |
+
# Probe imports (early surfacing)
|
| 550 |
sys.path.append(str(REPO_DIR))
|
| 551 |
try:
|
| 552 |
from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
|