Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,9 @@
|
|
2 |
# Created by bilsimaging.com
|
3 |
|
4 |
import os
|
|
|
|
|
|
|
5 |
import sys
|
6 |
import io
|
7 |
import json
|
@@ -60,7 +63,10 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
|
|
60 |
d = torch.device("cpu")
|
61 |
else:
|
62 |
d = torch.device(pref)
|
63 |
-
|
|
|
|
|
|
|
64 |
return d
|
65 |
|
66 |
|
@@ -89,7 +95,7 @@ def _download_weights_if_needed() -> None:
|
|
89 |
"synchformer_state_dict.pth",
|
90 |
"vae_128d_48k.pth",
|
91 |
"assets/*",
|
92 |
-
"config.yaml",
|
93 |
],
|
94 |
)
|
95 |
|
@@ -105,12 +111,16 @@ def prepare_once() -> None:
|
|
105 |
def auto_load_models() -> str:
|
106 |
"""
|
107 |
Load HunyuanVideo-Foley + encoders on the chosen device.
|
|
|
108 |
"""
|
109 |
global _model_dict, _cfg, _device
|
110 |
|
111 |
if _model_dict is not None and _cfg is not None:
|
112 |
return "Model already loaded."
|
113 |
|
|
|
|
|
|
|
114 |
sys.path.append(str(REPO_DIR))
|
115 |
from hunyuanvideo_foley.utils.model_utils import load_model
|
116 |
|
@@ -122,8 +132,19 @@ def auto_load_models() -> str:
|
|
122 |
try:
|
123 |
_model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
|
124 |
return "✅ Model loaded."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
except Exception as e:
|
126 |
-
logger.error(e)
|
127 |
return f"❌ Failed to load model: {e}"
|
128 |
|
129 |
|
@@ -134,7 +155,6 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
|
|
134 |
from hunyuanvideo_foley.utils.media_utils import merge_audio_video
|
135 |
merge_audio_video(audio_path, video_path, out_path)
|
136 |
except Exception as e:
|
137 |
-
# Fallback: plain ffmpeg merge (assumes same duration or lets ffmpeg handle)
|
138 |
logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
|
139 |
import subprocess
|
140 |
cmd = [
|
@@ -242,89 +262,8 @@ def infer_single_video(
|
|
242 |
return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
|
243 |
|
244 |
|
245 |
-
# ---------------
|
246 |
-
# MCP-only APIs
|
247 |
-
# ---------------
|
248 |
-
def _download_to_tmp(url: str) -> str:
|
249 |
-
"""Download a remote file to temp."""
|
250 |
-
try:
|
251 |
-
import requests
|
252 |
-
except Exception:
|
253 |
-
raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
|
254 |
-
|
255 |
-
r = requests.get(url, timeout=30)
|
256 |
-
r.raise_for_status()
|
257 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
258 |
-
tmp.write(r.content)
|
259 |
-
tmp.flush()
|
260 |
-
tmp.close()
|
261 |
-
return tmp.name
|
262 |
-
|
263 |
-
|
264 |
-
def _maybe_from_base64(data_url_or_b64: str) -> str:
|
265 |
-
"""Accept data: URLs or raw base64; returns temp file path."""
|
266 |
-
b64 = data_url_or_b64
|
267 |
-
if data_url_or_b64.startswith("data:"):
|
268 |
-
b64 = data_url_or_b64.split(",", 1)[-1]
|
269 |
-
raw = base64.b64decode(b64)
|
270 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
271 |
-
tmp.write(raw)
|
272 |
-
tmp.flush()
|
273 |
-
tmp.close()
|
274 |
-
return tmp.name
|
275 |
-
|
276 |
-
|
277 |
-
def _normalize_video_input(video_url_or_b64: str) -> str:
|
278 |
-
v = (video_url_or_b64 or "").strip()
|
279 |
-
if v.startswith("http://") or v.startswith("https://"):
|
280 |
-
return _download_to_tmp(v)
|
281 |
-
return _maybe_from_base64(v)
|
282 |
-
|
283 |
-
|
284 |
-
with gr.Blocks() as mcp_only_endpoints:
|
285 |
-
gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
|
286 |
-
|
287 |
-
@gr.api
|
288 |
-
def api_generate_from_url(
|
289 |
-
video_url_or_b64: str,
|
290 |
-
text_prompt: str = "",
|
291 |
-
guidance_scale: float = 4.5,
|
292 |
-
num_inference_steps: int = 50,
|
293 |
-
sample_nums: int = 1,
|
294 |
-
) -> Dict[str, List[str]]:
|
295 |
-
"""
|
296 |
-
Generate Foley from a remote video URL or base64-encoded video.
|
297 |
-
Returns: {"videos": [paths], "message": str}
|
298 |
-
"""
|
299 |
-
if _model_dict is None or _cfg is None:
|
300 |
-
raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
|
301 |
-
local = _normalize_video_input(video_url_or_b64)
|
302 |
-
outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
|
303 |
-
return {"videos": outs, "message": msg}
|
304 |
-
|
305 |
-
@gr.api
|
306 |
-
def load_model_tool() -> str:
|
307 |
-
"""Ensure model is loaded on server (MCP convenience)."""
|
308 |
-
return auto_load_models()
|
309 |
-
|
310 |
-
@gr.mcp.resource("shortifoley://status")
|
311 |
-
def shortifoley_status() -> str:
|
312 |
-
"""Return a simple readiness string for MCP clients."""
|
313 |
-
ready = _model_dict is not None and _cfg is not None
|
314 |
-
dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
|
315 |
-
return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
|
316 |
-
|
317 |
-
@gr.mcp.prompt()
|
318 |
-
def foley_prompt(name: str = "default") -> str:
|
319 |
-
"""Reusable guidance for describing sound ambience."""
|
320 |
-
return (
|
321 |
-
"Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
|
322 |
-
"Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
|
323 |
-
)
|
324 |
-
|
325 |
-
|
326 |
# -------------
|
327 |
-
# Gradio UI
|
328 |
# -------------
|
329 |
def _about_html() -> str:
|
330 |
return f"""
|
@@ -407,13 +346,12 @@ def create_ui() -> gr.Blocks:
|
|
407 |
vis_updates.append(gr.update(visible=True, value=outs[i]))
|
408 |
else:
|
409 |
vis_updates.append(gr.update(visible=False, value=None))
|
410 |
-
|
411 |
-
return (*vis_updates, msg, gr.update(value=gal_items))
|
412 |
|
413 |
generate.click(
|
414 |
fn=_process_and_update,
|
415 |
inputs=[video_input, text_input, guidance_scale, steps, samples],
|
416 |
-
outputs=[v1, v2, v3, v4, v5, v6, status
|
417 |
api_name="/infer",
|
418 |
api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
|
419 |
)
|
@@ -445,9 +383,76 @@ def create_ui() -> gr.Blocks:
|
|
445 |
with gr.Tab("ℹ️ About"):
|
446 |
gr.HTML(_about_html())
|
447 |
|
448 |
-
#
|
449 |
generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
|
450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
return demo
|
452 |
|
453 |
|
@@ -485,8 +490,6 @@ if __name__ == "__main__":
|
|
485 |
logger.info(msg)
|
486 |
|
487 |
ui = create_ui()
|
488 |
-
# Mount MCP-only endpoints alongside the UI
|
489 |
-
ui.blocks.append(mcp_only_endpoints)
|
490 |
|
491 |
# Enable MCP server so tools/resources/prompts are discoverable
|
492 |
ui.launch(
|
|
|
2 |
# Created by bilsimaging.com
|
3 |
|
4 |
import os
|
5 |
+
# ---- Prefer safetensors for all HF model loads (fixes CLAP .bin crash on ZeroGPU) ----
|
6 |
+
os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
|
7 |
+
|
8 |
import sys
|
9 |
import io
|
10 |
import json
|
|
|
63 |
d = torch.device("cpu")
|
64 |
else:
|
65 |
d = torch.device(pref)
|
66 |
+
if d.type == "cuda":
|
67 |
+
logger.info(f"Using CUDA {d}")
|
68 |
+
else:
|
69 |
+
logger.info(f"Using {d}")
|
70 |
return d
|
71 |
|
72 |
|
|
|
95 |
"synchformer_state_dict.pth",
|
96 |
"vae_128d_48k.pth",
|
97 |
"assets/*",
|
98 |
+
"config.yaml",
|
99 |
],
|
100 |
)
|
101 |
|
|
|
111 |
def auto_load_models() -> str:
|
112 |
"""
|
113 |
Load HunyuanVideo-Foley + encoders on the chosen device.
|
114 |
+
Ensures safetensors is preferred to avoid ZeroGPU issues with .bin checkpoints.
|
115 |
"""
|
116 |
global _model_dict, _cfg, _device
|
117 |
|
118 |
if _model_dict is not None and _cfg is not None:
|
119 |
return "Model already loaded."
|
120 |
|
121 |
+
# Ensure Transformers prefers safetensors for everything:
|
122 |
+
os.environ["HF_PREFER_SAFETENSORS"] = "1"
|
123 |
+
|
124 |
sys.path.append(str(REPO_DIR))
|
125 |
from hunyuanvideo_foley.utils.model_utils import load_model
|
126 |
|
|
|
132 |
try:
|
133 |
_model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
|
134 |
return "✅ Model loaded."
|
135 |
+
except OSError as e:
|
136 |
+
# If any OSError (often from trying to read pytorch_model.bin), retry after enforcing safetensors.
|
137 |
+
logger.error(str(e))
|
138 |
+
logger.info("Retrying load after enforcing safetensors preference...")
|
139 |
+
os.environ["HF_PREFER_SAFETENSORS"] = "1"
|
140 |
+
try:
|
141 |
+
_model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
|
142 |
+
return "✅ Model loaded (after safetensors retry)."
|
143 |
+
except Exception as e2:
|
144 |
+
logger.error(str(e2))
|
145 |
+
return f"❌ Failed to load model: {e2}"
|
146 |
except Exception as e:
|
147 |
+
logger.error(str(e))
|
148 |
return f"❌ Failed to load model: {e}"
|
149 |
|
150 |
|
|
|
155 |
from hunyuanvideo_foley.utils.media_utils import merge_audio_video
|
156 |
merge_audio_video(audio_path, video_path, out_path)
|
157 |
except Exception as e:
|
|
|
158 |
logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
|
159 |
import subprocess
|
160 |
cmd = [
|
|
|
262 |
return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
|
263 |
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
# -------------
|
266 |
+
# Gradio UI (with MCP+API inside the same app)
|
267 |
# -------------
|
268 |
def _about_html() -> str:
|
269 |
return f"""
|
|
|
346 |
vis_updates.append(gr.update(visible=True, value=outs[i]))
|
347 |
else:
|
348 |
vis_updates.append(gr.update(visible=False, value=None))
|
349 |
+
return (*vis_updates, msg)
|
|
|
350 |
|
351 |
generate.click(
|
352 |
fn=_process_and_update,
|
353 |
inputs=[video_input, text_input, guidance_scale, steps, samples],
|
354 |
+
outputs=[v1, v2, v3, v4, v5, v6, status],
|
355 |
api_name="/infer",
|
356 |
api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
|
357 |
)
|
|
|
383 |
with gr.Tab("ℹ️ About"):
|
384 |
gr.HTML(_about_html())
|
385 |
|
386 |
+
# Keep gallery in sync after generate
|
387 |
generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
|
388 |
|
389 |
+
# -----------------------
|
390 |
+
# MCP + REST API endpoints
|
391 |
+
# -----------------------
|
392 |
+
def _download_to_tmp(url: str) -> str:
|
393 |
+
try:
|
394 |
+
import requests
|
395 |
+
except Exception:
|
396 |
+
raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
|
397 |
+
r = requests.get(url, timeout=30)
|
398 |
+
r.raise_for_status()
|
399 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
400 |
+
tmp.write(r.content)
|
401 |
+
tmp.flush()
|
402 |
+
tmp.close()
|
403 |
+
return tmp.name
|
404 |
+
|
405 |
+
def _maybe_from_base64(data_url_or_b64: str) -> str:
|
406 |
+
b64 = data_url_or_b64
|
407 |
+
if data_url_or_b64.startswith("data:"):
|
408 |
+
b64 = data_url_or_b64.split(",", 1)[-1]
|
409 |
+
raw = base64.b64decode(b64)
|
410 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
411 |
+
tmp.write(raw)
|
412 |
+
tmp.flush()
|
413 |
+
tmp.close()
|
414 |
+
return tmp.name
|
415 |
+
|
416 |
+
def _normalize_video_input(video_url_or_b64: str) -> str:
|
417 |
+
v = (video_url_or_b64 or "").strip()
|
418 |
+
if v.startswith("http://") or v.startswith("https://"):
|
419 |
+
return _download_to_tmp(v)
|
420 |
+
return _maybe_from_base64(v)
|
421 |
+
|
422 |
+
@gr.api
|
423 |
+
def api_generate_from_url(
|
424 |
+
video_url_or_b64: str,
|
425 |
+
text_prompt: str = "",
|
426 |
+
guidance_scale: float = 4.5,
|
427 |
+
num_inference_steps: int = 50,
|
428 |
+
sample_nums: int = 1,
|
429 |
+
) -> Dict[str, List[str]]:
|
430 |
+
if _model_dict is None or _cfg is None:
|
431 |
+
raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
|
432 |
+
local = _normalize_video_input(video_url_or_b64)
|
433 |
+
outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
|
434 |
+
return {"videos": outs, "message": msg}
|
435 |
+
|
436 |
+
@gr.api
|
437 |
+
def load_model_tool() -> str:
|
438 |
+
"""Ensure model is loaded on server (MCP convenience)."""
|
439 |
+
return auto_load_models()
|
440 |
+
|
441 |
+
@gr.mcp.resource("shortifoley://status")
|
442 |
+
def shortifoley_status() -> str:
|
443 |
+
"""Return a simple readiness string for MCP clients."""
|
444 |
+
ready = _model_dict is not None and _cfg is not None
|
445 |
+
dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
|
446 |
+
return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
|
447 |
+
|
448 |
+
@gr.mcp.prompt()
|
449 |
+
def foley_prompt(name: str = "default") -> str:
|
450 |
+
"""Reusable guidance for describing sound ambience."""
|
451 |
+
return (
|
452 |
+
"Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
|
453 |
+
"Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
|
454 |
+
)
|
455 |
+
|
456 |
return demo
|
457 |
|
458 |
|
|
|
490 |
logger.info(msg)
|
491 |
|
492 |
ui = create_ui()
|
|
|
|
|
493 |
|
494 |
# Enable MCP server so tools/resources/prompts are discoverable
|
495 |
ui.launch(
|