Bils commited on
Commit
0ff82ef
·
verified ·
1 Parent(s): e7621f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -92
app.py CHANGED
@@ -2,6 +2,9 @@
2
  # Created by bilsimaging.com
3
 
4
  import os
 
 
 
5
  import sys
6
  import io
7
  import json
@@ -60,7 +63,10 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
60
  d = torch.device("cpu")
61
  else:
62
  d = torch.device(pref)
63
- logger.info(f"Using CUDA {d}" if d.type == "cuda" else f"Using {d}")
 
 
 
64
  return d
65
 
66
 
@@ -89,7 +95,7 @@ def _download_weights_if_needed() -> None:
89
  "synchformer_state_dict.pth",
90
  "vae_128d_48k.pth",
91
  "assets/*",
92
- "config.yaml", # harmless
93
  ],
94
  )
95
 
@@ -105,12 +111,16 @@ def prepare_once() -> None:
105
  def auto_load_models() -> str:
106
  """
107
  Load HunyuanVideo-Foley + encoders on the chosen device.
 
108
  """
109
  global _model_dict, _cfg, _device
110
 
111
  if _model_dict is not None and _cfg is not None:
112
  return "Model already loaded."
113
 
 
 
 
114
  sys.path.append(str(REPO_DIR))
115
  from hunyuanvideo_foley.utils.model_utils import load_model
116
 
@@ -122,8 +132,19 @@ def auto_load_models() -> str:
122
  try:
123
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
124
  return "✅ Model loaded."
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
- logger.error(e)
127
  return f"❌ Failed to load model: {e}"
128
 
129
 
@@ -134,7 +155,6 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
134
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
135
  merge_audio_video(audio_path, video_path, out_path)
136
  except Exception as e:
137
- # Fallback: plain ffmpeg merge (assumes same duration or lets ffmpeg handle)
138
  logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
139
  import subprocess
140
  cmd = [
@@ -242,89 +262,8 @@ def infer_single_video(
242
  return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
243
 
244
 
245
- # ---------------
246
- # MCP-only APIs
247
- # ---------------
248
- def _download_to_tmp(url: str) -> str:
249
- """Download a remote file to temp."""
250
- try:
251
- import requests
252
- except Exception:
253
- raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
254
-
255
- r = requests.get(url, timeout=30)
256
- r.raise_for_status()
257
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
258
- tmp.write(r.content)
259
- tmp.flush()
260
- tmp.close()
261
- return tmp.name
262
-
263
-
264
- def _maybe_from_base64(data_url_or_b64: str) -> str:
265
- """Accept data: URLs or raw base64; returns temp file path."""
266
- b64 = data_url_or_b64
267
- if data_url_or_b64.startswith("data:"):
268
- b64 = data_url_or_b64.split(",", 1)[-1]
269
- raw = base64.b64decode(b64)
270
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
271
- tmp.write(raw)
272
- tmp.flush()
273
- tmp.close()
274
- return tmp.name
275
-
276
-
277
- def _normalize_video_input(video_url_or_b64: str) -> str:
278
- v = (video_url_or_b64 or "").strip()
279
- if v.startswith("http://") or v.startswith("https://"):
280
- return _download_to_tmp(v)
281
- return _maybe_from_base64(v)
282
-
283
-
284
- with gr.Blocks() as mcp_only_endpoints:
285
- gr.Markdown("These endpoints are MCP/API only and have no visible UI.", show_label=False)
286
-
287
- @gr.api
288
- def api_generate_from_url(
289
- video_url_or_b64: str,
290
- text_prompt: str = "",
291
- guidance_scale: float = 4.5,
292
- num_inference_steps: int = 50,
293
- sample_nums: int = 1,
294
- ) -> Dict[str, List[str]]:
295
- """
296
- Generate Foley from a remote video URL or base64-encoded video.
297
- Returns: {"videos": [paths], "message": str}
298
- """
299
- if _model_dict is None or _cfg is None:
300
- raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
301
- local = _normalize_video_input(video_url_or_b64)
302
- outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
303
- return {"videos": outs, "message": msg}
304
-
305
- @gr.api
306
- def load_model_tool() -> str:
307
- """Ensure model is loaded on server (MCP convenience)."""
308
- return auto_load_models()
309
-
310
- @gr.mcp.resource("shortifoley://status")
311
- def shortifoley_status() -> str:
312
- """Return a simple readiness string for MCP clients."""
313
- ready = _model_dict is not None and _cfg is not None
314
- dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
315
- return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
316
-
317
- @gr.mcp.prompt()
318
- def foley_prompt(name: str = "default") -> str:
319
- """Reusable guidance for describing sound ambience."""
320
- return (
321
- "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
322
- "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
323
- )
324
-
325
-
326
  # -------------
327
- # Gradio UI
328
  # -------------
329
  def _about_html() -> str:
330
  return f"""
@@ -407,13 +346,12 @@ def create_ui() -> gr.Blocks:
407
  vis_updates.append(gr.update(visible=True, value=outs[i]))
408
  else:
409
  vis_updates.append(gr.update(visible=False, value=None))
410
- gal_items = _list_gallery()
411
- return (*vis_updates, msg, gr.update(value=gal_items))
412
 
413
  generate.click(
414
  fn=_process_and_update,
415
  inputs=[video_input, text_input, guidance_scale, steps, samples],
416
- outputs=[v1, v2, v3, v4, v5, v6, status, ],
417
  api_name="/infer",
418
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
419
  )
@@ -445,9 +383,76 @@ def create_ui() -> gr.Blocks:
445
  with gr.Tab("ℹ️ About"):
446
  gr.HTML(_about_html())
447
 
448
- # Also expose gallery update after generate
449
  generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  return demo
452
 
453
 
@@ -485,8 +490,6 @@ if __name__ == "__main__":
485
  logger.info(msg)
486
 
487
  ui = create_ui()
488
- # Mount MCP-only endpoints alongside the UI
489
- ui.blocks.append(mcp_only_endpoints)
490
 
491
  # Enable MCP server so tools/resources/prompts are discoverable
492
  ui.launch(
 
2
  # Created by bilsimaging.com
3
 
4
  import os
5
+ # ---- Prefer safetensors for all HF model loads (fixes CLAP .bin crash on ZeroGPU) ----
6
+ os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
7
+
8
  import sys
9
  import io
10
  import json
 
63
  d = torch.device("cpu")
64
  else:
65
  d = torch.device(pref)
66
+ if d.type == "cuda":
67
+ logger.info(f"Using CUDA {d}")
68
+ else:
69
+ logger.info(f"Using {d}")
70
  return d
71
 
72
 
 
95
  "synchformer_state_dict.pth",
96
  "vae_128d_48k.pth",
97
  "assets/*",
98
+ "config.yaml",
99
  ],
100
  )
101
 
 
111
  def auto_load_models() -> str:
112
  """
113
  Load HunyuanVideo-Foley + encoders on the chosen device.
114
+ Ensures safetensors is preferred to avoid ZeroGPU issues with .bin checkpoints.
115
  """
116
  global _model_dict, _cfg, _device
117
 
118
  if _model_dict is not None and _cfg is not None:
119
  return "Model already loaded."
120
 
121
+ # Ensure Transformers prefers safetensors for everything:
122
+ os.environ["HF_PREFER_SAFETENSORS"] = "1"
123
+
124
  sys.path.append(str(REPO_DIR))
125
  from hunyuanvideo_foley.utils.model_utils import load_model
126
 
 
132
  try:
133
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
134
  return "✅ Model loaded."
135
+ except OSError as e:
136
+ # If any OSError (often from trying to read pytorch_model.bin), retry after enforcing safetensors.
137
+ logger.error(str(e))
138
+ logger.info("Retrying load after enforcing safetensors preference...")
139
+ os.environ["HF_PREFER_SAFETENSORS"] = "1"
140
+ try:
141
+ _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
142
+ return "✅ Model loaded (after safetensors retry)."
143
+ except Exception as e2:
144
+ logger.error(str(e2))
145
+ return f"❌ Failed to load model: {e2}"
146
  except Exception as e:
147
+ logger.error(str(e))
148
  return f"❌ Failed to load model: {e}"
149
 
150
 
 
155
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
156
  merge_audio_video(audio_path, video_path, out_path)
157
  except Exception as e:
 
158
  logger.warning(f"merge_audio_video failed, falling back to ffmpeg: {e}")
159
  import subprocess
160
  cmd = [
 
262
  return outs, f"✅ Generated {len(outs)} result(s). Saved to {OUTPUTS_DIR}/"
263
 
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  # -------------
266
+ # Gradio UI (with MCP+API inside the same app)
267
  # -------------
268
  def _about_html() -> str:
269
  return f"""
 
346
  vis_updates.append(gr.update(visible=True, value=outs[i]))
347
  else:
348
  vis_updates.append(gr.update(visible=False, value=None))
349
+ return (*vis_updates, msg)
 
350
 
351
  generate.click(
352
  fn=_process_and_update,
353
  inputs=[video_input, text_input, guidance_scale, steps, samples],
354
+ outputs=[v1, v2, v3, v4, v5, v6, status],
355
  api_name="/infer",
356
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
357
  )
 
383
  with gr.Tab("ℹ️ About"):
384
  gr.HTML(_about_html())
385
 
386
+ # Keep gallery in sync after generate
387
  generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
388
 
389
+ # -----------------------
390
+ # MCP + REST API endpoints
391
+ # -----------------------
392
+ def _download_to_tmp(url: str) -> str:
393
+ try:
394
+ import requests
395
+ except Exception:
396
+ raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
397
+ r = requests.get(url, timeout=30)
398
+ r.raise_for_status()
399
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
400
+ tmp.write(r.content)
401
+ tmp.flush()
402
+ tmp.close()
403
+ return tmp.name
404
+
405
+ def _maybe_from_base64(data_url_or_b64: str) -> str:
406
+ b64 = data_url_or_b64
407
+ if data_url_or_b64.startswith("data:"):
408
+ b64 = data_url_or_b64.split(",", 1)[-1]
409
+ raw = base64.b64decode(b64)
410
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
411
+ tmp.write(raw)
412
+ tmp.flush()
413
+ tmp.close()
414
+ return tmp.name
415
+
416
+ def _normalize_video_input(video_url_or_b64: str) -> str:
417
+ v = (video_url_or_b64 or "").strip()
418
+ if v.startswith("http://") or v.startswith("https://"):
419
+ return _download_to_tmp(v)
420
+ return _maybe_from_base64(v)
421
+
422
+ @gr.api
423
+ def api_generate_from_url(
424
+ video_url_or_b64: str,
425
+ text_prompt: str = "",
426
+ guidance_scale: float = 4.5,
427
+ num_inference_steps: int = 50,
428
+ sample_nums: int = 1,
429
+ ) -> Dict[str, List[str]]:
430
+ if _model_dict is None or _cfg is None:
431
+ raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
432
+ local = _normalize_video_input(video_url_or_b64)
433
+ outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
434
+ return {"videos": outs, "message": msg}
435
+
436
+ @gr.api
437
+ def load_model_tool() -> str:
438
+ """Ensure model is loaded on server (MCP convenience)."""
439
+ return auto_load_models()
440
+
441
+ @gr.mcp.resource("shortifoley://status")
442
+ def shortifoley_status() -> str:
443
+ """Return a simple readiness string for MCP clients."""
444
+ ready = _model_dict is not None and _cfg is not None
445
+ dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
446
+ return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
447
+
448
+ @gr.mcp.prompt()
449
+ def foley_prompt(name: str = "default") -> str:
450
+ """Reusable guidance for describing sound ambience."""
451
+ return (
452
+ "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
453
+ "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
454
+ )
455
+
456
  return demo
457
 
458
 
 
490
  logger.info(msg)
491
 
492
  ui = create_ui()
 
 
493
 
494
  # Enable MCP server so tools/resources/prompts are discoverable
495
  ui.launch(