Bils commited on
Commit
aa644be
·
verified ·
1 Parent(s): 6e08050

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -66
app.py CHANGED
@@ -1,15 +1,10 @@
1
-
2
  # Created by bilsimaging.com
3
 
4
  import os
5
-
6
  os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
7
 
8
  import sys
9
  import json
10
- import uuid
11
- import time
12
- import shutil
13
  import base64
14
  import random
15
  import tempfile
@@ -32,15 +27,17 @@ ROOT = Path(__file__).parent.resolve()
32
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
33
  WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
34
  CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
 
 
35
  OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
36
  OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
37
 
38
- SPACE_TITLE = "🎵 Shorti Foley Sound— HunyuanVideo-Foley"
39
- SPACE_TAGLINE = "Bring your videos to life with AI-powered Foley Sound"
40
  WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
41
 
42
- # ZeroGPU limit
43
- GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "120"))
44
 
45
  # Globals
46
  _model_dict = None
@@ -53,9 +50,9 @@ _device: Optional[torch.device] = None
53
  # ------------
54
  def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
55
  """
56
- Pick device safely.
57
- IMPORTANT: Do NOT query torch.cuda.is_available() in main/non-GPU processes
58
- on Stateless GPU Spaces. Only set CUDA when called from a @spaces.GPU context.
59
  """
60
  if pref.startswith("cuda"):
61
  d = torch.device(f"cuda:{gpu_id}")
@@ -105,10 +102,30 @@ def prepare_once() -> None:
105
  # -----------------------
106
  # Model load & inference
107
  # -----------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def auto_load_models(device_str: str = "cpu") -> str:
109
  """
110
  Load HunyuanVideo-Foley + encoders on the chosen device.
111
- Use device_str="cuda" ONLY inside @spaces.GPU function to avoid CUDA init in main process.
112
  """
113
  global _model_dict, _cfg, _device
114
 
@@ -117,6 +134,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
117
 
118
  # Make absolutely sure safetensors is preferred
119
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
 
120
 
121
  sys.path.append(str(REPO_DIR))
122
  from hunyuanvideo_foley.utils.model_utils import load_model
@@ -128,6 +146,8 @@ def auto_load_models(device_str: str = "cpu") -> str:
128
 
129
  try:
130
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
 
 
131
  return "✅ Model loaded."
132
  except OSError as e:
133
  logger.error(str(e))
@@ -135,6 +155,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
135
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
136
  try:
137
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
 
138
  return "✅ Model loaded (after safetensors retry)."
139
  except Exception as e2:
140
  logger.error(str(e2))
@@ -145,7 +166,7 @@ def auto_load_models(device_str: str = "cpu") -> str:
145
 
146
 
147
  def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
148
- """Preferred: projects util; fallback to ffmpeg."""
149
  sys.path.append(str(REPO_DIR))
150
  try:
151
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
@@ -167,7 +188,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
167
 
168
  def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
169
  prompt: str) -> str:
170
- """Save WAV + MP4 in autosaved/, add metadata with a soft watermark note."""
171
  # torchaudio expects [C, N]
172
  if audio_tensor.ndim == 1:
173
  audio_tensor = audio_tensor.unsqueeze(0)
@@ -222,7 +243,7 @@ def infer_single_video(
222
  Generate Foley audio for an uploaded video (1–6 variants).
223
  Returns: (list of output video paths, status message)
224
  """
225
- # Lazy-load on GPU
226
  if _model_dict is None or _cfg is None:
227
  msg = auto_load_models(device_str="cuda")
228
  if not str(msg).startswith("✅"):
@@ -235,23 +256,25 @@ def infer_single_video(
235
  from hunyuanvideo_foley.utils.feature_utils import feature_process
236
  from hunyuanvideo_foley.utils.model_utils import denoise_process
237
 
238
- # preprocess
239
- visual_feats, text_feats, audio_len_s = feature_process(
240
- video_file, (text_prompt or "").strip(), _model_dict, _cfg
241
- )
 
 
242
 
243
- # generate batch
244
- n = int(max(1, min(6, sample_nums)))
245
- audio, sr = denoise_process(
246
- visual_feats,
247
- text_feats,
248
- audio_len_s,
249
- _model_dict,
250
- _cfg,
251
- guidance_scale=float(guidance_scale),
252
- num_inference_steps=int(num_inference_steps),
253
- batch_size=n,
254
- )
255
 
256
  # save results
257
  outs = []
@@ -262,7 +285,7 @@ def infer_single_video(
262
 
263
 
264
  # -------------
265
- # Gradio UI
266
  # -------------
267
  def _about_html() -> str:
268
  return f"""
@@ -292,8 +315,6 @@ def _about_html() -> str:
292
  <h3>MCP & API</h3>
293
  <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
294
  Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
295
-
296
-
297
  </div>
298
  """
299
 
@@ -349,7 +370,7 @@ def create_ui() -> gr.Blocks:
349
  v6 = gr.Video(label="Sample 6", height=160, visible=False)
350
  gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
351
 
352
- # Generate handler
353
  def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
354
  outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
355
  vis = []
@@ -358,37 +379,16 @@ def create_ui() -> gr.Blocks:
358
  vis.append(gr.update(visible=True, value=outs[i]))
359
  else:
360
  vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
361
- # Also refresh the gallery in this same event
362
- new_gallery = _list_gallery()
363
- return (*vis, msg, new_gallery)
364
 
365
  generate.click(
366
  fn=_process_and_update,
367
  inputs=[video_input, text_input, guidance_scale, steps, samples],
368
- outputs=[v1, v2, v3, v4, v5, v6, status], # updated below to include gallery via .then-like merge
369
  api_name="/infer",
370
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
371
  )
372
 
373
- # Workaround: extend outputs to include gallery refresh using a wrapper
374
- def _process_and_update_with_gallery(video_file, text_prompt, cfg, nsteps, nsamples):
375
- outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
376
- vis = []
377
- for i in range(6):
378
- if outs and i < len(outs):
379
- vis.append(gr.update(visible=True, value=outs[i]))
380
- else:
381
- vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
382
- new_gallery = _list_gallery()
383
- return (*vis, msg, new_gallery)
384
-
385
- # Re-bind with gallery as extra output
386
- generate.click(
387
- fn=_process_and_update_with_gallery,
388
- inputs=[video_input, text_input, guidance_scale, steps, samples],
389
- outputs=[v1, v2, v3, v4, v5, v6, status,], # gallery will be refreshed on Gallery tab itself
390
- )
391
-
392
  load_btn.click(
393
  fn=lambda: auto_load_models(device_str="cpu"),
394
  inputs=[],
@@ -411,7 +411,7 @@ def create_ui() -> gr.Blocks:
411
  samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
412
 
413
  with gr.Tab("📁 Gallery"):
414
- gr.Markdown("Latest generated videos (autosaved to `outputs/autosaved/`).")
415
  gallery = gr.Gallery(
416
  value=_list_gallery(),
417
  columns=3,
@@ -443,7 +443,7 @@ Loads the model proactively (useful before batch runs).
443
  - `shortifoley://status` → quick health info
444
  - `foley_prompt` → reusable guidance for describing the sound
445
 
446
- Works great with media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
447
  """)
448
 
449
  with gr.Tab("ℹ️ About"):
@@ -459,7 +459,7 @@ Works great with media-automation in tools like **n8n**: call `load_model_tool`
459
  """
460
  )
461
 
462
- # ---- REST + MCP endpoints ----
463
  def _download_to_tmp(url: str) -> str:
464
  try:
465
  import requests
@@ -499,7 +499,7 @@ Works great with media-automation in tools like **n8n**: call `load_model_tool`
499
  sample_nums: int = 1,
500
  ) -> Dict[str, List[str]]:
501
  if _model_dict is None or _cfg is None:
502
- msg = auto_load_models(device_str="cpu")
503
  if not str(msg).startswith("✅"):
504
  raise RuntimeError(msg)
505
  local = _normalize_video_input(video_url_or_b64)
@@ -546,7 +546,7 @@ if __name__ == "__main__":
546
  logger.info("===== Application Startup =====\n")
547
  prepare_once()
548
 
549
- # Probe imports
550
  sys.path.append(str(REPO_DIR))
551
  try:
552
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
@@ -557,10 +557,10 @@ if __name__ == "__main__":
557
 
558
  ui = create_ui()
559
 
560
- # Enable MCP server
561
  ui.launch(
562
  server_name="0.0.0.0",
563
  share=False,
564
  show_error=True,
565
- mcp_server=True, # MCP
566
  )
 
 
1
  # Created by bilsimaging.com
2
 
3
  import os
 
4
  os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
5
 
6
  import sys
7
  import json
 
 
 
8
  import base64
9
  import random
10
  import tempfile
 
27
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
28
  WEIGHTS_DIR = Path(os.environ.get("HIFI_FOLEY_MODEL_PATH", str(ROOT / "weights")))
29
  CONFIG_PATH = Path(os.environ.get("HIFI_FOLEY_CONFIG", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")))
30
+
31
+ # Always save into outputs/autosaved/
32
  OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs" / "autosaved")))
33
  OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
34
 
35
+ SPACE_TITLE = "🎵 ShortiFoley — HunyuanVideo-Foley"
36
+ SPACE_TAGLINE = "Text/Video Audio Foley · Created by bilsimaging.com"
37
  WATERMARK_NOTE = "Made with ❤️ by bilsimaging.com"
38
 
39
+ # ZeroGPU limit (<=120s recommended)
40
+ GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
41
 
42
  # Globals
43
  _model_dict = None
 
50
  # ------------
51
  def _setup_device(pref: str = "cpu", gpu_id: int = 0) -> torch.device:
52
  """
53
+ Safe device picker.
54
+ IMPORTANT: Do NOT probe torch.cuda.is_available() here on Stateless GPU Spaces.
55
+ Only request CUDA inside a @spaces.GPU function.
56
  """
57
  if pref.startswith("cuda"):
58
  d = torch.device(f"cuda:{gpu_id}")
 
102
  # -----------------------
103
  # Model load & inference
104
  # -----------------------
105
+ def _force_fp32_on_modules(obj):
106
+ """Ensure every torch.nn.Module inside obj is float32 to avoid half/float mismatches."""
107
+ try:
108
+ import torch.nn as nn
109
+ for name in dir(obj):
110
+ try:
111
+ m = getattr(obj, name)
112
+ except Exception:
113
+ continue
114
+ if isinstance(m, nn.Module):
115
+ m.float()
116
+ if hasattr(obj, "foley_model"): obj.foley_model.float()
117
+ if hasattr(obj, "dac_model"): obj.dac_model.float()
118
+ if hasattr(obj, "siglip2_model"): obj.siglip2_model.float()
119
+ if hasattr(obj, "clap_model"): obj.clap_model.float()
120
+ if hasattr(obj, "syncformer_model"): obj.syncformer_model.float()
121
+ except Exception as e:
122
+ logger.warning(f"FP32 cast warning: {e}")
123
+
124
+
125
  def auto_load_models(device_str: str = "cpu") -> str:
126
  """
127
  Load HunyuanVideo-Foley + encoders on the chosen device.
128
+ Use device_str='cuda' ONLY inside @spaces.GPU to avoid CUDA init in main process.
129
  """
130
  global _model_dict, _cfg, _device
131
 
 
134
 
135
  # Make absolutely sure safetensors is preferred
136
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
137
+ torch.set_float32_matmul_precision("high") # allow TF32 where possible
138
 
139
  sys.path.append(str(REPO_DIR))
140
  from hunyuanvideo_foley.utils.model_utils import load_model
 
146
 
147
  try:
148
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
149
+ # Force fp32 to fix: RuntimeError: Input type (Half) and bias (float) must match
150
+ _force_fp32_on_modules(_model_dict)
151
  return "✅ Model loaded."
152
  except OSError as e:
153
  logger.error(str(e))
 
155
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
156
  try:
157
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
158
+ _force_fp32_on_modules(_model_dict)
159
  return "✅ Model loaded (after safetensors retry)."
160
  except Exception as e2:
161
  logger.error(str(e2))
 
166
 
167
 
168
  def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
169
+ """Preferred: project's util; fallback to ffmpeg."""
170
  sys.path.append(str(REPO_DIR))
171
  try:
172
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
 
188
 
189
  def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
190
  prompt: str) -> str:
191
+ """Save WAV + MP4 in outputs/autosaved/, add metadata with a soft watermark note."""
192
  # torchaudio expects [C, N]
193
  if audio_tensor.ndim == 1:
194
  audio_tensor = audio_tensor.unsqueeze(0)
 
243
  Generate Foley audio for an uploaded video (1–6 variants).
244
  Returns: (list of output video paths, status message)
245
  """
246
+ # Lazy-load on GPU ONLY here (prevents CUDA init in main process)
247
  if _model_dict is None or _cfg is None:
248
  msg = auto_load_models(device_str="cuda")
249
  if not str(msg).startswith("✅"):
 
256
  from hunyuanvideo_foley.utils.feature_utils import feature_process
257
  from hunyuanvideo_foley.utils.model_utils import denoise_process
258
 
259
+ # Avoid autocast to float16 to fix Half/Float mismatch inside Synchformer conv3d
260
+ with torch.autocast(device_type="cuda", enabled=False):
261
+ # preprocess
262
+ visual_feats, text_feats, audio_len_s = feature_process(
263
+ video_file, (text_prompt or "").strip(), _model_dict, _cfg
264
+ )
265
 
266
+ # generate batch
267
+ n = int(max(1, min(6, sample_nums)))
268
+ audio, sr = denoise_process(
269
+ visual_feats,
270
+ text_feats,
271
+ audio_len_s,
272
+ _model_dict,
273
+ _cfg,
274
+ guidance_scale=float(guidance_scale),
275
+ num_inference_steps=int(num_inference_steps),
276
+ batch_size=n,
277
+ )
278
 
279
  # save results
280
  outs = []
 
285
 
286
 
287
  # -------------
288
+ # Gradio UI (with MCP+API inside the same app)
289
  # -------------
290
  def _about_html() -> str:
291
  return f"""
 
315
  <h3>MCP & API</h3>
316
  <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see “API & MCP” tab).
317
  Perfect for media-automation pipelines and tools like <b><a href="https://n8n.partnerlinks.io/bilsimaging" target="_blank" rel="noopener">n8n</a></b>.</p>
 
 
318
  </div>
319
  """
320
 
 
370
  v6 = gr.Video(label="Sample 6", height=160, visible=False)
371
  gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
372
 
373
+ # Generate handler (single binding, exact outputs)
374
  def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
375
  outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
376
  vis = []
 
379
  vis.append(gr.update(visible=True, value=outs[i]))
380
  else:
381
  vis.append(gr.update(visible=(i == 0), value=None if i > 0 else None))
382
+ return (*vis, msg)
 
 
383
 
384
  generate.click(
385
  fn=_process_and_update,
386
  inputs=[video_input, text_input, guidance_scale, steps, samples],
387
+ outputs=[v1, v2, v3, v4, v5, v6, status],
388
  api_name="/infer",
389
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
390
  )
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  load_btn.click(
393
  fn=lambda: auto_load_models(device_str="cpu"),
394
  inputs=[],
 
411
  samples.change(_toggle_vis, inputs=[samples], outputs=[v1, v2, v3, v4, v5, v6])
412
 
413
  with gr.Tab("📁 Gallery"):
414
+ gr.Markdown("Latest generated videos (autosaved to <code>outputs/autosaved/</code>).")
415
  gallery = gr.Gallery(
416
  value=_list_gallery(),
417
  columns=3,
 
443
  - `shortifoley://status` → quick health info
444
  - `foley_prompt` → reusable guidance for describing the sound
445
 
446
+ Works great for media-automation in tools like **n8n**: call `load_model_tool` once, then `api_generate_from_url` for each clip.
447
  """)
448
 
449
  with gr.Tab("ℹ️ About"):
 
459
  """
460
  )
461
 
462
+ # ---- REST + MCP endpoints (inside Blocks) ----
463
  def _download_to_tmp(url: str) -> str:
464
  try:
465
  import requests
 
499
  sample_nums: int = 1,
500
  ) -> Dict[str, List[str]]:
501
  if _model_dict is None or _cfg is None:
502
+ msg = auto_load_models(device_str="cpu") # safe in HTTP context
503
  if not str(msg).startswith("✅"):
504
  raise RuntimeError(msg)
505
  local = _normalize_video_input(video_url_or_b64)
 
546
  logger.info("===== Application Startup =====\n")
547
  prepare_once()
548
 
549
+ # Probe imports (early surfacing)
550
  sys.path.append(str(REPO_DIR))
551
  try:
552
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
 
557
 
558
  ui = create_ui()
559
 
560
+ # Enable MCP server so tools/resources/prompts are discoverable
561
  ui.launch(
562
  server_name="0.0.0.0",
563
  share=False,
564
  show_error=True,
565
+ mcp_server=True, # Enable MCP server
566
  )