Bils commited on
Commit
7e3de09
Β·
verified Β·
1 Parent(s): 4413610

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -135
app.py CHANGED
@@ -2,15 +2,11 @@
2
  # Created by bilsimaging.com
3
 
4
  import os
5
- # ---- Prefer safetensors for all HF model loads (fixes CLAP .bin crash on ZeroGPU) ----
6
  os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
7
 
8
  import sys
9
- import io
10
  import json
11
- import uuid
12
- import time
13
- import shutil
14
  import base64
15
  import random
16
  import tempfile
@@ -37,10 +33,10 @@ OUTPUTS_DIR = Path(os.environ.get("OUTPUTS_DIR", str(ROOT / "outputs")))
37
  OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
38
 
39
  SPACE_TITLE = "🎡 ShortiFoley β€” HunyuanVideo-Foley"
40
- SPACE_TAGLINE = "Text/Video β†’ Audio Foley. Created by bilsimaging.com"
41
  WATERMARK_NOTE = "Made with ❀️ by bilsimaging.com"
42
 
43
- # Keep GPU <= 120s for ZeroGPU (default 110)
44
  GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
45
 
46
  # Globals
@@ -63,10 +59,7 @@ def _setup_device(pref: str = "auto", gpu_id: int = 0) -> torch.device:
63
  d = torch.device("cpu")
64
  else:
65
  d = torch.device(pref)
66
- if d.type == "cuda":
67
- logger.info(f"Using CUDA {d}")
68
- else:
69
- logger.info(f"Using {d}")
70
  return d
71
 
72
 
@@ -116,9 +109,9 @@ def auto_load_models() -> str:
116
  global _model_dict, _cfg, _device
117
 
118
  if _model_dict is not None and _cfg is not None:
119
- return "Model already loaded."
120
 
121
- # Ensure Transformers prefers safetensors for everything:
122
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
123
 
124
  sys.path.append(str(REPO_DIR))
@@ -133,9 +126,8 @@ def auto_load_models() -> str:
133
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
134
  return "βœ… Model loaded."
135
  except OSError as e:
136
- # If any OSError (often from trying to read pytorch_model.bin), retry after enforcing safetensors.
137
  logger.error(str(e))
138
- logger.info("Retrying load after enforcing safetensors preference...")
139
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
140
  try:
141
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
@@ -149,7 +141,7 @@ def auto_load_models() -> str:
149
 
150
 
151
  def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
152
- """Use project's helper (preferred) with a fallback to ffmpeg via subprocess."""
153
  sys.path.append(str(REPO_DIR))
154
  try:
155
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
@@ -171,7 +163,7 @@ def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
171
 
172
  def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
173
  prompt: str) -> str:
174
- """Save WAV + MP4 in outputs/, add metadata and a small watermark note (metadata only)."""
175
  # torchaudio expects [C, N]
176
  if audio_tensor.ndim == 1:
177
  audio_tensor = audio_tensor.unsqueeze(0)
@@ -186,14 +178,14 @@ def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
186
 
187
  _merge_audio_video(str(wav_path), video_src, str(out_mp4))
188
 
189
- # Save JSON sidecar
190
  meta = {
191
  "id": base,
192
  "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
193
  "source_video": Path(video_src).name,
194
  "output_video": Path(out_mp4).name,
195
  "prompt": prompt or "",
196
- "watermark": WATERMARK_NOTE,
197
  "tool": "ShortiFoley (HunyuanVideo-Foley)"
198
  }
199
  (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
@@ -226,8 +218,11 @@ def infer_single_video(
226
  Generate Foley audio for an uploaded video (1–6 variants).
227
  Returns: (list of output video paths, status message)
228
  """
 
229
  if _model_dict is None or _cfg is None:
230
- return [], "❌ Load the model first (open the app once)."
 
 
231
 
232
  if not video_file:
233
  return [], "❌ Please provide a video."
@@ -269,84 +264,96 @@ def _about_html() -> str:
269
  return f"""
270
  <div style="line-height:1.6">
271
  <h2>About ShortiFoley</h2>
272
- <p><b>ShortiFoley</b> automatically generates realistic Foley soundtracks for short videos using
273
- Tencent’s HunyuanVideo-Foley with CLAP & SigLIP2 encoders. It includes autosave and an MCP server so
274
- you can call it from agents or workflows (e.g., n8n).</p>
275
- <p><b>Created by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a></b></p>
276
 
277
- <h3>How to use</h3>
278
  <ol>
279
- <li>Upload a video (ideally &lt; 120 seconds).</li>
280
- <li>Optionally enter a text description of the sound (English).</li>
281
- <li>Adjust CFG scale, steps, and number of variants.</li>
282
- <li>Click <b>Generate</b>. Results appear on the right and are stored in the Gallery.</li>
283
  </ol>
284
 
285
- <h3>Tips</h3>
286
  <ul>
287
- <li>Trim clips to the key action (5–30s) for faster, crisper results.</li>
288
- <li>Include material cues (β€œwood”, β€œmetal”, β€œconcrete”), action cues (β€œsplash”, β€œglass shatter”), and ambience (β€œroomy”, β€œechoey”).</li>
289
- <li>Generate multiple variants and pick the most natural.</li>
 
290
  </ul>
291
 
292
- <h3>MCP / Automation</h3>
293
- <p>This app runs as an <b>MCP server</b>. Open the footer β€œView API β†’ MCP” to copy a ready config. You can also use the REST endpoints listed there. Perfect for n8n integrations.</p>
 
294
 
295
  <h3>Watermark</h3>
296
- <p>Each output’s metadata includes: <i>{WATERMARK_NOTE}</i>. If you want a <b>visible video overlay</b>, I can add an ffmpeg overlay step on request.</p>
297
  </div>
298
  """
299
 
300
 
301
  def create_ui() -> gr.Blocks:
302
- with gr.Blocks(
303
- title="ShortiFoley β€” HunyuanVideo-Foley",
304
- css="""
305
- .main-header{ text-align:center; padding:1.2rem; border-radius:16px; background:linear-gradient(135deg,#667eea,#764ba2); color:white; }
306
- .card{ background:white; border:1px solid #e1e5e9; border-radius:16px; padding:1rem; box-shadow:0 8px 32px rgba(0,0,0,.06); }
307
- .generate-btn button{ font-weight:700; }
308
- """
309
- ) as demo:
 
 
310
 
311
  gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
312
 
313
  with gr.Tabs():
314
  with gr.Tab("Run"):
315
  with gr.Row():
 
316
  with gr.Column(scale=1, elem_classes=["card"]):
317
  gr.Markdown("### πŸ“Ή Input")
318
  video_input = gr.Video(label="Upload Video", height=300)
319
  text_input = gr.Textbox(
320
  label="🎯 Audio Description (optional, English)",
321
- placeholder="e.g., Rubber soles on wet tile, distant chatter.",
322
  lines=3
323
  )
324
  with gr.Row():
325
- guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG Scale")
326
  steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
327
  samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
328
- generate = gr.Button("🎡 Generate", variant="primary", elem_classes=["generate-btn"])
329
 
 
 
 
 
 
 
 
330
  with gr.Column(scale=1, elem_classes=["card"]):
331
  gr.Markdown("### πŸŽ₯ Result(s)")
332
  v1 = gr.Video(label="Sample 1", height=260, visible=True)
333
- v2 = gr.Video(label="Sample 2", height=160, visible=False)
334
- v3 = gr.Video(label="Sample 3", height=160, visible=False)
335
- v4 = gr.Video(label="Sample 4", height=160, visible=False)
336
- v5 = gr.Video(label="Sample 5", height=160, visible=False)
 
 
337
  v6 = gr.Video(label="Sample 6", height=160, visible=False)
338
- status = gr.Textbox(label="Status", interactive=False)
339
 
340
  # Generate handler
341
  def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
342
  outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
343
- vis_updates = []
344
  for i in range(6):
345
  if i < len(outs):
346
- vis_updates.append(gr.update(visible=True, value=outs[i]))
347
  else:
348
- vis_updates.append(gr.update(visible=False, value=None))
349
- return (*vis_updates, msg)
350
 
351
  generate.click(
352
  fn=_process_and_update,
@@ -356,7 +363,15 @@ def create_ui() -> gr.Blocks:
356
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
357
  )
358
 
359
- # Toggle visibility when # of samples changes
 
 
 
 
 
 
 
 
360
  def _toggle_vis(n):
361
  n = int(n)
362
  return [
@@ -380,78 +395,110 @@ def create_ui() -> gr.Blocks:
380
  refresh = gr.Button("πŸ”„ Refresh Gallery")
381
  refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  with gr.Tab("ℹ️ About"):
384
  gr.HTML(_about_html())
385
 
386
- # Keep gallery in sync after generate
387
- generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
388
-
389
- # -----------------------
390
- # MCP + REST API endpoints
391
- # -----------------------
392
- def _download_to_tmp(url: str) -> str:
393
- try:
394
- import requests
395
- except Exception:
396
- raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
397
- r = requests.get(url, timeout=30)
398
- r.raise_for_status()
399
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
400
- tmp.write(r.content)
401
- tmp.flush()
402
- tmp.close()
403
- return tmp.name
404
-
405
- def _maybe_from_base64(data_url_or_b64: str) -> str:
406
- b64 = data_url_or_b64
407
- if data_url_or_b64.startswith("data:"):
408
- b64 = data_url_or_b64.split(",", 1)[-1]
409
- raw = base64.b64decode(b64)
410
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
411
- tmp.write(raw)
412
- tmp.flush()
413
- tmp.close()
414
- return tmp.name
415
-
416
- def _normalize_video_input(video_url_or_b64: str) -> str:
417
- v = (video_url_or_b64 or "").strip()
418
- if v.startswith("http://") or v.startswith("https://"):
419
- return _download_to_tmp(v)
420
- return _maybe_from_base64(v)
421
-
422
- @gr.api
423
- def api_generate_from_url(
424
- video_url_or_b64: str,
425
- text_prompt: str = "",
426
- guidance_scale: float = 4.5,
427
- num_inference_steps: int = 50,
428
- sample_nums: int = 1,
429
- ) -> Dict[str, List[str]]:
430
- if _model_dict is None or _cfg is None:
431
- raise RuntimeError("Model not loaded. Open the UI once or call /load_model tool.")
432
- local = _normalize_video_input(video_url_or_b64)
433
- outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
434
- return {"videos": outs, "message": msg}
435
-
436
- @gr.api
437
- def load_model_tool() -> str:
438
- """Ensure model is loaded on server (MCP convenience)."""
439
- return auto_load_models()
440
-
441
- @gr.mcp.resource("shortifoley://status")
442
- def shortifoley_status() -> str:
443
- """Return a simple readiness string for MCP clients."""
444
- ready = _model_dict is not None and _cfg is not None
445
- dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
446
- return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
447
-
448
- @gr.mcp.prompt()
449
- def foley_prompt(name: str = "default") -> str:
450
- """Reusable guidance for describing sound ambience."""
451
- return (
452
- "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
453
- "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
454
- )
 
 
 
 
 
 
 
455
 
456
  return demo
457
 
@@ -473,22 +520,15 @@ if __name__ == "__main__":
473
  logger.info("===== Application Startup =====\n")
474
  prepare_once()
475
 
476
- # Ensure import paths after repo is present
477
  sys.path.append(str(REPO_DIR))
478
  try:
479
- # Probe key modules early (better error surfacing)
480
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
481
  from hunyuanvideo_foley.utils.feature_utils import feature_process # noqa: F401
482
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video # noqa: F401
483
  except Exception as e:
484
  logger.warning(f"Repo imports not ready yet: {e}")
485
 
486
- msg = auto_load_models()
487
- if not msg.startswith("βœ…"):
488
- logger.error(f"[BOOT][ERROR] auto_load_models() failed:\n{msg}")
489
- else:
490
- logger.info(msg)
491
-
492
  ui = create_ui()
493
 
494
  # Enable MCP server so tools/resources/prompts are discoverable
@@ -496,5 +536,5 @@ if __name__ == "__main__":
496
  server_name="0.0.0.0",
497
  share=False,
498
  show_error=True,
499
- mcp_server=True, # MCP on
500
  )
 
2
  # Created by bilsimaging.com
3
 
4
  import os
5
+ # Prefer safetensors globally (fixes CLAP .bin crash on ZeroGPU)
6
  os.environ.setdefault("HF_PREFER_SAFETENSORS", "1")
7
 
8
  import sys
 
9
  import json
 
 
 
10
  import base64
11
  import random
12
  import tempfile
 
33
  OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
34
 
35
  SPACE_TITLE = "🎡 ShortiFoley β€” HunyuanVideo-Foley"
36
+ SPACE_TAGLINE = "Text/Video β†’ Audio Foley Β· Created by bilsimaging.com"
37
  WATERMARK_NOTE = "Made with ❀️ by bilsimaging.com"
38
 
39
+ # ZeroGPU limit (<=120)
40
  GPU_DURATION = int(os.environ.get("GPU_DURATION_SECS", "110"))
41
 
42
  # Globals
 
59
  d = torch.device("cpu")
60
  else:
61
  d = torch.device(pref)
62
+ logger.info(f"Using {d}")
 
 
 
63
  return d
64
 
65
 
 
109
  global _model_dict, _cfg, _device
110
 
111
  if _model_dict is not None and _cfg is not None:
112
+ return "βœ… Model already loaded."
113
 
114
+ # Make absolutely sure safetensors is preferred
115
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
116
 
117
  sys.path.append(str(REPO_DIR))
 
126
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
127
  return "βœ… Model loaded."
128
  except OSError as e:
 
129
  logger.error(str(e))
130
+ logger.info("Retrying after enforcing safetensors preference...")
131
  os.environ["HF_PREFER_SAFETENSORS"] = "1"
132
  try:
133
  _model_dict, _cfg = load_model(str(WEIGHTS_DIR), str(CONFIG_PATH), _device)
 
141
 
142
 
143
  def _merge_audio_video(audio_path: str, video_path: str, out_path: str) -> None:
144
+ """Preferred: project's util; fallback to ffmpeg."""
145
  sys.path.append(str(REPO_DIR))
146
  try:
147
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video
 
163
 
164
  def _save_outputs(video_src: str, audio_tensor: torch.Tensor, sr: int, idx: int,
165
  prompt: str) -> str:
166
+ """Save WAV + MP4 in outputs/, add metadata with a soft watermark note."""
167
  # torchaudio expects [C, N]
168
  if audio_tensor.ndim == 1:
169
  audio_tensor = audio_tensor.unsqueeze(0)
 
178
 
179
  _merge_audio_video(str(wav_path), video_src, str(out_mp4))
180
 
181
+ # Sidecar JSON
182
  meta = {
183
  "id": base,
184
  "created_utc": datetime.datetime.utcnow().isoformat() + "Z",
185
  "source_video": Path(video_src).name,
186
  "output_video": Path(out_mp4).name,
187
  "prompt": prompt or "",
188
+ "watermark_note": WATERMARK_NOTE,
189
  "tool": "ShortiFoley (HunyuanVideo-Foley)"
190
  }
191
  (OUTPUTS_DIR / f"{base}.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2))
 
218
  Generate Foley audio for an uploaded video (1–6 variants).
219
  Returns: (list of output video paths, status message)
220
  """
221
+ # Lazy-load if needed
222
  if _model_dict is None or _cfg is None:
223
+ msg = auto_load_models()
224
+ if not str(msg).startswith("βœ…"):
225
+ return [], f"❌ {msg}"
226
 
227
  if not video_file:
228
  return [], "❌ Please provide a video."
 
264
  return f"""
265
  <div style="line-height:1.6">
266
  <h2>About ShortiFoley</h2>
267
+ <p><b>ShortiFoley</b> turns short videos (and an optional text hint) into realistic Foley sound.
268
+ Powered by Tencent’s HunyuanVideo-Foley (SigLIP2 + CLAP), with autosave and an MCP server for automation (e.g., n8n).</p>
269
+ <p><b>Created by <a href="https://bilsimaging.com" target="_blank" rel="noopener">bilsimaging.com</a></b></p>
 
270
 
271
+ <h3>Quick Steps</h3>
272
  <ol>
273
+ <li>Upload a clip (ideally &lt; 120s).</li>
274
+ <li>Optionally describe the sound (English).</li>
275
+ <li>Pick variants (1–6), adjust CFG and steps.</li>
276
+ <li>Hit <b>Generate</b>. Results show on the right and save into the Gallery.</li>
277
  </ol>
278
 
279
+ <h3>Tips for Best Quality</h3>
280
  <ul>
281
+ <li>Use tight clips (5–30s) around the action.</li>
282
+ <li>Include material & action cues: β€œmetal clang”, β€œglass shatter”, β€œrubber on wet tile”.</li>
283
+ <li>Describe ambience: β€œroomy”, β€œechoey”, β€œdistant crowd”.</li>
284
+ <li>Generate 2–4 variants and pick the most natural.</li>
285
  </ul>
286
 
287
+ <h3>MCP & API</h3>
288
+ <p>This Space exposes an <b>MCP server</b> and simple REST endpoints (see β€œAPI & MCP” tab).
289
+ Perfect for pipelines and tools like <b>n8n</b>.</p>
290
 
291
  <h3>Watermark</h3>
292
+ <p>Each output writes a JSON sidecar including: <i>{WATERMARK_NOTE}</i>. Ask if you want a visible overlay.</p>
293
  </div>
294
  """
295
 
296
 
297
  def create_ui() -> gr.Blocks:
298
+ css = """
299
+ .main-header{ text-align:center; padding:1.2rem; border-radius:18px; background:linear-gradient(135deg,#6366f1,#8b5cf6); color:white; box-shadow:0 12px 40px rgba(99,102,241,.35); margin-bottom:16px;}
300
+ .main-header h1{ margin:0; font-size:2.0rem; font-weight:800;}
301
+ .main-header p{ margin:.25rem 0 0; opacity:.95; font-weight:500;}
302
+ .card{ background:white; border:1px solid #e7e9ef; border-radius:16px; padding:14px; box-shadow:0 10px 28px rgba(0,0,0,.06);}
303
+ .generate-btn button{ font-weight:800; border-radius:12px; padding:10px 18px;}
304
+ .minor-btn button{ border-radius:10px;}
305
+ .muted{ color:#64748b; }
306
+ """
307
+ with gr.Blocks(title="ShortiFoley β€” HunyuanVideo-Foley", css=css) as demo:
308
 
309
  gr.HTML(f"<div class='main-header'><h1>{SPACE_TITLE}</h1><p>{SPACE_TAGLINE}</p></div>")
310
 
311
  with gr.Tabs():
312
  with gr.Tab("Run"):
313
  with gr.Row():
314
+ # LEFT: input
315
  with gr.Column(scale=1, elem_classes=["card"]):
316
  gr.Markdown("### πŸ“Ή Input")
317
  video_input = gr.Video(label="Upload Video", height=300)
318
  text_input = gr.Textbox(
319
  label="🎯 Audio Description (optional, English)",
320
+ placeholder="e.g., Rubber soles on wet tile; distant chatter; occasional splashes.",
321
  lines=3
322
  )
323
  with gr.Row():
324
+ guidance_scale = gr.Slider(1.0, 10.0, value=4.5, step=0.1, label="CFG")
325
  steps = gr.Slider(10, 100, value=50, step=5, label="Steps")
326
  samples = gr.Slider(1, 6, value=1, step=1, label="Variants")
 
327
 
328
+ with gr.Row():
329
+ load_btn = gr.Button("βš™οΈ Load model", variant="secondary", elem_classes=["minor-btn"])
330
+ generate = gr.Button("🎡 Generate", variant="primary", elem_classes=["generate-btn"])
331
+
332
+ status = gr.Textbox(label="Status", interactive=False)
333
+
334
+ # RIGHT: results
335
  with gr.Column(scale=1, elem_classes=["card"]):
336
  gr.Markdown("### πŸŽ₯ Result(s)")
337
  v1 = gr.Video(label="Sample 1", height=260, visible=True)
338
+ with gr.Row():
339
+ v2 = gr.Video(label="Sample 2", height=160, visible=False)
340
+ v3 = gr.Video(label="Sample 3", height=160, visible=False)
341
+ with gr.Row():
342
+ v4 = gr.Video(label="Sample 4", height=160, visible=False)
343
+ v5 = gr.Video(label="Sample 5", height=160, visible=False)
344
  v6 = gr.Video(label="Sample 6", height=160, visible=False)
345
+ gr.Markdown("<span class='muted'>Autosaved to the Gallery tab.</span>")
346
 
347
  # Generate handler
348
  def _process_and_update(video_file, text_prompt, cfg, nsteps, nsamples):
349
  outs, msg = infer_single_video(video_file, text_prompt, cfg, nsteps, nsamples)
350
+ vis = []
351
  for i in range(6):
352
  if i < len(outs):
353
+ vis.append(gr.update(visible=True, value=outs[i]))
354
  else:
355
+ vis.append(gr.update(visible=False, value=None))
356
+ return (*vis, msg)
357
 
358
  generate.click(
359
  fn=_process_and_update,
 
363
  api_description="Generate Foley audio for an uploaded video. Returns up to 6 video+audio files."
364
  )
365
 
366
+ load_btn.click(
367
+ fn=auto_load_models,
368
+ inputs=[],
369
+ outputs=[status],
370
+ api_name="/load_model",
371
+ api_description="Load/initialize the ShortiFoley model and encoders."
372
+ )
373
+
374
+ # Toggle visibility based on variants
375
  def _toggle_vis(n):
376
  n = int(n)
377
  return [
 
395
  refresh = gr.Button("πŸ”„ Refresh Gallery")
396
  refresh.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
397
 
398
+ with gr.Tab("API & MCP"):
399
+ gr.Markdown("""
400
+ ### REST examples
401
+
402
+ **POST** `/api_generate_from_url`
403
+ ```json
404
+ {
405
+ "video_url_or_b64": "https://yourhost/sample.mp4",
406
+ "text_prompt": "metallic clink; hollow room reverb",
407
+ "guidance_scale": 4.5,
408
+ "num_inference_steps": 50,
409
+ "sample_nums": 2
410
+ }
411
+ ```
412
+
413
+ **POST** `/load_model_tool`
414
+ Loads the model proactively (useful before batch runs).
415
+
416
+ ### MCP resources & prompt
417
+ - `shortifoley://status` β†’ quick health info
418
+ - `foley_prompt` β†’ reusable guidance for describing the sound
419
+
420
+ Works great with n8n: call `load_model_tool` once, then `api_generate_from_url` for each clip.
421
+ """)
422
+
423
  with gr.Tab("ℹ️ About"):
424
  gr.HTML(_about_html())
425
 
426
+ # Keep gallery fresh after generation
427
+ generate.click(lambda: gr.update(value=_list_gallery()), outputs=[gallery])
428
+
429
+ # ---- REST + MCP endpoints (inside Blocks) ----
430
+ def _download_to_tmp(url: str) -> str:
431
+ try:
432
+ import requests
433
+ except Exception:
434
+ raise RuntimeError("Missing dependency 'requests'. Add it to requirements.txt to use URL inputs.")
435
+ r = requests.get(url, timeout=30)
436
+ r.raise_for_status()
437
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
438
+ tmp.write(r.content)
439
+ tmp.flush()
440
+ tmp.close()
441
+ return tmp.name
442
+
443
+ def _maybe_from_base64(data_url_or_b64: str) -> str:
444
+ b64 = data_url_or_b64
445
+ if data_url_or_b64.startswith("data:"):
446
+ b64 = data_url_or_b64.split(",", 1)[-1]
447
+ raw = base64.b64decode(b64)
448
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
449
+ tmp.write(raw)
450
+ tmp.flush()
451
+ tmp.close()
452
+ return tmp.name
453
+
454
+ def _normalize_video_input(video_url_or_b64: str) -> str:
455
+ v = (video_url_or_b64 or "").strip()
456
+ if v.startswith("http://") or v.startswith("https://"):
457
+ return _download_to_tmp(v)
458
+ return _maybe_from_base64(v)
459
+
460
+ @gr.api
461
+ def api_generate_from_url(
462
+ video_url_or_b64: str,
463
+ text_prompt: str = "",
464
+ guidance_scale: float = 4.5,
465
+ num_inference_steps: int = 50,
466
+ sample_nums: int = 1,
467
+ ) -> Dict[str, List[str]]:
468
+ if _model_dict is None or _cfg is None:
469
+ msg = auto_load_models()
470
+ if not str(msg).startswith("βœ…"):
471
+ raise RuntimeError(msg)
472
+ local = _normalize_video_input(video_url_or_b64)
473
+ outs, msg = infer_single_video(local, text_prompt, guidance_scale, num_inference_steps, sample_nums)
474
+ return {"videos": outs, "message": msg}
475
+
476
+ @gr.api
477
+ def load_model_tool() -> str:
478
+ """Ensure model is loaded on server (convenient for MCP/REST)."""
479
+ return auto_load_models()
480
+
481
+ @gr.mcp.resource("shortifoley://status")
482
+ def shortifoley_status() -> str:
483
+ """Return a simple readiness string for MCP clients."""
484
+ ready = _model_dict is not None and _cfg is not None
485
+ dev = "cuda" if (_device and _device.type == "cuda") else ("mps" if (_device and _device.type == "mps") else "cpu")
486
+ return f"ShortiFoley status: {'ready' if ready else 'loading'} | device={dev} | outputs={OUTPUTS_DIR}"
487
+
488
+ @gr.mcp.prompt()
489
+ def foley_prompt(name: str = "default") -> str:
490
+ """Reusable guidance for describing sound ambience."""
491
+ return (
492
+ "Describe the expected environmental sound precisely. Mention material, rhythm, intensity, and ambience.\n"
493
+ "Example: 'Soft leather footfalls on wet pavement with distant traffic hiss; occasional splashes.'"
494
+ )
495
+
496
+ # Auto-load model when UI first renders
497
+ demo.load(
498
+ fn=auto_load_models,
499
+ inputs=None,
500
+ outputs=[status]
501
+ )
502
 
503
  return demo
504
 
 
520
  logger.info("===== Application Startup =====\n")
521
  prepare_once()
522
 
523
+ # Probe imports (early surfacing)
524
  sys.path.append(str(REPO_DIR))
525
  try:
 
526
  from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process # noqa: F401
527
  from hunyuanvideo_foley.utils.feature_utils import feature_process # noqa: F401
528
  from hunyuanvideo_foley.utils.media_utils import merge_audio_video # noqa: F401
529
  except Exception as e:
530
  logger.warning(f"Repo imports not ready yet: {e}")
531
 
 
 
 
 
 
 
532
  ui = create_ui()
533
 
534
  # Enable MCP server so tools/resources/prompts are discoverable
 
536
  server_name="0.0.0.0",
537
  share=False,
538
  show_error=True,
539
+ mcp_server=True, # MCP on (great for n8n)
540
  )