Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on 6 days ago

Commit

4588e7b

verified ·

1 Parent(s): b29db40

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -199

app.py CHANGED Viewed

@@ -2,10 +2,9 @@ import os, sys, json, tempfile, subprocess, shutil, uuid, glob, traceback, datet
 from pathlib import Path
 from typing import Tuple, List
-# ================= Crash trap & verbose logs =================
 import faulthandler
 faulthandler.enable()
 os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")
 os.environ.setdefault("GRADIO_NUM_PORTS", "1")
 os.environ.setdefault("HF_HUB_VERBOSE", "1")
@@ -17,35 +16,40 @@ def _crash_trap(exctype, value, tb):
     print(f"\n===== FATAL ({ts}Z) =====================================")
     traceback.print_exception(exctype, value, tb)
     print("=========================================================\n", flush=True)
 sys.excepthook = _crash_trap
-# ============================================================
 import gradio as gr
-from spaces import GPU  # <-- explicit import so startup checker can see it
-from huggingface_hub import snapshot_download
 from loguru import logger
-import torch, torchaudio
-# ========= Paths & Config =========
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = ROOT / "weights"
 CACHE_DIR = ROOT / "cache"
 OUT_DIR = ROOT / "outputs"
 ASSETS = ROOT / "assets"
-ASSETS.mkdir(exist_ok=True)
 APP_TITLE   = os.environ.get("APP_TITLE", "Foley Studio · ZeroGPU")
 APP_TAGLINE = os.environ.get("APP_TAGLINE", "Generate scene-true foley for short clips (ZeroGPU-ready).")
 PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")
-# ZeroGPU-safe defaults (tweak in Space Secrets if needed)
 MAX_SECS = int(os.environ.get("MAX_SECS", "15"))
 TARGET_H = int(os.environ.get("TARGET_H", "480"))
 SR       = int(os.environ.get("TARGET_SR", "48000"))
 ZEROGPU_DURATION = int(os.environ.get("ZEROGPU_DURATION", "110"))
 def sh(cmd: str):
     print(">>", cmd)
     subprocess.run(cmd, shell=True, check=True)
@@ -61,10 +65,6 @@ def ffprobe_duration(path: str) -> float:
         return 0.0
 def _clone_without_lfs():
-    """
-    Clone repo while skipping LFS smudge to avoid huge demo assets.
-    Falls back to sparse checkout with only essential paths.
-    """
     if REPO_DIR.exists():
         return
     try:
@@ -104,14 +104,11 @@ def _clone_without_lfs():
         sh(f"git -C {REPO_DIR} fetch --depth 1 origin master")
         sh(f"git -C {REPO_DIR} checkout master")
-def prepare_once():
-    """Clone code (skip LFS), download weights, set env, prepare dirs."""
     _clone_without_lfs()
     if str(REPO_DIR) not in sys.path:
         sys.path.insert(0, str(REPO_DIR))
-    WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
         local_dir=str(WEIGHTS_DIR),
@@ -121,20 +118,45 @@ def prepare_once():
     )
     os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
-    CACHE_DIR.mkdir(exist_ok=True)
-    OUT_DIR.mkdir(exist_ok=True)
-prepare_once()
-# Prefer safetensors & fast transfer
 os.environ["TRANSFORMERS_PREFER_SAFETENSORS"] = "1"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-def ensure_clap_safetensors():
-    """
-    Pre-cache ONLY safetensors for laion/larger_clap_general so
-    Transformers never selects a stale/corrupt *.bin.
-    """
     snapshot_download(
         repo_id="laion/larger_clap_general",
         allow_patterns=[
@@ -146,13 +168,9 @@ def ensure_clap_safetensors():
         local_dir=None,
         local_dir_use_symlinks=False,
     )
-def _purge_clap_pt_bins():
-    """Remove any cached .bin for laion/larger_clap_general."""
     cache_root = Path.home() / ".cache" / "huggingface" / "hub"
-    for pat in [
-        cache_root / "models--laion--larger_clap_general" / "snapshots" / "*" / "*.bin",
-    ]:
         for f in glob.glob(str(pat)):
             try:
                 Path(f).unlink()
@@ -160,39 +178,8 @@ def _purge_clap_pt_bins():
             except Exception:
                 pass
-# ---- Dependency guards (early / clear errors) -------------------------------
-try:
-    import audiotools  # provided by PyPI package 'descript-audiotools'
-except Exception as e:
-    raise RuntimeError(
-        "Missing module 'audiotools'. Install via PyPI package "
-        "'descript-audiotools' (add 'descript-audiotools>=0.7.2' to requirements.txt)."
-    ) from e
-try:
-    import omegaconf  # noqa: F401
-    import yaml       # from pyyaml
-    import easydict   # noqa: F401
-except Exception as e:
-    raise RuntimeError(
-        "Missing config deps. Add to requirements.txt: "
-        "'omegaconf>=2.3.0', 'pyyaml', 'easydict'."
-    ) from e
-# Import Tencent internals after guards
-from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
-from hunyuanvideo_foley.utils.feature_utils import feature_process
-from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-# ========= Native Model Setup =========
-MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", str(WEIGHTS_DIR))
-CONFIG_PATH = str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")
-_model_dict = None
-_cfg = None
-_device = None
-def _setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
     if device_str == "auto":
         if torch.cuda.is_available():
             d = torch.device(f"cuda:{gpu_id}")
@@ -209,11 +196,16 @@ def _setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
     return d
 def auto_load_models() -> str:
-    """Load model natively (weights already downloaded to MODEL_PATH)."""
     global _model_dict, _cfg, _device
-    if not os.path.exists(MODEL_PATH):
-        os.makedirs(MODEL_PATH, exist_ok=True)
     if not os.path.exists(CONFIG_PATH):
         return f"❌ Config file not found: {CONFIG_PATH}"
@@ -222,40 +214,18 @@ def auto_load_models() -> str:
     logger.info(f"MODEL_PATH:  {MODEL_PATH}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
-    # Ensure CLAP uses safetensors; nuke any .bin first
-    ensure_clap_safetensors()
-    _purge_clap_pt_bins()
-    # Lock HF Hub to offline so Transformers can't fetch a fresh .bin again
     os.environ["HF_HUB_OFFLINE"] = "1"
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
     _model_dict, _cfg = load_model(MODEL_PATH, CONFIG_PATH, _device)
     logger.info("✅ Model loaded")
     return "✅ Model loaded"
-# Init logger and load model once (with explicit crash surface)
-logger.remove()
-logger.add(lambda msg: print(msg, end=''), level="INFO")
-try:
-    msg = auto_load_models()
-    logger.info(msg)
-except Exception as e:
-    print("\n[BOOT][ERROR] auto_load_models() failed:")
-    traceback.print_exc()
-    with gr.Blocks(title="Foley Studio · Boot Error") as demo:
-        gr.Markdown("### ❌ Boot failure\n```\n" + "".join(traceback.format_exc()) + "\n```")
-    demo.launch(server_name="0.0.0.0")
-    raise
-# ========= Preprocessing =========
 def preprocess_video(in_path: str) -> Tuple[str, float]:
-    """
-    - Trim to <= MAX_SECS
-    - Downscale to TARGET_H (keep AR), strip audio
-    - Return processed mp4 path and final duration
-    """
     dur = ffprobe_duration(in_path)
     if dur == 0:
         raise RuntimeError("Unable to read the video duration.")
@@ -265,68 +235,61 @@ def preprocess_video(in_path: str) -> Tuple[str, float]:
     processed = temp_dir / "proc.mp4"
     trim_args = ["-t", str(MAX_SECS)] if dur > MAX_SECS else []
-    # Normalize & remove audio
     sh(" ".join([
-        "ffmpeg", "-y", "-i", f"\"{in_path}\"",
-        *trim_args,
-        "-an",
-        "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
-        "-movflags", "+faststart",
-        f"\"{trimmed}\""
     ]))
-    # Downscale to TARGET_H; ensure mod2 width
     vf = f"scale=-2:{TARGET_H}:flags=bicubic"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
-        "-vf", f"\"{vf}\"",
-        "-an",
         "-vcodec", "libx264", "-profile:v", "baseline", "-level", "3.1",
-        "-pix_fmt", "yuv420p",
-        "-preset", "veryfast", "-crf", "24",
-        "-movflags", "+faststart",
-        f"\"{processed}\""
     ]))
-    final_dur = min(dur, float(MAX_SECS))
-    return str(processed), final_dur
-# ========= ZeroGPU marker (so startup checker is happy) =========
-@GPU(duration=5)
-def _zgpu_marker(_: int = 0) -> int:
-    """No-op; only to advertise that this Space has GPU-decorated functions."""
-    return _
-# ========= Inference (ZeroGPU) =========
 @GPU(duration=ZEROGPU_DURATION)
-@torch.inference_mode()
 def run_model(video_path: str, prompt_text: str,
               guidance_scale: float = 4.5,
               num_inference_steps: int = 50,
               sample_nums: int = 1):
     """
-    Native inference (no shell). Returns ([wav_paths], sample_rate).
     """
-    if _model_dict is None or _cfg is None:
-        raise RuntimeError("Model not loaded yet.")
     text_prompt = (prompt_text or "").strip()
-    # Extract features
     visual_feats, text_feats, audio_len_s = feature_process(
         video_path, text_prompt, _model_dict, _cfg
     )
-    # Generate audio (B x C x T)
     logger.info(f"Generating {sample_nums} sample(s)...")
     audio_batch, sr = denoise_process(
         visual_feats, text_feats, audio_len_s, _model_dict, _cfg,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
         batch_size=sample_nums
     )
-    # Save each sample as WAV
     out_dir = OUT_DIR / f"job_{uuid.uuid4().hex[:8]}"
     out_dir.mkdir(parents=True, exist_ok=True)
     wav_paths = []
@@ -334,23 +297,8 @@ def run_model(video_path: str, prompt_text: str,
         wav_p = out_dir / f"generated_audio_{i+1}.wav"
         torchaudio.save(str(wav_p), audio_batch[i], sr)
         wav_paths.append(str(wav_p))
     return wav_paths, sr
-# ========= Optional: Mux Foley back to video =========
-def mux_audio_with_video(video_path: str, audio_path: str) -> str:
-    out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
-    sh(" ".join([
-        "ffmpeg", "-y",
-        "-i", f"\"{video_path}\"",
-        "-i", f"\"{audio_path}\"",
-        "-map", "0:v:0", "-map", "1:a:0",
-        "-c:v", "copy", "-c:a", "aac", "-b:a", "192k",
-        "-shortest",
-        f"\"{out_path}\""
-    ]))
-    return str(out_path)
 # ========= UI Handlers =========
 def single_generate(video: str, prompt: str, want_mux: bool, project_name: str):
     history = []
@@ -361,18 +309,12 @@ def single_generate(video: str, prompt: str, want_mux: bool, project_name: str):
         pre_path, final_dur = preprocess_video(video)
         history.append(["Inference", "ZeroGPU native pipeline"])
-        wav_list, sr = run_model(
-            pre_path, prompt or "", guidance_scale=4.5, num_inference_steps=50, sample_nums=1
-        )
         if not wav_list:
             raise RuntimeError("No audio produced.")
         wav = wav_list[0]
-        muxed = None
-        if want_mux:
-            history.append(["Mux", "Merging foley with video"])
-            muxed = mux_audio_with_video(pre_path, wav)
         history.append(["Done", f"OK · ~{final_dur:.1f}s"])
         return wav, muxed, f"✅ Completed (~{final_dur:.1f}s)", history
     except Exception as e:
@@ -431,35 +373,19 @@ THEME_CSS = f"""
   color: white;
   box-shadow: 0 10px 30px rgba(0,0,0,.35);
 }}
-#hero h1 {{
-  margin: 0 0 6px 0;
-  font-size: 20px;
-  font-weight: 700;
-  letter-spacing: .2px;
-}}
-#hero p {{
-  margin: 0;
-  opacity: .95;
-}}
 .gr-tabitem, .gr-block.gr-group, .gr-panel {{
   background: var(--panel);
   border-radius: 16px !important;
   box-shadow: 0 6px 18px rgba(0,0,0,.28);
   border: 1px solid rgba(255,255,255,.04);
 }}
-.gr-button {{
-  border-radius: 12px !important;
-  border: 1px solid rgba(255,255,255,.08) !important;
-}}
 .gradio-container .tabs .tab-nav button.selected {{
-  background: rgba(255,255,255,.06);
-  border-radius: 12px;
-  border: 1px solid rgba(255,255,255,.08);
-}}
-.badge {{
-  display:inline-block; padding:2px 8px; border-radius:999px;
-  background: rgba(255,255,255,.12); color:#fff; font-size:12px
 }}
 """
 with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
@@ -475,16 +401,10 @@ with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
     with gr.Tabs():
         with gr.Tab("🎬 Single Clip"):
             with gr.Group():
-                project_name = gr.Textbox(
-                    label="Project name (optional)",
-                    placeholder="Enter a short label for this clip"
-                )
                 with gr.Row():
                     v_single = gr.Video(label=f"Video (≤ ~{MAX_SECS}s recommended)")
-                    p_single = gr.Textbox(
-                        label="Sound prompt (optional)",
-                        placeholder="e.g., soft footsteps on wood, light rain, indoor reverb"
-                    )
                 with gr.Row():
                     want_mux_single = gr.Checkbox(value=True, label="Mux foley into MP4 output")
                 run_btn = gr.Button("Generate", variant="primary")
@@ -492,10 +412,7 @@ with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
                     out_audio = gr.Audio(label=f"Generated Foley ({SR//1000} kHz WAV)", type="filepath")
                     out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
                 status_md = gr.Markdown()
-                history_table = gr.Dataframe(
-                    headers=["Step", "Note"], datatype=["str","str"],
-                    interactive=False, wrap=True, label="Activity"
-                )
             run_btn.click(
                 single_generate,
@@ -509,16 +426,9 @@ with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
             want_mux_b = gr.Checkbox(value=True, label="Mux each output")
             go_b = gr.Button("Run batch-lite")
             batch_status = gr.Markdown()
-            batch_log = gr.Dataframe(
-                headers=["Step","Note"], datatype=["str","str"],
-                interactive=False, wrap=True, label="Batch Log"
-            )
-            go_b.click(
-                batch_lite_generate,
-                inputs=[files, prompt_b, want_mux_b],
-                outputs=[batch_status, batch_log]
-            )
         with gr.Tab("ℹ️ Tips"):
             gr.Markdown(f"""
@@ -532,16 +442,19 @@ with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
 - Enable **Mux** to get a ready MP4 with the generated foley track.
 """)
-# ---- Health endpoint & guarded launch ---------------------------------------
 try:
     from fastapi import FastAPI
-    fastapi_app = demo.app  # Gradio's FastAPI app
     @fastapi_app.get("/health")
     def _health():
-        return {"ok": True, "model_loaded": _model_dict is not None, "device": str(_device)}
 except Exception:
     pass
 try:
     demo.queue(max_size=24).launch(server_name="0.0.0.0")
 except Exception:

 from pathlib import Path
 from typing import Tuple, List
+# ========= Crash trap & env =========
 import faulthandler
 faulthandler.enable()
 os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "false")
 os.environ.setdefault("GRADIO_NUM_PORTS", "1")
 os.environ.setdefault("HF_HUB_VERBOSE", "1")
     print(f"\n===== FATAL ({ts}Z) =====================================")
     traceback.print_exception(exctype, value, tb)
     print("=========================================================\n", flush=True)
 sys.excepthook = _crash_trap
+# ========= Minimal imports for startup =========
 import gradio as gr
+from spaces import GPU  # ensure checker can see decorator
 from loguru import logger
+# ---- ZeroGPU marker FIRST (so startup detector finds it) ----
+@GPU(duration=5)
+def _zgpu_marker(_: int = 0) -> int:
+    """No-op; only to advertise a GPU-decorated function at import-time."""
+    return _
+# ========= Paths & Configs =========
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = ROOT / "weights"
 CACHE_DIR = ROOT / "cache"
 OUT_DIR = ROOT / "outputs"
 ASSETS = ROOT / "assets"
+for p in (ASSETS, WEIGHTS_DIR, CACHE_DIR, OUT_DIR):
+    p.mkdir(parents=True, exist_ok=True)
 APP_TITLE   = os.environ.get("APP_TITLE", "Foley Studio · ZeroGPU")
 APP_TAGLINE = os.environ.get("APP_TAGLINE", "Generate scene-true foley for short clips (ZeroGPU-ready).")
 PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")
+# ZeroGPU-friendly defaults
 MAX_SECS = int(os.environ.get("MAX_SECS", "15"))
 TARGET_H = int(os.environ.get("TARGET_H", "480"))
 SR       = int(os.environ.get("TARGET_SR", "48000"))
 ZEROGPU_DURATION = int(os.environ.get("ZEROGPU_DURATION", "110"))
+# ========= Light utils (safe at import) =========
 def sh(cmd: str):
     print(">>", cmd)
     subprocess.run(cmd, shell=True, check=True)
         return 0.0
 def _clone_without_lfs():
     if REPO_DIR.exists():
         return
     try:
         sh(f"git -C {REPO_DIR} fetch --depth 1 origin master")
         sh(f"git -C {REPO_DIR} checkout master")
+def prepare_code_and_weights():
+    from huggingface_hub import snapshot_download
     _clone_without_lfs()
     if str(REPO_DIR) not in sys.path:
         sys.path.insert(0, str(REPO_DIR))
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
         local_dir=str(WEIGHTS_DIR),
     )
     os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
+# Do lightweight prep (no model init) at import-time
+prepare_code_and_weights()
+# Prefer safetensors & fast transfer for later downloads
 os.environ["TRANSFORMERS_PREFER_SAFETENSORS"] = "1"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# ========= Heavy deps & model utilities (deferred import) =========
+_model_dict = None
+_cfg = None
+_device = None
+def _lazy_heavy_imports():
+    global torch, torchaudio
+    import torch, torchaudio  # noqa
+    try:
+        import audiotools  # provided by 'descript-audiotools'
+    except Exception as e:
+        raise RuntimeError(
+            "Missing 'audiotools'. Add 'descript-audiotools>=0.7.2' to requirements.txt."
+        ) from e
+    try:
+        import omegaconf  # noqa
+        import yaml       # noqa
+        import easydict   # noqa
+    except Exception as e:
+        raise RuntimeError(
+            "Missing config deps. Add: omegaconf>=2.3.0, pyyaml, easydict."
+        ) from e
+    # Tencent internals
+    from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process  # noqa
+    from hunyuanvideo_foley.utils.feature_utils import feature_process           # noqa
+    from hunyuanvideo_foley.utils.media_utils import merge_audio_video           # noqa
+    return torch, torchaudio
+def _ensure_clap_safetensors_only():
+    from huggingface_hub import snapshot_download
+    # Pre-cache only safetensors; block .bin selection
     snapshot_download(
         repo_id="laion/larger_clap_general",
         allow_patterns=[
         local_dir=None,
         local_dir_use_symlinks=False,
     )
+    # Purge any cached .bin for the model
     cache_root = Path.home() / ".cache" / "huggingface" / "hub"
+    for pat in [cache_root / "models--laion--larger_clap_general" / "snapshots" / "*" / "*.bin"]:
         for f in glob.glob(str(pat)):
             try:
                 Path(f).unlink()
             except Exception:
                 pass
+def _setup_device(device_str: str = "auto", gpu_id: int = 0):
+    import torch
     if device_str == "auto":
         if torch.cuda.is_available():
             d = torch.device(f"cuda:{gpu_id}")
     return d
 def auto_load_models() -> str:
+    """Load the full Tencent pipeline (lazy; call when needed)."""
     global _model_dict, _cfg, _device
+    if _model_dict is not None:
+        return "✅ Model already loaded"
+    # Imports & guards
+    torch, _ = _lazy_heavy_imports()
+    MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", str(WEIGHTS_DIR))
+    CONFIG_PATH = str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")
     if not os.path.exists(CONFIG_PATH):
         return f"❌ Config file not found: {CONFIG_PATH}"
     logger.info(f"MODEL_PATH:  {MODEL_PATH}")
     logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
+    # Force CLAP to safetensors path
+    _ensure_clap_safetensors_only()
     os.environ["HF_HUB_OFFLINE"] = "1"
     os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    from hunyuanvideo_foley.utils.model_utils import load_model
     _model_dict, _cfg = load_model(MODEL_PATH, CONFIG_PATH, _device)
     logger.info("✅ Model loaded")
     return "✅ Model loaded"
+# ========= Pre/Post-processing =========
 def preprocess_video(in_path: str) -> Tuple[str, float]:
     dur = ffprobe_duration(in_path)
     if dur == 0:
         raise RuntimeError("Unable to read the video duration.")
     processed = temp_dir / "proc.mp4"
     trim_args = ["-t", str(MAX_SECS)] if dur > MAX_SECS else []
     sh(" ".join([
+        "ffmpeg", "-y", "-i", f"\"{in_path}\"", *trim_args,
+        "-an", "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
+        "-movflags", "+faststart", f"\"{trimmed}\""
     ]))
     vf = f"scale=-2:{TARGET_H}:flags=bicubic"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
+        "-vf", f"\"{vf}\"", "-an",
         "-vcodec", "libx264", "-profile:v", "baseline", "-level", "3.1",
+        "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "24",
+        "-movflags", "+faststart", f"\"{processed}\""
     ]))
+    return str(processed), min(dur, float(MAX_SECS))
+def mux_audio_with_video(video_path: str, audio_path: str) -> str:
+    out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
+    sh(" ".join([
+        "ffmpeg", "-y", "-i", f"\"{video_path}\"", "-i", f"\"{audio_path}\"",
+        "-map", "0:v:0", "-map", "1:a:0", "-c:v", "copy", "-c:a", "aac", "-b:a", "192k",
+        "-shortest", f"\"{out_path}\""
+    ]))
+    return str(out_path)
+# ========= Inference (GPU-decorated) =========
 @GPU(duration=ZEROGPU_DURATION)
 def run_model(video_path: str, prompt_text: str,
               guidance_scale: float = 4.5,
               num_inference_steps: int = 50,
               sample_nums: int = 1):
     """
+    ZeroGPU-safe native pipeline. Returns ([wav_paths], sample_rate).
     """
+    # Lazy load model the first time this runs
+    if _model_dict is None:
+        msg = auto_load_models()
+        logger.info(msg)
+    # heavy imports (after model load prepared)
+    import torchaudio
+    from hunyuanvideo_foley.utils.feature_utils import feature_process
+    from hunyuanvideo_foley.utils.model_utils import denoise_process
     text_prompt = (prompt_text or "").strip()
     visual_feats, text_feats, audio_len_s = feature_process(
         video_path, text_prompt, _model_dict, _cfg
     )
     logger.info(f"Generating {sample_nums} sample(s)...")
     audio_batch, sr = denoise_process(
         visual_feats, text_feats, audio_len_s, _model_dict, _cfg,
+        guidance_scale=guidance_scale, num_inference_steps=num_inference_steps,
         batch_size=sample_nums
     )
     out_dir = OUT_DIR / f"job_{uuid.uuid4().hex[:8]}"
     out_dir.mkdir(parents=True, exist_ok=True)
     wav_paths = []
         wav_p = out_dir / f"generated_audio_{i+1}.wav"
         torchaudio.save(str(wav_p), audio_batch[i], sr)
         wav_paths.append(str(wav_p))
     return wav_paths, sr
 # ========= UI Handlers =========
 def single_generate(video: str, prompt: str, want_mux: bool, project_name: str):
     history = []
         pre_path, final_dur = preprocess_video(video)
         history.append(["Inference", "ZeroGPU native pipeline"])
+        wav_list, sr = run_model(pre_path, prompt or "", guidance_scale=4.5, num_inference_steps=50, sample_nums=1)
         if not wav_list:
             raise RuntimeError("No audio produced.")
         wav = wav_list[0]
+        muxed = mux_audio_with_video(pre_path, wav) if want_mux else None
         history.append(["Done", f"OK · ~{final_dur:.1f}s"])
         return wav, muxed, f"✅ Completed (~{final_dur:.1f}s)", history
     except Exception as e:
   color: white;
   box-shadow: 0 10px 30px rgba(0,0,0,.35);
 }}
+#hero h1 {{ margin: 0 0 6px 0; font-size: 20px; font-weight: 700; letter-spacing: .2px; }}
+#hero p  {{ margin: 0; opacity: .95; }}
 .gr-tabitem, .gr-block.gr-group, .gr-panel {{
   background: var(--panel);
   border-radius: 16px !important;
   box-shadow: 0 6px 18px rgba(0,0,0,.28);
   border: 1px solid rgba(255,255,255,.04);
 }}
+.gr-button {{ border-radius: 12px !important; border: 1px solid rgba(255,255,255,.08) !important; }}
 .gradio-container .tabs .tab-nav button.selected {{
+  background: rgba(255,255,255,.06); border-radius: 12px; border: 1px solid rgba(255,255,255,.08);
 }}
+.badge {{ display:inline-block; padding:2px 8px; border-radius:999px; background: rgba(255,255,255,.12); color:#fff; font-size:12px }}
 """
 with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
     with gr.Tabs():
         with gr.Tab("🎬 Single Clip"):
             with gr.Group():
+                project_name = gr.Textbox(label="Project name (optional)", placeholder="Enter a short label for this clip")
                 with gr.Row():
                     v_single = gr.Video(label=f"Video (≤ ~{MAX_SECS}s recommended)")
+                    p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps on wood, light rain, indoor reverb")
                 with gr.Row():
                     want_mux_single = gr.Checkbox(value=True, label="Mux foley into MP4 output")
                 run_btn = gr.Button("Generate", variant="primary")
                     out_audio = gr.Audio(label=f"Generated Foley ({SR//1000} kHz WAV)", type="filepath")
                     out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
                 status_md = gr.Markdown()
+                history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
             run_btn.click(
                 single_generate,
             want_mux_b = gr.Checkbox(value=True, label="Mux each output")
             go_b = gr.Button("Run batch-lite")
             batch_status = gr.Markdown()
+            batch_log = gr.Dataframe(headers=["Step","Note"], datatype=["str","str"], interactive=False, wrap=True, label="Batch Log")
+            go_b.click(batch_lite_generate, inputs=[files, prompt_b, want_mux_b], outputs=[batch_status, batch_log])
         with gr.Tab("ℹ️ Tips"):
             gr.Markdown(f"""
 - Enable **Mux** to get a ready MP4 with the generated foley track.
 """)
+# Health endpoint
 try:
     from fastapi import FastAPI
+    fastapi_app = demo.app
     @fastapi_app.get("/health")
     def _health():
+        return {"ok": True, "model_loaded": _model_dict is not None, "device": str(_device) if _device else None}
 except Exception:
     pass
+# Launch
+logger.remove()
+logger.add(lambda msg: print(msg, end=''), level="INFO")
 try:
     demo.queue(max_size=24).launch(server_name="0.0.0.0")
 except Exception: