Spaces:
Running
on
Zero
Running
on
Zero
import os, json, tempfile, subprocess, shutil, time, uuid | |
from pathlib import Path | |
from typing import Optional, Tuple, List | |
import gradio as gr | |
import spaces | |
from huggingface_hub import snapshot_download | |
# ========= Paths & Repo ========= | |
ROOT = Path(__file__).parent.resolve() | |
REPO_DIR = ROOT / "HunyuanVideo-Foley" | |
WEIGHTS_DIR = ROOT / "weights" | |
CACHE_DIR = ROOT / "cache" | |
OUT_DIR = ROOT / "outputs" | |
ASSETS = ROOT / "assets" | |
ASSETS.mkdir(exist_ok=True) | |
BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio") | |
PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF") # purple-ish | |
MAX_SECS = int(os.environ.get("MAX_SECS", "22")) # ZeroGPU-friendly | |
TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height | |
SR = int(os.environ.get("TARGET_SR", "48000")) # target audio sample rate | |
def sh(cmd: str): | |
print(">>", cmd) | |
subprocess.run(cmd, shell=True, check=True) | |
def ffprobe_duration(path: str) -> float: | |
try: | |
out = subprocess.check_output([ | |
"ffprobe", "-v", "error", "-show_entries", "format=duration", | |
"-of", "default=noprint_wrappers=1:nokey=1", path | |
]).decode().strip() | |
return float(out) | |
except Exception: | |
return 0.0 | |
def prepare_once(): | |
"""Clone repo + download weights on cold start.""" | |
REPO_DIR.exists() or sh("git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git") | |
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True) | |
snapshot_download( | |
repo_id="tencent/HunyuanVideo-Foley", | |
local_dir=str(WEIGHTS_DIR), | |
local_dir_use_symlinks=False, | |
repo_type="model", | |
) | |
os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR) | |
CACHE_DIR.mkdir(exist_ok=True) | |
OUT_DIR.mkdir(exist_ok=True) | |
prepare_once() | |
# ========= Preprocessing ========= | |
def preprocess_video(in_path: str) -> Tuple[str, float]: | |
""" | |
- Validates duration (<= MAX_SECS). If longer, auto-trims to MAX_SECS. | |
- Downscales to TARGET_H height (keeping AR), H.264 baseline, AAC passthrough. | |
- Returns path to processed mp4 and final duration. | |
""" | |
dur = ffprobe_duration(in_path) | |
temp_dir = Path(tempfile.mkdtemp(prefix="pre_")) | |
trimmed = temp_dir / "trim.mp4" | |
processed = temp_dir / "proc.mp4" | |
# If longer than budget, trim to MAX_SECS (from start). | |
if dur == 0: | |
raise RuntimeError("Unable to read the video duration.") | |
trim_filter = [] | |
if dur > MAX_SECS: | |
trim_filter = ["-t", str(MAX_SECS)] | |
# First, ensure we have a small, uniform container (mp4) | |
sh(" ".join([ | |
"ffmpeg", "-y", "-i", f"\"{in_path}\"", | |
*trim_filter, | |
"-an", # remove original audio (we're generating new foley) | |
"-vcodec", "libx264", "-preset", "veryfast", "-crf", "23", | |
"-movflags", "+faststart", | |
f"\"{trimmed}\"" | |
])) | |
# Downscale to TARGET_H keeping AR; re-encode efficiently | |
# Use mod2 dimensions for compatibility | |
vf = f"scale=-2:{TARGET_H}:flags=bicubic" | |
sh(" ".join([ | |
"ffmpeg", "-y", "-i", f"\"{trimmed}\"", | |
"-vf", f"\"{vf}\"", | |
"-an", | |
"-vcodec", "libx264", "-profile:v", "baseline", "-level", "3.1", | |
"-pix_fmt", "yuv420p", | |
"-preset", "veryfast", "-crf", "24", | |
"-movflags", "+faststart", | |
f"\"{processed}\"" | |
])) | |
final_dur = min(dur, float(MAX_SECS)) | |
return str(processed), final_dur | |
# ========= Inference (ZeroGPU) ========= | |
# ~4 minutes per call window | |
def run_model(video_path: str, prompt_text: str) -> str: | |
""" | |
Run Tencent's infer.py on ZeroGPU. Returns path to WAV. | |
""" | |
job_id = uuid.uuid4().hex[:8] | |
work_out = OUT_DIR / f"job_{job_id}" | |
work_out.mkdir(parents=True, exist_ok=True) | |
cmd = [ | |
"python", f"{REPO_DIR}/infer.py", | |
"--model_path", str(WEIGHTS_DIR), | |
"--config_path", f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml", | |
"--single_video", video_path, | |
"--single_prompt", json.dumps(prompt_text or ""), | |
"--output_dir", str(work_out), | |
"--device", "cuda" | |
] | |
sh(" ".join(cmd)) | |
# Find produced wav | |
wav = None | |
for p in work_out.rglob("*.wav"): | |
wav = p | |
break | |
if not wav: | |
raise RuntimeError("No audio produced by the model.") | |
# Normalize / resample to SR (safeguard) | |
fixed = work_out / "foley_48k.wav" | |
sh(" ".join([ | |
"ffmpeg", "-y", "-i", f"\"{str(wav)}\"", | |
"-ar", str(SR), "-ac", "2", | |
f"\"{str(fixed)}\"" | |
])) | |
return str(fixed) | |
# ========= Post: optional mux back to the video ========= | |
def mux_audio_with_video(video_path: str, audio_path: str) -> str: | |
out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4" | |
# Copy video, add foley audio as AAC | |
sh(" ".join([ | |
"ffmpeg", "-y", | |
"-i", f"\"{video_path}\"", | |
"-i", f"\"{audio_path}\"", | |
"-map", "0:v:0", "-map", "1:a:0", | |
"-c:v", "copy", "-c:a", "aac", "-b:a", "192k", | |
"-shortest", | |
f"\"{out_path}\"" | |
])) | |
return str(out_path) | |
# ========= Gradio UI Logic ========= | |
def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]: | |
""" | |
Returns: (wav_path, muxed_video_path_or_None, status_markdown, history_list) | |
""" | |
history = [] | |
try: | |
if not video: | |
return None, None, "⚠️ Please upload a video.", history | |
# Preprocess | |
history.append(["Preprocess", "Downscaling / trimming…"]) | |
pre_path, final_dur = preprocess_video(video) | |
# Run model (ZeroGPU) | |
history.append(["Inference", "Generating foley on GPU…"]) | |
wav = run_model(pre_path, prompt or "") | |
# Optional Mux | |
muxed = None | |
if want_mux: | |
history.append(["Mux", "Combining foley with video…"]) | |
muxed = mux_audio_with_video(pre_path, wav) | |
history.append(["Done", f"OK · Duration ~{final_dur:.1f}s"]) | |
return wav, muxed, f"✅ Finished (≈ {final_dur:.1f}s)", history | |
except Exception as e: | |
history.append(["Error", str(e)]) | |
return None, None, f"❌ {type(e).__name__}: {e}", history | |
def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]: | |
""" | |
Run a tiny queue sequentially; ZeroGPU handles each call in series. | |
We enforce 3 items max to stay quota-friendly. | |
""" | |
log = [] | |
if not files: | |
return "⚠️ Please upload 1–3 videos.", log | |
if len(files) > 3: | |
files = files[:3] | |
log.append(["Info", "Limiting to first 3 videos."]) | |
outputs = [] | |
for i, f in enumerate(files, 1): | |
try: | |
log.append([f"Preprocess {i}", Path(f).name]) | |
pre, final_dur = preprocess_video(f) | |
log.append([f"Run {i}", f"GPU infer ~{final_dur:.1f}s"]) | |
wav = run_model(pre, prompt or "") | |
muxed = mux_audio_with_video(pre, wav) if want_mux else None | |
outputs.append((wav, muxed)) | |
log.append([f"Done {i}", "OK"]) | |
except Exception as e: | |
log.append([f"Error {i}", str(e)]) | |
# Write a small manifest to outputs | |
manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json" | |
manifest.write_text(json.dumps( | |
[{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2 | |
)) | |
return f"✅ Batch-lite finished · items: {len(outputs)}", log | |
# ========= UI ========= | |
THEME_CSS = f""" | |
:root {{ | |
--brand: {PRIMARY_COLOR}; | |
}} | |
.gradio-container {{ | |
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Cairo, Noto Sans, Arial, "Apple Color Emoji", "Segoe UI Emoji"; | |
}} | |
#brandbar {{ | |
background: linear-gradient(90deg, var(--brand), #222); | |
color: white; padding: 12px 16px; border-radius: 12px; | |
}} | |
#brandbar strong {{ letter-spacing: .3px; }} | |
footer, #footer {{}} | |
""" | |
with gr.Blocks( | |
css=THEME_CSS, | |
title="Foley Studio · ZeroGPU" | |
) as demo: | |
with gr.Row(): | |
gr.HTML(f'<div id="brandbar"><strong>{BILS_BRAND}</strong> — HunyuanVideo-Foley on ZeroGPU</div>') | |
with gr.Tabs(): | |
with gr.Tab("🎬 Single Clip"): | |
with gr.Group(): | |
project_name = gr.Textbox(label="Project name (optional)", placeholder="e.g., JawharaFM Teaser 09-2025") | |
with gr.Row(): | |
v_single = gr.Video(label="Video (≤ ~20s recommended)") | |
p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps, indoor reverb, light rain outside") | |
with gr.Row(): | |
want_mux_single = gr.Checkbox(value=True, label="Mux foley back into video (MP4)") | |
run_btn = gr.Button("Generate", variant="primary") | |
with gr.Row(): | |
out_audio = gr.Audio(label="Generated Foley (48 kHz WAV)", type="filepath") | |
out_mux = gr.Video(label="Video + Foley (MP4)", visible=True) | |
status_md = gr.Markdown() | |
history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity") | |
run_btn.click( | |
single_generate, | |
inputs=[v_single, p_single, want_mux_single, project_name], | |
outputs=[out_audio, out_mux, status_md, history_table] | |
) | |
with gr.Tab("📦 Batch-Lite (1–3 clips)"): | |
files = gr.Files(label="Upload 1–3 short videos", file_types=[".mp4",".mov"], file_count="multiple") | |
prompt_b = gr.Textbox(label="Global prompt (optional)") | |
want_mux_b = gr.Checkbox(value=True, label="Mux each output") | |
go_b = gr.Button("Run batch-lite") | |
batch_status = gr.Markdown() | |
batch_log = gr.Dataframe(headers=["Step","Note"], datatype=["str","str"], interactive=False, wrap=True, label="Batch Log") | |
go_b.click( | |
batch_lite_generate, | |
inputs=[files, prompt_b, want_mux_b], | |
outputs=[batch_status, batch_log] | |
) | |
with gr.Tab("⚙️ Settings / Tips"): | |
gr.Markdown(f""" | |
**ZeroGPU Budget Tips** | |
- Keep clips **≤ {MAX_SECS}s** (tool trims automatically if longer). | |
- Video is downscaled to **{TARGET_H}p** to speed up inference. | |
- If you hit a quota message, try again later; ZeroGPU limits GPU minutes per visitor. | |
**Branding** | |
- Change brand name / color via environment variables: | |
- `BILS_BRAND` → header text | |
- `PRIMARY_COLOR` → UI accent hex | |
**Outputs** | |
- WAV is 48 kHz stereo. Toggle **Mux** to get a ready MP4 with the foley track. | |
""") | |
demo.queue(max_size=24).launch() | |