Bils commited on
Commit
cc49c73
·
verified ·
1 Parent(s): 50d7cf9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -47
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, json, tempfile, subprocess, shutil, uuid
2
  from pathlib import Path
3
  from typing import Optional, Tuple, List
4
 
@@ -6,6 +6,9 @@ import gradio as gr
6
  import spaces
7
  from huggingface_hub import snapshot_download
8
 
 
 
 
9
  # ========= Paths & Config =========
10
  ROOT = Path(__file__).parent.resolve()
11
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
@@ -15,14 +18,15 @@ OUT_DIR = ROOT / "outputs"
15
  ASSETS = ROOT / "assets"
16
  ASSETS.mkdir(exist_ok=True)
17
 
18
- # You can keep these env vars silently; we just won't mention them in the UI
19
  APP_TITLE = os.environ.get("APP_TITLE", "Foley Studio · ZeroGPU")
20
  APP_TAGLINE = os.environ.get("APP_TAGLINE", "Generate scene-true foley for short clips (ZeroGPU-ready).")
21
- PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")
22
 
23
- MAX_SECS = int(os.environ.get("MAX_SECS", "22")) # ZeroGPU-friendly clip length
24
- TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height
25
- SR = int(os.environ.get("TARGET_SR", "48000")) # WAV sample rate
 
 
26
 
27
  def sh(cmd: str):
28
  print(">>", cmd)
@@ -71,7 +75,7 @@ def _clone_without_lfs():
71
  sparse_file = REPO_DIR / ".git" / "info" / "sparse-checkout"
72
  sparse_file.parent.mkdir(parents=True, exist_ok=True)
73
  sparse_file.write_text("\n".join([
74
- "infer.py",
75
  "configs/",
76
  "gradio_app.py",
77
  "requirements.txt",
@@ -89,19 +93,78 @@ def _clone_without_lfs():
89
  def prepare_once():
90
  """Clone code (skip LFS), download weights, set env, prepare dirs."""
91
  _clone_without_lfs()
 
 
 
 
 
92
  WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
93
  snapshot_download(
94
  repo_id="tencent/HunyuanVideo-Foley",
95
  local_dir=str(WEIGHTS_DIR),
96
  local_dir_use_symlinks=False,
97
  repo_type="model",
 
98
  )
99
  os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
 
100
  CACHE_DIR.mkdir(exist_ok=True)
101
  OUT_DIR.mkdir(exist_ok=True)
102
 
103
  prepare_once()
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # ========= Preprocessing =========
106
  def preprocess_video(in_path: str) -> Tuple[str, float]:
107
  """
@@ -145,42 +208,44 @@ def preprocess_video(in_path: str) -> Tuple[str, float]:
145
  return str(processed), final_dur
146
 
147
  # ========= Inference (ZeroGPU) =========
148
- @spaces.GPU(duration=240) # ~4 minutes per call (ZeroGPU window)
149
- def run_model(video_path: str, prompt_text: str) -> str:
 
 
 
 
150
  """
151
- Call Tencent's infer.py on GPU and return a 48 kHz WAV path.
152
  """
153
- job_id = uuid.uuid4().hex[:8]
154
- work_out = OUT_DIR / f"job_{job_id}"
155
- work_out.mkdir(parents=True, exist_ok=True)
156
-
157
- cmd = [
158
- "python", str(REPO_DIR / "infer.py"),
159
- "--model_path", str(WEIGHTS_DIR),
160
- "--config_path", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml"),
161
- "--single_video", video_path,
162
- "--single_prompt", json.dumps(prompt_text or ""),
163
- "--output_dir", str(work_out),
164
- "--device", "cuda",
165
- ]
166
- sh(" ".join(cmd))
167
-
168
- # Find produced wav
169
- wav = None
170
- for p in work_out.rglob("*.wav"):
171
- wav = p
172
- break
173
- if not wav:
174
- raise RuntimeError("No audio produced by the model.")
175
-
176
- # Normalize / resample to SR stereo
177
- fixed = work_out / "foley_48k.wav"
178
- sh(" ".join([
179
- "ffmpeg", "-y", "-i", f"\"{str(wav)}\"",
180
- "-ar", str(SR), "-ac", "2",
181
- f"\"{str(fixed)}\""
182
- ]))
183
- return str(fixed)
184
 
185
  # ========= Optional: Mux Foley back to video =========
186
  def mux_audio_with_video(video_path: str, audio_path: str) -> str:
@@ -197,26 +262,34 @@ def mux_audio_with_video(video_path: str, audio_path: str) -> str:
197
  return str(out_path)
198
 
199
  # ========= UI Handlers =========
200
- def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]:
201
  history = []
202
  try:
203
  if not video:
204
  return None, None, "⚠️ Please upload a video.", history
205
  history.append(["Preprocess", "Downscaling & trimming"])
206
  pre_path, final_dur = preprocess_video(video)
207
- history.append(["Inference", "Running on ZeroGPU"])
208
- wav = run_model(pre_path, prompt or "")
 
 
 
 
 
 
 
209
  muxed = None
210
  if want_mux:
211
  history.append(["Mux", "Merging foley with video"])
212
  muxed = mux_audio_with_video(pre_path, wav)
 
213
  history.append(["Done", f"OK · ~{final_dur:.1f}s"])
214
  return wav, muxed, f"✅ Completed (~{final_dur:.1f}s)", history
215
  except Exception as e:
216
  history.append(["Error", str(e)])
217
  return None, None, f"❌ {type(e).__name__}: {e}", history
218
 
219
- def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]:
220
  log = []
221
  if not files:
222
  return "⚠️ Please upload 1–3 videos.", log
@@ -230,7 +303,10 @@ def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[
230
  log.append([f"Preprocess {i}", Path(f).name])
231
  pre, final_dur = preprocess_video(f)
232
  log.append([f"Run {i}", f"ZeroGPU ~{final_dur:.1f}s"])
233
- wav = run_model(pre, prompt or "")
 
 
 
234
  muxed = mux_audio_with_video(pre, wav) if want_mux else None
235
  outputs.append((wav, muxed))
236
  log.append([f"Done {i}", "OK"])
@@ -359,7 +435,7 @@ with gr.Blocks(css=THEME_CSS, title=APP_TITLE, analytics_enabled=False) as demo:
359
  **Usage guidelines**
360
  - Keep clips short (the tool trims to **≤ {MAX_SECS}s** automatically).
361
  - The video is downscaled to **{TARGET_H}p** to fit the ZeroGPU time window.
362
- - If you see a quota message, just try again later (ZeroGPU limits GPU minutes per visitor).
363
 
364
  **Outputs**
365
  - WAV is **{SR//1000} kHz** stereo.
 
1
+ import os, sys, json, tempfile, subprocess, shutil, uuid
2
  from pathlib import Path
3
  from typing import Optional, Tuple, List
4
 
 
6
  import spaces
7
  from huggingface_hub import snapshot_download
8
 
9
+ from loguru import logger
10
+ import torch, torchaudio
11
+
12
  # ========= Paths & Config =========
13
  ROOT = Path(__file__).parent.resolve()
14
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
 
18
  ASSETS = ROOT / "assets"
19
  ASSETS.mkdir(exist_ok=True)
20
 
 
21
  APP_TITLE = os.environ.get("APP_TITLE", "Foley Studio · ZeroGPU")
22
  APP_TAGLINE = os.environ.get("APP_TAGLINE", "Generate scene-true foley for short clips (ZeroGPU-ready).")
23
+ PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF") # UI accent only
24
 
25
+ # ZeroGPU-safe defaults (tweak in Space Secrets if needed)
26
+ MAX_SECS = int(os.environ.get("MAX_SECS", "15")) # keep clips short for ZeroGPU window
27
+ TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height
28
+ SR = int(os.environ.get("TARGET_SR", "48000")) # WAV sample rate
29
+ ZEROGPU_DURATION = int(os.environ.get("ZEROGPU_DURATION", "110")) # <= platform maximum
30
 
31
  def sh(cmd: str):
32
  print(">>", cmd)
 
75
  sparse_file = REPO_DIR / ".git" / "info" / "sparse-checkout"
76
  sparse_file.parent.mkdir(parents=True, exist_ok=True)
77
  sparse_file.write_text("\n".join([
78
+ "hunyuanvideo_foley/",
79
  "configs/",
80
  "gradio_app.py",
81
  "requirements.txt",
 
93
  def prepare_once():
94
  """Clone code (skip LFS), download weights, set env, prepare dirs."""
95
  _clone_without_lfs()
96
+
97
+ # Ensure we can import their package
98
+ if str(REPO_DIR) not in sys.path:
99
+ sys.path.insert(0, str(REPO_DIR))
100
+
101
  WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
102
  snapshot_download(
103
  repo_id="tencent/HunyuanVideo-Foley",
104
  local_dir=str(WEIGHTS_DIR),
105
  local_dir_use_symlinks=False,
106
  repo_type="model",
107
+ resume_download=True,
108
  )
109
  os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
110
+
111
  CACHE_DIR.mkdir(exist_ok=True)
112
  OUT_DIR.mkdir(exist_ok=True)
113
 
114
  prepare_once()
115
 
116
+ # Now safe to import their internals
117
+ from hunyuanvideo_foley.utils.model_utils import load_model, denoise_process
118
+ from hunyuanvideo_foley.utils.feature_utils import feature_process
119
+ from hunyuanvideo_foley.utils.media_utils import merge_audio_video
120
+
121
+ # ========= Native Model Setup =========
122
+ MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", str(WEIGHTS_DIR))
123
+ CONFIG_PATH = str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml")
124
+
125
+ _model_dict = None
126
+ _cfg = None
127
+ _device = None
128
+
129
+ def _setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
130
+ if device_str == "auto":
131
+ if torch.cuda.is_available():
132
+ d = torch.device(f"cuda:{gpu_id}")
133
+ logger.info(f"Using CUDA {d}")
134
+ elif torch.backends.mps.is_available():
135
+ d = torch.device("mps")
136
+ logger.info("Using MPS")
137
+ else:
138
+ d = torch.device("cpu")
139
+ logger.info("Using CPU")
140
+ else:
141
+ d = torch.device(device_str if device_str != "cuda" else f"cuda:{gpu_id}")
142
+ logger.info(f"Using specified device: {d}")
143
+ return d
144
+
145
+ def auto_load_models() -> str:
146
+ """Download weights if needed + load model natively."""
147
+ global _model_dict, _cfg, _device
148
+
149
+ if not os.path.exists(MODEL_PATH):
150
+ os.makedirs(MODEL_PATH, exist_ok=True)
151
+
152
+ if not os.path.exists(CONFIG_PATH):
153
+ return f"❌ Config file not found: {CONFIG_PATH}"
154
+
155
+ _device = _setup_device("auto", 0)
156
+ logger.info("Loading HunyuanVideo-Foley model...")
157
+ logger.info(f"MODEL_PATH: {MODEL_PATH}")
158
+ logger.info(f"CONFIG_PATH: {CONFIG_PATH}")
159
+ _model_dict, _cfg = load_model(MODEL_PATH, CONFIG_PATH, _device)
160
+ logger.info("✅ Model loaded")
161
+ return "✅ Model loaded"
162
+
163
+ # Init logger and load model once
164
+ logger.remove()
165
+ logger.add(lambda msg: print(msg, end=''), level="INFO")
166
+ logger.info(auto_load_models())
167
+
168
  # ========= Preprocessing =========
169
  def preprocess_video(in_path: str) -> Tuple[str, float]:
170
  """
 
208
  return str(processed), final_dur
209
 
210
  # ========= Inference (ZeroGPU) =========
211
+ @spaces.GPU(duration=ZEROGPU_DURATION) # tune via env if needed
212
+ @torch.inference_mode()
213
+ def run_model(video_path: str, prompt_text: str,
214
+ guidance_scale: float = 4.5,
215
+ num_inference_steps: int = 50,
216
+ sample_nums: int = 1) -> Tuple[List[str], int]:
217
  """
218
+ Native inference (no shell). Returns ([wav_paths], sample_rate).
219
  """
220
+ if _model_dict is None or _cfg is None:
221
+ raise RuntimeError("Model not loaded yet.")
222
+
223
+ text_prompt = (prompt_text or "").strip()
224
+
225
+ # Extract features
226
+ visual_feats, text_feats, audio_len_s = feature_process(
227
+ video_path, text_prompt, _model_dict, _cfg
228
+ )
229
+
230
+ # Generate audio (B x C x T)
231
+ logger.info(f"Generating {sample_nums} sample(s)...")
232
+ audio_batch, sr = denoise_process(
233
+ visual_feats, text_feats, audio_len_s, _model_dict, _cfg,
234
+ guidance_scale=guidance_scale,
235
+ num_inference_steps=num_inference_steps,
236
+ batch_size=sample_nums
237
+ )
238
+
239
+ # Save each sample as WAV
240
+ out_dir = OUT_DIR / f"job_{uuid.uuid4().hex[:8]}"
241
+ out_dir.mkdir(parents=True, exist_ok=True)
242
+ wav_paths = []
243
+ for i in range(sample_nums):
244
+ wav_p = out_dir / f"generated_audio_{i+1}.wav"
245
+ torchaudio.save(str(wav_p), audio_batch[i], sr)
246
+ wav_paths.append(str(wav_p))
247
+
248
+ return wav_paths, sr
 
 
249
 
250
  # ========= Optional: Mux Foley back to video =========
251
  def mux_audio_with_video(video_path: str, audio_path: str) -> str:
 
262
  return str(out_path)
263
 
264
  # ========= UI Handlers =========
265
+ def single_generate(video: str, prompt: str, want_mux: bool, project_name: str):
266
  history = []
267
  try:
268
  if not video:
269
  return None, None, "⚠️ Please upload a video.", history
270
  history.append(["Preprocess", "Downscaling & trimming"])
271
  pre_path, final_dur = preprocess_video(video)
272
+
273
+ history.append(["Inference", "ZeroGPU native pipeline"])
274
+ wav_list, sr = run_model(
275
+ pre_path, prompt or "", guidance_scale=4.5, num_inference_steps=50, sample_nums=1
276
+ )
277
+ if not wav_list:
278
+ raise RuntimeError("No audio produced.")
279
+ wav = wav_list[0]
280
+
281
  muxed = None
282
  if want_mux:
283
  history.append(["Mux", "Merging foley with video"])
284
  muxed = mux_audio_with_video(pre_path, wav)
285
+
286
  history.append(["Done", f"OK · ~{final_dur:.1f}s"])
287
  return wav, muxed, f"✅ Completed (~{final_dur:.1f}s)", history
288
  except Exception as e:
289
  history.append(["Error", str(e)])
290
  return None, None, f"❌ {type(e).__name__}: {e}", history
291
 
292
+ def batch_lite_generate(files: List[str], prompt: str, want_mux: bool):
293
  log = []
294
  if not files:
295
  return "⚠️ Please upload 1–3 videos.", log
 
303
  log.append([f"Preprocess {i}", Path(f).name])
304
  pre, final_dur = preprocess_video(f)
305
  log.append([f"Run {i}", f"ZeroGPU ~{final_dur:.1f}s"])
306
+ wav_list, sr = run_model(pre, prompt or "", sample_nums=1)
307
+ if not wav_list:
308
+ raise RuntimeError("No audio produced.")
309
+ wav = wav_list[0]
310
  muxed = mux_audio_with_video(pre, wav) if want_mux else None
311
  outputs.append((wav, muxed))
312
  log.append([f"Done {i}", "OK"])
 
435
  **Usage guidelines**
436
  - Keep clips short (the tool trims to **≤ {MAX_SECS}s** automatically).
437
  - The video is downscaled to **{TARGET_H}p** to fit the ZeroGPU time window.
438
+ - If you see a quota message, try again later (ZeroGPU limits GPU minutes per visitor).
439
 
440
  **Outputs**
441
  - WAV is **{SR//1000} kHz** stereo.