UncleWang233 commited on
Commit
37b79a6
·
1 Parent(s): d62c880
Files changed (3) hide show
  1. .gradio/certificate.pem +31 -0
  2. app.py +24 -6
  3. requirements.txt +1 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -32,6 +32,7 @@ import gradio as gr
32
  import tempfile
33
  import subprocess
34
  from huggingface_hub import hf_hub_download
 
35
 
36
  _CLIP_SIZE = 224
37
  _CLIP_FPS = 8.0
@@ -62,7 +63,7 @@ class VGGSound(Dataset):
62
  self,
63
  sample_rate: int = 44_100,
64
  duration_sec: float = 9.0,
65
- audio_samples: Optional[int] = 397312,
66
  normalize_audio: bool = False,
67
  ):
68
  if audio_samples is None:
@@ -182,8 +183,8 @@ else:
182
  device = 'cpu'
183
  extra_device = 'cpu'
184
 
185
- vae_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=3-step=100000.ckpt",repo_type="dataset")
186
- synchformer_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="synchformer_state_dict.pth",repo_type="dataset")
187
  feature_extractor = FeaturesUtils(
188
  vae_ckpt=vae_ckpt,
189
  vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
@@ -191,7 +192,7 @@ feature_extractor = FeaturesUtils(
191
  synchformer_ckpt=synchformer_ckpt
192
  ).eval().to(extra_device)
193
 
194
- preprocesser = VGGSound()
195
 
196
  args = get_all_args()
197
 
@@ -224,7 +225,7 @@ model.pretransform.load_state_dict(load_vae_state)
224
  # Remove weight_norm from the pretransform if specified
225
  if args.remove_pretransform_weight_norm == "post_load":
226
  remove_weight_norm_from_model(model.pretransform)
227
- ckpt_path = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=10-step=68000.ckpt",repo_type="dataset")
228
  training_wrapper = create_training_wrapper_from_config(model_config, model)
229
  # 加载模型权重时根据设备选择map_location
230
  if device == 'cuda':
@@ -232,13 +233,23 @@ if device == 'cuda':
232
  else:
233
  training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
234
 
 
 
 
 
235
  def get_audio(video_path, caption):
236
  # 允许caption为空
237
  if caption is None:
238
  caption = ''
239
  timer = Timer(duration="00:15:00:00")
 
 
 
 
240
  data = preprocesser.sample(video_path, caption)
241
 
 
 
242
  preprocessed_data = {}
243
  metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
244
  preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
@@ -253,11 +264,17 @@ def get_audio(video_path, caption):
253
  sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
254
  preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
255
  preprocessed_data['video_exist'] = torch.tensor(True)
 
 
 
 
 
 
256
 
257
  metadata = [preprocessed_data]
258
 
259
  batch_size = 1
260
- length = 194
261
  with torch.amp.autocast(device):
262
  conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
263
 
@@ -288,6 +305,7 @@ def get_audio(video_path, caption):
288
  audio_path = tmp_audio.name
289
  return audio_path
290
 
 
291
  # 合成新视频:用ffmpeg将音频与原视频合成
292
 
293
  def synthesize_video_with_audio(video_file, caption):
 
32
  import tempfile
33
  import subprocess
34
  from huggingface_hub import hf_hub_download
35
+ from moviepy.editor import VideoFileClip
36
 
37
  _CLIP_SIZE = 224
38
  _CLIP_FPS = 8.0
 
63
  self,
64
  sample_rate: int = 44_100,
65
  duration_sec: float = 9.0,
66
+ audio_samples: int = None,
67
  normalize_audio: bool = False,
68
  ):
69
  if audio_samples is None:
 
183
  device = 'cpu'
184
  extra_device = 'cpu'
185
 
186
+ vae_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=3-step=100000.ckpt",repo_type="model")
187
+ synchformer_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
188
  feature_extractor = FeaturesUtils(
189
  vae_ckpt=vae_ckpt,
190
  vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
 
192
  synchformer_ckpt=synchformer_ckpt
193
  ).eval().to(extra_device)
194
 
195
+
196
 
197
  args = get_all_args()
198
 
 
225
  # Remove weight_norm from the pretransform if specified
226
  if args.remove_pretransform_weight_norm == "post_load":
227
  remove_weight_norm_from_model(model.pretransform)
228
+ ckpt_path = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=10-step=68000.ckpt",repo_type="model")
229
  training_wrapper = create_training_wrapper_from_config(model_config, model)
230
  # 加载模型权重时根据设备选择map_location
231
  if device == 'cuda':
 
233
  else:
234
  training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
235
 
236
+ def get_video_duration(video_path):
237
+ video = VideoFileClip(video_path)
238
+ return video.duration
239
+
240
  def get_audio(video_path, caption):
241
  # 允许caption为空
242
  if caption is None:
243
  caption = ''
244
  timer = Timer(duration="00:15:00:00")
245
+ #get video duration
246
+ duration_sec = get_video_duration(video_path)
247
+ print(duration_sec)
248
+ preprocesser = VGGSound(duration_sec=duration_sec)
249
  data = preprocesser.sample(video_path, caption)
250
 
251
+
252
+
253
  preprocessed_data = {}
254
  metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
255
  preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
 
264
  sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
265
  preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
266
  preprocessed_data['video_exist'] = torch.tensor(True)
267
+ print("clip_shape", preprocessed_data['metaclip_features'].shape)
268
+ print("sync_shape", preprocessed_data['sync_features'].shape)
269
+ sync_seq_len = preprocessed_data['sync_features'].shape[0]
270
+ clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
271
+ latent_seq_len = (int)(194/9*duration_sec)
272
+ training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)
273
 
274
  metadata = [preprocessed_data]
275
 
276
  batch_size = 1
277
+ length = latent_seq_len
278
  with torch.amp.autocast(device):
279
  conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
280
 
 
305
  audio_path = tmp_audio.name
306
  return audio_path
307
 
308
+ get_audio("./examples/3_mute.mp4", "Axe striking")
309
  # 合成新视频:用ffmpeg将音频与原视频合成
310
 
311
  def synthesize_video_with_audio(video_file, caption):
requirements.txt CHANGED
@@ -230,3 +230,4 @@ xyzservices==2025.4.0
230
  yarl==1.20.0
231
  zipp==3.21.0
232
  git+https://github.com/patrick-kidger/torchcubicspline.git
 
 
230
  yarl==1.20.0
231
  zipp==3.21.0
232
  git+https://github.com/patrick-kidger/torchcubicspline.git
233
+ moviepy==1.0.3