Spaces:

FunAudioLLM
/

ThinkSound

Running on Zero

App Files Files Community

UncleWang233 commited on Jun 30

Commit

37b79a6

1 Parent(s): d62c880

0630

Browse files

Files changed (3) hide show

.gradio/certificate.pem +31 -0
app.py +24 -6
requirements.txt +1 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ import gradio as gr
 import tempfile
 import subprocess
 from huggingface_hub import hf_hub_download
 _CLIP_SIZE = 224
 _CLIP_FPS = 8.0
@@ -62,7 +63,7 @@ class VGGSound(Dataset):
         self,
         sample_rate: int = 44_100,
         duration_sec: float = 9.0,
-        audio_samples: Optional[int] = 397312,
         normalize_audio: bool = False,
     ):
         if audio_samples is None:
@@ -182,8 +183,8 @@ else:
     device = 'cpu'
     extra_device = 'cpu'
-vae_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=3-step=100000.ckpt",repo_type="dataset")
-synchformer_ckpt = hf_hub_download(repo_id="UncleWang233/occdata", filename="synchformer_state_dict.pth",repo_type="dataset")
 feature_extractor = FeaturesUtils(
     vae_ckpt=vae_ckpt,
     vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
@@ -191,7 +192,7 @@ feature_extractor = FeaturesUtils(
     synchformer_ckpt=synchformer_ckpt
 ).eval().to(extra_device)
-preprocesser = VGGSound()
 args = get_all_args()
@@ -224,7 +225,7 @@ model.pretransform.load_state_dict(load_vae_state)
 # Remove weight_norm from the pretransform if specified
 if args.remove_pretransform_weight_norm == "post_load":
     remove_weight_norm_from_model(model.pretransform)
-ckpt_path = hf_hub_download(repo_id="UncleWang233/occdata", filename="epoch=10-step=68000.ckpt",repo_type="dataset")
 training_wrapper = create_training_wrapper_from_config(model_config, model)
 # 加载模型权重时根据设备选择map_location
 if device == 'cuda':
@@ -232,13 +233,23 @@ if device == 'cuda':
 else:
     training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
 def get_audio(video_path, caption):
     # 允许caption为空
     if caption is None:
         caption = ''
     timer = Timer(duration="00:15:00:00")
     data = preprocesser.sample(video_path, caption)
     preprocessed_data = {}
     metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
     preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
@@ -253,11 +264,17 @@ def get_audio(video_path, caption):
     sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
     preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
     preprocessed_data['video_exist'] = torch.tensor(True)
     metadata = [preprocessed_data]
     batch_size = 1
-    length = 194
     with torch.amp.autocast(device):
         conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
@@ -288,6 +305,7 @@ def get_audio(video_path, caption):
         audio_path = tmp_audio.name
     return audio_path
 # 合成新视频：用ffmpeg将音频与原视频合成
 def synthesize_video_with_audio(video_file, caption):

 import tempfile
 import subprocess
 from huggingface_hub import hf_hub_download
+from moviepy.editor import VideoFileClip
 _CLIP_SIZE = 224
 _CLIP_FPS = 8.0
         self,
         sample_rate: int = 44_100,
         duration_sec: float = 9.0,
+        audio_samples: int = None,
         normalize_audio: bool = False,
     ):
         if audio_samples is None:
     device = 'cpu'
     extra_device = 'cpu'
+vae_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=3-step=100000.ckpt",repo_type="model")
+synchformer_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
 feature_extractor = FeaturesUtils(
     vae_ckpt=vae_ckpt,
     vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
     synchformer_ckpt=synchformer_ckpt
 ).eval().to(extra_device)
 args = get_all_args()
 # Remove weight_norm from the pretransform if specified
 if args.remove_pretransform_weight_norm == "post_load":
     remove_weight_norm_from_model(model.pretransform)
+ckpt_path = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=10-step=68000.ckpt",repo_type="model")
 training_wrapper = create_training_wrapper_from_config(model_config, model)
 # 加载模型权重时根据设备选择map_location
 if device == 'cuda':
 else:
     training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
+def get_video_duration(video_path):
+    video = VideoFileClip(video_path)
+    return video.duration
 def get_audio(video_path, caption):
     # 允许caption为空
     if caption is None:
         caption = ''
     timer = Timer(duration="00:15:00:00")
+    #get video duration
+    duration_sec = get_video_duration(video_path)
+    print(duration_sec)
+    preprocesser = VGGSound(duration_sec=duration_sec)
     data = preprocesser.sample(video_path, caption)
     preprocessed_data = {}
     metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
     preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
     sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
     preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
     preprocessed_data['video_exist'] = torch.tensor(True)
+    print("clip_shape", preprocessed_data['metaclip_features'].shape)
+    print("sync_shape", preprocessed_data['sync_features'].shape)
+    sync_seq_len = preprocessed_data['sync_features'].shape[0]
+    clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
+    latent_seq_len = (int)(194/9*duration_sec)
+    training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)
     metadata = [preprocessed_data]
     batch_size = 1
+    length = latent_seq_len
     with torch.amp.autocast(device):
         conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
         audio_path = tmp_audio.name
     return audio_path
+get_audio("./examples/3_mute.mp4", "Axe striking")
 # 合成新视频：用ffmpeg将音频与原视频合成
 def synthesize_video_with_audio(video_file, caption):

requirements.txt CHANGED Viewed

@@ -230,3 +230,4 @@ xyzservices==2025.4.0
 yarl==1.20.0
 zipp==3.21.0
 git+https://github.com/patrick-kidger/torchcubicspline.git

 yarl==1.20.0
 zipp==3.21.0
 git+https://github.com/patrick-kidger/torchcubicspline.git
+moviepy==1.0.3