Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
37b79a6
1
Parent(s):
d62c880
0630
Browse files- .gradio/certificate.pem +31 -0
- app.py +24 -6
- requirements.txt +1 -0
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
app.py
CHANGED
@@ -32,6 +32,7 @@ import gradio as gr
|
|
32 |
import tempfile
|
33 |
import subprocess
|
34 |
from huggingface_hub import hf_hub_download
|
|
|
35 |
|
36 |
_CLIP_SIZE = 224
|
37 |
_CLIP_FPS = 8.0
|
@@ -62,7 +63,7 @@ class VGGSound(Dataset):
|
|
62 |
self,
|
63 |
sample_rate: int = 44_100,
|
64 |
duration_sec: float = 9.0,
|
65 |
-
audio_samples:
|
66 |
normalize_audio: bool = False,
|
67 |
):
|
68 |
if audio_samples is None:
|
@@ -182,8 +183,8 @@ else:
|
|
182 |
device = 'cpu'
|
183 |
extra_device = 'cpu'
|
184 |
|
185 |
-
vae_ckpt = hf_hub_download(repo_id="
|
186 |
-
synchformer_ckpt = hf_hub_download(repo_id="
|
187 |
feature_extractor = FeaturesUtils(
|
188 |
vae_ckpt=vae_ckpt,
|
189 |
vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
|
@@ -191,7 +192,7 @@ feature_extractor = FeaturesUtils(
|
|
191 |
synchformer_ckpt=synchformer_ckpt
|
192 |
).eval().to(extra_device)
|
193 |
|
194 |
-
|
195 |
|
196 |
args = get_all_args()
|
197 |
|
@@ -224,7 +225,7 @@ model.pretransform.load_state_dict(load_vae_state)
|
|
224 |
# Remove weight_norm from the pretransform if specified
|
225 |
if args.remove_pretransform_weight_norm == "post_load":
|
226 |
remove_weight_norm_from_model(model.pretransform)
|
227 |
-
ckpt_path = hf_hub_download(repo_id="
|
228 |
training_wrapper = create_training_wrapper_from_config(model_config, model)
|
229 |
# 加载模型权重时根据设备选择map_location
|
230 |
if device == 'cuda':
|
@@ -232,13 +233,23 @@ if device == 'cuda':
|
|
232 |
else:
|
233 |
training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
|
234 |
|
|
|
|
|
|
|
|
|
235 |
def get_audio(video_path, caption):
|
236 |
# 允许caption为空
|
237 |
if caption is None:
|
238 |
caption = ''
|
239 |
timer = Timer(duration="00:15:00:00")
|
|
|
|
|
|
|
|
|
240 |
data = preprocesser.sample(video_path, caption)
|
241 |
|
|
|
|
|
242 |
preprocessed_data = {}
|
243 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
244 |
preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
|
@@ -253,11 +264,17 @@ def get_audio(video_path, caption):
|
|
253 |
sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
|
254 |
preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
|
255 |
preprocessed_data['video_exist'] = torch.tensor(True)
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
metadata = [preprocessed_data]
|
258 |
|
259 |
batch_size = 1
|
260 |
-
length =
|
261 |
with torch.amp.autocast(device):
|
262 |
conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
|
263 |
|
@@ -288,6 +305,7 @@ def get_audio(video_path, caption):
|
|
288 |
audio_path = tmp_audio.name
|
289 |
return audio_path
|
290 |
|
|
|
291 |
# 合成新视频:用ffmpeg将音频与原视频合成
|
292 |
|
293 |
def synthesize_video_with_audio(video_file, caption):
|
|
|
32 |
import tempfile
|
33 |
import subprocess
|
34 |
from huggingface_hub import hf_hub_download
|
35 |
+
from moviepy.editor import VideoFileClip
|
36 |
|
37 |
_CLIP_SIZE = 224
|
38 |
_CLIP_FPS = 8.0
|
|
|
63 |
self,
|
64 |
sample_rate: int = 44_100,
|
65 |
duration_sec: float = 9.0,
|
66 |
+
audio_samples: int = None,
|
67 |
normalize_audio: bool = False,
|
68 |
):
|
69 |
if audio_samples is None:
|
|
|
183 |
device = 'cpu'
|
184 |
extra_device = 'cpu'
|
185 |
|
186 |
+
vae_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=3-step=100000.ckpt",repo_type="model")
|
187 |
+
synchformer_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
|
188 |
feature_extractor = FeaturesUtils(
|
189 |
vae_ckpt=vae_ckpt,
|
190 |
vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
|
|
|
192 |
synchformer_ckpt=synchformer_ckpt
|
193 |
).eval().to(extra_device)
|
194 |
|
195 |
+
|
196 |
|
197 |
args = get_all_args()
|
198 |
|
|
|
225 |
# Remove weight_norm from the pretransform if specified
|
226 |
if args.remove_pretransform_weight_norm == "post_load":
|
227 |
remove_weight_norm_from_model(model.pretransform)
|
228 |
+
ckpt_path = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="epoch=10-step=68000.ckpt",repo_type="model")
|
229 |
training_wrapper = create_training_wrapper_from_config(model_config, model)
|
230 |
# 加载模型权重时根据设备选择map_location
|
231 |
if device == 'cuda':
|
|
|
233 |
else:
|
234 |
training_wrapper.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu'))['state_dict'])
|
235 |
|
236 |
+
def get_video_duration(video_path):
|
237 |
+
video = VideoFileClip(video_path)
|
238 |
+
return video.duration
|
239 |
+
|
240 |
def get_audio(video_path, caption):
|
241 |
# 允许caption为空
|
242 |
if caption is None:
|
243 |
caption = ''
|
244 |
timer = Timer(duration="00:15:00:00")
|
245 |
+
#get video duration
|
246 |
+
duration_sec = get_video_duration(video_path)
|
247 |
+
print(duration_sec)
|
248 |
+
preprocesser = VGGSound(duration_sec=duration_sec)
|
249 |
data = preprocesser.sample(video_path, caption)
|
250 |
|
251 |
+
|
252 |
+
|
253 |
preprocessed_data = {}
|
254 |
metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
|
255 |
preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
|
|
|
264 |
sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
|
265 |
preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
|
266 |
preprocessed_data['video_exist'] = torch.tensor(True)
|
267 |
+
print("clip_shape", preprocessed_data['metaclip_features'].shape)
|
268 |
+
print("sync_shape", preprocessed_data['sync_features'].shape)
|
269 |
+
sync_seq_len = preprocessed_data['sync_features'].shape[0]
|
270 |
+
clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
|
271 |
+
latent_seq_len = (int)(194/9*duration_sec)
|
272 |
+
training_wrapper.diffusion.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)
|
273 |
|
274 |
metadata = [preprocessed_data]
|
275 |
|
276 |
batch_size = 1
|
277 |
+
length = latent_seq_len
|
278 |
with torch.amp.autocast(device):
|
279 |
conditioning = training_wrapper.diffusion.conditioner(metadata, training_wrapper.device)
|
280 |
|
|
|
305 |
audio_path = tmp_audio.name
|
306 |
return audio_path
|
307 |
|
308 |
+
get_audio("./examples/3_mute.mp4", "Axe striking")
|
309 |
# 合成新视频:用ffmpeg将音频与原视频合成
|
310 |
|
311 |
def synthesize_video_with_audio(video_file, caption):
|
requirements.txt
CHANGED
@@ -230,3 +230,4 @@ xyzservices==2025.4.0
|
|
230 |
yarl==1.20.0
|
231 |
zipp==3.21.0
|
232 |
git+https://github.com/patrick-kidger/torchcubicspline.git
|
|
|
|
230 |
yarl==1.20.0
|
231 |
zipp==3.21.0
|
232 |
git+https://github.com/patrick-kidger/torchcubicspline.git
|
233 |
+
moviepy==1.0.3
|