Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -80,7 +80,7 @@ from accelerate.utils import set_seed
|
|
| 80 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 81 |
|
| 82 |
|
| 83 |
-
@spaces.GPU(duration=
|
| 84 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
| 85 |
"""
|
| 86 |
Perform lip-sync video generation using an input video and a separate audio track.
|
|
@@ -106,20 +106,22 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 106 |
print(f"Input audio path: {audio_path}")
|
| 107 |
print(f"Loaded checkpoint path: {inference_ckpt_path}")
|
| 108 |
|
| 109 |
-
is_shared_ui = True
|
| 110 |
temp_dir = None
|
| 111 |
if is_shared_ui:
|
| 112 |
temp_dir = tempfile.mkdtemp()
|
|
|
|
| 113 |
cropped_video_path = process_video(video_path)
|
| 114 |
print(f"Cropped video saved to: {cropped_video_path}")
|
| 115 |
video_path=cropped_video_path
|
| 116 |
-
|
| 117 |
trimmed_audio_path = process_audio(audio_path, temp_dir)
|
| 118 |
print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
|
| 119 |
audio_path=trimmed_audio_path
|
| 120 |
|
|
|
|
| 121 |
scheduler = DDIMScheduler.from_pretrained("configs")
|
| 122 |
-
|
| 123 |
if config.model.cross_attention_dim == 768:
|
| 124 |
whisper_model_path = "checkpoints/whisper/small.pt"
|
| 125 |
elif config.model.cross_attention_dim == 384:
|
|
@@ -128,8 +130,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 128 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
| 129 |
|
| 130 |
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
|
| 131 |
-
|
| 132 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
|
|
|
|
| 133 |
vae.config.scaling_factor = 0.18215
|
| 134 |
vae.config.shift_factor = 0
|
| 135 |
|
|
@@ -138,7 +141,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 138 |
inference_ckpt_path, # load checkpoint
|
| 139 |
device="cpu",
|
| 140 |
)
|
| 141 |
-
|
| 142 |
unet = unet.to(dtype=torch.float16)
|
| 143 |
|
| 144 |
"""
|
|
@@ -154,7 +157,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 154 |
unet=unet,
|
| 155 |
scheduler=scheduler,
|
| 156 |
).to("cuda")
|
| 157 |
-
|
| 158 |
seed = -1
|
| 159 |
if seed != -1:
|
| 160 |
set_seed(seed)
|
|
@@ -165,7 +168,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 165 |
|
| 166 |
unique_id = str(uuid.uuid4())
|
| 167 |
video_out_path = f"video_out{unique_id}.mp4"
|
| 168 |
-
|
| 169 |
pipeline(
|
| 170 |
video_path=video_path,
|
| 171 |
audio_path=audio_path,
|
|
@@ -178,7 +181,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
| 178 |
width=config.data.resolution,
|
| 179 |
height=config.data.resolution,
|
| 180 |
)
|
| 181 |
-
|
| 182 |
if is_shared_ui:
|
| 183 |
# Clean up the temporary directory
|
| 184 |
if os.path.exists(temp_dir):
|
|
|
|
| 80 |
from latentsync.whisper.audio2feature import Audio2Feature
|
| 81 |
|
| 82 |
|
| 83 |
+
@spaces.GPU(duration=40)
|
| 84 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
| 85 |
"""
|
| 86 |
Perform lip-sync video generation using an input video and a separate audio track.
|
|
|
|
| 106 |
print(f"Input audio path: {audio_path}")
|
| 107 |
print(f"Loaded checkpoint path: {inference_ckpt_path}")
|
| 108 |
|
| 109 |
+
is_shared_ui = True
|
| 110 |
temp_dir = None
|
| 111 |
if is_shared_ui:
|
| 112 |
temp_dir = tempfile.mkdtemp()
|
| 113 |
+
print(1)
|
| 114 |
cropped_video_path = process_video(video_path)
|
| 115 |
print(f"Cropped video saved to: {cropped_video_path}")
|
| 116 |
video_path=cropped_video_path
|
| 117 |
+
|
| 118 |
trimmed_audio_path = process_audio(audio_path, temp_dir)
|
| 119 |
print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
|
| 120 |
audio_path=trimmed_audio_path
|
| 121 |
|
| 122 |
+
print(2)
|
| 123 |
scheduler = DDIMScheduler.from_pretrained("configs")
|
| 124 |
+
print(3)
|
| 125 |
if config.model.cross_attention_dim == 768:
|
| 126 |
whisper_model_path = "checkpoints/whisper/small.pt"
|
| 127 |
elif config.model.cross_attention_dim == 384:
|
|
|
|
| 130 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
| 131 |
|
| 132 |
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
|
| 133 |
+
print(4)
|
| 134 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
|
| 135 |
+
print(5)
|
| 136 |
vae.config.scaling_factor = 0.18215
|
| 137 |
vae.config.shift_factor = 0
|
| 138 |
|
|
|
|
| 141 |
inference_ckpt_path, # load checkpoint
|
| 142 |
device="cpu",
|
| 143 |
)
|
| 144 |
+
print(6)
|
| 145 |
unet = unet.to(dtype=torch.float16)
|
| 146 |
|
| 147 |
"""
|
|
|
|
| 157 |
unet=unet,
|
| 158 |
scheduler=scheduler,
|
| 159 |
).to("cuda")
|
| 160 |
+
print(7)
|
| 161 |
seed = -1
|
| 162 |
if seed != -1:
|
| 163 |
set_seed(seed)
|
|
|
|
| 168 |
|
| 169 |
unique_id = str(uuid.uuid4())
|
| 170 |
video_out_path = f"video_out{unique_id}.mp4"
|
| 171 |
+
|
| 172 |
pipeline(
|
| 173 |
video_path=video_path,
|
| 174 |
audio_path=audio_path,
|
|
|
|
| 181 |
width=config.data.resolution,
|
| 182 |
height=config.data.resolution,
|
| 183 |
)
|
| 184 |
+
print(8)
|
| 185 |
if is_shared_ui:
|
| 186 |
# Clean up the temporary directory
|
| 187 |
if os.path.exists(temp_dir):
|