Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -80,7 +80,7 @@ from accelerate.utils import set_seed
|
|
80 |
from latentsync.whisper.audio2feature import Audio2Feature
|
81 |
|
82 |
|
83 |
-
@spaces.GPU(duration=
|
84 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
85 |
"""
|
86 |
Perform lip-sync video generation using an input video and a separate audio track.
|
@@ -106,20 +106,22 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
106 |
print(f"Input audio path: {audio_path}")
|
107 |
print(f"Loaded checkpoint path: {inference_ckpt_path}")
|
108 |
|
109 |
-
is_shared_ui = True
|
110 |
temp_dir = None
|
111 |
if is_shared_ui:
|
112 |
temp_dir = tempfile.mkdtemp()
|
|
|
113 |
cropped_video_path = process_video(video_path)
|
114 |
print(f"Cropped video saved to: {cropped_video_path}")
|
115 |
video_path=cropped_video_path
|
116 |
-
|
117 |
trimmed_audio_path = process_audio(audio_path, temp_dir)
|
118 |
print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
|
119 |
audio_path=trimmed_audio_path
|
120 |
|
|
|
121 |
scheduler = DDIMScheduler.from_pretrained("configs")
|
122 |
-
|
123 |
if config.model.cross_attention_dim == 768:
|
124 |
whisper_model_path = "checkpoints/whisper/small.pt"
|
125 |
elif config.model.cross_attention_dim == 384:
|
@@ -128,8 +130,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
128 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
129 |
|
130 |
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
|
131 |
-
|
132 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
|
|
|
133 |
vae.config.scaling_factor = 0.18215
|
134 |
vae.config.shift_factor = 0
|
135 |
|
@@ -138,7 +141,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
138 |
inference_ckpt_path, # load checkpoint
|
139 |
device="cpu",
|
140 |
)
|
141 |
-
|
142 |
unet = unet.to(dtype=torch.float16)
|
143 |
|
144 |
"""
|
@@ -154,7 +157,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
154 |
unet=unet,
|
155 |
scheduler=scheduler,
|
156 |
).to("cuda")
|
157 |
-
|
158 |
seed = -1
|
159 |
if seed != -1:
|
160 |
set_seed(seed)
|
@@ -165,7 +168,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
165 |
|
166 |
unique_id = str(uuid.uuid4())
|
167 |
video_out_path = f"video_out{unique_id}.mp4"
|
168 |
-
|
169 |
pipeline(
|
170 |
video_path=video_path,
|
171 |
audio_path=audio_path,
|
@@ -178,7 +181,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
|
178 |
width=config.data.resolution,
|
179 |
height=config.data.resolution,
|
180 |
)
|
181 |
-
|
182 |
if is_shared_ui:
|
183 |
# Clean up the temporary directory
|
184 |
if os.path.exists(temp_dir):
|
|
|
80 |
from latentsync.whisper.audio2feature import Audio2Feature
|
81 |
|
82 |
|
83 |
+
@spaces.GPU(duration=40)
|
84 |
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
|
85 |
"""
|
86 |
Perform lip-sync video generation using an input video and a separate audio track.
|
|
|
106 |
print(f"Input audio path: {audio_path}")
|
107 |
print(f"Loaded checkpoint path: {inference_ckpt_path}")
|
108 |
|
109 |
+
is_shared_ui = True
|
110 |
temp_dir = None
|
111 |
if is_shared_ui:
|
112 |
temp_dir = tempfile.mkdtemp()
|
113 |
+
print(1)
|
114 |
cropped_video_path = process_video(video_path)
|
115 |
print(f"Cropped video saved to: {cropped_video_path}")
|
116 |
video_path=cropped_video_path
|
117 |
+
|
118 |
trimmed_audio_path = process_audio(audio_path, temp_dir)
|
119 |
print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
|
120 |
audio_path=trimmed_audio_path
|
121 |
|
122 |
+
print(2)
|
123 |
scheduler = DDIMScheduler.from_pretrained("configs")
|
124 |
+
print(3)
|
125 |
if config.model.cross_attention_dim == 768:
|
126 |
whisper_model_path = "checkpoints/whisper/small.pt"
|
127 |
elif config.model.cross_attention_dim == 384:
|
|
|
130 |
raise NotImplementedError("cross_attention_dim must be 768 or 384")
|
131 |
|
132 |
audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
|
133 |
+
print(4)
|
134 |
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
|
135 |
+
print(5)
|
136 |
vae.config.scaling_factor = 0.18215
|
137 |
vae.config.shift_factor = 0
|
138 |
|
|
|
141 |
inference_ckpt_path, # load checkpoint
|
142 |
device="cpu",
|
143 |
)
|
144 |
+
print(6)
|
145 |
unet = unet.to(dtype=torch.float16)
|
146 |
|
147 |
"""
|
|
|
157 |
unet=unet,
|
158 |
scheduler=scheduler,
|
159 |
).to("cuda")
|
160 |
+
print(7)
|
161 |
seed = -1
|
162 |
if seed != -1:
|
163 |
set_seed(seed)
|
|
|
168 |
|
169 |
unique_id = str(uuid.uuid4())
|
170 |
video_out_path = f"video_out{unique_id}.mp4"
|
171 |
+
|
172 |
pipeline(
|
173 |
video_path=video_path,
|
174 |
audio_path=audio_path,
|
|
|
181 |
width=config.data.resolution,
|
182 |
height=config.data.resolution,
|
183 |
)
|
184 |
+
print(8)
|
185 |
if is_shared_ui:
|
186 |
# Clean up the temporary directory
|
187 |
if os.path.exists(temp_dir):
|