LatentSync

Runtime error

App Files Files Community

Flux9665 commited on Jul 31

Commit

855aa9e

verified ·

1 Parent(s): dfb452f

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -9

app.py CHANGED Viewed

@@ -80,7 +80,7 @@ from accelerate.utils import set_seed
 from latentsync.whisper.audio2feature import Audio2Feature
-@spaces.GPU(duration=800)
 def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
     """
     Perform lip-sync video generation using an input video and a separate audio track.
@@ -106,20 +106,22 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
     print(f"Input audio path: {audio_path}")
     print(f"Loaded checkpoint path: {inference_ckpt_path}")
-    is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
         cropped_video_path = process_video(video_path)
         print(f"Cropped video saved to: {cropped_video_path}")
         video_path=cropped_video_path
         trimmed_audio_path = process_audio(audio_path, temp_dir)
         print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
         audio_path=trimmed_audio_path
     scheduler = DDIMScheduler.from_pretrained("configs")
     if config.model.cross_attention_dim == 768:
         whisper_model_path = "checkpoints/whisper/small.pt"
     elif config.model.cross_attention_dim == 384:
@@ -128,8 +130,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
     audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
     vae.config.scaling_factor = 0.18215
     vae.config.shift_factor = 0
@@ -138,7 +141,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
         inference_ckpt_path,  # load checkpoint
         device="cpu",
     )
     unet = unet.to(dtype=torch.float16)
     """
@@ -154,7 +157,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
         unet=unet,
         scheduler=scheduler,
     ).to("cuda")
     seed = -1
     if seed != -1:
         set_seed(seed)
@@ -165,7 +168,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
     unique_id = str(uuid.uuid4())
     video_out_path = f"video_out{unique_id}.mp4"
     pipeline(
         video_path=video_path,
         audio_path=audio_path,
@@ -178,7 +181,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
         width=config.data.resolution,
         height=config.data.resolution,
     )
     if is_shared_ui:
         # Clean up the temporary directory
         if os.path.exists(temp_dir):

 from latentsync.whisper.audio2feature import Audio2Feature
+@spaces.GPU(duration=40)
 def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
     """
     Perform lip-sync video generation using an input video and a separate audio track.
     print(f"Input audio path: {audio_path}")
     print(f"Loaded checkpoint path: {inference_ckpt_path}")
+    is_shared_ui = True
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
+        print(1)
         cropped_video_path = process_video(video_path)
         print(f"Cropped video saved to: {cropped_video_path}")
         video_path=cropped_video_path
         trimmed_audio_path = process_audio(audio_path, temp_dir)
         print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
         audio_path=trimmed_audio_path
+    print(2)
     scheduler = DDIMScheduler.from_pretrained("configs")
+    print(3)
     if config.model.cross_attention_dim == 768:
         whisper_model_path = "checkpoints/whisper/small.pt"
     elif config.model.cross_attention_dim == 384:
         raise NotImplementedError("cross_attention_dim must be 768 or 384")
     audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
+    print(4)
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+    print(5)
     vae.config.scaling_factor = 0.18215
     vae.config.shift_factor = 0
         inference_ckpt_path,  # load checkpoint
         device="cpu",
     )
+    print(6)
     unet = unet.to(dtype=torch.float16)
     """
         unet=unet,
         scheduler=scheduler,
     ).to("cuda")
+    print(7)
     seed = -1
     if seed != -1:
         set_seed(seed)
     unique_id = str(uuid.uuid4())
     video_out_path = f"video_out{unique_id}.mp4"
     pipeline(
         video_path=video_path,
         audio_path=audio_path,
         width=config.data.resolution,
         height=config.data.resolution,
     )
+    print(8)
     if is_shared_ui:
         # Clean up the temporary directory
         if os.path.exists(temp_dir):