Spaces:

toninio19
/

keysync-demo

Running

App Files Files Community

Antoni Bigata commited on Apr 30

Commit

2dff4e4

1 Parent(s): cf0da47

requirements

Browse files

Files changed (1) hide show

app.py +65 -51

app.py CHANGED Viewed

@@ -186,54 +186,17 @@ DEFAULT_AUDIO_PATH = os.path.join(
 #     landmarks_extractor,
 # ) = load_all_models()
-with spaces.GPU(duration=60) as gpu:
-    vae_model = VaeWrapper("video")
-    vae_model = vae_model.half()  # Convert to half precision
-    try:
-        vae_model = torch.compile(vae_model)
-        print("Successfully compiled vae_model in FP16")
-    except Exception as e:
-        print(f"Warning: Failed to compile vae_model: {e}")
-    hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").cuda()
-    hubert_model = hubert_model.half()  # Convert to half precision
-    try:
-        hubert_model = torch.compile(hubert_model)
-        print("Successfully compiled hubert_model in FP16")
-    except Exception as e:
-        print(f"Warning: Failed to compile hubert_model: {e}")
-    wavlm_model = WavLM_wrapper(
-        model_size="Base+",
-        feed_as_frames=False,
-        merge_type="None",
-        model_path=os.path.join(repo_path, "checkpoints/WavLM-Base+.pt"),
-    ).cuda()
-    wavlm_model = wavlm_model.half()  # Convert to half precision
-    try:
-        wavlm_model = torch.compile(wavlm_model)
-        print("Successfully compiled wavlm_model in FP16")
-    except Exception as e:
-        print(f"Warning: Failed to compile wavlm_model: {e}")
-    landmarks_extractor = LandmarksExtractor()
-    keyframe_model = load_model(
-        config="keyframe.yaml",
-        ckpt=os.path.join(repo_path, "checkpoints/keyframe_dub.pt"),
-    )
-    interpolation_model = load_model(
-        config="interpolation.yaml",
-        ckpt=os.path.join(repo_path, "checkpoints/interpolation_dub.pt"),
-    )
-    keyframe_model.en_and_decode_n_samples_a_time = 2
-    interpolation_model.en_and_decode_n_samples_a_time = 2
 @spaces.GPU(duration=60)
 @torch.no_grad()
-def compute_video_embedding(video_reader, min_len):
     """Compute embeddings from video"""
     total_frames = min_len
@@ -283,7 +246,7 @@ def compute_video_embedding(video_reader, min_len):
 @spaces.GPU(duration=120)
 @torch.no_grad()
-def compute_hubert_embedding(raw_audio):
     """Compute embeddings from audio"""
     print(f"Computing audio embedding from {raw_audio.shape}")
@@ -330,7 +293,7 @@ def compute_hubert_embedding(raw_audio):
 @spaces.GPU(duration=120)
 @torch.no_grad()
-def compute_wavlm_embedding(raw_audio):
     """Compute embeddings from audio"""
     audio = rearrange(raw_audio, "(f s) -> f s", s=640)
@@ -369,7 +332,7 @@ def compute_wavlm_embedding(raw_audio):
 @torch.no_grad()
-def extract_video_landmarks(video_frames):
     """Extract landmarks from video frames"""
     # Create a progress bar for Gradio
@@ -666,6 +629,57 @@ def process_video(video_input, audio_input, max_num_seconds):
         duration=10,
     )
     # Use default media if none provided
     if video_input is None:
         video_input = DEFAULT_VIDEO_PATH
@@ -749,9 +763,9 @@ def process_video(video_input, audio_input, max_num_seconds):
             # Compute video embeddings and landmarks - store full version in cache
             video_embedding, video_frames = compute_video_embedding(
-                video_reader, len(video_reader)
             )
-            video_landmarks = extract_video_landmarks(video_frames)
             # Update video cache with full versions
             cache["video"]["path"] = video_path_hash
@@ -807,8 +821,8 @@ def process_video(video_input, audio_input, max_num_seconds):
             print("Computing audio embeddings")
             # Compute audio embeddings with the truncated audio
-            hubert_embedding = compute_hubert_embedding(raw_audio_reshape)
-            wavlm_embedding = compute_wavlm_embedding(raw_audio_reshape)
             # Update audio cache with full embeddings
             # Note: raw_audio was already cached above

 #     landmarks_extractor,
 # ) = load_all_models()
+keyframe_model = None
+interpolation_model = None
+vae_model = None
+hubert_model = None
+wavlm_model = None
+landmarks_extractor = None
 @spaces.GPU(duration=60)
 @torch.no_grad()
+def compute_video_embedding(video_reader, min_len, vae_model):
     """Compute embeddings from video"""
     total_frames = min_len
 @spaces.GPU(duration=120)
 @torch.no_grad()
+def compute_hubert_embedding(raw_audio, hubert_model):
     """Compute embeddings from audio"""
     print(f"Computing audio embedding from {raw_audio.shape}")
 @spaces.GPU(duration=120)
 @torch.no_grad()
+def compute_wavlm_embedding(raw_audio, wavlm_model):
     """Compute embeddings from audio"""
     audio = rearrange(raw_audio, "(f s) -> f s", s=640)
 @torch.no_grad()
+def extract_video_landmarks(video_frames, landmarks_extractor):
     """Extract landmarks from video frames"""
     # Create a progress bar for Gradio
         duration=10,
     )
+    if vae_model is None:
+        vae_model = VaeWrapper("video")
+        vae_model = vae_model.half()  # Convert to half precision
+        try:
+            vae_model = torch.compile(vae_model)
+            print("Successfully compiled vae_model in FP16")
+        except Exception as e:
+            print(f"Warning: Failed to compile vae_model: {e}")
+    if hubert_model is None:
+        hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").cuda()
+        hubert_model = hubert_model.half()  # Convert to half precision
+        try:
+            hubert_model = torch.compile(hubert_model)
+            print("Successfully compiled hubert_model in FP16")
+        except Exception as e:
+            print(f"Warning: Failed to compile hubert_model: {e}")
+    if wavlm_model is None:
+        wavlm_model = WavLM_wrapper(
+            model_size="Base+",
+            feed_as_frames=False,
+            merge_type="None",
+            model_path=os.path.join(repo_path, "checkpoints/WavLM-Base+.pt"),
+        ).cuda()
+        wavlm_model = wavlm_model.half()  # Convert to half precision
+        try:
+            wavlm_model = torch.compile(wavlm_model)
+            print("Successfully compiled wavlm_model in FP16")
+        except Exception as e:
+            print(f"Warning: Failed to compile wavlm_model: {e}")
+    if landmarks_extractor is None:
+        landmarks_extractor = LandmarksExtractor()
+    if keyframe_model is None:
+        keyframe_model = load_model(
+            config="keyframe.yaml",
+            ckpt=os.path.join(repo_path, "checkpoints/keyframe_dub.pt"),
+        )
+    if interpolation_model is None:
+        interpolation_model = load_model(
+            config="interpolation.yaml",
+            ckpt=os.path.join(repo_path, "checkpoints/interpolation_dub.pt"),
+        )
+    keyframe_model.en_and_decode_n_samples_a_time = 2
+    interpolation_model.en_and_decode_n_samples_a_time = 2
     # Use default media if none provided
     if video_input is None:
         video_input = DEFAULT_VIDEO_PATH
             # Compute video embeddings and landmarks - store full version in cache
             video_embedding, video_frames = compute_video_embedding(
+                video_reader, len(video_reader), vae_model
             )
+            video_landmarks = extract_video_landmarks(video_frames, landmarks_extractor)
             # Update video cache with full versions
             cache["video"]["path"] = video_path_hash
             print("Computing audio embeddings")
             # Compute audio embeddings with the truncated audio
+            hubert_embedding = compute_hubert_embedding(raw_audio_reshape, hubert_model)
+            wavlm_embedding = compute_wavlm_embedding(raw_audio_reshape, wavlm_model)
             # Update audio cache with full embeddings
             # Note: raw_audio was already cached above