Spaces:

artificialguybr
/

EchoMimicV3-Demo

Running on Zero

App Files Files Community

artificialguybr commited on 4 days ago

Commit

d7950f8

verified ·

1 Parent(s): 2c2f38a

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -3

app.py CHANGED Viewed

@@ -259,6 +259,12 @@ def initialize_models():
             clip_image_encoder=clip_image_encoder,
         )
         pipeline.to(device=device)
         print("✅ Pipeline created and moved to device")
         print("🔄 Loading Wav2Vec models...")
@@ -343,6 +349,15 @@ def generate_video(
         audio_features = extract_audio_features(audio_path, wav2vec_processor, wav2vec_model)
         audio_embeds = audio_features.unsqueeze(0).to(device=device, dtype=config.weight_dtype)
         video_length = int(audio_clip.duration * fps)
         video_length = (
             int((video_length - 1) // pipeline.vae.config.temporal_compression_ratio * pipeline.vae.config.temporal_compression_ratio) + 1
@@ -405,7 +420,6 @@ def generate_video(
             audio_start_frame = init_frames * 2
             audio_end_frame = (init_frames + current_partial_length) * 2
-            # Ensure audio embeds are long enough
             if audio_embeds.shape[1] < audio_end_frame:
                 repeat_times = (audio_end_frame // audio_embeds.shape[1]) + 1
                 audio_embeds = audio_embeds.repeat(1, repeat_times, 1)
@@ -414,9 +428,9 @@ def generate_video(
             with torch.no_grad():
                 sample = pipeline(
-                    prompt,
                     num_frames=current_partial_length,
-                    negative_prompt=negative_prompt,
                     audio_embeds=partial_audio_embeds,
                     audio_scale=audio_scale,
                     ip_mask=ip_mask,

             clip_image_encoder=clip_image_encoder,
         )
         pipeline.to(device=device)
+        if torch.__version__ >= "2.0":
+            print("🚀 Compiling the pipeline with torch.compile()...")
+            pipeline.transformer = torch.compile(pipeline.transformer, mode="reduce-overhead", fullgraph=True)
+            print("✅ Pipeline transformer compiled!")
         print("✅ Pipeline created and moved to device")
         print("🔄 Loading Wav2Vec models...")
         audio_features = extract_audio_features(audio_path, wav2vec_processor, wav2vec_model)
         audio_embeds = audio_features.unsqueeze(0).to(device=device, dtype=config.weight_dtype)
+        progress(0.25, desc="Encoding prompts...")
+        prompt_embeds, negative_prompt_embeds = pipeline.encode_prompt(
+            prompt,
+            device=device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=(guidance_scale > 1.0),
+            negative_prompt=negative_prompt
+        )
         video_length = int(audio_clip.duration * fps)
         video_length = (
             int((video_length - 1) // pipeline.vae.config.temporal_compression_ratio * pipeline.vae.config.temporal_compression_ratio) + 1
             audio_start_frame = init_frames * 2
             audio_end_frame = (init_frames + current_partial_length) * 2
             if audio_embeds.shape[1] < audio_end_frame:
                 repeat_times = (audio_end_frame // audio_embeds.shape[1]) + 1
                 audio_embeds = audio_embeds.repeat(1, repeat_times, 1)
             with torch.no_grad():
                 sample = pipeline(
+                    prompt_embeds=prompt_embeds,
+                    negative_prompt_embeds=negative_prompt_embeds,
                     num_frames=current_partial_length,
                     audio_embeds=partial_audio_embeds,
                     audio_scale=audio_scale,
                     ip_mask=ip_mask,