Spaces:

fffiloni
/

MEMO

Running on Zero

App Files Files Community

ZeroGPU

by hysts HF Staff - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+36

-31

Files changed (1) hide show

hf_gradio_app.py +36 -31

hf_gradio_app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os, random, time
-#import spaces
 import uuid
 import tempfile, shutil
 from pydub import AudioSegment
@@ -22,22 +22,22 @@ for subfolder in subfolders:
 snapshot_download(
     repo_id = "memoavatar/memo",
-    local_dir = "./checkpoints"
 )
 snapshot_download(
     repo_id = "stabilityai/sd-vae-ft-mse",
-    local_dir = "./checkpoints/vae"
 )
 snapshot_download(
     repo_id = "facebook/wav2vec2-base-960h",
-    local_dir = "./checkpoints/wav2vec2"
 )
 snapshot_download(
     repo_id = "emotion2vec/emotion2vec_plus_large",
-    local_dir = "./checkpoints/emotion2vec_plus_large"
 )
 import torch
@@ -65,51 +65,53 @@ from memo.utils.vision_utils import preprocess_image, tensor_to_video
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 weight_dtype = torch.bfloat16
-with torch.inference_mode():
-    vae = AutoencoderKL.from_pretrained("./checkpoints/vae").to(device=device, dtype=weight_dtype)
-    reference_net = UNet2DConditionModel.from_pretrained("./checkpoints", subfolder="reference_net", use_safetensors=True)
-    diffusion_net = UNet3DConditionModel.from_pretrained("./checkpoints", subfolder="diffusion_net", use_safetensors=True)
-    image_proj = ImageProjModel.from_pretrained("./checkpoints", subfolder="image_proj", use_safetensors=True)
-    audio_proj = AudioProjModel.from_pretrained("./checkpoints", subfolder="audio_proj", use_safetensors=True)
-    vae.requires_grad_(False).eval()
-    reference_net.requires_grad_(False).eval()
-    diffusion_net.requires_grad_(False).eval()
-    image_proj.requires_grad_(False).eval()
-    audio_proj.requires_grad_(False).eval()
-    reference_net.enable_xformers_memory_efficient_attention()
-    diffusion_net.enable_xformers_memory_efficient_attention()
-    noise_scheduler = FlowMatchEulerDiscreteScheduler()
-    pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
-    pipeline.to(device=device, dtype=weight_dtype)
 def process_audio(file_path, temp_dir):
     # Load the audio file
     audio = AudioSegment.from_file(file_path)
     # Check and cut the audio if longer than 4 seconds
     max_duration = 4 * 1000  # 4 seconds in milliseconds
     if len(audio) > max_duration:
         audio = audio[:max_duration]
     # Save the processed audio in the temporary directory
     output_path = os.path.join(temp_dir, "trimmed_audio.wav")
     audio.export(output_path, format="wav")
     # Return the path to the trimmed file
     print(f"Processed audio saved at: {output_path}")
     return output_path
-#@spaces.GPU(duration=240)
 @torch.inference_mode()
 def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
     is_shared_ui = True if "fffiloni/MEMO" in os.environ['SPACE_ID'] else False
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
         input_audio = process_audio(input_audio, temp_dir)
         print(f"Processed file was stored temporarily at: {input_audio}")
     resolution = 512
     num_generated_frames_per_clip = 16
     fps = 30
@@ -125,7 +127,7 @@ def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=Tru
     generator = torch.manual_seed(seed)
     img_size = (resolution, resolution)
     pixel_values, face_emb = preprocess_image(face_analysis_model="./checkpoints/misc/face_analysis", image_path=input_video, image_size=resolution)
     output_dir = "./outputs"
     os.makedirs(output_dir, exist_ok=True)
     cache_dir = os.path.join(output_dir, "audio_preprocess")
@@ -190,6 +192,9 @@ def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=Tru
         )
         video_frames.append(pipeline_output.videos)
     video_frames = torch.cat(video_frames, dim=2)
     video_frames = video_frames.squeeze(0)
     video_frames = video_frames[:, :audio_length]
@@ -210,7 +215,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/memoavatar/memo">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://memoavatar.github.io/">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -225,7 +230,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
 			</a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 input_video = gr.Image(label="Upload Input Image", type="filepath")
@@ -241,4 +246,4 @@ with gr.Blocks(analytics_enabled=False) as demo:
         outputs=[video_output],
     )
-demo.queue().launch(share=False, show_api=False, show_error=True)

 import os, random, time
+import spaces
 import uuid
 import tempfile, shutil
 from pydub import AudioSegment
 snapshot_download(
     repo_id = "memoavatar/memo",
+    local_dir = "./checkpoints"
 )
 snapshot_download(
     repo_id = "stabilityai/sd-vae-ft-mse",
+    local_dir = "./checkpoints/vae"
 )
 snapshot_download(
     repo_id = "facebook/wav2vec2-base-960h",
+    local_dir = "./checkpoints/wav2vec2"
 )
 snapshot_download(
     repo_id = "emotion2vec/emotion2vec_plus_large",
+    local_dir = "./checkpoints/emotion2vec_plus_large"
 )
 import torch
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 weight_dtype = torch.bfloat16
+vae = AutoencoderKL.from_pretrained("./checkpoints/vae").to(device=device, dtype=weight_dtype)
+reference_net = UNet2DConditionModel.from_pretrained("./checkpoints", subfolder="reference_net", use_safetensors=True)
+diffusion_net = UNet3DConditionModel.from_pretrained("./checkpoints", subfolder="diffusion_net", use_safetensors=True)
+image_proj = ImageProjModel.from_pretrained("./checkpoints", subfolder="image_proj", use_safetensors=True)
+audio_proj = AudioProjModel.from_pretrained("./checkpoints", subfolder="audio_proj", use_safetensors=True)
+vae.requires_grad_(False).eval()
+reference_net.requires_grad_(False).eval()
+diffusion_net.requires_grad_(False).eval()
+image_proj.requires_grad_(False).eval()
+audio_proj.requires_grad_(False).eval()
+noise_scheduler = FlowMatchEulerDiscreteScheduler()
+pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
+pipeline.to(device=device, dtype=weight_dtype)
 def process_audio(file_path, temp_dir):
     # Load the audio file
     audio = AudioSegment.from_file(file_path)
     # Check and cut the audio if longer than 4 seconds
     max_duration = 4 * 1000  # 4 seconds in milliseconds
     if len(audio) > max_duration:
         audio = audio[:max_duration]
     # Save the processed audio in the temporary directory
     output_path = os.path.join(temp_dir, "trimmed_audio.wav")
     audio.export(output_path, format="wav")
     # Return the path to the trimmed file
     print(f"Processed audio saved at: {output_path}")
     return output_path
+@spaces.GPU(duration=240)
 @torch.inference_mode()
 def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
+    pipeline.reference_net.enable_xformers_memory_efficient_attention()
+    pipeline.diffusion_net.enable_xformers_memory_efficient_attention()
     is_shared_ui = True if "fffiloni/MEMO" in os.environ['SPACE_ID'] else False
     temp_dir = None
     if is_shared_ui:
         temp_dir = tempfile.mkdtemp()
         input_audio = process_audio(input_audio, temp_dir)
         print(f"Processed file was stored temporarily at: {input_audio}")
     resolution = 512
     num_generated_frames_per_clip = 16
     fps = 30
     generator = torch.manual_seed(seed)
     img_size = (resolution, resolution)
     pixel_values, face_emb = preprocess_image(face_analysis_model="./checkpoints/misc/face_analysis", image_path=input_video, image_size=resolution)
     output_dir = "./outputs"
     os.makedirs(output_dir, exist_ok=True)
     cache_dir = os.path.join(output_dir, "audio_preprocess")
         )
         video_frames.append(pipeline_output.videos)
+    pipeline.reference_net.disable_xformers_memory_efficient_attention()
+    pipeline.diffusion_net.disable_xformers_memory_efficient_attention()
     video_frames = torch.cat(video_frames, dim=2)
     video_frames = video_frames.squeeze(0)
     video_frames = video_frames[:, :audio_length]
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/memoavatar/memo">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://memoavatar.github.io/">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
 			</a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 input_video = gr.Image(label="Upload Input Image", type="filepath")
         outputs=[video_output],
     )
+demo.queue().launch(share=False, show_api=False, show_error=True)