Spaces:

fffiloni
/

SVFR-demo

Running on Zero

App Files Files Community

fffiloni

AJohn123 commited on Jan 16

Commit

25d1b89

verified ·

1 Parent(s): bc166ca

svfr (#1)

Browse files

- (a4ce973fcbf35fcb2439184e4113aa6f190aa210)
- update (081a49f6f59d76700471269a023d40dd3658aa7b)
- update (f4e85a8429c3aed1881b7fd7cb6dff187c470d29)

Co-authored-by: minixiami <[email protected]>

Files changed (5) hide show

ORIGINAL_README.md +6 -0
README.md +3 -1
app.py +385 -40
infer.py +29 -5
src/dataset/dataset.py +68 -0

ORIGINAL_README.md CHANGED Viewed

@@ -108,6 +108,8 @@ https://github.com/user-attachments/assets/efdac23c-0ba5-4dad-ab8c-48904af5dd89
 ## 🚀 Getting Started
 ## Setup
 Use the following command to install a conda environment for SVFR from scratch:
@@ -200,6 +202,10 @@ The code of SVFR is released under the MIT License. There is no limitation for b
 **The pretrained models we provided with this library are available for non-commercial research purposes only, including both auto-downloading models and manual-downloading models.**
 ## BibTex
 ```

 ## 🚀 Getting Started
+> Note: It is recommended to use a GPU with 16GB or more VRAM.
 ## Setup
 Use the following command to install a conda environment for SVFR from scratch:
 **The pretrained models we provided with this library are available for non-commercial research purposes only, including both auto-downloading models and manual-downloading models.**
+## Acknowledgments
+This work is built on the architecture of [Sonic](https://github.com/jixiaozhong/Sonic).
 ## BibTex
 ```

README.md CHANGED Viewed

@@ -10,4 +10,6 @@ pinned: false
 short_description: Unified Framework for Generalized Video Face Restoration
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Unified Framework for Generalized Video Face Restoration
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,3 +1,15 @@
 import torch
 import sys
 import os
@@ -8,6 +20,45 @@ import uuid
 import gradio as gr
 from glob import glob
 from huggingface_hub import snapshot_download
 # Download models
 os.makedirs("models", exist_ok=True)
@@ -30,44 +81,326 @@ snapshot_download(
     local_dir = "./models/stable-video-diffusion-img2vid-xt"
 )
-def infer(lq_sequence, task_name):
     unique_id = str(uuid.uuid4())
     output_dir = f"results_{unique_id}"
-    if task_name == "BFR":
-        task_id = "0"
-    elif task_name == "colorization":
-        task_id = "1"
-    elif task_name == "BFR + colorization":
-        task_id = "0,1"
     try:
-        # Run the inference command
-        subprocess.run(
-            [
-                "python", "infer.py",
-                "--config", "config/infer.yaml",
-                "--task_ids", f"{task_id}",
-                "--input_path", f"{lq_sequence}",
-                "--output_dir", f"{output_dir}",
-            ],
-            check=True
-        )
         # Search for the mp4 file in a subfolder of output_dir
-        output_video = glob(os.path.join(output_dir,"*.mp4"))
-        print(output_video)
         if output_video:
             output_video_path = output_video[0]  # Get the first match
         else:
             output_video_path = None
-        print(output_video_path)
-        return output_video_path
     except subprocess.CalledProcessError as e:
         raise gr.Error(f"Error during inference: {str(e)}")
 css="""
@@ -91,38 +424,50 @@ with gr.Blocks(css=css) as demo:
             <a href="https://arxiv.org/pdf/2501.01235">
                 <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
             </a>
-            <a href="https://huggingface.co/spaces/fffiloni/SVFR-demo?duplicate=true">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
-            </a>
-            <a href="https://huggingface.co/fffiloni">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
-            </a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 input_seq = gr.Video(label="Video LQ")
-                task_name = gr.Radio(
                     label="Task",
-                    choices=["BFR", "colorization", "BFR + colorization"],
-                    value="BFR"
                 )
-                submit_btn = gr.Button("Submit")
             with gr.Column():
                 output_res = gr.Video(label="Restored")
                 gr.Examples(
                     examples = [
-                        ["./assert/lq/lq1.mp4", "BFR"],
-                        ["./assert/lq/lq2.mp4", "BFR + colorization"],
-                        ["./assert/lq/lq3.mp4", "colorization"]
                     ],
-                    inputs = [input_seq, task_name]
                 )
     submit_btn.click(
         fn = infer,
-        inputs = [input_seq, task_name],
-        outputs = [output_res]
     )
-demo.queue().launch(show_api=False, show_error=True)

+"""
+This script is based on the original project by https://huggingface.co/fffiloni.
+URL: https://huggingface.co/spaces/fffiloni/SVFR-demo/blob/main/app.py
+Modifications made:
+- Synced the infer code updates from GitHub repo.
+- Added an inpainting option to enhance functionality.
+Author of modifications: https://github.com/wangzhiyaoo
+Date: 2025/01/15
+"""
 import torch
 import sys
 import os
 import gradio as gr
 from glob import glob
 from huggingface_hub import snapshot_download
+import random
+import argparse
+import warnings
+import os
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from PIL import Image
+import random
+from omegaconf import OmegaConf
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers.schedulers import EulerDiscreteScheduler
+from transformers import CLIPVisionModelWithProjection
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+from src.models.svfr_adapter.unet_3d_svd_condition_ip import UNet3DConditionSVDModel
+# pipeline
+from src.pipelines.pipeline import LQ2VideoLongSVDPipeline
+from src.utils.util import (
+    save_videos_grid,
+    seed_everything,
+)
+from torchvision.utils import save_image
+from src.models.id_proj import IDProjConvModel
+from src.models import model_insightface_360k
+from src.dataset.face_align.align import AlignImage
+warnings.filterwarnings("ignore")
+import decord
+import cv2
+from src.dataset.dataset import get_affine_transform, mean_face_lm5p_256, get_union_bbox, process_bbox, crop_resize_img
 # Download models
 os.makedirs("models", exist_ok=True)
     local_dir = "./models/stable-video-diffusion-img2vid-xt"
 )
+BASE_DIR = '.'
+config = OmegaConf.load("./config/infer.yaml")
+vae = AutoencoderKLTemporalDecoder.from_pretrained(
+    f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+    subfolder="vae",
+    variant="fp16")
+val_noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+    f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+    subfolder="scheduler")
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+    subfolder="image_encoder",
+    variant="fp16")
+unet = UNet3DConditionSVDModel.from_pretrained(
+    f"{BASE_DIR}/{config.pretrained_model_name_or_path}",
+    subfolder="unet",
+    variant="fp16")
+weight_dir = 'models/face_align'
+det_path = os.path.join(BASE_DIR, weight_dir, 'yoloface_v5m.pt')
+align_instance = AlignImage("cuda", det_path=det_path)
+to_tensor = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+import torch.nn as nn
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        x = super().forward(x)
+        return x
+# Add ref channel
+old_weights = unet.conv_in.weight
+old_bias = unet.conv_in.bias
+new_conv1 = InflatedConv3d(
+    12,
+    old_weights.shape[0],
+    kernel_size=unet.conv_in.kernel_size,
+    stride=unet.conv_in.stride,
+    padding=unet.conv_in.padding,
+    bias=True if old_bias is not None else False,
+)
+param = torch.zeros((320, 4, 3, 3), requires_grad=True)
+new_conv1.weight = torch.nn.Parameter(torch.cat((old_weights, param), dim=1))
+if old_bias is not None:
+    new_conv1.bias = old_bias
+unet.conv_in = new_conv1
+unet.config["in_channels"] = 12
+unet.config.in_channels = 12
+id_linear = IDProjConvModel(in_channels=512, out_channels=1024).to(device='cuda')
+# load pretrained weights
+unet_checkpoint_path = os.path.join(BASE_DIR, config.unet_checkpoint_path)
+unet.load_state_dict(
+    torch.load(unet_checkpoint_path, map_location="cpu"),
+    strict=True,
+)
+id_linear_checkpoint_path = os.path.join(BASE_DIR, config.id_linear_checkpoint_path)
+id_linear.load_state_dict(
+    torch.load(id_linear_checkpoint_path, map_location="cpu"),
+    strict=True,
+)
+net_arcface = model_insightface_360k.getarcface(f'{BASE_DIR}/{config.net_arcface_checkpoint_path}').eval().to(device="cuda")
+if config.weight_dtype == "fp16":
+    weight_dtype = torch.float16
+elif config.weight_dtype == "fp32":
+    weight_dtype = torch.float32
+elif config.weight_dtype == "bf16":
+    weight_dtype = torch.bfloat16
+else:
+    raise ValueError(
+        f"Do not support weight dtype: {config.weight_dtype} during training"
+    )
+image_encoder.to(weight_dtype)
+vae.to(weight_dtype)
+unet.to(weight_dtype)
+id_linear.to(weight_dtype)
+net_arcface.requires_grad_(False).to(weight_dtype)
+pipe = LQ2VideoLongSVDPipeline(
+    unet=unet,
+    image_encoder=image_encoder,
+    vae=vae,
+    scheduler=val_noise_scheduler,
+    feature_extractor=None
+)
+pipe = pipe.to("cuda", dtype=unet.dtype)
+def gen(args,pipe):
+    save_dir = f"{BASE_DIR}/{args.output_dir}"
+    os.makedirs(save_dir,exist_ok=True)
+    seed_input = args.seed
+    seed_everything(seed_input)
+    video_path = args.input_path
+    task_ids = args.task_ids
+    if 2 in task_ids and args.mask_path is not None:
+        mask_path = args.mask_path
+        mask = Image.open(mask_path).convert("L")
+        mask_array = np.array(mask)
+        white_positions = mask_array == 255
+    print('task_ids:',task_ids)
+    task_prompt = [0,0,0]
+    for i in range(3):
+        if i in task_ids:
+            task_prompt[i] = 1
+    print("task_prompt:",task_prompt)
+    video_name = video_path.split('/')[-1]
+    # print(video_name)
+    if os.path.exists(os.path.join(save_dir, "result_frames", video_name[:-4])):
+        print(os.path.join(save_dir, "result_frames", video_name[:-4]))
+        # continue
+    cap = decord.VideoReader(video_path, fault_tol=1)
+    total_frames = len(cap)
+    T = total_frames #
+    print("total_frames:",total_frames)
+    step=1
+    drive_idx_start = 0
+    drive_idx_list = list(range(drive_idx_start, drive_idx_start + T * step, step))
+    assert len(drive_idx_list) == T
+    # Crop faces from the video for further processing
+    bbox_list = []
+    frame_interval = 5
+    for frame_count, drive_idx in enumerate(drive_idx_list):
+        if frame_count % frame_interval != 0:
+            continue
+        frame = cap[drive_idx].asnumpy()
+        _, _, bboxes_list = align_instance(frame[:,:,[2,1,0]], maxface=True)
+        if bboxes_list==[]:
+            continue
+        x1, y1, ww, hh = bboxes_list[0]
+        x2, y2 = x1 + ww, y1 + hh
+        bbox = [x1, y1, x2, y2]
+        bbox_list.append(bbox)
+    bbox = get_union_bbox(bbox_list)
+    bbox_s = process_bbox(bbox, expand_radio=0.4, height=frame.shape[0], width=frame.shape[1])
+    imSameIDs = []
+    vid_gt = []
+    for i, drive_idx in enumerate(drive_idx_list):
+        frame = cap[drive_idx].asnumpy()
+        imSameID = Image.fromarray(frame)
+        imSameID = crop_resize_img(imSameID, bbox_s)
+        imSameID = imSameID.resize((512,512))
+        if 1 in task_ids:
+            imSameID = imSameID.convert("L")  # Convert to grayscale
+            imSameID = imSameID.convert("RGB")
+        image_array = np.array(imSameID)
+        if 2 in task_ids and args.mask_path is not None:
+            image_array[white_positions] = [255, 255, 255] # mask for inpainting task
+        vid_gt.append(np.float32(image_array/255.))
+        imSameIDs.append(imSameID)
+    vid_lq = [(torch.from_numpy(frame).permute(2,0,1) - 0.5) / 0.5 for frame in vid_gt]
+    val_data = dict(
+        pixel_values_vid_lq = torch.stack(vid_lq,dim=0),
+        # pixel_values_ref_img=self.to_tensor(target_image),
+        # pixel_values_ref_concat_img=self.to_tensor(imSrc2),
+        task_ids=task_ids,
+        task_id_input=torch.tensor(task_prompt),
+        total_frames=total_frames,
+    )
+    window_overlap=0
+    inter_frame_list = get_overlap_slide_window_indices(val_data["total_frames"],config.data.n_sample_frames,window_overlap)
+    lq_frames = val_data["pixel_values_vid_lq"]
+    task_ids = val_data["task_ids"]
+    task_id_input = val_data["task_id_input"]
+    height, width = val_data["pixel_values_vid_lq"].shape[-2:]
+    print("Generating the first clip...")
+    output = pipe(
+        lq_frames[inter_frame_list[0]].to("cuda").to(weight_dtype), # lq
+        None, # ref concat
+        torch.zeros((1, len(inter_frame_list[0]), 49, 1024)).to("cuda").to(weight_dtype),# encoder_hidden_states
+        task_id_input.to("cuda").to(weight_dtype),
+        height=height,
+        width=width,
+        num_frames=len(inter_frame_list[0]),
+        decode_chunk_size=config.decode_chunk_size,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale=config.min_appearance_guidance_scale,
+        max_guidance_scale=config.max_appearance_guidance_scale,
+        overlap=config.overlap,
+        frames_per_batch=len(inter_frame_list[0]),
+        num_inference_steps=50,
+        i2i_noise_strength=config.i2i_noise_strength,
+    )
+    video = output.frames
+    ref_img_tensor = video[0][:,-1]
+    ref_img = (video[0][:,-1] *0.5+0.5).clamp(0,1) * 255.
+    ref_img = ref_img.permute(1,2,0).cpu().numpy().astype(np.uint8)
+    pts5 = align_instance(ref_img[:,:,[2,1,0]], maxface=True)[0][0]
+    warp_mat = get_affine_transform(pts5, mean_face_lm5p_256 * height/256)
+    ref_img = cv2.warpAffine(np.array(Image.fromarray(ref_img)), warp_mat, (height, width), flags=cv2.INTER_CUBIC)
+    ref_img = to_tensor(ref_img).to("cuda").to(weight_dtype)
+    save_image(ref_img*0.5 + 0.5,f"{save_dir}/ref_img_align.png")
+    ref_img =  F.interpolate(ref_img.unsqueeze(0)[:, :, 0:224, 16:240], size=[112, 112], mode='bilinear')
+    _, id_feature_conv = net_arcface(ref_img)
+    id_embedding = id_linear(id_feature_conv)
+    print('Generating all video clips...')
+    video = pipe(
+        lq_frames.to("cuda").to(weight_dtype), # lq
+        ref_img_tensor.to("cuda").to(weight_dtype),
+        id_embedding.unsqueeze(1).repeat(1, len(lq_frames), 1, 1).to("cuda").to(weight_dtype), # encoder_hidden_states
+        task_id_input.to("cuda").to(weight_dtype),
+        height=height,
+        width=width,
+        num_frames=val_data["total_frames"],#frame_num,
+        decode_chunk_size=config.decode_chunk_size,
+        noise_aug_strength=config.noise_aug_strength,
+        min_guidance_scale=config.min_appearance_guidance_scale,
+        max_guidance_scale=config.max_appearance_guidance_scale,
+        overlap=config.overlap,
+        frames_per_batch=config.data.n_sample_frames,
+        num_inference_steps=config.num_inference_steps,
+        i2i_noise_strength=config.i2i_noise_strength,
+    ).frames
+    video = (video*0.5 + 0.5).clamp(0, 1)
+    video = torch.cat([video.to(device="cuda")], dim=0).cpu()
+    save_videos_grid(video, f"{save_dir}/{video_name[:-4]}_{seed_input}_gen.mp4", n_rows=1, fps=25)
+    lq_frames = lq_frames.permute(1,0,2,3).unsqueeze(0)
+    lq_frames = (lq_frames * 0.5 + 0.5).clamp(0, 1).to(device="cuda").cpu()
+    save_videos_grid(lq_frames, f"{save_dir}/{video_name[:-4]}_{seed_input}_ori.mp4", n_rows=1, fps=25)
+    if args.restore_frames:
+        video = video.squeeze(0)
+        os.makedirs(os.path.join(save_dir, "result_frames", f"{video_name[:-4]}_{seed_input}"),exist_ok=True)
+        print(os.path.join(save_dir, "result_frames", video_name[:-4]))
+        for i in range(video.shape[1]):
+            save_frames_path = os.path.join(f"{save_dir}/result_frames", f"{video_name[:-4]}_{seed_input}", f'{i:08d}.png')
+            save_image(video[:,i], save_frames_path)
+def get_overlap_slide_window_indices(video_length, window_size, window_overlap):
+    inter_frame_list = []
+    for j in range(0, video_length, window_size-window_overlap):
+        inter_frame_list.append( [e % video_length for e in range(j, min(j + window_size, video_length))] )
+    return inter_frame_list
+def random_seed():
+    return random.randint(0, 10000)
+def infer(lq_sequence, task_name, mask, seed):
     unique_id = str(uuid.uuid4())
     output_dir = f"results_{unique_id}"
+    task_mapping = {
+        "BFR": 0,
+        "Colorization": 1,
+        "Inpainting": 2
+    }
+    task_ids = [task_mapping[task] for task in task_name if task in task_mapping]
+    # task_id = ",".join(task_ids)
     try:
+        parser = argparse.ArgumentParser()
+        args = parser.parse_args()
+        args.task_ids = task_ids
+        args.input_path = f"{lq_sequence}"
+        args.output_dir = f"{output_dir}"
+        args.mask_path = f"{mask}"
+        args.seed = int(seed)
+        args.restore_frames = False
+        gen(args,pipe)
         # Search for the mp4 file in a subfolder of output_dir
+        output_video = glob(os.path.join(output_dir,"*gen.mp4"))
+        face_region_video = glob(os.path.join(output_dir,"*ori.mp4"))
+        # print(face_region_video,output_video)
         if output_video:
             output_video_path = output_video[0]  # Get the first match
+            face_region_video_path = face_region_video[0]  # Get the first match
         else:
             output_video_path = None
+            face_region_video = None
+        print(output_video_path,face_region_video_path)
+        torch.cuda.empty_cache()
+        return face_region_video_path,output_video_path
     except subprocess.CalledProcessError as e:
+        torch.cuda.empty_cache()
         raise gr.Error(f"Error during inference: {str(e)}")
 css="""
             <a href="https://arxiv.org/pdf/2501.01235">
                 <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
             </a>
         </div>
         """)
         with gr.Row():
             with gr.Column():
                 input_seq = gr.Video(label="Video LQ")
+                task_name = gr.CheckboxGroup(
                     label="Task",
+                    choices=["BFR", "Colorization", "Inpainting"],
+                    value=["BFR"]  # default
                 )
+                mask_input = gr.Image(type="filepath",label="Inpainting Mask")
+                with gr.Row():
+                    seed_input = gr.Number(label="Seed", value=77, precision=0)
+                    random_seed_btn = gr.Button("🎲",scale=1,elem_id="dice-btn")
+                submit_btn = gr.Button("Submit", variant="primary")
+                clear_btn = gr.Button("Clear")
             with gr.Column():
+                output_face = gr.Video(label="Face Region Input")
                 output_res = gr.Video(label="Restored")
                 gr.Examples(
                     examples = [
+                        ["./assert/lq/lq1.mp4", ["BFR"],None],
+                        ["./assert/lq/lq2.mp4", ["BFR", "Colorization"],None],
+                        ["./assert/lq/lq3.mp4", ["BFR", "Colorization", "Inpainting"],"./assert/mask/lq3.png"]
                     ],
+                    inputs = [input_seq, task_name, mask_input]
                 )
+    random_seed_btn.click(
+        fn=random_seed,
+        inputs=[],
+        outputs=seed_input
+    )
     submit_btn.click(
         fn = infer,
+        inputs = [input_seq, task_name, mask_input,seed_input],
+        outputs = [output_face,output_res]
+    )
+    clear_btn.click(
+        fn=lambda: [None,["BFR"],None,77,None,None],
+        inputs=None,
+        outputs=[input_seq, task_name, mask_input, seed_input, output_face, output_res]
     )
+demo.queue().launch(show_api=False, show_error=True, server_port=1203)

infer.py CHANGED Viewed

@@ -33,10 +33,11 @@ warnings.filterwarnings("ignore")
 import decord
 import cv2
-from src.dataset.dataset import get_affine_transform, mean_face_lm5p_256
 BASE_DIR = '.'
 def main(config,args):
     if 'CUDA_VISIBLE_DEVICES' in os.environ:
         cuda_visible_devices = os.environ['CUDA_VISIBLE_DEVICES']
@@ -179,13 +180,33 @@ def main(config,args):
     drive_idx_list = list(range(drive_idx_start, drive_idx_start + T * step, step))
     assert len(drive_idx_list) == T
     imSameIDs = []
     vid_gt = []
     for i, drive_idx in enumerate(drive_idx_list):
         frame = cap[drive_idx].asnumpy()
         imSameID = Image.fromarray(frame)
         imSameID = imSameID.resize((512,512))
         image_array = np.array(imSameID)
         if 2 in task_ids and args.mask_path is not None:
             image_array[white_positions] = [255, 255, 255] # mask for inpainting task
@@ -241,7 +262,7 @@ def main(config,args):
     ref_img = cv2.warpAffine(np.array(Image.fromarray(ref_img)), warp_mat, (height, width), flags=cv2.INTER_CUBIC)
     ref_img = to_tensor(ref_img).to("cuda").to(weight_dtype)
-    save_image(ref_img*0.5 + 0.5,f"{save_dir}/ref_img_align.png")
     ref_img =  F.interpolate(ref_img.unsqueeze(0)[:, :, 0:224, 16:240], size=[112, 112], mode='bilinear')
     _, id_feature_conv = net_arcface(ref_img)
@@ -269,8 +290,11 @@ def main(config,args):
     video = (video*0.5 + 0.5).clamp(0, 1)
     video = torch.cat([video.to(device="cuda")], dim=0).cpu()
-    save_videos_grid(video, f"{save_dir}/{video_name[:-4]}_{seed_input}.mp4", n_rows=1, fps=25)
     if args.restore_frames:
         video = video.squeeze(0)

 import decord
 import cv2
+from src.dataset.dataset import get_affine_transform, mean_face_lm5p_256, get_union_bbox, process_bbox, crop_resize_img
 BASE_DIR = '.'
 def main(config,args):
     if 'CUDA_VISIBLE_DEVICES' in os.environ:
         cuda_visible_devices = os.environ['CUDA_VISIBLE_DEVICES']
     drive_idx_list = list(range(drive_idx_start, drive_idx_start + T * step, step))
     assert len(drive_idx_list) == T
+    # Crop faces from the video for further processing
+    bbox_list = []
+    frame_interval = 5
+    for frame_count, drive_idx in enumerate(drive_idx_list):
+        if frame_count % frame_interval != 0:
+            continue
+        frame = cap[drive_idx].asnumpy()
+        _, _, bboxes_list = align_instance(frame[:,:,[2,1,0]], maxface=True)
+        if bboxes_list==[]:
+            continue
+        x1, y1, ww, hh = bboxes_list[0]
+        x2, y2 = x1 + ww, y1 + hh
+        bbox = [x1, y1, x2, y2]
+        bbox_list.append(bbox)
+    bbox = get_union_bbox(bbox_list)
+    bbox_s = process_bbox(bbox, expand_radio=0.4, height=frame.shape[0], width=frame.shape[1])
     imSameIDs = []
     vid_gt = []
     for i, drive_idx in enumerate(drive_idx_list):
         frame = cap[drive_idx].asnumpy()
         imSameID = Image.fromarray(frame)
+        imSameID = crop_resize_img(imSameID, bbox_s)
         imSameID = imSameID.resize((512,512))
+        if 1 in task_ids:
+            imSameID = imSameID.convert("L")  # Convert to grayscale
+            imSameID = imSameID.convert("RGB")
         image_array = np.array(imSameID)
         if 2 in task_ids and args.mask_path is not None:
             image_array[white_positions] = [255, 255, 255] # mask for inpainting task
     ref_img = cv2.warpAffine(np.array(Image.fromarray(ref_img)), warp_mat, (height, width), flags=cv2.INTER_CUBIC)
     ref_img = to_tensor(ref_img).to("cuda").to(weight_dtype)
+    # save_image(ref_img*0.5 + 0.5,f"{save_dir}/ref_img_align.png")
     ref_img =  F.interpolate(ref_img.unsqueeze(0)[:, :, 0:224, 16:240], size=[112, 112], mode='bilinear')
     _, id_feature_conv = net_arcface(ref_img)
     video = (video*0.5 + 0.5).clamp(0, 1)
     video = torch.cat([video.to(device="cuda")], dim=0).cpu()
+    save_videos_grid(video, f"{save_dir}/{video_name[:-4]}_{seed_input}_gen.mp4", n_rows=1, fps=25)
+    lq_frames = lq_frames.permute(1,0,2,3).unsqueeze(0)
+    lq_frames = (lq_frames * 0.5 + 0.5).clamp(0, 1).to(device="cuda").cpu()
+    save_videos_grid(lq_frames, f"{save_dir}/{video_name[:-4]}_{seed_input}_ori.mp4", n_rows=1, fps=25)
     if args.restore_frames:
         video = video.squeeze(0)

src/dataset/dataset.py CHANGED Viewed

@@ -48,3 +48,71 @@ def get_affine_transform(target_face_lm5p, mean_lm5p):
     mat_warp[1][2] = mat23[3]
     return mat_warp

     mat_warp[1][2] = mat23[3]
     return mat_warp
+def get_union_bbox(bboxes):
+    bboxes = np.array(bboxes)
+    min_x = np.min(bboxes[:, 0])
+    min_y = np.min(bboxes[:, 1])
+    max_x = np.max(bboxes[:, 2])
+    max_y = np.max(bboxes[:, 3])
+    return np.array([min_x, min_y, max_x, max_y])
+def process_bbox(bbox, expand_radio, height, width):
+    def expand(bbox, ratio, height, width):
+        bbox_h = bbox[3] - bbox[1]
+        bbox_w = bbox[2] - bbox[0]
+        expand_x1 = max(bbox[0] - ratio * bbox_w, 0)
+        expand_y1 = max(bbox[1] - ratio * bbox_h, 0)
+        expand_x2 = min(bbox[2] + ratio * bbox_w, width)
+        expand_y2 = min(bbox[3] + ratio * bbox_h, height)
+        return [expand_x1,expand_y1,expand_x2,expand_y2]
+    def to_square(bbox_src, bbox_expend, height, width):
+        h = bbox_expend[3] - bbox_expend[1]
+        w = bbox_expend[2] - bbox_expend[0]
+        c_h = (bbox_expend[1] + bbox_expend[3]) / 2
+        c_w = (bbox_expend[0] + bbox_expend[2]) / 2
+        c = min(h, w) / 2
+        c_src_h = (bbox_src[1] + bbox_src[3]) / 2
+        c_src_w = (bbox_src[0] + bbox_src[2]) / 2
+        s_h, s_w = 0, 0
+        if w < h:
+            d = abs((h - w) / 2)
+            s_h = min(d, abs(c_src_h-c_h))
+            s_h = s_h if  c_src_h > c_h else s_h * (-1)
+        else:
+            d = abs((h - w) / 2)
+            s_w = min(d, abs(c_src_w-c_w))
+            s_w = s_w if  c_src_w > c_w else s_w * (-1)
+        c_h = (bbox_expend[1] + bbox_expend[3]) / 2 + s_h
+        c_w = (bbox_expend[0] + bbox_expend[2]) / 2 + s_w
+        square_x1 = c_w - c
+        square_y1 = c_h - c
+        square_x2 = c_w + c
+        square_y2 = c_h + c
+        return [round(square_x1), round(square_y1), round(square_x2), round(square_y2)]
+    bbox_expend = expand(bbox, expand_radio, height=height, width=width)
+    processed_bbox = to_square(bbox, bbox_expend, height=height, width=width)
+    return processed_bbox
+def crop_resize_img(img, bbox):
+    x1, y1, x2, y2 = bbox
+    img = img.crop((x1, y1, x2, y2))
+    return img