MEMO / memo /utils /vision_utils.py
fffiloni's picture
Update memo/utils/vision_utils.py
9645faf verified
raw
history blame
3.6 kB
import logging
import cv2
import numpy as np
import torch
from insightface.app import FaceAnalysis
from moviepy.editor import AudioFileClip, VideoClip
from PIL import Image
from torchvision import transforms
logger = logging.getLogger(__name__)
def tensor_to_video(tensor, output_video_path, input_audio_path, fps=30):
"""
Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file.
Args:
tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w].
output_video_path (str): The file path where the output video will be saved.
input_audio_path (str): The path to the audio file (WAV file) that contains the audio track to be added.
fps (int): The frame rate of the output video. Default is 30 fps.
"""
tensor = tensor.permute(1, 2, 3, 0).cpu().numpy() # convert to [f, h, w, c]
tensor = np.clip(tensor * 255, 0, 255).astype(np.uint8) # to [0, 255]
def make_frame(t):
frame_index = min(int(t * fps), tensor.shape[0] - 1)
return tensor[frame_index]
video_duration = tensor.shape[0] / fps
audio_clip = AudioFileClip(input_audio_path)
audio_duration = audio_clip.duration
final_duration = min(video_duration, audio_duration)
audio_clip = audio_clip.subclip(0, final_duration)
new_video_clip = VideoClip(make_frame, duration=final_duration)
new_video_clip = new_video_clip.set_audio(audio_clip)
new_video_clip.write_videofile(output_video_path, fps=fps, codec="libx264", audio_codec="aac")
@torch.no_grad()
def preprocess_image(face_analysis_model: str, image_path: str, image_size: int = 512):
"""
Preprocess the image and extract face embedding.
Args:
face_analysis_model (str): Path to the FaceAnalysis model directory.
image_path (str): Path to the image file.
image_size (int, optional): Target size for resizing the image. Default is 512.
Returns:
tuple: A tuple containing:
- pixel_values (torch.Tensor): Tensor of the preprocessed image.
- face_emb (torch.Tensor): Tensor of the face embedding.
"""
# Define the image transformation
transform = transforms.Compose(
[
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
# Initialize the FaceAnalysis model
face_analysis = FaceAnalysis(
# name='',
root=face_analysis_model,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
face_analysis.prepare(ctx_id=0, det_size=(640, 640))
# Load and preprocess the image
image = Image.open(image_path).convert("RGB")
pixel_values = transform(image)
pixel_values = pixel_values.unsqueeze(0)
# Detect faces and extract the face embedding
image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
faces = face_analysis.get(image_bgr)
if not faces:
logger.warning("No faces detected in the image. Using a zero vector as the face embedding.")
face_emb = np.zeros(512)
else:
# Sort faces by size and select the largest one
faces_sorted = sorted(
faces,
key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]),
reverse=True,
)
face_emb = faces_sorted[0]["embedding"]
# Convert face embedding to a PyTorch tensor
face_emb = face_emb.reshape(1, -1)
face_emb = torch.tensor(face_emb)
del face_analysis
return pixel_values, face_emb