Spaces:

fffiloni
/

MEMO

Running on Zero

App Files Files Community

MEMO / memo /utils /vision_utils.py

fffiloni

Update memo/utils/vision_utils.py

9645faf verified 6 months ago

raw

history blame

3.6 kB

	import logging

	import cv2
	import numpy as np
	import torch
	from insightface.app import FaceAnalysis
	from moviepy.editor import AudioFileClip, VideoClip
	from PIL import Image
	from torchvision import transforms


	logger = logging.getLogger(__name__)


	def tensor_to_video(tensor, output_video_path, input_audio_path, fps=30):
	"""
	Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file.

	Args:
	tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w].
	output_video_path (str): The file path where the output video will be saved.
	input_audio_path (str): The path to the audio file (WAV file) that contains the audio track to be added.
	fps (int): The frame rate of the output video. Default is 30 fps.
	"""
	tensor = tensor.permute(1, 2, 3, 0).cpu().numpy() # convert to [f, h, w, c]
	tensor = np.clip(tensor * 255, 0, 255).astype(np.uint8) # to [0, 255]

	def make_frame(t):
	frame_index = min(int(t * fps), tensor.shape[0] - 1)
	return tensor[frame_index]

	video_duration = tensor.shape[0] / fps
	audio_clip = AudioFileClip(input_audio_path)
	audio_duration = audio_clip.duration
	final_duration = min(video_duration, audio_duration)
	audio_clip = audio_clip.subclip(0, final_duration)
	new_video_clip = VideoClip(make_frame, duration=final_duration)
	new_video_clip = new_video_clip.set_audio(audio_clip)
	new_video_clip.write_videofile(output_video_path, fps=fps, codec="libx264", audio_codec="aac")

	@torch.no_grad()
	def preprocess_image(face_analysis_model: str, image_path: str, image_size: int = 512):
	"""
	Preprocess the image and extract face embedding.

	Args:
	face_analysis_model (str): Path to the FaceAnalysis model directory.
	image_path (str): Path to the image file.
	image_size (int, optional): Target size for resizing the image. Default is 512.

	Returns:
	tuple: A tuple containing:
	- pixel_values (torch.Tensor): Tensor of the preprocessed image.
	- face_emb (torch.Tensor): Tensor of the face embedding.
	"""
	# Define the image transformation
	transform = transforms.Compose(
	[
	transforms.Resize((image_size, image_size)),
	transforms.ToTensor(),
	transforms.Normalize([0.5], [0.5]),
	]
	)

	# Initialize the FaceAnalysis model
	face_analysis = FaceAnalysis(
	# name='',
	root=face_analysis_model,
	providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
	)
	face_analysis.prepare(ctx_id=0, det_size=(640, 640))

	# Load and preprocess the image
	image = Image.open(image_path).convert("RGB")
	pixel_values = transform(image)
	pixel_values = pixel_values.unsqueeze(0)

	# Detect faces and extract the face embedding
	image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	faces = face_analysis.get(image_bgr)
	if not faces:
	logger.warning("No faces detected in the image. Using a zero vector as the face embedding.")
	face_emb = np.zeros(512)
	else:
	# Sort faces by size and select the largest one
	faces_sorted = sorted(
	faces,
	key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]),
	reverse=True,
	)
	face_emb = faces_sorted[0]["embedding"]

	# Convert face embedding to a PyTorch tensor
	face_emb = face_emb.reshape(1, -1)
	face_emb = torch.tensor(face_emb)

	del face_analysis

	return pixel_values, face_emb