import mediapipe as mp from mediapipe.tasks import python from mediapipe.tasks.python import vision import cv2 import numpy as np import os import numpy as np import torch import torch.nn.functional as F import os import cv2 # borrowed from https://github.com/daniilidis-group/neural_renderer/blob/master/neural_renderer/vertices_to_faces.py def face_vertices(vertices, faces): """ :param vertices: [batch size, number of vertices, 3] :param faces: [batch size, number of faces, 3] :return: [batch size, number of faces, 3, 3] """ assert (vertices.ndimension() == 3) assert (faces.ndimension() == 3) assert (vertices.shape[0] == faces.shape[0]) assert (vertices.shape[2] == 3) assert (faces.shape[2] == 3) bs, nv = vertices.shape[:2] bs, nf = faces.shape[:2] device = vertices.device faces = faces + (torch.arange(bs, dtype=torch.int32).to(device) * nv)[:, None, None] vertices = vertices.reshape((bs * nv, 3)) # pytorch only supports long and byte tensors for indexing return vertices[faces.long()] def vertex_normals(vertices, faces): """ :param vertices: [batch size, number of vertices, 3] :param faces: [batch size, number of faces, 3] :return: [batch size, number of vertices, 3] """ assert (vertices.ndimension() == 3) assert (faces.ndimension() == 3) assert (vertices.shape[0] == faces.shape[0]) assert (vertices.shape[2] == 3) assert (faces.shape[2] == 3) bs, nv = vertices.shape[:2] bs, nf = faces.shape[:2] device = vertices.device normals = torch.zeros(bs * nv, 3).to(device) faces = faces + (torch.arange(bs, dtype=torch.int32).to(device) * nv)[:, None, None] # expanded faces vertices_faces = vertices.reshape((bs * nv, 3))[faces.long()] faces = faces.reshape(-1, 3) vertices_faces = vertices_faces.reshape(-1, 3, 3) normals.index_add_(0, faces[:, 1].long(), torch.cross(vertices_faces[:, 2] - vertices_faces[:, 1], vertices_faces[:, 0] - vertices_faces[:, 1])) normals.index_add_(0, faces[:, 2].long(), torch.cross(vertices_faces[:, 0] - vertices_faces[:, 2], vertices_faces[:, 1] - vertices_faces[:, 2])) normals.index_add_(0, faces[:, 0].long(), torch.cross(vertices_faces[:, 1] - vertices_faces[:, 0], vertices_faces[:, 2] - vertices_faces[:, 0])) normals = F.normalize(normals, eps=1e-6, dim=1) normals = normals.reshape((bs, nv, 3)) # pytorch only supports long and byte tensors for indexing return normals def batch_orth_proj(X, camera): ''' orthgraphic projection X: 3d vertices, [bz, n_point, 3] camera: scale and translation, [bz, 3], [scale, tx, ty] ''' #print('--------') #print(camera[0, 1:].abs()) #print(X[0].abs().mean(0)) camera = camera.clone().view(-1, 1, 3) X_trans = X[:, :, :2] + camera[:, :, 1:] #print(X_trans[0].abs().mean(0)) X_trans = torch.cat([X_trans, X[:,:,2:]], 2) Xn = (camera[:, :, 0:1] * X_trans) return Xn class MP_2_FLAME(): """ Convert Mediapipe 52 blendshape scores to FLAME's coefficients """ def __init__(self, mappings_path): self.bs2exp = np.load(os.path.join(mappings_path, 'bs2exp.npy')) self.bs2pose = np.load(os.path.join(mappings_path, 'bs2pose.npy')) self.bs2eye = np.load(os.path.join(mappings_path, 'bs2eye.npy')) def convert(self, blendshape_scores : np.array): # blendshape_scores: [N, 52] # Calculate expression, pose, and eye_pose using the mappings exp = blendshape_scores @ self.bs2exp pose = blendshape_scores @ self.bs2pose pose[0, :3] = 0 # we do not support head rotation yet eye_pose = blendshape_scores @ self.bs2eye return exp, pose, eye_pose class MediaPipeUtils: def __init__(self, model_asset_path='pretrained_models/mediapipe/face_landmarker.task', mappings_path='pretrained_models/mediapipe/'): base_options = python.BaseOptions(model_asset_path=model_asset_path) options = vision.FaceLandmarkerOptions(base_options=base_options, output_face_blendshapes=True, output_facial_transformation_matrixes=True, num_faces=1, min_face_detection_confidence=0.1, min_face_presence_confidence=0.1) self.detector = vision.FaceLandmarker.create_from_options(options) self.mp2flame = MP_2_FLAME(mappings_path=mappings_path) def run_mediapipe(self, image): image_numpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image_numpy) detection_result = self.detector.detect(image) if len(detection_result.face_landmarks) == 0: print('No face detected') return None blend_scores = detection_result.face_blendshapes[0] blend_scores = np.array(list(map(lambda l: l.score, blend_scores)), dtype=np.float32).reshape(1, 52) exp, pose, eye_pose = self.mp2flame.convert(blendshape_scores=blend_scores) face_landmarks = detection_result.face_landmarks[0] face_landmarks_numpy = np.zeros((478, 3)) for i, landmark in enumerate(face_landmarks): face_landmarks_numpy[i] = [landmark.x * image.width, landmark.y * image.height, landmark.z] return face_landmarks_numpy, exp, pose, eye_pose