Spaces:

JasonSmithSO
/

FooocusEnhanced

Configuration error

File size: 12,493 Bytes
# Openpose
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
# 3rd Edited by ControlNet
# 4th Edited by ControlNet (added face and correct hands)
# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
# This preprocessor is licensed by CMU for non-commercial use only.

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import json
import torch
import numpy as np
from . import util
from .body import Body, BodyResult, Keypoint
from .hand import Hand
from .face import Face
from .types import PoseResult, HandResult, FaceResult, AnimalPoseResult
from huggingface_hub import hf_hub_download
from .wholebody import Wholebody
import warnings
from custom_controlnet_aux.util import HWC3, resize_image_with_pad, common_input_validate, custom_hf_download
import cv2
from PIL import Image
from .animalpose import AnimalPoseImage

from typing import Tuple, List, Callable, Union, Optional


def draw_animalposes(animals: list[list[Keypoint]], H: int, W: int) -> np.ndarray:
    canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
    for animal_pose in animals:
        canvas = draw_animalpose(canvas, animal_pose)
    return canvas


def draw_animalpose(canvas: np.ndarray, keypoints: list[Keypoint]) -> np.ndarray:
    # order of the keypoints for AP10k and a standardized list of colors for limbs
    keypointPairsList = [
        (1, 2),
        (2, 3),
        (1, 3),
        (3, 4),
        (4, 9),
        (9, 10),
        (10, 11),
        (4, 6),
        (6, 7),
        (7, 8),
        (4, 5),
        (5, 15),
        (15, 16),
        (16, 17),
        (5, 12),
        (12, 13),
        (13, 14),
    ]
    colorsList = [
        (255, 255, 255),
        (100, 255, 100),
        (150, 255, 255),
        (100, 50, 255),
        (50, 150, 200),
        (0, 255, 255),
        (0, 150, 0),
        (0, 0, 255),
        (0, 0, 150),
        (255, 50, 255),
        (255, 0, 255),
        (255, 0, 0),
        (150, 0, 0),
        (255, 255, 100),
        (0, 150, 0),
        (255, 255, 0),
        (150, 150, 150),
    ]  # 16 colors needed

    for ind, (i, j) in enumerate(keypointPairsList):
        p1 = keypoints[i - 1]
        p2 = keypoints[j - 1]

        if p1 is not None and p2 is not None:
            cv2.line(
                canvas,
                (int(p1.x), int(p1.y)),
                (int(p2.x), int(p2.y)),
                colorsList[ind],
                5,
            )
    return canvas


def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
    """
    Draw the detected poses on an empty canvas.

    Args:
        poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
        H (int): The height of the canvas.
        W (int): The width of the canvas.
        draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
        draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
        draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.

    Returns:
        numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
    """
    canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)

    for pose in poses:
        if draw_body:
            canvas = util.draw_bodypose(canvas, pose.body.keypoints)

        if draw_hand:
            canvas = util.draw_handpose(canvas, pose.left_hand)
            canvas = util.draw_handpose(canvas, pose.right_hand)

        if draw_face:
            canvas = util.draw_facepose(canvas, pose.face)

    return canvas


def decode_json_as_poses(
    pose_json: dict,
) -> Tuple[List[PoseResult], List[AnimalPoseResult], int, int]:
    """Decode the json_string complying with the openpose JSON output format
    to poses that controlnet recognizes.
    https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md

    Args:
        json_string: The json string to decode.

    Returns:
        human_poses
        animal_poses
        canvas_height
        canvas_width
    """
    height = pose_json["canvas_height"]
    width = pose_json["canvas_width"]

    def chunks(lst, n):
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(lst), n):
            yield lst[i : i + n]

    def decompress_keypoints(
        numbers: Optional[List[float]],
    ) -> Optional[List[Optional[Keypoint]]]:
        if not numbers:
            return None

        assert len(numbers) % 3 == 0

        def create_keypoint(x, y, c):
            if c < 1.0:
                return None
            keypoint = Keypoint(x, y)
            return keypoint

        return [create_keypoint(x, y, c) for x, y, c in chunks(numbers, n=3)]

    return (
        [
            PoseResult(
                body=BodyResult(
                    keypoints=decompress_keypoints(pose.get("pose_keypoints_2d"))
                ),
                left_hand=decompress_keypoints(pose.get("hand_left_keypoints_2d")),
                right_hand=decompress_keypoints(pose.get("hand_right_keypoints_2d")),
                face=decompress_keypoints(pose.get("face_keypoints_2d")),
            )
            for pose in pose_json.get("people", [])
        ],
        [decompress_keypoints(pose) for pose in pose_json.get("animals", [])],
        height,
        width,
    )


def encode_poses_as_dict(poses: List[PoseResult], canvas_height: int, canvas_width: int) -> str:
    """ Encode the pose as a dict following openpose JSON output format:
    https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
    """
    def compress_keypoints(keypoints: Union[List[Keypoint], None]) -> Union[List[float], None]:
        if not keypoints:
            return None
        
        return [
            value
            for keypoint in keypoints
            for value in (
                [float(keypoint.x), float(keypoint.y), 1.0]
                if keypoint is not None
                else [0.0, 0.0, 0.0]
            )
        ]

    return {
        'people': [
            {
                'pose_keypoints_2d': compress_keypoints(pose.body.keypoints),
                "face_keypoints_2d": compress_keypoints(pose.face),
                "hand_left_keypoints_2d": compress_keypoints(pose.left_hand),
                "hand_right_keypoints_2d":compress_keypoints(pose.right_hand),
            }
            for pose in poses
        ],
        'canvas_height': canvas_height,
        'canvas_width': canvas_width,
    }

global_cached_dwpose = Wholebody()

class DwposeDetector:
    """
    A class for detecting human poses in images using the Dwpose model.

    Attributes:
        model_dir (str): Path to the directory where the pose models are stored.
    """
    def __init__(self, dw_pose_estimation):
        self.dw_pose_estimation = dw_pose_estimation
    
    @classmethod
    def from_pretrained(cls, pretrained_model_or_path, pretrained_det_model_or_path=None, det_filename=None, pose_filename=None, torchscript_device="cuda"):
        global global_cached_dwpose
        pretrained_det_model_or_path = pretrained_det_model_or_path or pretrained_model_or_path
        det_filename = det_filename or "yolox_l.onnx"
        pose_filename = pose_filename or "dw-ll_ucoco_384.onnx"
        det_model_path = custom_hf_download(pretrained_det_model_or_path, det_filename)
        pose_model_path = custom_hf_download(pretrained_model_or_path, pose_filename)
        
        print(f"\nDWPose: Using {det_filename} for bbox detection and {pose_filename} for pose estimation")
        if global_cached_dwpose.det is None or global_cached_dwpose.det_filename != det_filename:
            t = Wholebody(det_model_path, None, torchscript_device=torchscript_device)
            t.pose = global_cached_dwpose.pose
            t.pose_filename = global_cached_dwpose.pose
            global_cached_dwpose = t
        
        if global_cached_dwpose.pose is None or global_cached_dwpose.pose_filename != pose_filename:
            t = Wholebody(None, pose_model_path, torchscript_device=torchscript_device)
            t.det = global_cached_dwpose.det
            t.det_filename = global_cached_dwpose.det_filename
            global_cached_dwpose = t
        return cls(global_cached_dwpose)

    def detect_poses(self, oriImg) -> List[PoseResult]:
        with torch.no_grad():
            keypoints_info = self.dw_pose_estimation(oriImg.copy())
            return Wholebody.format_result(keypoints_info)
    
    def __call__(self, input_image, detect_resolution=512, include_body=True, include_hand=False, include_face=False, hand_and_face=None, output_type="pil", image_and_json=False, upscale_method="INTER_CUBIC", **kwargs):
        if hand_and_face is not None:
            warnings.warn("hand_and_face is deprecated. Use include_hand and include_face instead.", DeprecationWarning)
            include_hand = hand_and_face
            include_face = hand_and_face

        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
        poses = self.detect_poses(input_image)
        
        canvas = draw_poses(poses, input_image.shape[0], input_image.shape[1], draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
        canvas, remove_pad = resize_image_with_pad(canvas, detect_resolution, upscale_method)
        detected_map = HWC3(remove_pad(canvas))

        if output_type == "pil":
            detected_map = Image.fromarray(detected_map)
        
        if image_and_json:
            return (detected_map, encode_poses_as_dict(poses, input_image.shape[0], input_image.shape[1]))
        
        return detected_map

global_cached_animalpose = AnimalPoseImage()
class AnimalposeDetector:
    """
    A class for detecting animal poses in images using the RTMPose AP10k model.

    Attributes:
        model_dir (str): Path to the directory where the pose models are stored.
    """
    def __init__(self, animal_pose_estimation):
        self.animal_pose_estimation = animal_pose_estimation
    
    @classmethod
    def from_pretrained(cls, pretrained_model_or_path, pretrained_det_model_or_path=None, det_filename="yolox_l.onnx", pose_filename="dw-ll_ucoco_384.onnx", torchscript_device="cuda"):
        global global_cached_animalpose
        det_model_path = custom_hf_download(pretrained_det_model_or_path, det_filename)
        pose_model_path = custom_hf_download(pretrained_model_or_path, pose_filename)
        
        print(f"\nAnimalPose: Using {det_filename} for bbox detection and {pose_filename} for pose estimation")
        if global_cached_animalpose.det is None or global_cached_animalpose.det_filename != det_filename:
            t = AnimalPoseImage(det_model_path, None, torchscript_device=torchscript_device)
            t.pose = global_cached_animalpose.pose
            t.pose_filename = global_cached_animalpose.pose
            global_cached_animalpose = t
        
        if global_cached_animalpose.pose is None or global_cached_animalpose.pose_filename != pose_filename:
            t = AnimalPoseImage(None, pose_model_path, torchscript_device=torchscript_device)
            t.det = global_cached_animalpose.det
            t.det_filename = global_cached_animalpose.det_filename
            global_cached_animalpose = t
        return cls(global_cached_animalpose)
    
    def __call__(self, input_image, detect_resolution=512, output_type="pil", image_and_json=False, upscale_method="INTER_CUBIC", **kwargs):
        input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
        input_image, remove_pad = resize_image_with_pad(input_image, detect_resolution, upscale_method)
        result = self.animal_pose_estimation(input_image)
        if result is None:
            detected_map = np.zeros_like(input_image)
            openpose_dict = {
                'version': 'ap10k',
                'animals': [],
                'canvas_height': input_image.shape[0],
                'canvas_width': input_image.shape[1]
            }
        else:
            detected_map, openpose_dict = result
        detected_map = remove_pad(detected_map)
        if output_type == "pil":
            detected_map = Image.fromarray(detected_map)
        
        if image_and_json:
            return (detected_map, openpose_dict)

        return detected_map