Spaces:

opencv
/

handpose_estimation_mediapipe

Running

File size: 9,223 Bytes

import gradio as gr
import cv2 as cv
import tempfile
from mp_handpose import MPHandPose
from mp_palmdet import MPPalmDet
import numpy as np
from huggingface_hub import hf_hub_download

handpose_detector_path = hf_hub_download(repo_id="opencv/handpose_estimation_mediapipe", filename="handpose_estimation_mediapipe_2023feb.onnx")
palm_detector_path = hf_hub_download(repo_id="opencv/palm_detection_mediapipe", filename="palm_detection_mediapipe_2023feb.onnx")

backend_id = cv.dnn.DNN_BACKEND_OPENCV
target_id = cv.dnn.DNN_TARGET_CPU

palm_detector = MPPalmDet(modelPath=palm_detector_path, nmsThreshold=0.3, scoreThreshold=0.6, backendId=backend_id, targetId=target_id)
handpose_detector = MPHandPose(modelPath=handpose_detector_path, confThreshold=0.9, backendId=backend_id, targetId=target_id)

def visualize(image, hands):
    display_screen = image.copy()
    display_3d = np.zeros((400, 400, 3), np.uint8)
    cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2)
    cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2)
    cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

    def draw_lines(image, landmarks, is_draw_point=True, thickness=2):
        connections = [
            (0, 1), (1, 2), (2, 3), (3, 4),
            (0, 5), (5, 6), (6, 7), (7, 8),
            (0, 9), (9, 10), (10, 11), (11, 12),
            (0, 13), (13, 14), (14, 15), (15, 16),
            (0, 17), (17, 18), (18, 19), (19, 20)
        ]
        for (i, j) in connections:
            cv.line(image, landmarks[i], landmarks[j], (255, 255, 255), thickness)
        if is_draw_point:
            for p in landmarks:
                cv.circle(image, p, thickness, (0, 0, 255), -1)

    gc = GestureClassification()

    for handpose in hands:
        bbox = handpose[0:4].astype(np.int32)
        handedness = handpose[-2]
        handedness_text = 'Left' if handedness <= 0.5 else 'Right'
        landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32)
        landmarks_word = handpose[67:130].reshape(21, 3)
        gesture = gc.classify(landmarks_screen)

        cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        cv.putText(display_screen, handedness_text, (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
        cv.putText(display_screen, gesture, (bbox[0], bbox[1] + 30), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

        landmarks_xy = landmarks_screen[:, 0:2]
        draw_lines(display_screen, landmarks_xy, is_draw_point=False)

        for p in landmarks_screen:
            r = max(5 - p[2] // 5, 0)
            r = min(r, 14)
            cv.circle(display_screen, (p[0], p[1]), r, (0, 0, 255), -1)

        # Main view
        landmarks_xy = (landmarks_word[:, [0, 1]] * 1000 + 100).astype(np.int32)
        draw_lines(display_3d, landmarks_xy, thickness=5)
        # Top view
        landmarks_xz = landmarks_word[:, [0, 2]]
        landmarks_xz[:, 1] *= -1
        landmarks_xz = (landmarks_xz * 1000 + [300, 100]).astype(np.int32)
        draw_lines(display_3d, landmarks_xz, thickness=5)
        # Left view
        landmarks_yz = landmarks_word[:, [2, 1]]
        landmarks_yz[:, 0] *= -1
        landmarks_yz = (landmarks_yz * 1000 + [100, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_yz, thickness=5)
        # Right view
        landmarks_zy = (landmarks_word[:, [2, 1]] * 1000 + [300, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_zy, thickness=5)

    return display_screen, display_3d

class GestureClassification:
    def _vector_2_angle(self, v1, v2):
        uv1 = v1 / np.linalg.norm(v1)
        uv2 = v2 / np.linalg.norm(v2)
        return np.degrees(np.arccos(np.dot(uv1, uv2)))

    def _hand_angle(self, hand):
        return [
            self._vector_2_angle(np.array([hand[0][0] - hand[i][0], hand[0][1] - hand[i][1]]),
                                 np.array([hand[i + 1][0] - hand[i + 2][0], hand[i + 1][1] - hand[i + 2][1]]))
            for i in [2, 6, 10, 14, 18]
        ]

    def _finger_status(self, lmList):
        originx, originy = lmList[0]
        keypoint_list = [[5, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
        return [np.hypot(x2 - originx, y2 - originy) > np.hypot(x1 - originx, y1 - originy)
                for (x1, y1), (x2, y2) in [(lmList[i], lmList[j]) for i, j in keypoint_list]]

    def _classify(self, hand):
        angle_list = self._hand_angle(hand)
        thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen = self._finger_status(hand)
        thr_angle = 65.
        thr_angle_thumb = 30.
        thr_angle_s = 49.
        g = "Undefined"
        if angle_list[0] > thr_angle_thumb and all(a > thr_angle for a in angle_list[1:]) and not any([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Zero"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and all(a > thr_angle for a in angle_list[2:]) and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "One"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and angle_list[2] < thr_angle_s and all(a > thr_angle for a in angle_list[3:]) and not thumbOpen and firstOpen and secondOpen and not any([thirdOpen, fourthOpen]):
            g = "Two"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:4]) and angle_list[4] > thr_angle and not thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Three"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:]) and all([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Four"
        elif all(a < thr_angle_s for a in angle_list) and all([thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Five"
        elif angle_list[0] < thr_angle_s and all(a > thr_angle for a in angle_list[1:4]) and angle_list[4] < thr_angle_s and thumbOpen and not any([firstOpen, secondOpen, thirdOpen]) and fourthOpen:
            g = "Six"
        elif angle_list[0] < thr_angle_s and angle_list[1] < thr_angle and all(a > thr_angle for a in angle_list[2:4]) and angle_list[4] > thr_angle_s and thumbOpen and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "Seven"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:3]) and angle_list[3] > thr_angle and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen]) and not any([thirdOpen, fourthOpen]):
            g = "Eight"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:4]) and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Nine"
        return g

    def classify(self, landmarks):
        return self._classify(landmarks[:21, :2])

def process_video(video_path):
    cap = cv.VideoCapture(video_path)
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

    out_path = tempfile.mktemp(suffix=".mp4")
    out_writer = cv.VideoWriter(out_path, cv.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        palms = palm_detector.infer(frame)
        hands = np.empty((0, 132))
        for palm in palms:
            handpose = handpose_detector.infer(frame, palm)
            if handpose is not None:
                hands = np.vstack((hands, handpose))
        frame, _ = visualize(frame, hands)
        out_writer.write(frame)

    cap.release()
    out_writer.release()
    return out_path

# Gradio Interface
with gr.Blocks(css='''.example * {
    font-style: italic;
    font-size: 18px !important;
    color: #0ea5e9 !important;
    }''') as demo:

    gr.Markdown("### Handpose Estimation with Mediapipe using OpenCV")
    gr.Markdown("Upload a video to palm detection followed by hand pose estimation using MediaPipe and OpenCV")

    with gr.Row():
        inputs=gr.Video(label="Upload Video")
        outputs=gr.Video(label="Handpose Estimation Output")

    # Clear output when new video is uploaded
    inputs.change(fn=lambda: (None), outputs=outputs)

    with gr.Row():
        submit_btn = gr.Button("Submit", variant="primary")
        clear_btn = gr.Button("Clear")

    submit_btn.click(fn=process_video, inputs=inputs, outputs=outputs)
    clear_btn.click(fn=lambda:(None, None), outputs=[inputs, outputs])

    gr.Markdown("Click on any example to try it.", elem_classes=["example"])

    gr.Examples(
        examples=[
            ["examples/handwave.mp4"],
            ["examples/handwave2.mp4"]
        ],
        inputs=inputs
    )

    gr.Markdown("Example videos credit: https://pixabay.com/")

if __name__ == "__main__":
    demo.launch()