File size: 9,223 Bytes
a771188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41a18e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import gradio as gr
import cv2 as cv
import tempfile
from mp_handpose import MPHandPose
from mp_palmdet import MPPalmDet
import numpy as np
from huggingface_hub import hf_hub_download

handpose_detector_path = hf_hub_download(repo_id="opencv/handpose_estimation_mediapipe", filename="handpose_estimation_mediapipe_2023feb.onnx")
palm_detector_path = hf_hub_download(repo_id="opencv/palm_detection_mediapipe", filename="palm_detection_mediapipe_2023feb.onnx")

backend_id = cv.dnn.DNN_BACKEND_OPENCV
target_id = cv.dnn.DNN_TARGET_CPU

palm_detector = MPPalmDet(modelPath=palm_detector_path, nmsThreshold=0.3, scoreThreshold=0.6, backendId=backend_id, targetId=target_id)
handpose_detector = MPHandPose(modelPath=handpose_detector_path, confThreshold=0.9, backendId=backend_id, targetId=target_id)

def visualize(image, hands):
    display_screen = image.copy()
    display_3d = np.zeros((400, 400, 3), np.uint8)
    cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2)
    cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2)
    cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

    def draw_lines(image, landmarks, is_draw_point=True, thickness=2):
        connections = [
            (0, 1), (1, 2), (2, 3), (3, 4),
            (0, 5), (5, 6), (6, 7), (7, 8),
            (0, 9), (9, 10), (10, 11), (11, 12),
            (0, 13), (13, 14), (14, 15), (15, 16),
            (0, 17), (17, 18), (18, 19), (19, 20)
        ]
        for (i, j) in connections:
            cv.line(image, landmarks[i], landmarks[j], (255, 255, 255), thickness)
        if is_draw_point:
            for p in landmarks:
                cv.circle(image, p, thickness, (0, 0, 255), -1)

    gc = GestureClassification()

    for handpose in hands:
        bbox = handpose[0:4].astype(np.int32)
        handedness = handpose[-2]
        handedness_text = 'Left' if handedness <= 0.5 else 'Right'
        landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32)
        landmarks_word = handpose[67:130].reshape(21, 3)
        gesture = gc.classify(landmarks_screen)

        cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        cv.putText(display_screen, handedness_text, (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
        cv.putText(display_screen, gesture, (bbox[0], bbox[1] + 30), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

        landmarks_xy = landmarks_screen[:, 0:2]
        draw_lines(display_screen, landmarks_xy, is_draw_point=False)

        for p in landmarks_screen:
            r = max(5 - p[2] // 5, 0)
            r = min(r, 14)
            cv.circle(display_screen, (p[0], p[1]), r, (0, 0, 255), -1)

        # Main view
        landmarks_xy = (landmarks_word[:, [0, 1]] * 1000 + 100).astype(np.int32)
        draw_lines(display_3d, landmarks_xy, thickness=5)
        # Top view
        landmarks_xz = landmarks_word[:, [0, 2]]
        landmarks_xz[:, 1] *= -1
        landmarks_xz = (landmarks_xz * 1000 + [300, 100]).astype(np.int32)
        draw_lines(display_3d, landmarks_xz, thickness=5)
        # Left view
        landmarks_yz = landmarks_word[:, [2, 1]]
        landmarks_yz[:, 0] *= -1
        landmarks_yz = (landmarks_yz * 1000 + [100, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_yz, thickness=5)
        # Right view
        landmarks_zy = (landmarks_word[:, [2, 1]] * 1000 + [300, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_zy, thickness=5)

    return display_screen, display_3d

class GestureClassification:
    def _vector_2_angle(self, v1, v2):
        uv1 = v1 / np.linalg.norm(v1)
        uv2 = v2 / np.linalg.norm(v2)
        return np.degrees(np.arccos(np.dot(uv1, uv2)))

    def _hand_angle(self, hand):
        return [
            self._vector_2_angle(np.array([hand[0][0] - hand[i][0], hand[0][1] - hand[i][1]]),
                                 np.array([hand[i + 1][0] - hand[i + 2][0], hand[i + 1][1] - hand[i + 2][1]]))
            for i in [2, 6, 10, 14, 18]
        ]

    def _finger_status(self, lmList):
        originx, originy = lmList[0]
        keypoint_list = [[5, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
        return [np.hypot(x2 - originx, y2 - originy) > np.hypot(x1 - originx, y1 - originy)
                for (x1, y1), (x2, y2) in [(lmList[i], lmList[j]) for i, j in keypoint_list]]

    def _classify(self, hand):
        angle_list = self._hand_angle(hand)
        thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen = self._finger_status(hand)
        thr_angle = 65.
        thr_angle_thumb = 30.
        thr_angle_s = 49.
        g = "Undefined"
        if angle_list[0] > thr_angle_thumb and all(a > thr_angle for a in angle_list[1:]) and not any([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Zero"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and all(a > thr_angle for a in angle_list[2:]) and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "One"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and angle_list[2] < thr_angle_s and all(a > thr_angle for a in angle_list[3:]) and not thumbOpen and firstOpen and secondOpen and not any([thirdOpen, fourthOpen]):
            g = "Two"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:4]) and angle_list[4] > thr_angle and not thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Three"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:]) and all([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Four"
        elif all(a < thr_angle_s for a in angle_list) and all([thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Five"
        elif angle_list[0] < thr_angle_s and all(a > thr_angle for a in angle_list[1:4]) and angle_list[4] < thr_angle_s and thumbOpen and not any([firstOpen, secondOpen, thirdOpen]) and fourthOpen:
            g = "Six"
        elif angle_list[0] < thr_angle_s and angle_list[1] < thr_angle and all(a > thr_angle for a in angle_list[2:4]) and angle_list[4] > thr_angle_s and thumbOpen and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "Seven"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:3]) and angle_list[3] > thr_angle and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen]) and not any([thirdOpen, fourthOpen]):
            g = "Eight"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:4]) and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Nine"
        return g

    def classify(self, landmarks):
        return self._classify(landmarks[:21, :2])

def process_video(video_path):
    cap = cv.VideoCapture(video_path)
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

    out_path = tempfile.mktemp(suffix=".mp4")
    out_writer = cv.VideoWriter(out_path, cv.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        palms = palm_detector.infer(frame)
        hands = np.empty((0, 132))
        for palm in palms:
            handpose = handpose_detector.infer(frame, palm)
            if handpose is not None:
                hands = np.vstack((hands, handpose))
        frame, _ = visualize(frame, hands)
        out_writer.write(frame)

    cap.release()
    out_writer.release()
    return out_path

# Gradio Interface
with gr.Blocks(css='''.example * {
    font-style: italic;
    font-size: 18px !important;
    color: #0ea5e9 !important;
    }''') as demo:

    gr.Markdown("### Handpose Estimation with Mediapipe using OpenCV")
    gr.Markdown("Upload a video to palm detection followed by hand pose estimation using MediaPipe and OpenCV")

    with gr.Row():
        inputs=gr.Video(label="Upload Video")
        outputs=gr.Video(label="Handpose Estimation Output")

    # Clear output when new video is uploaded
    inputs.change(fn=lambda: (None), outputs=outputs)

    with gr.Row():
        submit_btn = gr.Button("Submit", variant="primary")
        clear_btn = gr.Button("Clear")

    submit_btn.click(fn=process_video, inputs=inputs, outputs=outputs)
    clear_btn.click(fn=lambda:(None, None), outputs=[inputs, outputs])

    gr.Markdown("Click on any example to try it.", elem_classes=["example"])

    gr.Examples(
        examples=[
            ["examples/handwave.mp4"],
            ["examples/handwave2.mp4"]
        ],
        inputs=inputs
    )

    gr.Markdown("Example videos credit: https://pixabay.com/")

if __name__ == "__main__":
    demo.launch()