File size: 8,273 Bytes
a771188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import gradio as gr
import cv2 as cv
import tempfile
from mp_handpose import MPHandPose
from mp_palmdet import MPPalmDet
import numpy as np
from huggingface_hub import hf_hub_download

handpose_detector_path = hf_hub_download(repo_id="opencv/handpose_estimation_mediapipe", filename="handpose_estimation_mediapipe_2023feb.onnx")
palm_detector_path = hf_hub_download(repo_id="opencv/palm_detection_mediapipe", filename="palm_detection_mediapipe_2023feb.onnx")

backend_id = cv.dnn.DNN_BACKEND_OPENCV
target_id = cv.dnn.DNN_TARGET_CPU

palm_detector = MPPalmDet(modelPath=palm_detector_path, nmsThreshold=0.3, scoreThreshold=0.6, backendId=backend_id, targetId=target_id)
handpose_detector = MPHandPose(modelPath=handpose_detector_path, confThreshold=0.9, backendId=backend_id, targetId=target_id)

def visualize(image, hands):
    display_screen = image.copy()
    display_3d = np.zeros((400, 400, 3), np.uint8)
    cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2)
    cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2)
    cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
    cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

    def draw_lines(image, landmarks, is_draw_point=True, thickness=2):
        connections = [
            (0, 1), (1, 2), (2, 3), (3, 4),
            (0, 5), (5, 6), (6, 7), (7, 8),
            (0, 9), (9, 10), (10, 11), (11, 12),
            (0, 13), (13, 14), (14, 15), (15, 16),
            (0, 17), (17, 18), (18, 19), (19, 20)
        ]
        for (i, j) in connections:
            cv.line(image, landmarks[i], landmarks[j], (255, 255, 255), thickness)
        if is_draw_point:
            for p in landmarks:
                cv.circle(image, p, thickness, (0, 0, 255), -1)

    gc = GestureClassification()

    for handpose in hands:
        bbox = handpose[0:4].astype(np.int32)
        handedness = handpose[-2]
        handedness_text = 'Left' if handedness <= 0.5 else 'Right'
        landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32)
        landmarks_word = handpose[67:130].reshape(21, 3)
        gesture = gc.classify(landmarks_screen)

        cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
        cv.putText(display_screen, handedness_text, (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
        cv.putText(display_screen, gesture, (bbox[0], bbox[1] + 30), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))

        landmarks_xy = landmarks_screen[:, 0:2]
        draw_lines(display_screen, landmarks_xy, is_draw_point=False)

        for p in landmarks_screen:
            r = max(5 - p[2] // 5, 0)
            r = min(r, 14)
            cv.circle(display_screen, (p[0], p[1]), r, (0, 0, 255), -1)

        # Main view
        landmarks_xy = (landmarks_word[:, [0, 1]] * 1000 + 100).astype(np.int32)
        draw_lines(display_3d, landmarks_xy, thickness=5)
        # Top view
        landmarks_xz = landmarks_word[:, [0, 2]]
        landmarks_xz[:, 1] *= -1
        landmarks_xz = (landmarks_xz * 1000 + [300, 100]).astype(np.int32)
        draw_lines(display_3d, landmarks_xz, thickness=5)
        # Left view
        landmarks_yz = landmarks_word[:, [2, 1]]
        landmarks_yz[:, 0] *= -1
        landmarks_yz = (landmarks_yz * 1000 + [100, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_yz, thickness=5)
        # Right view
        landmarks_zy = (landmarks_word[:, [2, 1]] * 1000 + [300, 300]).astype(np.int32)
        draw_lines(display_3d, landmarks_zy, thickness=5)

    return display_screen, display_3d

class GestureClassification:
    def _vector_2_angle(self, v1, v2):
        uv1 = v1 / np.linalg.norm(v1)
        uv2 = v2 / np.linalg.norm(v2)
        return np.degrees(np.arccos(np.dot(uv1, uv2)))

    def _hand_angle(self, hand):
        return [
            self._vector_2_angle(np.array([hand[0][0] - hand[i][0], hand[0][1] - hand[i][1]]),
                                 np.array([hand[i + 1][0] - hand[i + 2][0], hand[i + 1][1] - hand[i + 2][1]]))
            for i in [2, 6, 10, 14, 18]
        ]

    def _finger_status(self, lmList):
        originx, originy = lmList[0]
        keypoint_list = [[5, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
        return [np.hypot(x2 - originx, y2 - originy) > np.hypot(x1 - originx, y1 - originy)
                for (x1, y1), (x2, y2) in [(lmList[i], lmList[j]) for i, j in keypoint_list]]

    def _classify(self, hand):
        angle_list = self._hand_angle(hand)
        thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen = self._finger_status(hand)
        thr_angle = 65.
        thr_angle_thumb = 30.
        thr_angle_s = 49.
        g = "Undefined"
        if angle_list[0] > thr_angle_thumb and all(a > thr_angle for a in angle_list[1:]) and not any([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Zero"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and all(a > thr_angle for a in angle_list[2:]) and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "One"
        elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and angle_list[2] < thr_angle_s and all(a > thr_angle for a in angle_list[3:]) and not thumbOpen and firstOpen and secondOpen and not any([thirdOpen, fourthOpen]):
            g = "Two"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:4]) and angle_list[4] > thr_angle and not thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Three"
        elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:]) and all([firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Four"
        elif all(a < thr_angle_s for a in angle_list) and all([thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen]):
            g = "Five"
        elif angle_list[0] < thr_angle_s and all(a > thr_angle for a in angle_list[1:4]) and angle_list[4] < thr_angle_s and thumbOpen and not any([firstOpen, secondOpen, thirdOpen]) and fourthOpen:
            g = "Six"
        elif angle_list[0] < thr_angle_s and angle_list[1] < thr_angle and all(a > thr_angle for a in angle_list[2:4]) and angle_list[4] > thr_angle_s and thumbOpen and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]):
            g = "Seven"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:3]) and angle_list[3] > thr_angle and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen]) and not any([thirdOpen, fourthOpen]):
            g = "Eight"
        elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:4]) and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen:
            g = "Nine"
        return g

    def classify(self, landmarks):
        return self._classify(landmarks[:21, :2])

def process_video(video_path):
    cap = cv.VideoCapture(video_path)
    fps = cap.get(cv.CAP_PROP_FPS)
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

    out_path = tempfile.mktemp(suffix=".mp4")
    out_writer = cv.VideoWriter(out_path, cv.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        palms = palm_detector.infer(frame)
        hands = np.empty((0, 132))
        for palm in palms:
            handpose = handpose_detector.infer(frame, palm)
            if handpose is not None:
                hands = np.vstack((hands, handpose))
        frame, _ = visualize(frame, hands)
        out_writer.write(frame)

    cap.release()
    out_writer.release()
    return out_path

gr.Interface(
    fn=process_video,
    inputs=gr.File(label="Upload Video", file_types=[".mp4", ".avi"]),
    outputs=gr.Video(label="Processed Video"),
    title="Video Edge Detection",
    allow_flagging="never"
).launch()