import gradio as gr import cv2 as cv import tempfile from mp_handpose import MPHandPose from mp_palmdet import MPPalmDet import numpy as np from huggingface_hub import hf_hub_download handpose_detector_path = hf_hub_download(repo_id="opencv/handpose_estimation_mediapipe", filename="handpose_estimation_mediapipe_2023feb.onnx") palm_detector_path = hf_hub_download(repo_id="opencv/palm_detection_mediapipe", filename="palm_detection_mediapipe_2023feb.onnx") backend_id = cv.dnn.DNN_BACKEND_OPENCV target_id = cv.dnn.DNN_TARGET_CPU palm_detector = MPPalmDet(modelPath=palm_detector_path, nmsThreshold=0.3, scoreThreshold=0.6, backendId=backend_id, targetId=target_id) handpose_detector = MPHandPose(modelPath=handpose_detector_path, confThreshold=0.9, backendId=backend_id, targetId=target_id) def visualize(image, hands): display_screen = image.copy() display_3d = np.zeros((400, 400, 3), np.uint8) cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2) cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2) cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) def draw_lines(image, landmarks, is_draw_point=True, thickness=2): connections = [ (0, 1), (1, 2), (2, 3), (3, 4), (0, 5), (5, 6), (6, 7), (7, 8), (0, 9), (9, 10), (10, 11), (11, 12), (0, 13), (13, 14), (14, 15), (15, 16), (0, 17), (17, 18), (18, 19), (19, 20) ] for (i, j) in connections: cv.line(image, landmarks[i], landmarks[j], (255, 255, 255), thickness) if is_draw_point: for p in landmarks: cv.circle(image, p, thickness, (0, 0, 255), -1) gc = GestureClassification() for handpose in hands: bbox = handpose[0:4].astype(np.int32) handedness = handpose[-2] handedness_text = 'Left' if handedness <= 0.5 else 'Right' landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32) landmarks_word = handpose[67:130].reshape(21, 3) gesture = gc.classify(landmarks_screen) cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2) cv.putText(display_screen, handedness_text, (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) cv.putText(display_screen, gesture, (bbox[0], bbox[1] + 30), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255)) landmarks_xy = landmarks_screen[:, 0:2] draw_lines(display_screen, landmarks_xy, is_draw_point=False) for p in landmarks_screen: r = max(5 - p[2] // 5, 0) r = min(r, 14) cv.circle(display_screen, (p[0], p[1]), r, (0, 0, 255), -1) # Main view landmarks_xy = (landmarks_word[:, [0, 1]] * 1000 + 100).astype(np.int32) draw_lines(display_3d, landmarks_xy, thickness=5) # Top view landmarks_xz = landmarks_word[:, [0, 2]] landmarks_xz[:, 1] *= -1 landmarks_xz = (landmarks_xz * 1000 + [300, 100]).astype(np.int32) draw_lines(display_3d, landmarks_xz, thickness=5) # Left view landmarks_yz = landmarks_word[:, [2, 1]] landmarks_yz[:, 0] *= -1 landmarks_yz = (landmarks_yz * 1000 + [100, 300]).astype(np.int32) draw_lines(display_3d, landmarks_yz, thickness=5) # Right view landmarks_zy = (landmarks_word[:, [2, 1]] * 1000 + [300, 300]).astype(np.int32) draw_lines(display_3d, landmarks_zy, thickness=5) return display_screen, display_3d class GestureClassification: def _vector_2_angle(self, v1, v2): uv1 = v1 / np.linalg.norm(v1) uv2 = v2 / np.linalg.norm(v2) return np.degrees(np.arccos(np.dot(uv1, uv2))) def _hand_angle(self, hand): return [ self._vector_2_angle(np.array([hand[0][0] - hand[i][0], hand[0][1] - hand[i][1]]), np.array([hand[i + 1][0] - hand[i + 2][0], hand[i + 1][1] - hand[i + 2][1]])) for i in [2, 6, 10, 14, 18] ] def _finger_status(self, lmList): originx, originy = lmList[0] keypoint_list = [[5, 4], [6, 8], [10, 12], [14, 16], [18, 20]] return [np.hypot(x2 - originx, y2 - originy) > np.hypot(x1 - originx, y1 - originy) for (x1, y1), (x2, y2) in [(lmList[i], lmList[j]) for i, j in keypoint_list]] def _classify(self, hand): angle_list = self._hand_angle(hand) thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen = self._finger_status(hand) thr_angle = 65. thr_angle_thumb = 30. thr_angle_s = 49. g = "Undefined" if angle_list[0] > thr_angle_thumb and all(a > thr_angle for a in angle_list[1:]) and not any([firstOpen, secondOpen, thirdOpen, fourthOpen]): g = "Zero" elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and all(a > thr_angle for a in angle_list[2:]) and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]): g = "One" elif angle_list[0] > thr_angle_thumb and angle_list[1] < thr_angle_s and angle_list[2] < thr_angle_s and all(a > thr_angle for a in angle_list[3:]) and not thumbOpen and firstOpen and secondOpen and not any([thirdOpen, fourthOpen]): g = "Two" elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:4]) and angle_list[4] > thr_angle and not thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen: g = "Three" elif angle_list[0] > thr_angle_thumb and all(a < thr_angle_s for a in angle_list[1:]) and all([firstOpen, secondOpen, thirdOpen, fourthOpen]): g = "Four" elif all(a < thr_angle_s for a in angle_list) and all([thumbOpen, firstOpen, secondOpen, thirdOpen, fourthOpen]): g = "Five" elif angle_list[0] < thr_angle_s and all(a > thr_angle for a in angle_list[1:4]) and angle_list[4] < thr_angle_s and thumbOpen and not any([firstOpen, secondOpen, thirdOpen]) and fourthOpen: g = "Six" elif angle_list[0] < thr_angle_s and angle_list[1] < thr_angle and all(a > thr_angle for a in angle_list[2:4]) and angle_list[4] > thr_angle_s and thumbOpen and firstOpen and not any([secondOpen, thirdOpen, fourthOpen]): g = "Seven" elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:3]) and angle_list[3] > thr_angle and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen]) and not any([thirdOpen, fourthOpen]): g = "Eight" elif angle_list[0] < thr_angle_s and all(a < thr_angle for a in angle_list[1:4]) and angle_list[4] > thr_angle_s and thumbOpen and all([firstOpen, secondOpen, thirdOpen]) and not fourthOpen: g = "Nine" return g def classify(self, landmarks): return self._classify(landmarks[:21, :2]) def process_video(video_path): cap = cv.VideoCapture(video_path) fps = cap.get(cv.CAP_PROP_FPS) width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT)) out_path = tempfile.mktemp(suffix=".mp4") out_writer = cv.VideoWriter(out_path, cv.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) while cap.isOpened(): ret, frame = cap.read() if not ret: break palms = palm_detector.infer(frame) hands = np.empty((0, 132)) for palm in palms: handpose = handpose_detector.infer(frame, palm) if handpose is not None: hands = np.vstack((hands, handpose)) frame, _ = visualize(frame, hands) out_writer.write(frame) cap.release() out_writer.release() return out_path # Gradio Interface with gr.Blocks(css='''.example * { font-style: italic; font-size: 18px !important; color: #0ea5e9 !important; }''') as demo: gr.Markdown("### Handpose Estimation with Mediapipe using OpenCV") gr.Markdown("Upload a video to palm detection followed by hand pose estimation using MediaPipe and OpenCV") with gr.Row(): inputs=gr.Video(label="Upload Video") outputs=gr.Video(label="Handpose Estimation Output") # Clear output when new video is uploaded inputs.change(fn=lambda: (None), outputs=outputs) with gr.Row(): submit_btn = gr.Button("Submit", variant="primary") clear_btn = gr.Button("Clear") submit_btn.click(fn=process_video, inputs=inputs, outputs=outputs) clear_btn.click(fn=lambda:(None, None), outputs=[inputs, outputs]) gr.Markdown("Click on any example to try it.", elem_classes=["example"]) gr.Examples( examples=[ ["examples/handwave.mp4"], ["examples/handwave2.mp4"] ], inputs=inputs ) gr.Markdown("Example videos credit: https://pixabay.com/") if __name__ == "__main__": demo.launch()