Wanli commited on
Commit
549df9a
·
1 Parent(s): 9464ab9

Update handpose estimation model from MediaPipe (2023feb) (#133)

Browse files

* update handpose model

* update quantize model

* fix quantize path

* update readme of quantization and benchmark result

* fix document

Files changed (3) hide show
  1. README.md +9 -4
  2. demo.py +101 -39
  3. mp_handpose.py +17 -7
README.md CHANGED
@@ -4,11 +4,14 @@ This model estimates 21 hand keypoints per detected hand from [palm detector](..
4
 
5
  ![MediaPipe Hands Keypoints](./examples/hand_keypoints.png)
6
 
7
- This model is converted from Tensorflow-JS to ONNX using following tools:
8
- - tfjs to tf_saved_model: https://github.com/patlevin/tfjs-to-tf/
9
- - tf_saved_model to ONNX: https://github.com/onnx/tensorflow-onnx
10
  - simplified by [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)
11
 
 
 
 
 
12
  ## Demo
13
 
14
  Run the following commands to try the demo:
@@ -21,7 +24,7 @@ python demo.py -i /path/to/image
21
 
22
  ### Example outputs
23
 
24
- ![webcam demo](./examples/mphandpose_demo.gif)
25
 
26
  ## License
27
 
@@ -30,3 +33,5 @@ All files in this directory are licensed under [Apache 2.0 License](./LICENSE).
30
  ## Reference
31
 
32
  - MediaPipe Handpose: https://github.com/tensorflow/tfjs-models/tree/master/handpose
 
 
 
4
 
5
  ![MediaPipe Hands Keypoints](./examples/hand_keypoints.png)
6
 
7
+ This model is converted from TFlite to ONNX using following tools:
8
+ - TFLite model to ONNX: https://github.com/onnx/tensorflow-onnx
 
9
  - simplified by [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)
10
 
11
+ **Note**:
12
+ - The int8-quantized model may produce invalid results due to a significant drop of accuracy.
13
+ - Visit https://google.github.io/mediapipe/solutions/models.html#hands for models of larger scale.
14
+
15
  ## Demo
16
 
17
  Run the following commands to try the demo:
 
24
 
25
  ### Example outputs
26
 
27
+ ![webcam demo](./examples/mphandpose_demo.webp)
28
 
29
  ## License
30
 
 
33
  ## Reference
34
 
35
  - MediaPipe Handpose: https://github.com/tensorflow/tfjs-models/tree/master/handpose
36
+ - MediaPipe hands model and model card: https://google.github.io/mediapipe/solutions/models.html#hands
37
+ - Int8 model quantized with rgb evaluation set of FreiHAND: https://lmb.informatik.uni-freiburg.de/resources/datasets/FreihandDataset.en.html
demo.py CHANGED
@@ -31,69 +31,126 @@ except:
31
 
32
  parser = argparse.ArgumentParser(description='Hand Pose Estimation from MediaPipe')
33
  parser.add_argument('--input', '-i', type=str, help='Path to the input image. Omit for using default camera.')
34
- parser.add_argument('--model', '-m', type=str, default='./handpose_estimation_mediapipe_2022may.onnx', help='Path to the model.')
35
  parser.add_argument('--backend', '-b', type=int, default=backends[0], help=help_msg_backends.format(*backends))
36
  parser.add_argument('--target', '-t', type=int, default=targets[0], help=help_msg_targets.format(*targets))
37
- parser.add_argument('--conf_threshold', type=float, default=0.8, help='Filter out hands of confidence < conf_threshold.')
38
  parser.add_argument('--save', '-s', type=str, default=False, help='Set true to save results. This flag is invalid when using camera.')
39
  parser.add_argument('--vis', '-v', type=str2bool, default=True, help='Set true to open a window for result visualization. This flag is invalid when using camera.')
40
  args = parser.parse_args()
41
 
42
 
43
  def visualize(image, hands, print_result=False):
44
- output = image.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  for idx, handpose in enumerate(hands):
47
  conf = handpose[-1]
48
  bbox = handpose[0:4].astype(np.int32)
49
- landmarks = handpose[4:-1].reshape(21, 2).astype(np.int32)
 
 
 
 
 
 
50
 
51
  # Print results
52
  if print_result:
53
  print('-----------hand {}-----------'.format(idx + 1))
54
  print('conf: {:.2f}'.format(conf))
 
55
  print('hand box: {}'.format(bbox))
56
  print('hand landmarks: ')
57
- for l in landmarks:
 
 
 
58
  print('\t{}'.format(l))
59
 
 
 
 
 
60
  # Draw line between each key points
61
- cv.line(output, landmarks[0], landmarks[1], (255, 255, 255), 2)
62
- cv.line(output, landmarks[1], landmarks[2], (255, 255, 255), 2)
63
- cv.line(output, landmarks[2], landmarks[3], (255, 255, 255), 2)
64
- cv.line(output, landmarks[3], landmarks[4], (255, 255, 255), 2)
65
-
66
- cv.line(output, landmarks[0], landmarks[5], (255, 255, 255), 2)
67
- cv.line(output, landmarks[5], landmarks[6], (255, 255, 255), 2)
68
- cv.line(output, landmarks[6], landmarks[7], (255, 255, 255), 2)
69
- cv.line(output, landmarks[7], landmarks[8], (255, 255, 255), 2)
70
-
71
- cv.line(output, landmarks[0], landmarks[9], (255, 255, 255), 2)
72
- cv.line(output, landmarks[9], landmarks[10], (255, 255, 255), 2)
73
- cv.line(output, landmarks[10], landmarks[11], (255, 255, 255), 2)
74
- cv.line(output, landmarks[11], landmarks[12], (255, 255, 255), 2)
75
-
76
- cv.line(output, landmarks[0], landmarks[13], (255, 255, 255), 2)
77
- cv.line(output, landmarks[13], landmarks[14], (255, 255, 255), 2)
78
- cv.line(output, landmarks[14], landmarks[15], (255, 255, 255), 2)
79
- cv.line(output, landmarks[15], landmarks[16], (255, 255, 255), 2)
80
-
81
- cv.line(output, landmarks[0], landmarks[17], (255, 255, 255), 2)
82
- cv.line(output, landmarks[17], landmarks[18], (255, 255, 255), 2)
83
- cv.line(output, landmarks[18], landmarks[19], (255, 255, 255), 2)
84
- cv.line(output, landmarks[19], landmarks[20], (255, 255, 255), 2)
85
-
86
- for p in landmarks:
87
- cv.circle(output, p, 2, (0, 0, 255), 2)
88
-
89
- return output
 
 
 
 
 
90
 
91
 
92
  if __name__ == '__main__':
93
  # palm detector
94
  palm_detector = MPPalmDet(modelPath='../palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx',
95
  nmsThreshold=0.3,
96
- scoreThreshold=0.8,
97
  backendId=args.backend,
98
  targetId=args.target)
99
  # handpose detector
@@ -108,7 +165,7 @@ if __name__ == '__main__':
108
 
109
  # Palm detector inference
110
  palms = palm_detector.infer(image)
111
- hands = np.empty(shape=(0, 47))
112
 
113
  # Estimate the pose of each hand
114
  for palm in palms:
@@ -117,10 +174,12 @@ if __name__ == '__main__':
117
  if handpose is not None:
118
  hands = np.vstack((hands, handpose))
119
  # Draw results on the input image
120
- image = visualize(image, hands, True)
121
 
122
  if len(palms) == 0:
123
  print('No palm detected!')
 
 
124
 
125
  # Save results
126
  if args.save:
@@ -131,6 +190,7 @@ if __name__ == '__main__':
131
  if args.vis:
132
  cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
133
  cv.imshow(args.input, image)
 
134
  cv.waitKey(0)
135
  else: # Omit input to call default camera
136
  deviceId = 0
@@ -145,7 +205,7 @@ if __name__ == '__main__':
145
 
146
  # Palm detector inference
147
  palms = palm_detector.infer(frame)
148
- hands = np.empty(shape=(0, 47))
149
 
150
  tm.start()
151
  # Estimate the pose of each hand
@@ -156,12 +216,14 @@ if __name__ == '__main__':
156
  hands = np.vstack((hands, handpose))
157
  tm.stop()
158
  # Draw results on the input image
159
- frame = visualize(frame, hands)
160
 
161
  if len(palms) == 0:
162
  print('No palm detected!')
163
  else:
 
164
  cv.putText(frame, 'FPS: {:.2f}'.format(tm.getFPS()), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
165
 
166
  cv.imshow('MediaPipe Handpose Detection Demo', frame)
 
167
  tm.reset()
 
31
 
32
  parser = argparse.ArgumentParser(description='Hand Pose Estimation from MediaPipe')
33
  parser.add_argument('--input', '-i', type=str, help='Path to the input image. Omit for using default camera.')
34
+ parser.add_argument('--model', '-m', type=str, default='./handpose_estimation_mediapipe_2023feb.onnx', help='Path to the model.')
35
  parser.add_argument('--backend', '-b', type=int, default=backends[0], help=help_msg_backends.format(*backends))
36
  parser.add_argument('--target', '-t', type=int, default=targets[0], help=help_msg_targets.format(*targets))
37
+ parser.add_argument('--conf_threshold', type=float, default=0.9, help='Filter out hands of confidence < conf_threshold.')
38
  parser.add_argument('--save', '-s', type=str, default=False, help='Set true to save results. This flag is invalid when using camera.')
39
  parser.add_argument('--vis', '-v', type=str2bool, default=True, help='Set true to open a window for result visualization. This flag is invalid when using camera.')
40
  args = parser.parse_args()
41
 
42
 
43
  def visualize(image, hands, print_result=False):
44
+ display_screen = image.copy()
45
+ display_3d = np.zeros((400, 400, 3), np.uint8)
46
+ cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2)
47
+ cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2)
48
+ cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
49
+ cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
50
+ cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
51
+ cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
52
+ is_draw = False # ensure only one hand is drawn
53
+
54
+ def draw_lines(image, landmarks, is_draw_point=True, thickness=2):
55
+ cv.line(image, landmarks[0], landmarks[1], (255, 255, 255), thickness)
56
+ cv.line(image, landmarks[1], landmarks[2], (255, 255, 255), thickness)
57
+ cv.line(image, landmarks[2], landmarks[3], (255, 255, 255), thickness)
58
+ cv.line(image, landmarks[3], landmarks[4], (255, 255, 255), thickness)
59
+
60
+ cv.line(image, landmarks[0], landmarks[5], (255, 255, 255), thickness)
61
+ cv.line(image, landmarks[5], landmarks[6], (255, 255, 255), thickness)
62
+ cv.line(image, landmarks[6], landmarks[7], (255, 255, 255), thickness)
63
+ cv.line(image, landmarks[7], landmarks[8], (255, 255, 255), thickness)
64
+
65
+ cv.line(image, landmarks[0], landmarks[9], (255, 255, 255), thickness)
66
+ cv.line(image, landmarks[9], landmarks[10], (255, 255, 255), thickness)
67
+ cv.line(image, landmarks[10], landmarks[11], (255, 255, 255), thickness)
68
+ cv.line(image, landmarks[11], landmarks[12], (255, 255, 255), thickness)
69
+
70
+ cv.line(image, landmarks[0], landmarks[13], (255, 255, 255), thickness)
71
+ cv.line(image, landmarks[13], landmarks[14], (255, 255, 255), thickness)
72
+ cv.line(image, landmarks[14], landmarks[15], (255, 255, 255), thickness)
73
+ cv.line(image, landmarks[15], landmarks[16], (255, 255, 255), thickness)
74
+
75
+ cv.line(image, landmarks[0], landmarks[17], (255, 255, 255), thickness)
76
+ cv.line(image, landmarks[17], landmarks[18], (255, 255, 255), thickness)
77
+ cv.line(image, landmarks[18], landmarks[19], (255, 255, 255), thickness)
78
+ cv.line(image, landmarks[19], landmarks[20], (255, 255, 255), thickness)
79
+
80
+ if is_draw_point:
81
+ for p in landmarks:
82
+ cv.circle(image, p, thickness, (0, 0, 255), -1)
83
 
84
  for idx, handpose in enumerate(hands):
85
  conf = handpose[-1]
86
  bbox = handpose[0:4].astype(np.int32)
87
+ handedness = handpose[-2]
88
+ if handedness <= 0.5:
89
+ handedness_text = 'Left'
90
+ else:
91
+ handedness_text = 'Right'
92
+ landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32)
93
+ landmarks_word = handpose[67:130].reshape(21, 3)
94
 
95
  # Print results
96
  if print_result:
97
  print('-----------hand {}-----------'.format(idx + 1))
98
  print('conf: {:.2f}'.format(conf))
99
+ print('handedness: {}'.format(handedness_text))
100
  print('hand box: {}'.format(bbox))
101
  print('hand landmarks: ')
102
+ for l in landmarks_screen:
103
+ print('\t{}'.format(l))
104
+ print('hand world landmarks: ')
105
+ for l in landmarks_word:
106
  print('\t{}'.format(l))
107
 
108
+ # draw box
109
+ cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
110
+ # draw handedness
111
+ cv.putText(display_screen, '{}'.format(handedness_text), (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
112
  # Draw line between each key points
113
+ landmarks_xy = landmarks_screen[:, 0:2]
114
+ draw_lines(display_screen, landmarks_xy, is_draw_point=False)
115
+
116
+ # z value is relative to WRIST
117
+ for p in landmarks_screen:
118
+ r = max(5 - p[2] // 5, 0)
119
+ r = min(r, 14)
120
+ cv.circle(display_screen, np.array([p[0], p[1]]), r, (0, 0, 255), -1)
121
+
122
+ if is_draw is False:
123
+ is_draw = True
124
+ # Main view
125
+ landmarks_xy = landmarks_word[:, [0, 1]]
126
+ landmarks_xy = (landmarks_xy * 1000 + 100).astype(np.int32)
127
+ draw_lines(display_3d, landmarks_xy, thickness=5)
128
+
129
+ # Top view
130
+ landmarks_xz = landmarks_word[:, [0, 2]]
131
+ landmarks_xz[:, 1] = -landmarks_xz[:, 1]
132
+ landmarks_xz = (landmarks_xz * 1000 + np.array([300, 100])).astype(np.int32)
133
+ draw_lines(display_3d, landmarks_xz, thickness=5)
134
+
135
+ # Left view
136
+ landmarks_yz = landmarks_word[:, [2, 1]]
137
+ landmarks_yz[:, 0] = -landmarks_yz[:, 0]
138
+ landmarks_yz = (landmarks_yz * 1000 + np.array([100, 300])).astype(np.int32)
139
+ draw_lines(display_3d, landmarks_yz, thickness=5)
140
+
141
+ # Right view
142
+ landmarks_zy = landmarks_word[:, [2, 1]]
143
+ landmarks_zy = (landmarks_zy * 1000 + np.array([300, 300])).astype(np.int32)
144
+ draw_lines(display_3d, landmarks_zy, thickness=5)
145
+
146
+ return display_screen, display_3d
147
 
148
 
149
  if __name__ == '__main__':
150
  # palm detector
151
  palm_detector = MPPalmDet(modelPath='../palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx',
152
  nmsThreshold=0.3,
153
+ scoreThreshold=0.6,
154
  backendId=args.backend,
155
  targetId=args.target)
156
  # handpose detector
 
165
 
166
  # Palm detector inference
167
  palms = palm_detector.infer(image)
168
+ hands = np.empty(shape=(0, 132))
169
 
170
  # Estimate the pose of each hand
171
  for palm in palms:
 
174
  if handpose is not None:
175
  hands = np.vstack((hands, handpose))
176
  # Draw results on the input image
177
+ image, view_3d = visualize(image, hands, True)
178
 
179
  if len(palms) == 0:
180
  print('No palm detected!')
181
+ else:
182
+ print('Palm detected!')
183
 
184
  # Save results
185
  if args.save:
 
190
  if args.vis:
191
  cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
192
  cv.imshow(args.input, image)
193
+ cv.imshow('3D HandPose Demo', view_3d)
194
  cv.waitKey(0)
195
  else: # Omit input to call default camera
196
  deviceId = 0
 
205
 
206
  # Palm detector inference
207
  palms = palm_detector.infer(frame)
208
+ hands = np.empty(shape=(0, 132))
209
 
210
  tm.start()
211
  # Estimate the pose of each hand
 
216
  hands = np.vstack((hands, handpose))
217
  tm.stop()
218
  # Draw results on the input image
219
+ frame, view_3d = visualize(frame, hands)
220
 
221
  if len(palms) == 0:
222
  print('No palm detected!')
223
  else:
224
+ print('Palm detected!')
225
  cv.putText(frame, 'FPS: {:.2f}'.format(tm.getFPS()), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
226
 
227
  cv.imshow('MediaPipe Handpose Detection Demo', frame)
228
+ cv.imshow('3D HandPose Demo', view_3d)
229
  tm.reset()
mp_handpose.py CHANGED
@@ -9,7 +9,7 @@ class MPHandPose:
9
  self.backend_id = backendId
10
  self.target_id = targetId
11
 
12
- self.input_size = np.array([256, 256]) # wh
13
  self.PALM_LANDMARK_IDS = [0, 5, 9, 13, 17, 1, 2]
14
  self.PALM_LANDMARKS_INDEX_OF_PALM_BASE = 0
15
  self.PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2
@@ -115,20 +115,25 @@ class MPHandPose:
115
  return results # [bbox_coords, landmarks_coords, conf]
116
 
117
  def _postprocess(self, blob, rotated_palm_bbox, angle, rotation_matrix):
118
- landmarks, conf = blob
119
 
 
120
  if conf < self.conf_threshold:
121
  return None
122
 
123
- landmarks = landmarks.reshape(-1, 3) # shape: (1, 63) -> (21, 3)
 
124
 
125
  # transform coords back to the input coords
126
  wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
127
  scale_factor = wh_rotated_palm_bbox / self.input_size
128
  landmarks[:, :2] = (landmarks[:, :2] - self.input_size / 2) * scale_factor
 
129
  coords_rotation_matrix = cv.getRotationMatrix2D((0, 0), angle, 1.0)
130
  rotated_landmarks = np.dot(landmarks[:, :2], coords_rotation_matrix[:, :2])
131
  rotated_landmarks = np.c_[rotated_landmarks, landmarks[:, 2]]
 
 
132
  # invert rotation
133
  rotation_component = np.array([
134
  [rotation_matrix[0][0], rotation_matrix[1][0]],
@@ -144,12 +149,12 @@ class MPHandPose:
144
  original_center = np.array([
145
  np.dot(center, inverse_rotation_matrix[0]),
146
  np.dot(center, inverse_rotation_matrix[1])])
147
- landmarks = rotated_landmarks[:, :2] + original_center
148
 
149
  # get bounding box from rotated_landmarks
150
  bbox = np.array([
151
- np.amin(landmarks, axis=0),
152
- np.amax(landmarks, axis=0)]) # [top-left, bottom-right]
153
  # shift bounding box
154
  wh_bbox = bbox[1] - bbox[0]
155
  shift_vector = self.HAND_BOX_SHIFT_VECTOR * wh_bbox
@@ -162,4 +167,9 @@ class MPHandPose:
162
  center_bbox - new_half_size,
163
  center_bbox + new_half_size])
164
 
165
- return np.r_[bbox.reshape(-1), landmarks.reshape(-1), conf[0]]
 
 
 
 
 
 
9
  self.backend_id = backendId
10
  self.target_id = targetId
11
 
12
+ self.input_size = np.array([224, 224]) # wh
13
  self.PALM_LANDMARK_IDS = [0, 5, 9, 13, 17, 1, 2]
14
  self.PALM_LANDMARKS_INDEX_OF_PALM_BASE = 0
15
  self.PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2
 
115
  return results # [bbox_coords, landmarks_coords, conf]
116
 
117
  def _postprocess(self, blob, rotated_palm_bbox, angle, rotation_matrix):
118
+ landmarks, conf, handedness, landmarks_word = blob
119
 
120
+ conf = conf[0][0]
121
  if conf < self.conf_threshold:
122
  return None
123
 
124
+ landmarks = landmarks[0].reshape(-1, 3) # shape: (1, 63) -> (21, 3)
125
+ landmarks_word = landmarks_word[0].reshape(-1, 3) # shape: (1, 63) -> (21, 3)
126
 
127
  # transform coords back to the input coords
128
  wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
129
  scale_factor = wh_rotated_palm_bbox / self.input_size
130
  landmarks[:, :2] = (landmarks[:, :2] - self.input_size / 2) * scale_factor
131
+ landmarks[:, 2] = landmarks[:, 2] * max(scale_factor) # depth scaling
132
  coords_rotation_matrix = cv.getRotationMatrix2D((0, 0), angle, 1.0)
133
  rotated_landmarks = np.dot(landmarks[:, :2], coords_rotation_matrix[:, :2])
134
  rotated_landmarks = np.c_[rotated_landmarks, landmarks[:, 2]]
135
+ rotated_landmarks_world = np.dot(landmarks_word[:, :2], coords_rotation_matrix[:, :2])
136
+ rotated_landmarks_world = np.c_[rotated_landmarks_world, landmarks_word[:, 2]]
137
  # invert rotation
138
  rotation_component = np.array([
139
  [rotation_matrix[0][0], rotation_matrix[1][0]],
 
149
  original_center = np.array([
150
  np.dot(center, inverse_rotation_matrix[0]),
151
  np.dot(center, inverse_rotation_matrix[1])])
152
+ landmarks[:, :2] = rotated_landmarks[:, :2] + original_center
153
 
154
  # get bounding box from rotated_landmarks
155
  bbox = np.array([
156
+ np.amin(landmarks[:, :2], axis=0),
157
+ np.amax(landmarks[:, :2], axis=0)]) # [top-left, bottom-right]
158
  # shift bounding box
159
  wh_bbox = bbox[1] - bbox[0]
160
  shift_vector = self.HAND_BOX_SHIFT_VECTOR * wh_bbox
 
167
  center_bbox - new_half_size,
168
  center_bbox + new_half_size])
169
 
170
+ # [0: 4]: hand bounding box found in image of format [x1, y1, x2, y2] (top-left and bottom-right points)
171
+ # [4: 67]: screen landmarks with format [x1, y1, z1, x2, y2 ... x21, y21, z21], z value is relative to WRIST
172
+ # [67: 130]: world landmarks with format [x1, y1, z1, x2, y2 ... x21, y21, z21], 3D metric x, y, z coordinate
173
+ # [130]: handedness, (left)[0, 1](right) hand
174
+ # [131]: confidence
175
+ return np.r_[bbox.reshape(-1), landmarks.reshape(-1), rotated_landmarks_world.reshape(-1), handedness[0][0], conf]