Spaces:

naver
/

multi-hmr

Runtime error

App Files Files Community

fbaradel commited on Feb 23, 2024

Commit

934fdee

0 Parent(s):

first commit

Browse files

Files changed (35) hide show

.gitattributes +42 -0
.gitignore +6 -0
Checkpoint_License.txt +40 -0
Example_Data_License.txt +29 -0
Multi-HMR_License.txt +98 -0
README.md +117 -0
app.py +269 -0
assets/visu1.gif +3 -0
assets/visu2.gif +3 -0
blocks/__init__.py +8 -0
blocks/camera_embed.py +58 -0
blocks/cross_attn_transformer.py +359 -0
blocks/dinov2.py +27 -0
blocks/smpl_layer.py +153 -0
demo.py +262 -0
example_data/170149601_13aa4e4483_c.jpg +3 -0
example_data/3692623581_aca6eb02d4_e.jpg +3 -0
example_data/3969570423_58eb848b75_c.jpg +3 -0
example_data/39742984604_46934fbd50_c.jpg +3 -0
example_data/4446582661_b188f82f3c_c.jpg +3 -0
example_data/51960182045_d5d6407a3c_c.jpg +3 -0
example_data/5850091922_73ba296093_c.jpg +3 -0
model.py +485 -0
models/multiHMR/multiHMR.pt +3 -0
packages.txt +3 -0
requirements.txt +15 -0
utils/__init__.py +15 -0
utils/camera.py +75 -0
utils/color.py +22 -0
utils/constants.py +9 -0
utils/download.py +103 -0
utils/humans.py +24 -0
utils/image.py +40 -0
utils/render.py +448 -0
utils/tensor_manip.py +45 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,42 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png* filter=lfs diff=lfs merge=lfs -text
+*.pt* filter=lfs diff=lfs merge=lfs -text
+*.jpg* filter=lfs diff=lfs merge=lfs -text
+*.jpeg* filter=lfs diff=lfs merge=lfs -text
+assets/visu1.gif filter=lfs diff=lfs merge=lfs -text
+assets/visu2.gif filter=lfs diff=lfs merge=lfs -text
+models/multiHMR/multiHMR.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+*.glb
+*.npz
+tmp_data
+._.DS_Store
+.DS_Store

Checkpoint_License.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+Multi-HMR Checkpoints, Copyright (c) 2024 Naver Corporation, are licensed under the Checkpoint License below.
+The following datasets, which are not being distributed with the Multi-HMR Checkpoints (hereinafter referred to as "Checkpoints"), were used to train one or more of the Checkpoints:
+(A) BEDLAM Dataset: see https://bedlam.is.tue.mpg.de
+made available under the following license: https://bedlam.is.tue.mpg.de/license.html
+Also see: Michael Black et al., "A Synthetic Dataset of Bodies Exhibiting Detailed Lifelike Animated Motion" in Proceedings IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR), pp. 8726-8737, June 2023.
+(B) AGORA Dataset: see https://agora.is.tue.mpg.de/index.html
+made available under the following license: https://agora.is.tue.mpg.de/license.html
+Also see: Priyanka Patel et al, "{AGORA}: Avatars in Geography Optimized for Regression Analysis" in Proceedings IEEE/CVF Conf.~on Computer Vision and Pattern Recognition ({CVPR}), June 2021.
+(C) 3DPW Dataset: see https://virtualhumans.mpi-inf.mpg.de/3DPW/evaluation.html
+made available under the following license: https://virtualhumans.mpi-inf.mpg.de/3DPW/license.html
+Also see: von Marcard et al., "Recovering Accurate 3D Human Pose in The Wild Using IMUs and a Moving Camera" in European Conference on Computer Vision (ECCV), Sept. 2018.
+(D) UBody Dataset: see https://osx-ubody.github.io/
+made available under the following license: https://docs.google.com/document/d/1R-nn6qguO0YDkPKBleZ8NyrqGrjLXfJ7AQTTsATMYZc/edit
+Also see: Jing Lin et al., "One-Stage 3D Whole-Body Mesh Recovery with Component Aware Transformer" in CVPR 2023.
+----------------------------------------------------------------
+CHECKPOINT LICENSE WHICH ACCOUNT FOR DATASET LICENSES ABOVE:
+----------------------------------------------------------------
+LICENSE GRANT
+BY EXERCISING ANY RIGHTS TO THE CHECKPOINTS, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, NAVER GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
+Subject to the terms and conditions of this License, Naver hereby grants you a personal, revocable, royalty-free, non-exclusive, non-sublicensable, non-transferable license to use the Checkpoints subject to the following conditions:
+(1) PERMITTED USES:  You may use the Checkpoints (1) solely for the sole purpose of performing non-commercial scientific research, non-commercial education, or non-commercial artistic projects and (2) for no purpose that is excluded by any Dataset License under (A)-(D) above (“Purpose”).
+(2) COPYRIGHT: You will retain the above copyright notice and license along with the disclaimer below in all copies or substantial portions of the Checkpoints.
+(3) TERM: You agree the License automatically terminates without notice if you fail to comply with its terms, or you may terminate this License by ceasing to use the Checkpoints for the Purpose. Upon termination you agree to delete any and all copies of the Checkpoints.
+ALL RIGHTS NOT EXPRESSLY GRANTED IN THIS LICENSE ARE RESERVED BY NAVER.
+THE CHECKPOINTS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL NAVER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE CHECKPOINTS OR THE USE OR OTHER DEALINGS IN THE CHECKPOINTS.

Example_Data_License.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+THE FOLLOWING FILES AVAILABLE IN THIS DIRECTORY ARE BEING REDISTRIBUTED WITHOUT MODIFICATION FROM THEIR ORGINAL SOURCE INDICATED BELOW:
+[A] FILE: 39742984604_46934fbd50_c.jpg CC0 [LICENSE: https://creativecommons.org/publicdomain/zero/1.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/138224835@N02/39742984604/in/photolist-23xXbGu-2kXXquq-2kXWBq4-kMD93w-kMB2PK-2kXWAvD-kMB4i6-kMDaeu-kMB4Np-kMBmEi-2kRpySa-2gLwm9L-2jjfFMN-2kRmJmq-2bBtz9S-2jjfFKt-2kSKY3X-2j8jGWF-2ipiiZ8-2ipbovG-2kNanr8-kMD93S-kMB3st-2kYGdpc-2dmWhrJ-kMB2AZ-kMCMRj-UPgVWc-2kYAXNe-2d5bp4B-2kYGAep-2kN98Ni-214amvd-kMBJnr-2j8k8cj-2kN8kmS-2kYBJzk-2kQG8N2-kMBJAn-kMALNH-kMBnhR-kMD9uU-2kN8jWZ-kMBKiV-2dmWhih-2kXNmL8-kMAH5a-kMCEA7-kMACwx-2gLwm4F
+ATTRIBUTION: Yun_Q
+[B] FILE: 4446582661_b188f82f3c_c.jpg [LICENSE: CC BY-NC-ND 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by-nc-nd/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/mirsasha/4446582661/in/photolist-7LVU7i-e3mDLj-e3mDxL-amhvdR-bw3UQR-amhBAH-amkkDA-e3fYZP-6sRpSL-SqQnjB-THieCq-9uZrTE-e3fYtH-e3fYYF-fQpAoR-2nNEszq-y4Faqu-4Vt164-mjzSco-amhB4P-amhBSP-YW9exq-bUPLmh-yiX3A9-5uEfDG-5PJRav-7wgFZn-XQKh1N-fBSsEX-5uzT4n-5qFqyV-mmpMR-5wju3g-YUFLBo-e5F5wq-e5yNza-4U1qLk-e5FkTG-4Vrbfc-4CNy3W-4X2Zyv-5t969C-amkjah-5qKKSG-5rzJuu-5vcjew-5uEgdC-e5EMef-4Lm1s9-ymYibg
+ATTRIBUTION: mirsasha
+[C] FILE: 3692623581_aca6eb02d4_e.jpg [LICENSE: CC BY-NC-ND 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by-nc-nd/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/chasemcalpine/3692623581/in/photolist-6CiEkP-6CiEpR-6CiEx2-6CiEZT-6CnMiC-6CnNoN-6CiErz-7kZN9Y-6CnN1h-6CiETx-6CnN4W-6CiF6P-6NRfC8-5fw88t-tMsMc-M9JFr6-audTNc-7p39Vu-7oYi88-8fbPSG-6CiFmM-6CiDMv-6CnLCW-6CnMnu-6CnMsu-6CiECr-6CiDYz-6CnMRJ-6CnN8N-5655q7-7p3a59-y9dZj-9yXwAb-7snC7p-4FRj19-8jaDN6-qtjT1-7pn36Y-dP4XkX-dP4Xh4-6fba2g-9EtDM3-9EqJEn-9EqJtB-2i2h8EN-6P9Mee-f11f3F-f11dBx-eZZTPF-551BoQ
+ATTRIBUTION: Chase McAlpine
+[D] FILE: 5850091922_73ba296093_c.jpg [LICENSE: CC BY-NC-ND 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by-nc/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/rwoan/5850091922/in/photolist-9UXfvQ-9UXbHo-9UXumJ-9UXrqf-9UXzou-9UUGiV-9UXn7s-9UX1Hf-9UXpk3-9UXnM1-9UXaTd-9UX6Am-9UUFne-9UXsVj-9UUfJT-9UXkkj-9UXogs-9UUvV8-9UUdBX-9UU9wB-9UUD1p-9UUkA2-9UXjQy-9UX8ym-9UX19A-9UWZqh-9UUtJP-9UXxxh-9UXavG-9UX6V3-9UUEb8-9UXf3m-9UUDQe-9UXy9C-9UXtMh-8VRM4n-8VUQPQ-9UUBsc-9UXoMj-9UXhtU-9UUf2Z-9UUy7T-9UXe31-9UXhcy-9UXuR5-9UXs8G-9UX2Xh-9UXgWj-9UUzxn-9UUdSM
+ATTRIBUTION: Ronald Woan
+[E] FILE: 51960182045_d5d6407a3c_c.jpg [LICENSE: CC BY 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/edrost88/51960182045/in/photolist-2dhKmwF-v4wCzs-vHVFcm-vHVFrj-2naxvRF-s5y56-2ozwe9b-8r14iU-8r14Bj-8qWWxB-s5y1M-2jA1RTm-NxtH1E-9QNiVy-oDvaVX-oDLQgD-oDMaZg-oDLUqv-onherq-NxtwnL-onhp6z-NPLSwA-PSsqmC-oDLVXD-onhPX7-oni2jp-oDM9Nt-onhw5q-NxtA2j-oDyjkQ-oDyj6b-oDyDSq-7RcmkH-oDvzHr-onho2F-onhqoz-onha73-onhmaz-oDvaPp-oBK9Us-oDLShc-oDM4nt-273QQFs-onhivh-onhJp4-oBKrWy-oBKxdf-w1WKvv-vHVE89-v4F4NX
+ATTRIBUTION: Erik Drost
+[F] FILE: 3969570423_58eb848b75_c.jpg [LICENSE: CC BY 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by-sa/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/jansolo09/3969570423/in/photolist-73M5Zn-8dQJqW-8aVGvd-7ZK2Ch-8vmBrz-N3U17e-9zkuGB-yLQNS-9zoufL-6TTR1w-8q6oSW-5MzhfE-9zotMY-dJvQKH-9gaGZg-9wfaFK-7p8Eqh-62BtZL-7UzsUq-rovX6Z-7UwaMB-7UzsCE-7Uw9Fr-7Uzp1J-62xdfP-2m14TNX-4o1kr6-4oXNj2-nsi9QD-6TWGFX-7R6qCC-VbRRUk-2ooTq9C-8bCzjp-9zotxb-7p4NDM-q3gbSg-71fw1w-2oLpmTq-dhK9bi-7iwmvQ-6TPV9x-6TTSNC-7UzoTQ-bwnd4G-8jdTfA-PnFy7Y-nSL5Fv-dCHAKX-6TPPBx
+ATTRIBUTION: Jan S0L0
+[G] 170149601_13aa4e4483_c.jpg [LICENSE: CC BY-NC-ND 2.0 AVAILABLE HERE: https://creativecommons.org/licenses/by-nc-nd/2.0/]
+LINK TO ORIGINAL: https://www.flickr.com/photos/wallyg/170149601/in/photolist-g34xP-9yRoKk-haeGC-i4u8dc-dbzfGd-6XJgsx-37hzc-2m792Tp-2n6BnqE-5zH4Yv-8KGj4D-8XMdHG-2n6D2Yc-yyCTU-8qevrJ-9CaiFs-4Be6kH-57oaAr-2m794vN-uindz-bTvAZD-47p3Fj-bEATx9-2m3xhFm-8KGiRx-6VQxan-6bUYCv-bvdoT6-9y8bbY-6jovR1-2n6BnUq-zrvUX-2ncrjMP-2ncxVyn-2ncrjkb-aVsBcB-ShZV6w-2m79N6e-2m3tqy5-2m3usgd-9CahZ9-57oaVH-8AoThL-57ob86-57oaKi-2m4hjp2-kZQsEY-57snZL-9xVH4B-7GQdij
+ATTRIBUTION: Wally Gobetz

Multi-HMR_License.txt ADDED Viewed

	@@ -0,0 +1,98 @@

+Multi-HMR, Copyright (c) 2024 Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+**************************************************************************
+SEE NOTICES BELOW CONCERNING SOFTWARE AND DATA:
+**************************************************************************
+----------------------------------------------------------------
+PART 1: NOTICES CONCERNING SOFTWARE FILES:
+----------------------------------------------------------------
+(A) NOTICE WITH RESPECT TO THE SOFTWARE: blocks/cross_attn_transformer.py
+This software is being redistributed in a modifiled form. The original form is available here:
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/t_cond_mlp.py
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW in Part 3 at [A]:
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/LICENSE.md
+(B) NOTICE WITH RESPECT TO THE SOFTWARE: model.py
+This software is being redistributed in a modifiled form. The original form is available here:
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/heads/smpl_head.py
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW in Part 3 at [B]:
+https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/LICENSE.md
+(C) NOTICE WITH RESPECT TO THE SOFTWARE: blocks/cross_attn_transformer.py
+This software is being redistributed in a modifiled form. The original form is available here:
+https://github.com/lucidrains/vit-pytorch
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW in Part 3 at [C]:
+https://github.com/lucidrains/vit-pytorch/blob/main/LICENSE
+----------------------------------------------------------------
+PART 2: ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE:
+----------------------------------------------------------------
+NOTICE WITH RESPECT TO DATA IN THIS DIRECTORY: example_data
+jpg files available in the directory are made available subject to the license set forth therein.
+----------------------------------------------------------------
+PART 3: ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE:
+----------------------------------------------------------------
+[A] / [B] https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/LICENSE.md
+MIT License
+Copyright (c) 2023 UC Regents, Shubham Goel
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+[C] https://github.com/lucidrains/vit-pytorch
+MIT License
+Copyright (c) 2020 Phil Wang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+title: Multi HMR
+emoji: 👬
+colorFrom: pink
+colorTo: purple
+sdk: gradio
+sdk_version: 4.13.0
+app_file: app.py
+pinned: false
+---
+<p align="center">
+  <h1 align="center">Multi-HMR: Regressing Whole-Body Human Meshes <br> for Multiple Persons in a Single Shot</h1>
+  <p align="center">
+    Fabien Baradel*,
+    Matthieu Armando,
+    Salma Galaaoui,
+    Romain Brégier, <br>
+    Philippe Weinzaepfel,
+    Grégory Rogez,
+    Thomas Lucas*
+  </p>
+  <p align="center">
+    <sup>*</sup> equal contribution
+  </p>
+  <p align="center">
+  <a href="./"><img alt="arXiv" src="https://img.shields.io/badge/arXiv-xxxx.xxxxx-00ff00.svg"></a>
+  <a href="./"><img alt="Blogpost" src="https://img.shields.io/badge/Blogpost-up-yellow"></a>
+  <a href="./"><img alt="Demo" src="https://img.shields.io/badge/Demo-up-blue"></a>
+  <a href="./"><img alt="Hugging Face Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"></a>
+  </p>
+  <div align="center">
+  <img width="49%" alt="Multi-HMR illustration 1" src="assets/visu1.gif">
+  <img width="49%" alt="Multi-HMR illustration 2" src="assets/visu2.gif">
+  <br>
+  Multi-HMR is a simple yet effective single-shot model for multi-person and expressive human mesh recovery.
+  It takes as input a single RGB image and efficiently performs 3D reconstruction of multiple humans in camera space.
+  <br>
+</div>
+</p>
+## Installation
+First, you need to clone the repo.
+We recommand to use virtual enviroment for running MultiHMR.
+Please run the following lines for creating the environment with ```venv```:
+```bash
+python3.9 -m venv .multihmr
+source .multihmr/bin/activate
+pip install -r requirements.txt
+```
+Otherwise you can also create a conda environment.
+```bash
+conda env create -f conda.yaml
+conda activate multihmr
+```
+The installation has been tested with CUDA 11.7.
+Checkpoints will automatically be downloaded to `$HOME/models/multiHMR` the first time you run the demo code.
+Besides these files, you also need to download the *SMPLX* model.
+You will need the [neutral model](http://smplify.is.tue.mpg.de) for running the demo code.
+Please go to the corresponding website and register to get access to the downloads section.
+Download the model and place `SMPLX_NEUTRAL.npz` in `./models/smplx/`.
+## Run Multi-HMR on images
+The following command will run Multi-HMR on all images in the specified `--img_folder`, and save renderings of the reconstructions in `--out_folder`.
+The `--model_name` flag specifies the model to use.
+The `--extra_views` flags additionally renders the side and bev view of the reconstructed scene, `--save_mesh` saves meshes as in a '.npy' file.
+```bash
+python3.9 demo.py \
+    --img_folder example_data \
+    --out_folder demo_out \
+    --extra_views 1 \
+    --model_name multiHMR_896_L_synth
+```
+## Pre-trained models
+We provide multiple pre-trained checkpoints.
+Here is a list of their associated features.
+Once downloaded you need to place them into `$HOME/models/multiHMR`.
+| modelname                     | training data                     | backbone | resolution | runtime (ms) |
+|-------------------------------|-----------------------------------|----------|------------|--------------|
+| [multiHMR_896_L_synth](./)    | BEDLAM+AGORA                      | ViT-L    | 896x896    |    126       |
+We compute the runtime on GPU V100-32GB.
+## License
+The code is distributed under the CC BY-NC-SA 4.0 License.\
+See [Multi-HMR LICENSE](Multi-HMR_License.txt), [Checkpoint LICENSE](Checkpoint_License.txt) and [Example Data LICENSE](Example_Data_License.txt) for more information.
+## Citing
+If you find this code useful for your research, please consider citing the following paper:
+```bibtex
+@inproceedings{multi-hmr2024,
+    title={Multi-HMR: Single-Shot Multi-Person Expressive Human Mesh Recovery},
+    author={Baradel*, Fabien and
+            Armando, Matthieu and
+            Galaaoui, Salma and
+            Br{\'e}gier, Romain and
+            Weinzaepfel, Philippe and
+            Rogez, Gr{\'e}gory and
+            Lucas*, Thomas
+            },
+    booktitle={arXiv},
+    year={2024}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+On chaos-01:
+    CUDA_VISIBLE_DEVICES="" XFORMERS_DISABLED=1 python app.py
+    CUDA_VISIBLE_DEVICES="0" XFORMERS_DISABLED=0 python app.py
+On laptop:
+    ssh -N -L 8000:127.0.0.1:7860 chaos-01
+"""
+import spaces
+import os
+os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
+from utils.constants import SMPLX_DIR, MEAN_PARAMS
+from argparse import ArgumentParser
+import torch
+import gradio as gr
+from PIL import Image, ImageOps
+import numpy as np
+from pathlib import Path
+if torch.cuda.is_available() and torch.cuda.device_count()>0:
+    device = torch.device('cuda:0')
+    os.environ["PYOPENGL_PLATFORM"] = "egl"
+    device_name = torch.cuda.get_device_name(0)
+    print(f"Device - GPU: {device_name}")
+else:
+    device = torch.device('cpu')
+    os.environ["PYOPENGL_PLATFORM"] = "osmesa"
+    device_name = 'CPU'
+    print("Device - CPU")
+from demo import forward_model, get_camera_parameters, overlay_human_meshes, load_model as _load_model
+from utils import normalize_rgb, demo_color as color, create_scene
+import time
+import shutil
+model = None
+example_data_dir = 'example_data'
+list_examples = os.listdir(example_data_dir)
+list_examples_basename = [x for x in list_examples if x.endswith(('.jpg', 'jpeg', 'png')) and not x.startswith('._')]
+list_examples = [[os.path.join(example_data_dir, x)] for x in list_examples_basename]
+_list_examples_basename = [Path(x).stem for x in list_examples_basename]
+tmp_data_dir = 'tmp_data'
+def download_smplx():
+    os.makedirs(os.path.join(SMPLX_DIR, 'smplx'), exist_ok=True)
+    smplx_fname = os.path.join(SMPLX_DIR, 'smplx', 'SMPLX_NEUTRAL.npz')
+    if not os.path.isfile(smplx_fname):
+        print('Start to download the SMPL-X model')
+        if not ('SMPLX_LOGIN' in os.environ and 'SMPLX_PWD' in os.environ):
+                raise ValueError('You need to set a secret for SMPLX_LOGIN and for SMPLX_PWD to run this space')
+        fname = "models_smplx_v1_1.zip"
+        username = os.environ['SMPLX_LOGIN'].replace('@','%40')
+        password = os.environ['SMPLX_PWD']
+        cmd = f"wget -O {fname} --save-cookies cookies.txt --keep-session-cookies --post-data 'username={username}&password={password}' \"https://download.is.tue.mpg.de/download.php?domain=smplx&sfile={fname}\""
+        os.system(cmd)
+        assert os.path.isfile(fname), "failed to download"
+        os.system(f'unzip {fname}')
+        os.system(f"cp models/smplx/SMPLX_NEUTRAL.npz {smplx_fname}")
+        assert os.path.isfile(smplx_fname), "failed to find smplx file"
+        print('SMPL-X has been succesfully downloaded')
+    else:
+         print('SMPL-X is already here')
+    if not os.path.isfile(MEAN_PARAMS):
+        print('Start to download the SMPL mean params')
+        os.system(f"wget -O {MEAN_PARAMS}  https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/smpl_mean_params.npz?versionId=CAEQHhiBgICN6M3V6xciIDU1MzUzNjZjZGNiOTQ3OWJiZTJmNThiZmY4NmMxMTM4")
+        print('SMPL mean params have been succesfully downloaded')
+    else:
+         print('SMPL mean params is already here')
+@spaces.GPU
+def infer(fn, det_thresh, nms_kernel_size):
+    global device
+    global model
+    # Is it an image from example_data_dir ?
+    basename = Path(os.path.basename(fn)).stem
+    _basename = f"{basename}_thresh{int(det_thresh*100)}_nms{int(nms_kernel_size)}"
+    is_known_image = (basename in _list_examples_basename) # only images from example_data
+    # Filenames
+    if not is_known_image:
+        _basename = 'output' # such that we do not save all the uploaded results - not sure ?
+    _glb_fn = f"{_basename}.glb"
+    _rend_fn = f"{_basename}.png"
+    glb_fn = os.path.join(tmp_data_dir, _glb_fn)
+    rend_fn = os.path.join(tmp_data_dir, _rend_fn)
+    os.makedirs(tmp_data_dir, exist_ok=True)
+    # Already processed
+    is_preprocessed = False
+    if is_known_image:
+        _tmp_data_dir_files = os.listdir(tmp_data_dir)
+        is_preprocessed = (_glb_fn in _tmp_data_dir_files) and (_rend_fn in _tmp_data_dir_files) # already preprocessed
+    is_known = is_known_image and is_preprocessed
+    if not is_known:
+        im = Image.open(fn)
+        fov, p_x, p_y = 60, None, None # FOV=60 always here!
+        img_size = model.img_size
+        # Get camera information
+        p_x, p_y = None, None
+        K = get_camera_parameters(img_size, fov=fov, p_x=p_x, p_y=p_y, device=device)
+        # Resise but keep aspect ratio
+        img_pil = ImageOps.contain(im, (img_size,img_size)) # keep the same aspect ratio
+        # Which side is too small/big
+        width, height = img_pil.size
+        pad = abs(width - height) // 2
+        # Pad
+        img_pil_bis = ImageOps.pad(img_pil.copy(), size=(img_size, img_size), color=(255, 255, 255))
+        img_pil = ImageOps.pad(img_pil, size=(img_size, img_size)) # pad with zero on the smallest side
+        # Numpy - normalize - torch.
+        resize_img = normalize_rgb(np.asarray(img_pil))
+        x = torch.from_numpy(resize_img).unsqueeze(0).to(device)
+        img_array = np.asarray(img_pil_bis)
+        img_pil_visu = Image.fromarray(img_array)
+        start = time.time()
+        humans = forward_model(model, x, K, det_thresh=det_thresh, nms_kernel_size=nms_kernel_size)
+        print(f"Forward: {time.time() - start:.2f}sec")
+        # Overlay
+        start = time.time()
+        pred_rend_array, _ = overlay_human_meshes(humans, K, model, img_pil_visu)
+        rend_pil = Image.fromarray(pred_rend_array.astype(np.uint8))
+        rend_pil.crop()
+        if width > height:
+            rend_pil = rend_pil.crop((0,pad,width,pad+height))
+        else:
+            rend_pil =rend_pil.crop((pad,0,pad+width,height))
+        rend_pil.save(rend_fn)
+        print(f"Rendering with pyrender: {time.time() - start:.2f}sec")
+        # Save into glb
+        start = time.time()
+        l_mesh = [humans[j]['verts_smplx'].detach().cpu().numpy() for j in range(len(humans))]
+        l_face = [model.smpl_layer['neutral'].bm_x.faces for j in range(len(humans))]
+        scene = create_scene(img_pil_visu, l_mesh, l_face, color=color, metallicFactor=0., roughnessFactor=0.5)
+        scene.export(glb_fn)
+        print(f"Exporting scene in glb: {time.time() - start:.2f}sec")
+    else:
+        print("We already have the predictions-visus stored somewhere...")
+    out = [rend_fn, glb_fn]
+    print(out)
+    return out
+    # return [rend_fn, hidden_glb_fn]
+    # return [rend_fn, my_glb_fn]
+if __name__ == "__main__":
+        parser = ArgumentParser()
+        parser.add_argument("--model_name", type=str, default='multiHMR')
+        parser.add_argument("--logs_path", type=str, default='./data')
+        args = parser.parse_args()
+        # Info
+        ### Description and style
+        logo = r"""
+        <center>
+            <img src='https://europe.naverlabs.com/wp-content/uploads/2020/10/NLE_1_WHITE_264x60_opti.png' alt='Multi-HMR logo' style="width:250px; margin-bottom:10px">
+        </center>
+        """
+        title = r"""
+        <center>
+            <h1 align="center">Multi-HMR: Regressing Whole-Body Human Meshes for Multiple Persons in a Single Shot</h1>
+        </center>
+        """
+        description = f"""
+        The demo is running on a {device_name}.
+        <br>
+        [<b>Demo code</b>] If you want to run Multi-HMR on several images please consider using the demo code available on [our Github repo](https://github.com/naver/multiHMR)
+        """
+        article = r"""
+        ---
+        📝 **Citation**
+        <br>
+        If our work is useful for your research, please consider citing:
+        ```bibtex
+        @inproceedings{multihmr2024,
+            title={Multi-HMR: Regressing Whole-Body Human Meshes for Multiple Persons in a Single Shot},
+            author={Baradel*, Fabien and
+                    Armando, Matthieu and
+                    Galaaoui, Salma and
+                    Br{\'e}gier, Romain and
+                    Weinzaepfel, Philippe and
+                    Rogez, Gr{\'e}gory and
+                    Lucas*, Thomas},
+            booktitle={arXiv},
+            year={2024}
+        }
+        ```
+        📋 **License**
+        <br>
+        CC BY-NC-SA 4.0 License. Please refer to the [LICENSE file](./Multi-HMR_License.txt) for details.
+        <br>
+        📧 **Contact**
+        <br>
+        If you have any questions, please feel free to send a message to <b>[email protected]</b> or open an issue on the [Github repo](https://github.com/naver/multi-hmr).
+        """
+        # Download SMPLX model and mean params
+        download_smplx()
+        # Loading the model
+        model = _load_model(args.model_name, device=device)
+        # Gradio demo
+        with gr.Blocks(title="Multi-HMR", css=".gradio-container") as demo:
+            # gr.HTML("""
+            #         <div style="font-weight:bold; text-align:center; color:royalblue;">Multi-HMR: <br> Multi-Person Whole-Body Human Mesh Recovery in a Single Shot </div>
+            #         """)
+            gr.Markdown(logo)
+            gr.Markdown(title)
+            gr.Markdown(description)
+            with gr.Row():
+                    with gr.Column():
+                        input_image = gr.Image(label="Input image",
+                                            #    type="pil",
+                                               type="filepath",
+                                               sources=['upload', 'clipboard'])
+                    with gr.Column():
+                        output_image = gr.Image(label="Reconstructions - Overlay",
+                                                #    type="pil",
+                                               type="filepath",
+                                                )
+            gr.HTML("""<br/>""")
+            with gr.Row():
+                with gr.Column():
+                        alpha = -70 # longitudinal rotation in degree
+                        beta = 70 # latitudinal rotation in degree
+                        radius = 3. # distance to the 3D model
+                        radius = None # distance to the 3D model
+                        output_model3d = gr.Model3D(label="Reconstructions - 3D scene",
+                                                    camera_position=(alpha, beta, radius),
+                                                    clear_color=[1.0, 1.0, 1.0, 0.0])
+            gr.HTML("""<br/>""")
+            with gr.Row():
+                    threshold = gr.Slider(0.1, 0.7, step=0.1, value=0.3, label='Detection Threshold')
+                    nms = gr.Radio(label="NMS kernel size", choices=[1, 3, 5], value=3)
+                    send_btn = gr.Button("Infer")
+                    send_btn.click(fn=infer, inputs=[input_image, threshold, nms], outputs=[output_image, output_model3d])
+            gr.Examples(list_examples,
+                        inputs=[input_image, 0.3, 3])
+            gr.Markdown(article)
+        demo.queue()  # <-- Sets up a queue with default parameters
+        demo.launch(debug=True, share=False)

assets/visu1.gif ADDED Viewed

Git LFS Details

SHA256: f86a4cecf284fcb08cdf94350e40b6a8b09ae4fca0189ad7c62a8beb3cee8c76
Pointer size: 132 Bytes
Size of remote file: 9.91 MB

assets/visu2.gif ADDED Viewed

Git LFS Details

SHA256: 526a9e58f46da581dab820ce71381813b6e4b74959f45f7bde81b09bfcb42db5
Pointer size: 133 Bytes
Size of remote file: 13.7 MB

blocks/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .camera_embed import FourierPositionEncoding
+from .dinov2 import Dinov2Backbone
+from .cross_attn_transformer import TransformerDecoder
+from .smpl_layer import SMPL_Layer

blocks/camera_embed.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+from torch import nn
+import numpy as np
+class FourierPositionEncoding(nn.Module):
+    def __init__(self, n, num_bands, max_resolution):
+        """
+        Module that generate Fourier encoding - no learning involved
+        """
+        super().__init__()
+        self.num_bands = num_bands
+        self.max_resolution = [max_resolution] * n
+    @property
+    def channels(self):
+        """
+        Return the output dimension
+        """
+        num_dims = len(self.max_resolution)
+        encoding_size = self.num_bands * num_dims
+        encoding_size *= 2 # sin-cos
+        encoding_size += num_dims # concat
+        return encoding_size
+    def forward(self, pos):
+        """
+        Forward pass that take rays as input and generate Fourier positional encodings
+        """
+        fourier_pos_enc = _generate_fourier_features(pos, num_bands=self.num_bands, max_resolution=self.max_resolution)
+        return fourier_pos_enc
+def _generate_fourier_features(pos, num_bands, max_resolution):
+    """Generate fourier features from a given set of positions and frequencies"""
+    b, n = pos.shape[:2]
+    device = pos.device
+    # Linear frequency sampling
+    min_freq = 1.0
+    freq_bands = torch.stack([torch.linspace(start=min_freq, end=res / 2, steps=num_bands, device=device) for res in max_resolution], dim=0)
+    # Stacking
+    per_pos_features = torch.stack([pos[i, :, :][:, :, None] * freq_bands[None, :, :] for i in range(b)], 0)
+    per_pos_features = per_pos_features.reshape(b, n, -1)
+    # Sin-Cos
+    per_pos_features = torch.cat([torch.sin(np.pi * per_pos_features), torch.cos(np.pi * per_pos_features)], dim=-1)
+    # Concat with initial pos
+    per_pos_features = torch.cat([pos, per_pos_features], dim=-1)
+    return per_pos_features

blocks/cross_attn_transformer.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+from typing import Callable, Optional
+import torch
+from torch import nn
+from inspect import isfunction
+from einops import rearrange
+class AdaptiveLayerNorm1D(torch.nn.Module):
+    """
+    Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/t_cond_mlp.py#L7
+    """
+    def __init__(self, data_dim: int, norm_cond_dim: int):
+        super().__init__()
+        if data_dim <= 0:
+            raise ValueError(f"data_dim must be positive, but got {data_dim}")
+        if norm_cond_dim <= 0:
+            raise ValueError(f"norm_cond_dim must be positive, but got {norm_cond_dim}")
+        self.norm = torch.nn.LayerNorm(
+            data_dim
+        )  # TODO: Check if elementwise_affine=True is correct
+        self.linear = torch.nn.Linear(norm_cond_dim, 2 * data_dim)
+        torch.nn.init.zeros_(self.linear.weight)
+        torch.nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        # x: (batch, ..., data_dim)
+        # t: (batch, norm_cond_dim)
+        # return: (batch, data_dim)
+        x = self.norm(x)
+        alpha, beta = self.linear(t).chunk(2, dim=-1)
+        # Add singleton dimensions to alpha and beta
+        if x.dim() > 2:
+            alpha = alpha.view(alpha.shape[0], *([1] * (x.dim() - 2)), alpha.shape[1])
+            beta = beta.view(beta.shape[0], *([1] * (x.dim() - 2)), beta.shape[1])
+        return x * (1 + alpha) + beta
+def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: int = -1):
+    """
+    Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/t_cond_mlp.py#L48
+    """
+    if norm == "batch":
+        return torch.nn.BatchNorm1d(dim)
+    elif norm == "layer":
+        return torch.nn.LayerNorm(dim)
+    elif norm == "ada":
+        assert norm_cond_dim > 0, f"norm_cond_dim must be positive, got {norm_cond_dim}"
+        return AdaptiveLayerNorm1D(dim, norm_cond_dim)
+    elif norm is None:
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown norm: {norm}")
+def exists(val):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L17"
+    return val is not None
+def default(val, d):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L21"
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+class PreNorm(nn.Module):
+    """
+    Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L27
+    """
+    def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_cond_dim: int = -1):
+        super().__init__()
+        self.norm = normalization_layer(norm, dim, norm_cond_dim)
+        self.fn = fn
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        if isinstance(self.norm, AdaptiveLayerNorm1D):
+            return self.fn(self.norm(x, *args), **kwargs)
+        else:
+            return self.fn(self.norm(x), **kwargs)
+class FeedForward(nn.Module):
+    """
+    Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L40
+    """
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(nn.Module):
+    """
+    Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L55
+    """
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x, mask=None):
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        # n --> the num query dimension
+        # TODO reshape b into b2 n and mask.
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv)
+        if mask is not None:
+            q, k, v = [x * mask[:, None, :, None] for x in [q, k, v]]
+        # q, k, v: [13:51:03.400365] torch.Size([22, 1, 256])
+        #q, k ,vk after reshape: torch.Size([16, 8, 1, 32])
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        if mask is not None:
+            dots = dots - (1 - mask)[:, None, None, :] * 10e10
+        attn = self.attend(dots)
+        if mask is not None: # Just for good measure; this is probably overkill
+            attn = attn * mask[:, None, None, :]
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        # out shape :torch.Size([16, 8, 1, 32])
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class CrossAttention(nn.Module):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L89"
+    def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        project_out = not (heads == 1 and dim_head == dim)
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        context_dim = default(context_dim, dim)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=False)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out
+            else nn.Identity()
+        )
+    def forward(self, x, context=None, mask=None):
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        q = self.to_q(x)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), [q, k, v])
+        if mask is not None:
+            q = q * mask[:, None, :, None]
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        if mask is not None:
+            dots = dots - (1 - mask).float()[:, None, :, None] * 1e6
+        attn = self.attend(dots)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        if mask is not None: # Just for good measure; this is probably overkill
+            out = out * mask[:, None, :,  None]
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class TransformerCrossAttn(nn.Module):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L160"
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int,
+        dim_head: int,
+        mlp_dim: int,
+        dropout: float = 0.0,
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
+            ca = CrossAttention(
+                dim, context_dim=context_dim, heads=heads, dim_head=dim_head, dropout=dropout
+            )
+            ff = FeedForward(dim, mlp_dim, dropout=dropout)
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ca, norm=norm, norm_cond_dim=norm_cond_dim),
+                        PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor, *args, context=None, context_list=None, mask=None):
+        if context_list is None:
+            context_list = [context] * len(self.layers)
+        if len(context_list) != len(self.layers):
+            raise ValueError(f"len(context_list) != len(self.layers) ({len(context_list)} != {len(self.layers)})")
+        for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
+            if mask is not None:
+                try:
+                    x = x * mask[:, :, None]
+                except:
+                    print("see ")
+                    import pdb; pdb.set_trace()
+            x = self_attn(x, mask=mask, *args) + x
+            x = cross_attn(x, mask=mask, *args, context=context_list[i]) + x
+            x = ff(x, *args) + x
+        if mask is not None:
+            x = x * mask[:, :, None]
+        return x
+class DropTokenDropout(nn.Module):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L204"
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[0, :, 0], self.p).bernoulli().bool()
+            # TODO: permutation idx for each batch using torch.argsort
+            if zero_mask.any():
+                x = x[:, ~zero_mask, :]
+        return x
+class ZeroTokenDropout(nn.Module):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L223"
+    def __init__(self, p: float = 0.1):
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(
+                "dropout probability has to be between 0 and 1, " "but got {}".format(p)
+            )
+        self.p = p
+    def forward(self, x: torch.Tensor):
+        # x: (batch_size, seq_len, dim)
+        if self.training and self.p > 0:
+            zero_mask = torch.full_like(x[:, :, 0], self.p).bernoulli().bool()
+            # Zero-out the masked tokens
+            x[zero_mask, :] = 0
+        return x
+class TransformerDecoder(nn.Module):
+    "Code modified from https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L301"
+    def __init__(
+        self,
+        num_tokens: int,
+        token_dim: int,
+        dim: int,
+        depth: int,
+        heads: int,
+        mlp_dim: int,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        emb_dropout: float = 0.0,
+        emb_dropout_type: str = 'drop',
+        norm: str = "layer",
+        norm_cond_dim: int = -1,
+        context_dim: Optional[int] = None,
+        skip_token_embedding: bool = False,
+    ):
+        super().__init__()
+        if not skip_token_embedding:
+            self.to_token_embedding = nn.Linear(token_dim, dim)
+        else:
+            self.to_token_embedding = nn.Identity()
+            if token_dim != dim:
+                raise ValueError(
+                    f"token_dim ({token_dim}) != dim ({dim}) when skip_token_embedding is True"
+                )
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
+        if emb_dropout_type == "drop":
+            self.dropout = DropTokenDropout(emb_dropout)
+        elif emb_dropout_type == "zero":
+            self.dropout = ZeroTokenDropout(emb_dropout)
+        elif emb_dropout_type == "normal":
+            self.dropout = nn.Dropout(emb_dropout)
+        self.transformer = TransformerCrossAttn(
+            dim,
+            depth,
+            heads,
+            dim_head,
+            mlp_dim,
+            dropout,
+            norm=norm,
+            norm_cond_dim=norm_cond_dim,
+            context_dim=context_dim,
+        )
+    def forward(self, inp: torch.Tensor, *args, context=None, context_list=None, mask=None):
+        x = self.to_token_embedding(inp)
+        b, n, _ = x.shape
+        x = self.dropout(x)
+        #x += self.pos_embedding[:, :n]
+        x += self.pos_embedding[:, 0][:, None, :] # For now, we don't wish to embed a position. We might in future versions though.
+        x = self.transformer(x, *args, context=context, context_list=context_list, mask=mask)
+        return x

blocks/dinov2.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+from torch import nn
+class Dinov2Backbone(nn.Module):
+    def __init__(self, name='dinov2_vitb14', *args, **kwargs):
+        super().__init__()
+        self.name = name
+        self.encoder = torch.hub.load('facebookresearch/dinov2', self.name, pretrained=False)
+        self.patch_size = self.encoder.patch_size
+        self.embed_dim = self.encoder.embed_dim
+    def forward(self, x):
+        """
+        Encode a RGB image using a ViT-backbone
+        Args:
+            - x: torch.Tensor of shape [bs,3,w,h]
+        Return:
+            - y: torch.Tensor of shape [bs,k,d] - image in patchified mode
+        """
+        assert len(x.shape) == 4
+        y = self.encoder.get_intermediate_layers(x)[0] # ViT-L+896x896: [bs,4096,1024] - [bs,nb_patches,emb]
+        return y

blocks/smpl_layer.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+from torch import nn
+from torch import nn
+import smplx
+import torch
+import numpy as np
+import utils
+from utils import inverse_perspective_projection, perspective_projection
+import roma
+import pickle
+import os
+from utils.constants import SMPLX_DIR
+class SMPL_Layer(nn.Module):
+    """
+    Extension of the SMPL Layer with information about the camera for (inverse) projection the camera plane.
+    """
+    def __init__(self,
+                 type='smplx',
+                 gender='neutral',
+                 num_betas=10,
+                 kid=False,
+                 person_center=None,
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        # Args
+        assert type == 'smplx'
+        self.type = type
+        self.kid = kid
+        self.num_betas = num_betas
+        self.bm_x = smplx.create(SMPLX_DIR, 'smplx', gender=gender, use_pca=False, flat_hand_mean=True, num_betas=num_betas)
+        # Primary keypoint - root
+        self.joint_names = eval(f"utils.get_{self.type}_joint_names")()
+        self.person_center = person_center
+        self.person_center_idx = None
+        if self.person_center is not None:
+            self.person_center_idx = self.joint_names.index(self.person_center)
+    def forward(self,
+                pose, shape,
+                loc, dist, transl,
+                K,
+                expression=None, # facial expression
+                ):
+        """
+        Args:
+            - pose: pose of the person in axis-angle - torch.Tensor [bs,24,3]
+            - shape: torch.Tensor [bs,10]
+            - loc: 2D location of the pelvis in pixel space - torch.Tensor [bs,2]
+            - dist: distance of the pelvis from the camera in m - torch.Tensor [bs,1]
+        Return:
+            - dict containing a bunch of useful information about each person
+        """
+        if loc is not None and dist is not None:
+            assert pose.shape[0] == shape.shape[0] == loc.shape[0] == dist.shape[0]
+        if self.type == 'smpl':
+            assert len(pose.shape) == 3 and list(pose.shape[1:]) == [24,3]
+        elif self.type == 'smplx':
+            assert len(pose.shape) == 3 and list(pose.shape[1:]) == [53,3] # taking root_orient, body_pose, lhand, rhan and jaw for the moment
+        else:
+            raise NameError
+        assert len(shape.shape) == 2 and (list(shape.shape[1:]) == [self.num_betas] or list(shape.shape[1:]) == [self.num_betas+1])
+        if loc is not None and dist is not None:
+            assert len(loc.shape) == 2 and list(loc.shape[1:]) == [2]
+            assert len(dist.shape) == 2 and list(dist.shape[1:]) == [1]
+        bs = pose.shape[0]
+        out = {}
+        # No humans
+        if bs == 0:
+            return {}
+        # Low dimensional parameters
+        kwargs_pose = {
+            'betas': shape,
+        }
+        kwargs_pose['global_orient'] = self.bm_x.global_orient.repeat(bs,1)
+        kwargs_pose['body_pose'] = pose[:,1:22].flatten(1)
+        kwargs_pose['left_hand_pose'] = pose[:,22:37].flatten(1)
+        kwargs_pose['right_hand_pose'] = pose[:,37:52].flatten(1)
+        kwargs_pose['jaw_pose'] = pose[:,52:53].flatten(1)
+        if expression is not None:
+            kwargs_pose['expression'] = expression.flatten(1) # [bs,10]
+        else:
+            kwargs_pose['expression'] = self.bm_x.expression.repeat(bs,1)
+        # default - to be generalized
+        kwargs_pose['leye_pose'] = self.bm_x.leye_pose.repeat(bs,1)
+        kwargs_pose['reye_pose'] = self.bm_x.reye_pose.repeat(bs,1)
+        # Forward using the parametric 3d model SMPL-X layer
+        output = self.bm_x(**kwargs_pose)
+        verts = output.vertices
+        j3d = output.joints # 45 joints
+        R = roma.rotvec_to_rotmat(pose[:,0])
+        # Apply global orientation on 3D points
+        pelvis = j3d[:,[0]]
+        j3d = (R.unsqueeze(1) @ (j3d - pelvis).unsqueeze(-1)).squeeze(-1)
+        # Apply global orientation on 3D points - bis
+        verts = (R.unsqueeze(1) @ (verts - pelvis).unsqueeze(-1)).squeeze(-1)
+        # Location of the person in 3D
+        if transl is None:
+            if K.dtype == torch.float16:
+                # because of torch.inverse - not working with float16 at the moment
+                transl = inverse_perspective_projection(loc.unsqueeze(1).float(), K.float(), dist.unsqueeze(1).float())[:,0]
+                transl = transl.half()
+            else:
+                transl = inverse_perspective_projection(loc.unsqueeze(1), K, dist.unsqueeze(1))[:,0]
+        # Updating transl if we choose a certain person center
+        transl_up = transl.clone()
+        # Definition of the translation depend on the args: 1) vanilla SMPL - 2) computed from a given joint
+        if self.person_center_idx is None:
+            # Add pelvis to transl - standard way for SMPLX layer
+            transl_up = transl_up + pelvis[:,0]
+        else:
+            # Center around the joint because teh translation is computed from this joint
+            person_center = j3d[:, [self.person_center_idx]]
+            verts = verts - person_center
+            j3d = j3d - person_center
+        # Moving into the camera coordinate system
+        j3d_cam = j3d + transl_up.unsqueeze(1)
+        verts_cam = verts + transl_up.unsqueeze(1)
+        # Projection in camera plane
+        j2d = perspective_projection(j3d_cam, K)
+        out.update({
+            'verts_smplx_cam': verts_cam,
+            'j3d': j3d_cam,
+            'j2d': j2d,
+            'transl': transl, # translation of the primary keypoint
+            'transl_pelvis': j3d_cam[:,[0]], # root=pelvis
+        })
+        return out

demo.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import os
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+os.environ['EGL_DEVICE_ID'] = '0'
+import sys
+from argparse import ArgumentParser
+import random
+import pickle as pkl
+import numpy as np
+from PIL import Image, ImageOps
+import torch
+from tqdm import tqdm
+import time
+from utils import normalize_rgb, render_meshes, get_focalLength_from_fieldOfView, demo_color as color, print_distance_on_image, render_side_views, create_scene, MEAN_PARAMS, CACHE_DIR_MULTIHMR, SMPLX_DIR
+from model import Model
+from pathlib import Path
+import warnings
+torch.cuda.empty_cache()
+np.random.seed(seed=0)
+random.seed(0)
+def open_image(img_path, img_size, device=torch.device('cuda')):
+    """ Open image at path, resize and pad """
+    # Open and reshape
+    img_pil = Image.open(img_path).convert('RGB')
+    img_pil = ImageOps.contain(img_pil, (img_size,img_size)) # keep the same aspect ratio
+    # Keep a copy for visualisations.
+    img_pil_bis = ImageOps.pad(img_pil.copy(), size=(img_size,img_size), color=(255, 255, 255))
+    img_pil = ImageOps.pad(img_pil, size=(img_size,img_size)) # pad with zero on the smallest side
+    # Go to numpy
+    resize_img = np.asarray(img_pil)
+    # Normalize and go to torch.
+    resize_img = normalize_rgb(resize_img)
+    x = torch.from_numpy(resize_img).unsqueeze(0).to(device)
+    return x, img_pil_bis
+def get_camera_parameters(img_size, fov=60, p_x=None, p_y=None, device=torch.device('cuda')):
+    """ Given image size, fov and principal point coordinates, return K the camera parameter matrix"""
+    K = torch.eye(3)
+    # Get focal length.
+    focal = get_focalLength_from_fieldOfView(fov=fov, img_size=img_size)
+    K[0,0], K[1,1] = focal, focal
+    # Set principal point
+    if p_x is not None and p_y is not None:
+            K[0,-1], K[1,-1] = p_x * img_size, p_y * img_size
+    else:
+            K[0,-1], K[1,-1] = img_size//2, img_size//2
+    # Add batch dimension
+    K = K.unsqueeze(0).to(device)
+    return K
+def load_model(model_name, device=torch.device('cuda')):
+    """ Open a checkpoint, build Multi-HMR using saved arguments, load the model weigths. """
+    # Model
+    ckpt_path = os.path.join(CACHE_DIR_MULTIHMR, model_name+ '.pt')
+    if not os.path.isfile(ckpt_path):
+        os.makedirs(CACHE_DIR_MULTIHMR, exist_ok=True)
+        print(f"{ckpt_path} not found...")
+        print("It should be the first time you run the demo code")
+        print("Downloading checkpoint from NAVER LABS Europe website...")
+        try:
+            os.system(f"wget -O {ckpt_path} http://download.europe.naverlabs.com/multihmr/{model_name}.pt")
+            print(f"Ckpt downloaded to {ckpt_path}")
+        except:
+            assert "Please contact [email protected] or open an issue on the github repo"
+    # Load weights
+    print("Loading model")
+    ckpt = torch.load(ckpt_path, map_location=device)
+    # Get arguments saved in the checkpoint to rebuild the model
+    kwargs = {}
+    for k,v in vars(ckpt['args']).items():
+            kwargs[k] = v
+    # Build the model.
+    kwargs['type'] = ckpt['args'].train_return_type
+    kwargs['img_size'] = ckpt['args'].img_size[0]
+    model = Model(**kwargs).to(device)
+    # Load weights into model.
+    model.load_state_dict(ckpt['model_state_dict'], strict=False)
+    print("Weights have been loaded")
+    return model
+def forward_model(model, input_image, camera_parameters,
+                  det_thresh=0.3,
+                  nms_kernel_size=1,
+                 ):
+    """ Make a forward pass on an input image and camera parameters. """
+    # Forward the model.
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(enabled=True):
+            humans = model(input_image,
+                           is_training=False,
+                           nms_kernel_size=int(nms_kernel_size),
+                           det_thresh=det_thresh,
+                           K=camera_parameters)
+    return humans
+def overlay_human_meshes(humans, K, model, img_pil, unique_color=False):
+    # Color of humans seen in the image.
+    _color = [color[0] for _ in range(len(humans))] if unique_color else color
+    # Get focal and princpt for rendering.
+    focal = np.asarray([K[0,0,0].cpu().numpy(),K[0,1,1].cpu().numpy()])
+    princpt = np.asarray([K[0,0,-1].cpu().numpy(),K[0,1,-1].cpu().numpy()])
+    # Get the vertices produced by the model.
+    verts_list = [humans[j]['verts_smplx'].cpu().numpy() for j in range(len(humans))]
+    faces_list = [model.smpl_layer['neutral'].bm_x.faces for j in range(len(humans))]
+    # Render the meshes onto the image.
+    pred_rend_array = render_meshes(np.asarray(img_pil),
+            verts_list,
+            faces_list,
+            {'focal': focal, 'princpt': princpt},
+            alpha=1.0,
+            color=_color)
+    return pred_rend_array, _color
+if __name__ == "__main__":
+        parser = ArgumentParser()
+        parser.add_argument("--model_name", type=str, default='multiHMR_896_L_synth')
+        parser.add_argument("--img_folder", type=str, default='example_data')
+        parser.add_argument("--out_folder", type=str, default='demo_out')
+        parser.add_argument("--save_mesh", type=int, default=0, choices=[0,1])
+        parser.add_argument("--extra_views", type=int, default=0, choices=[0,1])
+        parser.add_argument("--det_thresh", type=float, default=0.3)
+        parser.add_argument("--nms_kernel_size", type=float, default=3)
+        parser.add_argument("--fov", type=float, default=60)
+        parser.add_argument("--distance", type=int, default=0, choices=[0,1], help='add distance on the reprojected mesh')
+        parser.add_argument("--unique_color", type=int, default=0, choices=[0,1], help='only one color for all humans')
+        args = parser.parse_args()
+        dict_args = vars(args)
+        assert torch.cuda.is_available()
+        # SMPL-X models
+        smplx_fn = os.path.join(SMPLX_DIR, 'smplx', 'SMPLX_NEUTRAL.npz')
+        if not os.path.isfile(smplx_fn):
+            print(f"{smplx_fn} not found, please download SMPLX_NEUTRAL.npz file")
+            print("To do so you need to create an account in https://smpl-x.is.tue.mpg.de")
+            print("Then download 'SMPL-X-v1.1 (NPZ+PKL, 830MB) - Use thsi for SMPL-X Python codebase'")
+            print(f"Extract the zip file and move SMPLX_NEUTRAL.npz to {smplx_fn}")
+            print("Sorry for this incovenience but we do not have license for redustributing SMPLX model")
+            assert NotImplementedError
+        else:
+             print('SMPLX found')
+        # SMPL mean params download
+        if not os.path.isfile(MEAN_PARAMS):
+            print('Start to download the SMPL mean params')
+            os.system(f"wget -O {MEAN_PARAMS}  https://openmmlab-share.oss-cn-hangzhou.aliyuncs.com/mmhuman3d/models/smpl_mean_params.npz?versionId=CAEQHhiBgICN6M3V6xciIDU1MzUzNjZjZGNiOTQ3OWJiZTJmNThiZmY4NmMxMTM4")
+            print('SMPL mean params have been succesfully downloaded')
+        else:
+            print('SMPL mean params is already here')
+        # Input images
+        suffixes = ('.jpg', '.jpeg', '.png', '.webp')
+        l_img_path = [file for file in os.listdir(args.img_folder) if file.endswith(suffixes) and file[0] != '.']
+        # Loading
+        model = load_model(args.model_name)
+        # Model name for saving results.
+        model_name = os.path.basename(args.model_name)
+        # All images
+        os.makedirs(args.out_folder, exist_ok=True)
+        l_duration = []
+        for i, img_path in enumerate(tqdm(l_img_path)):
+            # Path where the image + overlays of human meshes + optional views will be saved.
+            save_fn = os.path.join(args.out_folder, f"{Path(img_path).stem}_{model_name}.png")
+            # Get input in the right format for the model
+            img_size = model.img_size
+            x, img_pil_nopad = open_image(os.path.join(args.img_folder, img_path), img_size)
+            # Get camera parameters
+            p_x, p_y = None, None
+            K = get_camera_parameters(model.img_size, fov=args.fov, p_x=p_x, p_y=p_y)
+            # Make model predictions
+            start = time.time()
+            humans = forward_model(model, x, K,
+                                             det_thresh=args.det_thresh,
+                                             nms_kernel_size=args.nms_kernel_size)
+            duration = time.time() - start
+            l_duration.append(duration)
+            # Superimpose predicted human meshes to the input image.
+            img_array = np.asarray(img_pil_nopad)
+            img_pil_visu= Image.fromarray(img_array)
+            pred_rend_array, _color = overlay_human_meshes(humans, K, model, img_pil_visu, unique_color=args.unique_color)
+            # Optionally add distance as an annotation to each mesh
+            if args.distance:
+                pred_rend_array = print_distance_on_image(pred_rend_array, humans, _color)
+            # List of images too view side by side.
+            l_img = [img_array, pred_rend_array]
+            # More views
+            if args.extra_views:
+                # Render more side views of the meshes.
+                pred_rend_array_bis, pred_rend_array_sideview, pred_rend_array_bev = render_side_views(img_array, _color, humans, model, K)
+                # Concat
+                _img1 = np.concatenate([img_array, pred_rend_array],1).astype(np.uint8)
+                _img2 = np.concatenate([pred_rend_array_bis, pred_rend_array_sideview, pred_rend_array_bev],1).astype(np.uint8)
+                _h = int(_img2.shape[0] * (_img1.shape[1]/_img2.shape[1]))
+                _img2 = np.asarray(Image.fromarray(_img2).resize((_img1.shape[1], _h)))
+                _img = np.concatenate([_img1, _img2],0).astype(np.uint8)
+            else:
+                 # Concatenate side by side
+                _img = np.concatenate([img_array, pred_rend_array],1).astype(np.uint8)
+            # Save to path.
+            Image.fromarray(_img).save(save_fn)
+            print(f"Avg Multi-HMR inference time={int(1000*np.median(np.asarray(l_duration[-1:])))}ms on a {torch.cuda.get_device_name()}")
+            # Saving mesh
+            if args.save_mesh:
+                # npy file
+                l_mesh = [hum['verts_smplx'].cpu().numpy() for hum in humans]
+                mesh_fn = save_fn+'.npy'
+                np.save(mesh_fn, np.asarray(l_mesh), allow_pickle=True)
+                x = np.load(mesh_fn, allow_pickle=True)
+                # glb file
+                l_mesh = [humans[j]['verts_smplx'].detach().cpu().numpy() for j in range(len(humans))]
+                l_face = [model.smpl_layer['neutral'].bm_x.faces for j in range(len(humans))]
+                scene = create_scene(img_pil_visu, l_mesh, l_face, color=None, metallicFactor=0., roughnessFactor=0.5)
+                scene_fn = save_fn+'.glb'
+                scene.export(scene_fn)
+        print('end')

example_data/170149601_13aa4e4483_c.jpg ADDED Viewed

Git LFS Details

SHA256: 530b3dc5272ec2afea339f3c8605631d6a37d33f735351029da6a713490b60b8
Pointer size: 131 Bytes
Size of remote file: 140 kB

example_data/3692623581_aca6eb02d4_e.jpg ADDED Viewed

Git LFS Details

SHA256: a4df294588ca7feb0fe4d858385828e0d00f471ead667c60afc3c46fc424d935
Pointer size: 130 Bytes
Size of remote file: 81.8 kB

example_data/3969570423_58eb848b75_c.jpg ADDED Viewed

Git LFS Details

SHA256: 22007f23f4249bc03d5fa56aa035c505559a6ebe704e212d67a216b0a2ce90bb
Pointer size: 131 Bytes
Size of remote file: 204 kB

example_data/39742984604_46934fbd50_c.jpg ADDED Viewed

Git LFS Details

SHA256: effbee4d7a92717914467153652dcb2e8a9d32e0e3af6bb7152cc0a5f269e614
Pointer size: 131 Bytes
Size of remote file: 143 kB

example_data/4446582661_b188f82f3c_c.jpg ADDED Viewed

Git LFS Details

SHA256: 6d7f2c12d8cb8e4921ed218873c750e836c57eddea5ed46e8ad54ebca51ee409
Pointer size: 131 Bytes
Size of remote file: 168 kB

example_data/51960182045_d5d6407a3c_c.jpg ADDED Viewed

Git LFS Details

SHA256: 17ec40cae52acf370d1d230f2a3ef3f11933812881ce151a5d662316b599b653
Pointer size: 131 Bytes
Size of remote file: 194 kB

example_data/5850091922_73ba296093_c.jpg ADDED Viewed

Git LFS Details

SHA256: 450bf31edd201bec7bbbd83670dce4cb2c9e52b92c58cd36e79498d42a0365f7
Pointer size: 131 Bytes
Size of remote file: 253 kB

model.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+from torch import nn
+import torch
+import numpy as np
+import roma
+import copy
+from utils import unpatch, inverse_perspective_projection, undo_focal_length_normalization, undo_log_depth
+from blocks import Dinov2Backbone, FourierPositionEncoding, TransformerDecoder, SMPL_Layer
+from utils import rot6d_to_rotmat, rebatch, pad_to_max
+import torch.nn as nn
+import numpy as np
+import einops
+from utils.constants import MEAN_PARAMS
+class Model(nn.Module):
+    """ A ViT backbone followed by a "HPH" head (stack of cross attention layers with queries corresponding to detected humans.) """
+    def __init__(self,
+            backbone='dinov2_vitb14',
+            img_size=896,
+            camera_embedding='geometric', # geometric encodes viewing directions with fourrier encoding
+            camera_embedding_num_bands=16, # increase the size of the camera embedding
+            camera_embedding_max_resolution=64, # does not increase the size of the camera embedding
+            nearness=True, # regress log(1/z)
+            xat_depth=2, # number of cross attention block (SA, CA, MLP) in the HPH head.
+            xat_num_heads=8, # Number of attention heads
+            dict_smpl_layer=None,
+            person_center='head',
+            clip_dist=True,
+            *args, **kwargs):
+        super().__init__()
+        # Save options
+        self.img_size = img_size
+        self.nearness = nearness
+        self.clip_dist = clip_dist,
+        self.xat_depth = xat_depth
+        self.xat_num_heads = xat_num_heads
+        # Setup backbone
+        self.backbone = Dinov2Backbone(backbone)
+        self.embed_dim = self.backbone.embed_dim
+        self.patch_size = self.backbone.patch_size
+        assert self.img_size % self.patch_size == 0, "Invalid img size"
+        # Camera instrinsics
+        self.fovn = 60
+        self.camera_embedding = camera_embedding
+        self.camera_embed_dim = 0
+        if self.camera_embedding is not None:
+            if not self.camera_embedding == 'geometric':
+                raise NotImplementedError("Only geometric camera embedding is implemented")
+            self.camera = FourierPositionEncoding(n=3, num_bands=camera_embedding_num_bands,max_resolution=camera_embedding_max_resolution)
+            # import pdb
+            # pdb.set_trace()
+            self.camera_embed_dim = self.camera.channels
+        # Heads - Detection
+        self.mlp_classif = regression_mlp([self.embed_dim, self.embed_dim, 1]) # bg or human
+        # Heads - Human properties
+        self.mlp_offset = regression_mlp([self.embed_dim, self.embed_dim, 2]) # offset
+        # Dense vetcor idx
+        self.nrot = 53
+        self.idx_score, self.idx_offset, self.idx_dist = [0], [1,2], [3]
+        self.idx_pose = list(range(4,4+self.nrot*9))
+        self.idx_shape = list(range(4+self.nrot*9,4+self.nrot*9+11))
+        self.idx_expr = list(range(4+self.nrot*9+11,4+self.nrot*9+11+10))
+        # SMPL Layers
+        dict_smpl_layer = {'neutral': {10: SMPL_Layer(type='smplx', gender='neutral', num_betas=10, kid=False, person_center=person_center)}}
+        _moduleDict = []
+        for k, _smpl_layer in dict_smpl_layer.items():
+            _moduleDict.append([k, copy.deepcopy(_smpl_layer[10])])
+        self.smpl_layer = nn.ModuleDict(_moduleDict)
+        self.x_attention_head = HPH(
+            num_body_joints=self.nrot-1, #23,
+            context_dim=self.embed_dim + self.camera_embed_dim,
+            dim=1024,
+            depth=self.xat_depth,
+            heads=self.xat_num_heads,
+            mlp_dim=1024,
+            dim_head=32,
+            dropout=0.0,
+            emb_dropout=0.0,
+            at_token_res=self.img_size // self.patch_size)
+    def detection(self, z, nms_kernel_size, det_thresh, N):
+        """ Detection score on the entire low res image """
+        scores = _sigmoid(self.mlp_classif(z)) # per token detection score.
+        # Restore Height and Width dimensions.
+        scores = unpatch(scores, patch_size=1, c=scores.shape[2], img_size=int(np.sqrt(N)))
+        if nms_kernel_size > 1: # Easy nms: supress adjacent high scores with max pooling.
+            scores = _nms(scores, kernel=nms_kernel_size)
+        _scores = torch.permute(scores, (0, 2, 3, 1))
+        # Binary decision (keep confident detections)
+        idx = apply_threshold(det_thresh, _scores)
+        # Scores
+        scores_detected = scores[idx[0], idx[3], idx[1],idx[2]] # scores of the detected humans only
+        scores = torch.permute(scores, (0, 2, 3, 1))
+        return scores, scores_detected, idx
+    def embedd_camera(self, K, z):
+        """ Embed viewing directions using fourrier encoding."""
+        bs = z.shape[0]
+        _h, _w = list(z.shape[-2:])
+        points = torch.stack([torch.arange(0,_h,1).reshape(-1,1).repeat(1,_w), torch.arange(0,_w,1).reshape(1,-1).repeat(_h,1)],-1).to(z.device).float() # [h,w,2]
+        points = points * self.patch_size + self.patch_size // 2 # move to pixel space - we give the pixel center of each token
+        points = points.reshape(1,-1,2).repeat(bs,1,1) # (bs, N, 2): 2D points
+        distance = torch.ones(bs,points.shape[1],1).to(K.device) # (bs, N, 1): distance in the 3D world
+        rays = inverse_perspective_projection(points, K, distance) # (bs, N, 3)
+        rays_embeddings = self.camera(pos=rays)
+        # Repeat for each element of the batch
+        z_K = rays_embeddings.reshape(bs,_h,_w,self.camera_embed_dim) # [bs,h,w,D]
+        return z_K
+    def to_euclidean_dist(self, x, dist, _K):
+        # Focal length normalization
+        focal = _K[:,[0],[0]]
+        dist = undo_focal_length_normalization(dist, focal, fovn=self.fovn, img_size=x.shape[-1])
+        # log space
+        if self.nearness:
+            dist = undo_log_depth(dist)
+        # Clamping
+        if self.clip_dist:
+            dist = torch.clamp(dist, 0, 50)
+        return dist
+    def forward(self,
+                x,
+                idx=None,
+                det_thresh=0.5,
+                nms_kernel_size=3,
+                K=None,
+                *args,
+                **kwargs):
+        """
+        Forward pass of the model and compute the loss according to the groundtruth
+        Args:
+            - x: RGB image - [bs,3,224,224]
+            - idx: GT location of persons - tuple of 3 tensor of shape [p]
+            - idx_j2d: GT location of 2d-kpts for each detected humans - tensor of shape [bs',14,2] - location in pixel space
+        Return:
+            - y: [bs,D,16,16]
+        """
+        persons = []
+        out = {}
+        # Feature extraction
+        z = self.backbone(x)
+        B,N,C = z.size() # [bs,256,768]
+        # Detection
+        scores, scores_det, idx = self.detection(z, nms_kernel_size=nms_kernel_size, det_thresh=det_thresh, N=N)
+        if len(idx[0]) == 0:
+            # no humans detected in the frame
+            return persons
+        # Map of Dense Feature
+        z = unpatch(z, patch_size=1, c=z.shape[2], img_size=int(np.sqrt(N))) # [bs,D,16,16]
+        z_all = z
+        # Extract the 'central' features
+        z = torch.reshape(z, (z.shape[0], 1, z.shape[1]//1, z.shape[2], z.shape[3])) # [bs,stack_K,D,16,16]
+        z_central = z[idx[0],idx[3],:,idx[1],idx[2]] # dense vectors
+        # 2D offset regression
+        offset = self.mlp_offset(z_central)
+        # Camera instrincs
+        K_det = K[idx[0]] # cameras for detected person
+        z_K = self.embedd_camera(K, z) # Embed viewing directions.
+        z_central = torch.cat([z_central, z_K[idx[0],idx[1], idx[2]]], 1) # Add to query tokens.
+        z_all = torch.cat([z_all, z_K.permute(0,3,1,2)], 1) # for the cross-attention only
+        z = torch.cat([z, z_K.permute(0,3,1,2).unsqueeze(1)],2)
+        # Distance for estimating the 3D location in 3D space
+        loc = torch.stack([idx[2],idx[1]]).permute(1,0) # Moving from higher resolution the location of the pelvis
+        loc = (loc + 0.5 + offset ) * self.patch_size
+        # SMPL parameter regression
+        kv = z_all[idx[0]] # retrieving dense features associated to each central vector
+        pred_smpl_params, pred_cam = self.x_attention_head(z_central, kv, idx_0=idx[0], idx_det=idx)
+        # Get outputs from the SMPL layer.
+        shape = pred_smpl_params['betas']
+        rotmat = torch.cat([pred_smpl_params['global_orient'],pred_smpl_params['body_pose']], 1)
+        expression = pred_smpl_params['expression']
+        rotvec = roma.rotmat_to_rotvec(rotmat)
+        # Distance
+        dist = pred_cam[:, 0][:, None]
+        out['dist_postprocessed'] = dist # before applying any post-processing such as focal length normalization, inverse or log
+        dist = self.to_euclidean_dist(x, dist, K_det)
+        # Populate output dictionnary
+        out.update({'scores': scores, 'offset': offset, 'dist': dist, 'expression': expression,
+                    'rotmat': rotmat, 'shape': shape, 'rotvec': rotvec, 'loc': loc})
+        assert rotvec.shape[0] == shape.shape[0] == loc.shape[0] == dist.shape[0], "Incoherent shapes"
+        # Neutral
+        smpl_out = self.smpl_layer['neutral'](rotvec, shape, loc, dist, None, K=K_det, expression=expression)
+        out.update(smpl_out)
+        # Populate a dictionnary for each person
+        for i in range(idx[0].shape[0]):
+            person = {
+                # Detection
+                'scores': scores_det[i], # detection scores
+                'loc': out['loc'][i], # 2d pixel location of the primary keypoints
+                # SMPL-X params
+                'transl': out['transl'][i], # from the primary keypoint i.e. the head
+                'transl_pelvis': out['transl_pelvis'][i], # of the pelvis joint
+                'rotvec': out['rotvec'][i],
+                'expression': out['expression'][i],
+                'shape': out['shape'][i],
+                # SMPL-X meshs
+                'verts_smplx': out['verts_smplx_cam'][i],
+                'j3d_smplx': out['j3d'][i],
+                'j2d_smplx': out['j2d'][i],
+            }
+            persons.append(person)
+        return persons
+class HPH(nn.Module):
+    """ Cross-attention based SMPL Transformer decoder
+    Code modified from:
+    https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/heads/smpl_head.py#L17
+    https://github.com/shubham-goel/4D-Humans/blob/a0def798c7eac811a63c8220fcc22d983b39785e/hmr2/models/components/pose_transformer.py#L301
+    """
+    def __init__(self,
+                 num_body_joints=52,
+                 context_dim=1280,
+                 dim=1024,
+                 depth=2,
+                 heads=8,
+                 mlp_dim=1024,
+                 dim_head=64,
+                 dropout=0.0,
+                 emb_dropout=0.0,
+                 at_token_res=32,
+                 ):
+        super().__init__()
+        self.joint_rep_type, self.joint_rep_dim = '6d', 6
+        self.num_body_joints = num_body_joints
+        self.nrot = self.num_body_joints + 1
+        npose = self.joint_rep_dim * (self.num_body_joints + 1)
+        self.npose = npose
+        self.depth = depth,
+        self.heads = heads,
+        self.res = at_token_res
+        self.input_is_mean_shape = True
+        _context_dim = context_dim # for the central features
+        # Transformer Decoder setup.
+        # Based on https://github.com/shubham-goel/4D-Humans/blob/8830bb330558eea2395b7f57088ef0aae7f8fa22/hmr2/configs_hydra/experiment/hmr_vit_transformer.yaml#L35
+        transformer_args = dict(
+            num_tokens=1,
+            token_dim=(npose + 10 + 3 + _context_dim) if self.input_is_mean_shape else 1,
+            dim=dim,
+            depth=depth,
+            heads=heads,
+            mlp_dim=mlp_dim,
+            dim_head=dim_head,
+            dropout=dropout,
+            emb_dropout=emb_dropout,
+            context_dim=context_dim,
+        )
+        self.transformer = TransformerDecoder(**transformer_args)
+        dim = transformer_args['dim']
+        # Final decoders to regress targets
+        self.decpose, self.decshape, self.deccam, self.decexpression = [nn.Linear(dim, od) for od in [npose, 10, 3, 10]]
+        # Register bufffers for the smpl layer.
+        self.set_smpl_init()
+        # Init learned embeddings for the cross attention queries
+        self.init_learned_queries(context_dim)
+    def init_learned_queries(self, context_dim, std=0.2):
+        """ Init learned embeddings for queries"""
+        self.cross_queries_x = nn.Parameter(torch.zeros(self.res, context_dim))
+        torch.nn.init.normal_(self.cross_queries_x, std=std)
+        self.cross_queries_y = nn.Parameter(torch.zeros(self.res, context_dim))
+        torch.nn.init.normal_(self.cross_queries_y, std=std)
+        self.cross_values_x = nn.Parameter(torch.zeros(self.res, context_dim))
+        torch.nn.init.normal_(self.cross_values_x, std=std)
+        self.cross_values_y = nn.Parameter(nn.Parameter(torch.zeros(self.res, context_dim)))
+        torch.nn.init.normal_(self.cross_values_y, std=std)
+    def set_smpl_init(self):
+        """ Fetch saved SMPL parameters and register buffers."""
+        mean_params = np.load(MEAN_PARAMS)
+        if self.nrot == 53:
+            init_body_pose = torch.eye(3).reshape(1,3,3).repeat(self.nrot,1,1)[:,:,:2].flatten(1).reshape(1, -1)
+            init_body_pose[:,:24*6] = torch.from_numpy(mean_params['pose'][:]).float() # global_orient+body_pose from SMPL
+        else:
+            init_body_pose = torch.from_numpy(mean_params['pose'].astype(np.float32)).unsqueeze(0)
+        init_betas = torch.from_numpy(mean_params['shape'].astype('float32')).unsqueeze(0)
+        init_cam = torch.from_numpy(mean_params['cam'].astype(np.float32)).unsqueeze(0)
+        init_betas_kid = torch.cat([init_betas, torch.zeros_like(init_betas[:,[0]])],1)
+        init_expression = 0. * torch.from_numpy(mean_params['shape'].astype('float32')).unsqueeze(0)
+        self.register_buffer('init_body_pose', init_body_pose)
+        self.register_buffer('init_betas', init_betas)
+        self.register_buffer('init_betas_kid', init_betas_kid)
+        self.register_buffer('init_cam', init_cam)
+        self.register_buffer('init_expression', init_expression)
+    def cross_attn_inputs(self, x, x_central, idx_0, idx_det):
+        """ Reshape and pad x_central to have the right shape for Cross-attention processing.
+            Inject learned embeddings to query and key inputs at the location of detected people. """
+        h, w = x.shape[2], x.shape[3]
+        x = einops.rearrange(x, 'b c h w -> b (h w) c')
+        assert idx_0 is not None, "Learned cross queries only work with multicross"
+        if idx_0.shape[0] > 0:
+            # reconstruct the batch/nb_people dimensions: pad for images with fewer people than max.
+            counts, idx_det_0 = rebatch(idx_0, idx_det)
+            old_shape = x_central.shape
+            # Legacy check for old versions
+            assert idx_det is not None, 'idx_det needed for learned_attention'
+            # xx is the tensor with all features
+            xx = einops.rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+            # Get learned embeddings for queries, at positions with detected people.
+            queries_xy = self.cross_queries_x[idx_det[1]] + self.cross_queries_y[idx_det[2]]
+            # Add the embedding to the central features.
+            x_central = x_central + queries_xy
+            assert x_central.shape == old_shape, "Problem with shape"
+            # Make it a tensor of dim. [batch, max_ppl_along_batch, ...]
+            x_central, mask = pad_to_max(x_central, counts)
+            #xx = einops.rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+            xx = xx[torch.cumsum(counts, dim=0)-1]
+            # Inject leared embeddings for key/values at detected locations.
+            values_xy = self.cross_values_x[idx_det[1]] + self.cross_values_y[idx_det[2]]
+            xx[idx_det_0, :, idx_det[1], idx_det[2]] += values_xy
+            x = einops.rearrange(xx, 'b c h w -> b (h w) c')
+            num_ppl =  x_central.shape[1]
+        else:
+            mask = None
+            num_ppl = 1
+            counts = None
+        return x, x_central, mask, num_ppl, counts
+    def forward(self,
+                x_central,
+                x,
+                idx_0=None,
+                idx_det=None,
+                **kwargs):
+        """"
+        Forward the HPH module.
+        """
+        batch_size = x.shape[0]
+        # Reshape inputs for cross attention and inject learned embeddings for queries and values.
+        x, x_central, mask, num_ppl, counts = self.cross_attn_inputs(x, x_central, idx_0, idx_det)
+        # Add init (mean smpl params) to the query for each quantity being regressed.
+        bs = x_central.shape[0] if idx_0.shape[0] else batch_size
+        expand = lambda x: x.expand(bs, num_ppl , -1)
+        pred_body_pose, pred_betas, pred_cam, pred_expression = [expand(x) for x in
+                [self.init_body_pose, self.init_betas, self.init_cam, self.init_expression]]
+        token = torch.cat([x_central, pred_body_pose, pred_betas, pred_cam], dim=-1)
+        if len(token.shape) == 2:
+            token = token[:,None,:]
+        # Process query and inputs with the cross-attention module.
+        token_out = self.transformer(token, context=x, mask=mask)
+        # Reshape outputs from [batch_size, nmax_ppl, ...] to [total_ppl, ...]
+        if mask is not None:
+            # Stack along batch axis.
+            token_out_list = [token_out[i, :c, ...] for i, c in enumerate(counts)]
+            token_out = torch.concat(token_out_list, dim=0)
+        else:
+            token_out = token_out.squeeze(1) # (B, C)
+        # Decoded output token and add to init for each quantity to regress.
+        reshape = (lambda x: x) if idx_0.shape[0] == 0 else (lambda x: x[0, 0, ...][None, ...])
+        decoders = [self.decpose, self.decshape, self.deccam, self.decexpression]
+        inits = [pred_body_pose, pred_betas, pred_cam, pred_expression]
+        pred_body_pose, pred_betas, pred_cam, pred_expression = [d(token_out) + reshape(i) for d, i in zip(decoders, inits)]
+        # Convert self.joint_rep_type -> rotmat
+        joint_conversion_fn = rot6d_to_rotmat
+        # conversion
+        pred_body_pose = joint_conversion_fn(pred_body_pose).view(batch_size, self.num_body_joints+1, 3, 3)
+        # Build the output dict
+        pred_smpl_params = {'global_orient': pred_body_pose[:, [0]],
+                            'body_pose': pred_body_pose[:, 1:],
+                            'betas': pred_betas,
+                            #'betas_kid': pred_betas_kid,
+                            'expression': pred_expression}
+        return pred_smpl_params, pred_cam #, pred_smpl_params_list
+def regression_mlp(layers_sizes):
+    """
+    Return a fully connected network.
+    """
+    assert len(layers_sizes) >= 2
+    in_features = layers_sizes[0]
+    layers = []
+    for i in range(1, len(layers_sizes)-1):
+        out_features = layers_sizes[i]
+        layers.append(torch.nn.Linear(in_features, out_features))
+        layers.append(torch.nn.ReLU())
+        in_features = out_features
+    layers.append(torch.nn.Linear(in_features, layers_sizes[-1]))
+    return torch.nn.Sequential(*layers)
+def apply_threshold(det_thresh, _scores):
+    """ Apply thresholding to detection scores; if stack_K is used and det_thresh is a list, apply to each channel separately """
+    if isinstance(det_thresh, list):
+        det_thresh = det_thresh[0]
+    idx = torch.where(_scores >= det_thresh)
+    return idx
+def _nms(heat, kernel=3):
+    """ easy non maximal supression (as in CenterNet) """
+    if kernel not in [2, 4]:
+        pad = (kernel - 1) // 2
+    else:
+        if kernel == 2:
+            pad = 1
+        else:
+            pad = 2
+    hmax = nn.functional.max_pool2d( heat, (kernel, kernel), stride=1, padding=pad)
+    if hmax.shape[2] > heat.shape[2]:
+        hmax = hmax[:, :, :heat.shape[2], :heat.shape[3]]
+    keep = (hmax == heat).float()
+    return heat * keep
+def _sigmoid(x):
+  y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
+  return y
+if __name__ == "__main__":
+    Model()

models/multiHMR/multiHMR.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54bda6698dd3a11639d54c5ae71190817549232fa57e48072e0fa533ea52639c
+size 1286462544

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libglfw3-dev
+libgles2-mesa-dev
+freeglut3-dev

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.0.1
+trimesh==3.22.3
+pyrender==0.1.45
+einops==0.6.1
+roma
+pillow==10.0.1
+smplx
+pyvista==0.42.3
+numpy==1.22.4
+pyglet==1.5.24
+tqdm==4.65.0
+#xformers==0.0.20 # does not work for CPU demo on HF
+# for huggingface
+gradio==4.18.0
+spaces==0.19.4

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .humans import (get_smplx_joint_names, rot6d_to_rotmat)
+from .camera import (perspective_projection, get_focalLength_from_fieldOfView, inverse_perspective_projection,
+    undo_focal_length_normalization, undo_log_depth)
+from .image import normalize_rgb, unpatch
+from .render import render_meshes, print_distance_on_image, render_side_views, create_scene
+from .tensor_manip import rebatch, pad, pad_to_max
+from .color import demo_color
+from .constants import SMPLX_DIR, MEAN_PARAMS, CACHE_DIR_MULTIHMR

utils/camera.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import numpy as np
+import math
+import torch
+OPENCV_TO_OPENGL_CAMERA_CONVENTION = np.array([[1, 0, 0, 0],
+                                               [0, -1, 0, 0],
+                                               [0, 0, -1, 0],
+                                               [0, 0, 0, 1]])
+def perspective_projection(x, K):
+    """
+    This function computes the perspective projection of a set of points assuming the extrinsinc params have already been applied
+    Args:
+        - x [bs,N,3]: 3D points
+        - K [bs,3,3]: Camera instrincs params
+    """
+    # Apply perspective distortion
+    y = x / x[:, :, -1].unsqueeze(-1)  # (bs, N, 3)
+    # Apply camera intrinsics
+    y = torch.einsum('bij,bkj->bki', K, y)  # (bs, N, 3)
+    return y[:, :, :2]
+def inverse_perspective_projection(points, K, distance):
+    """
+    This function computes the inverse perspective projection of a set of points given an estimated distance.
+    Input:
+        points (bs, N, 2): 2D points
+        K (bs,3,3): camera intrinsics params
+        distance (bs, N, 1): distance in the 3D world
+    Similar to:
+        - pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None)
+    """
+    # Apply camera intrinsics
+    points = torch.cat([points, torch.ones_like(points[..., :1])], -1)
+    points = torch.einsum('bij,bkj->bki', torch.inverse(K), points)
+    # Apply perspective distortion
+    if distance == None:
+        return points
+    points = points * distance
+    return points
+def get_focalLength_from_fieldOfView(fov=60, img_size=512):
+    """
+    Compute the focal length of the camera lens by assuming a certain FOV for the entire image
+    Args:
+        - fov: float, expressed in degree
+        - img_size: int
+    Return:
+        focal: float
+    """
+    focal = img_size / (2 * np.tan(np.radians(fov) /2))
+    return focal
+def undo_focal_length_normalization(y, f, fovn=60, img_size=448):
+    """
+    Undo focal_length_normalization()
+    """
+    fn = get_focalLength_from_fieldOfView(fov=fovn, img_size=img_size)
+    x = y * (f/fn)
+    return x
+def undo_log_depth(y):
+    """
+    Undo log_depth()
+    """
+    return torch.exp(y)

utils/color.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import numpy as np
+def hex_to_rgb(hex):
+    y = tuple(int(hex[i:i+2], 16) for i in (0, 2, 4))
+    return (y[0]/255,y[1]/255,y[2]/255)
+# Define colors for the demo
+color = ['0047AB', # cobaltblue
+        '6495ED', # cornerblue
+        'FF9999', 'FF9933', '00CC66', '66B2FF', 'FF6666', 'FF3333', 'C0C0C0', '9933FF'] # rosé - orange - green - blue - red - grey - violet
+color = [ hex_to_rgb(x) for x in color]
+for i in range(200):
+        color_i = list(np.random.choice(range(256), size=3))
+        color.append((color_i[0]/225, color_i[1]/225, color_i[2]/225))
+demo_color = color

utils/constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import os
+SMPLX_DIR = 'models'
+MEAN_PARAMS = 'models/smpl_mean_params.npz'
+CACHE_DIR_MULTIHMR = 'models/multiHMR'

utils/download.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import re
+import sys
+from urllib import request as urlrequest
+from constants import CACHE_DIR_MULTIHMR, SMPLX_DIR, MEAN_PARAMS
+def _progress_bar(count, total):
+    """Report download progress. Credit:
+    https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113
+    """
+    bar_len = 60
+    filled_len = int(round(bar_len * count / float(total)))
+    percents = round(100.0 * count / float(total), 1)
+    bar = "=" * filled_len + "-" * (bar_len - filled_len)
+    sys.stdout.write(
+        "  [{}] {}% of {:.1f}MB file  \r".format(bar, percents, total / 1024 / 1024)
+    )
+    sys.stdout.flush()
+    if count >= total:
+        sys.stdout.write("\n")
+def download_url(url, dst_file_path, chunk_size=8192, progress_hook=_progress_bar):
+    """Download url and write it to dst_file_path. Credit:
+    https://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
+    """
+    # url = url + "?dl=1" if "dropbox" in url else url
+    req = urlrequest.Request(url)
+    response = urlrequest.urlopen(req)
+    total_size = response.info().get("Content-Length")
+    if total_size is None:
+        raise ValueError("Cannot determine size of download from {}".format(url))
+    total_size = int(total_size.strip())
+    bytes_so_far = 0
+    with open(dst_file_path, "wb") as f:
+        while 1:
+            chunk = response.read(chunk_size)
+            bytes_so_far += len(chunk)
+            if not chunk:
+                break
+            if progress_hook:
+                progress_hook(bytes_so_far, total_size)
+            f.write(chunk)
+    return bytes_so_far
+def cache_url(url_or_file, cache_file_path, download=True):
+    """Download the file specified by the URL to the cache_dir and return the path to
+    the cached file. If the argument is not a URL, simply return it as is.
+    """
+    is_url = re.match(r"^(?:http)s?://", url_or_file, re.IGNORECASE) is not None
+    if not is_url:
+        return url_or_file
+    url = url_or_file
+    if os.path.exists(cache_file_path):
+        return cache_file_path
+    cache_file_dir = os.path.dirname(cache_file_path)
+    if not os.path.exists(cache_file_dir):
+        os.makedirs(cache_file_dir)
+    if download:
+        print("Downloading remote file {} to {}".format(url, cache_file_path))
+        download_url(url, cache_file_path)
+    return cache_file_path
+def download_models(folder=CACHE_DIR_MULTIHMR):
+    """Download checkpoints and files for running inference.
+    """
+    import os
+    os.makedirs(folder, exist_ok=True)
+    download_files = {
+        "hmr2_data.tar.gz"      : ["https://people.eecs.berkeley.edu/~jathushan/projects/4dhumans/hmr2_data.tar.gz", folder],
+    }
+    for file_name, url in download_files.items():
+        output_path = os.path.join(url[1], file_name)
+        if not os.path.exists(output_path):
+            print("Downloading file: " + file_name)
+            # output = gdown.cached_download(url[0], output_path, fuzzy=True)
+            output = cache_url(url[0], output_path)
+            assert os.path.exists(output_path), f"{output} does not exist"
+            # if ends with tar.gz, tar -xzf
+            if file_name.endswith(".tar.gz"):
+                print("Extracting file: " + file_name)
+                os.system("tar -xvf " + output_path + " -C " + url[1])
+def check_smplx_exists():
+    import os
+    candidates = [
+        f'{SMPLX_DIR}/data/smplx/SMPLX_NEUTRAL.npz',
+        f'{MEAN_PARAMS}'
+    ]
+    candidates_exist = [os.path.exists(c) for c in candidates]
+    if not any(candidates_exist):
+        raise FileNotFoundError(f"SMPLX model not found. Please download it from https://smplify.is.tue.mpg.de/ and place it at {candidates[1]}")
+    return True

utils/humans.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import numpy as np
+import torch.nn.functional as F
+import torch
+import roma
+from smplx.joint_names import JOINT_NAMES
+def rot6d_to_rotmat(x):
+    """
+    6D rotation representation to 3x3 rotation matrix.
+    Args:
+        x: (B,6) Batch of 6-D rotation representations.
+    Returns:
+        torch.Tensor: Batch of corresponding rotation matrices with shape (B,3,3).
+    """
+    x = x.reshape(-1,2,3).permute(0, 2, 1).contiguous()
+    y = roma.special_gramschmidt(x)
+    return y
+def get_smplx_joint_names(*args, **kwargs):
+    return JOINT_NAMES[:127]

utils/image.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+import numpy as np
+IMG_NORM_MEAN = [0.485, 0.456, 0.406]
+IMG_NORM_STD = [0.229, 0.224, 0.225]
+def normalize_rgb(img, imagenet_normalization=True):
+    """
+    Args:
+        - img: np.array - (W,H,3) - np.uint8 - 0/255
+    Return:
+        - img: np.array - (3,W,H) - np.float - -3/3
+    """
+    img = img.astype(np.float32) / 255.
+    img = np.transpose(img, (2,0,1))
+    if imagenet_normalization:
+        img = (img - np.asarray(IMG_NORM_MEAN).reshape(3,1,1)) / np.asarray(IMG_NORM_STD).reshape(3,1,1)
+    img = img.astype(np.float32)
+    return img
+def unpatch(data, patch_size=14, c=3, img_size=224):
+    # c = 3
+    if len(data.shape) == 2:
+        c=1
+        data = data[:,:,None].repeat([1,1,patch_size**2])
+    B,N,HWC = data.shape
+    HW = patch_size**2
+    c = int(HWC / HW)
+    h = w = int(N**.5)
+    p = q = int(HW**.5)
+    data = data.reshape([B,h,w,p,q,c])
+    data = torch.einsum('nhwpqc->nchpwq', data)
+    return data.reshape([B,c,img_size,img_size])

utils/render.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+import numpy as np
+import pyrender
+import trimesh
+import math
+from scipy.spatial.transform import Rotation
+from PIL import ImageFont, ImageDraw, Image
+OPENCV_TO_OPENGL_CAMERA_CONVENTION = np.array([[1, 0, 0, 0],
+                                               [0, -1, 0, 0],
+                                               [0, 0, -1, 0],
+                                               [0, 0, 0, 1]])
+def geotrf( Trf, pts, ncol=None, norm=False):
+    """ Apply a geometric transformation to a list of 3-D points.
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim in (2,3)
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+    ncol = ncol or pts.shape[-1]
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    if Trf.ndim == 3:
+        assert len(Trf) == len(pts), 'batch size does not match'
+    if Trf.ndim == 3 and pts.ndim > 3:
+        # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+        pts = pts.reshape(pts.shape[0], -1, pts.shape[-1])
+    elif Trf.ndim == 3 and pts.ndim == 2:
+        # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+        pts = pts[:, None, :]
+    if pts.shape[-1]+1 == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1,-2) # transpose Trf
+        pts = pts @ Trf[...,:-1,:] + Trf[...,-1:,:]
+    elif pts.shape[-1] == Trf.shape[-1]:
+        Trf = Trf.swapaxes(-1,-2) # transpose Trf
+        pts = pts @ Trf
+    else:
+        pts = Trf @ pts.T
+        if pts.ndim >= 2: pts = pts.swapaxes(-1,-2)
+    if norm:
+        pts = pts / pts[...,-1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1: pts *= norm
+    return pts[...,:ncol].reshape(*output_reshape, ncol)
+def create_scene(img_pil, l_mesh, l_face, color=None, metallicFactor=0., roughnessFactor=0.5, focal=600):
+    scene = trimesh.Scene(
+        lights=trimesh.scene.lighting.Light(intensity=3.0)
+    )
+    # Human meshes
+    for i, mesh in enumerate(l_mesh):
+        if color is None:
+            _color = (np.random.choice(range(1,225))/255, np.random.choice(range(1,225))/255, np.random.choice(range(1,225))/255)
+        else:
+            if isinstance(color,list):
+                _color = color[i]
+            elif isinstance(color,tuple):
+                _color = color
+            else:
+                raise NotImplementedError
+        mesh = trimesh.Trimesh(mesh, l_face[i])
+        mesh.visual = trimesh.visual.TextureVisuals(
+            uv=None,
+            material=trimesh.visual.material.PBRMaterial(
+              metallicFactor=metallicFactor,
+              roughnessFactor=roughnessFactor,
+              alphaMode='OPAQUE',
+              baseColorFactor=(_color[0], _color[1], _color[2], 1.0)
+            ),
+            image=None,
+            face_materials=None
+        )
+        scene.add_geometry(mesh)
+    # Image
+    H, W = img_pil.size[0], img_pil.size[1]
+    screen_width = 0.3
+    height = focal * screen_width / H
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3,:3] = Rotation.from_euler('z',np.deg2rad(45)).as_matrix()
+    rot45[2,3] = -height # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0,0] = W/H
+    transform = OPENCV_TO_OPENGL_CAMERA_CONVENTION @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4, transform=transform)
+    # cam.apply_transform(transform)
+    # import ipdb
+    # ipdb.set_trace()
+    # vertices = geotrf(transform, cam.vertices[[4,5,1,3]])
+    vertices = cam.vertices[[4,5,1,3]]
+    faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+    img = trimesh.Trimesh(vertices=vertices, faces=faces)
+    uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+    # img_pil = Image.fromarray((255. * np.ones((20,20,3))).astype(np.uint8)) # white only!
+    material = trimesh.visual.texture.SimpleMaterial(image=img_pil,
+                                                     diffuse=[255,255,255,0],
+                                                     ambient=[255,255,255,0],
+                                                     specular=[255,255,255,0],
+                                                     glossiness=1.0)
+    img.visual = trimesh.visual.TextureVisuals(uv=uv_coords, image=img_pil) #, material=material)
+    # _main_color = [255,255,255,0]
+    # print(img.visual.material.ambient)
+    # print(img.visual.material.diffuse)
+    # print(img.visual.material.specular)
+    # print(img.visual.material.main_color)
+    # img.visual.material.ambient = _main_color
+    # img.visual.material.diffuse = _main_color
+    # img.visual.material.specular = _main_color
+    # img.visual.material.main_color = _main_color
+    # img.visual.material.glossiness = _main_color
+    scene.add_geometry(img)
+    # this is the camera mesh
+    rot2 = np.eye(4)
+    rot2[:3,:3] = Rotation.from_euler('z',np.deg2rad(2)).as_matrix()
+    # import ipdb
+    # ipdb.set_trace()
+    # vertices = cam.vertices
+    # print(rot2)
+    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
+    # vertices = np.r_[cam.vertices, 0.95*cam.vertices, 1.05*cam.vertices]
+    faces = []
+    for face in cam.faces:
+        if 0 in face: continue
+        a,b,c = face
+        a2,b2,c2 = face + len(cam.vertices)
+        a3,b3,c3 = face + 2*len(cam.vertices)
+        # add 3 pseudo-edges
+        faces.append((a,b,b2))
+        faces.append((a,a2,c))
+        faces.append((c2,b,c))
+        faces.append((a,b,b3))
+        faces.append((a,a3,c))
+        faces.append((c3,b,c))
+    # no culling
+    faces += [(c,b,a) for a,b,c in faces]
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:,:3] = (255, 0, 0)
+    scene.add_geometry(cam)
+    # OpenCV to OpenGL
+    rot = np.eye(4)
+    cams2world = np.eye(4)
+    rot[:3,:3] = Rotation.from_euler('y',np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world @ OPENCV_TO_OPENGL_CAMERA_CONVENTION @ rot))
+    return scene
+def render_meshes(img, l_mesh, l_face, cam_param, color=None, alpha=1.0,
+                  show_camera=False,
+                  intensity=3.0,
+                  metallicFactor=0., roughnessFactor=0.5, smooth=True,
+                  ):
+    """
+    Rendering multiple mesh and project then in the initial image.
+    Args:
+        - img: np.array [w,h,3]
+        - l_mesh: np.array list of [v,3]
+        - l_face: np.array list of [f,3]
+        - cam_param: info about the camera intrinsics (focal, princpt) and (R,t) is possible
+    Return:
+        - img: np.array [w,h,3]
+    """
+    # scene
+    scene = pyrender.Scene(ambient_light=(0.3, 0.3, 0.3))
+    # mesh
+    for i, mesh in enumerate(l_mesh):
+        if color is None:
+            _color = (np.random.choice(range(1,225))/255, np.random.choice(range(1,225))/255, np.random.choice(range(1,225))/255)
+        else:
+            if isinstance(color,list):
+                _color = color[i]
+            elif isinstance(color,tuple):
+                _color = color
+            else:
+                raise NotImplementedError
+        mesh = trimesh.Trimesh(mesh, l_face[i])
+        # import ipdb
+        # ipdb.set_trace()
+        # mesh.visual = trimesh.visual.TextureVisuals(
+        #     uv=None,
+        #     material=trimesh.visual.material.PBRMaterial(
+        #         metallicFactor=metallicFactor,
+        #     roughnessFactor=roughnessFactor,
+        #     alphaMode='OPAQUE',
+        #     baseColorFactor=(_color[0], _color[1], _color[2], 1.0)
+        #     ),
+        #     image=None,
+        #     face_materials=None
+        # )
+        # print('saving')
+        # mesh.export('human.obj')
+        # mesh = trimesh.load('human.obj')
+        # print('loading')
+        # mesh = pyrender.Mesh.from_trimesh(mesh, smooth=smooth)
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=metallicFactor,
+            roughnessFactor=roughnessFactor,
+            alphaMode='OPAQUE',
+            baseColorFactor=(_color[0], _color[1], _color[2], 1.0))
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=smooth)
+        scene.add(mesh, f"mesh_{i}")
+    # Adding coordinate system at (0,0,2) for the moment
+    # Using lines defined in pyramid https://docs.pyvista.org/version/stable/api/utilities/_autosummary/pyvista.Pyramid.html
+    if show_camera:
+        import pyvista
+        def get_faces(x):
+            return x.faces.astype(np.uint32).reshape((x.n_faces, 4))[:, 1:]
+        # Camera = Box + Cone (or Cylinder?)
+        material_cam = pyrender.MetallicRoughnessMaterial(metallicFactor=metallicFactor, roughnessFactor=roughnessFactor, alphaMode='OPAQUE', baseColorFactor=(0.5,0.5,0.5))
+        height = 0.2
+        radius = 0.1
+        cone = pyvista.Cone(center=(0.0, 0.0, -height/2), direction=(0.0, 0.0, -1.0), height=height, radius=radius).extract_surface().triangulate()
+        verts = cone.points
+        mesh = pyrender.Mesh.from_trimesh(trimesh.Trimesh(verts, get_faces(cone)), material=material_cam, smooth=smooth)
+        scene.add(mesh, f"cone")
+        size = 0.1
+        box = pyvista.Box(bounds=(-size, size,
+                                  -size, size,
+                                  verts[:,-1].min() - 3*size, verts[:,-1].min())).extract_surface().triangulate()
+        verts = box.points
+        mesh = pyrender.Mesh.from_trimesh(trimesh.Trimesh(verts, get_faces(box)), material=material_cam, smooth=smooth)
+        scene.add(mesh, f"box")
+        # Coordinate system
+        # https://docs.pyvista.org/version/stable/api/utilities/_autosummary/pyvista.Arrow.html
+        l_color = [(1,0,0,1.0), (0,1,0,1.0), (0,0,1,1.0)]
+        l_direction = [(1,0,0), (0,1,0), (0,0,1)]
+        scale = 0.2
+        pose3d = [2*scale, 0.0, -scale]
+        for i in range(len(l_color)):
+            arrow = pyvista.Arrow(direction=l_direction[i], scale=scale)
+            arrow = arrow.extract_surface().triangulate()
+            verts = arrow.points + np.asarray([pose3d])
+            faces = arrow.faces.astype(np.uint32).reshape((arrow.n_faces, 4))[:, 1:]
+            mesh = trimesh.Trimesh(verts, faces)
+            material = pyrender.MetallicRoughnessMaterial(metallicFactor=metallicFactor, roughnessFactor=roughnessFactor, alphaMode='OPAQUE', baseColorFactor=l_color[i])
+            mesh = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=smooth)
+            scene.add(mesh, f"arrow_{i}")
+    focal, princpt = cam_param['focal'], cam_param['princpt']
+    camera_pose = np.eye(4)
+    if 'R' in cam_param.keys():
+        camera_pose[:3, :3] = cam_param['R']
+    if 't' in cam_param.keys():
+        camera_pose[:3, 3] = cam_param['t']
+    camera = pyrender.IntrinsicsCamera(fx=focal[0], fy=focal[1], cx=princpt[0], cy=princpt[1])
+    # camera
+    camera_pose = OPENCV_TO_OPENGL_CAMERA_CONVENTION @ camera_pose
+    camera_pose = np.linalg.inv(camera_pose)
+    scene.add(camera, pose=camera_pose)
+    # renderer
+    renderer = pyrender.OffscreenRenderer(viewport_width=img.shape[1], viewport_height=img.shape[0], point_size=1.0)
+    # light
+    light = pyrender.DirectionalLight(intensity=intensity)
+    scene.add(light, pose=camera_pose)
+    # render
+    rgb, depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+    rgb = rgb[:,:,:3].astype(np.float32)
+    fg = (depth > 0)[:,:,None].astype(np.float32)
+    # Simple smoothing of the mask
+    bg_blending_radius = 1
+    bg_blending_kernel = 2.0 * torch.ones((1, 1, 2 * bg_blending_radius + 1, 2 * bg_blending_radius + 1)) / (2 * bg_blending_radius + 1) ** 2
+    bg_blending_bias =  -torch.ones(1)
+    fg = fg.reshape((fg.shape[0],fg.shape[1]))
+    fg = torch.from_numpy(fg).unsqueeze(0)
+    fg = torch.clamp_min(torch.nn.functional.conv2d(fg, weight=bg_blending_kernel, bias=bg_blending_bias, padding=bg_blending_radius) * fg, 0.0)
+    fg = fg.permute(1,2,0).numpy()
+    # Alpha-blending
+    img = (fg * (alpha * rgb + (1.0-alpha) * img) + (1-fg) * img).astype(np.uint8)
+    renderer.delete()
+    return img.astype(np.uint8)
+def length(v):
+    return math.sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2])
+def cross(v0, v1):
+    return [
+        v0[1]*v1[2]-v1[1]*v0[2],
+        v0[2]*v1[0]-v1[2]*v0[0],
+        v0[0]*v1[1]-v1[0]*v0[1]]
+def dot(v0, v1):
+    return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2]
+def normalize(v, eps=1e-13):
+    l = length(v)
+    return [v[0]/(l+eps), v[1]/(l+eps), v[2]/(l+eps)]
+def lookAt(eye, target, *args, **kwargs):
+    """
+    eye is the point of view, target is the point which is looked at and up is the upwards direction.
+    Input should be in OpenCV format - we transform arguments to OpenGL
+    Do compute in OpenGL and then transform back to OpenCV
+    """
+    # Transform from OpenCV to OpenGL format
+    # eye = [eye[0], -eye[1], -eye[2]]
+    # target = [target[0], -target[1], -target[2]]
+    up = [0,-1,0]
+    eye, at, up = eye, target, up
+    zaxis = normalize((at[0]-eye[0], at[1]-eye[1], at[2]-eye[2]))
+    xaxis = normalize(cross(zaxis, up))
+    yaxis = cross(xaxis, zaxis)
+    zaxis = [-zaxis[0],-zaxis[1],-zaxis[2]]
+    viewMatrix = np.asarray([
+        [xaxis[0], xaxis[1], xaxis[2], -dot(xaxis, eye)],
+        [yaxis[0], yaxis[1], yaxis[2], -dot(yaxis, eye)],
+        [zaxis[0], zaxis[1], zaxis[2], -dot(zaxis, eye)],
+        [0, 0, 0, 1]]
+    ).reshape(4,4)
+    # OpenGL to OpenCV
+    viewMatrix = OPENCV_TO_OPENGL_CAMERA_CONVENTION @ viewMatrix
+    return viewMatrix
+def print_distance_on_image(pred_rend_array, humans, _color):
+    # Add distance to the image.
+    font = ImageFont.load_default()
+    rend_pil = Image.fromarray(pred_rend_array)
+    draw = ImageDraw.Draw(rend_pil)
+    for i_hum, hum in enumerate(humans):
+            # distance
+            transl = hum['transl_pelvis'].cpu().numpy().reshape(3)
+            dist_cam = np.sqrt(((transl[[0,2]])**2).sum()) # discarding Y axis
+            # 2d - bbox
+            bbox = get_bbox(hum['j2d_smplx'].cpu().numpy(), factor=1.35, output_format='x1y1x2y2')
+            loc = [(bbox[0] + bbox[2]) / 2., bbox[1]]
+            txt = f"{dist_cam:.2f}m"
+            length = font.getlength(txt)
+            loc[0] = loc[0] - length // 2
+            fill = tuple((np.asarray(_color[i_hum]) * 255).astype(np.int32).tolist())
+            draw.text((loc[0], loc[1]), txt, fill=fill, font=font)
+    return np.asarray(rend_pil)
+def get_bbox(points, factor=1., output_format='xywh'):
+    """
+    Args:
+        - y: [k,2]
+    Return:
+        - bbox: [4] in a specific format
+    """
+    assert len(points.shape) == 2, f"Wrong shape, expected two-dimensional array. Got shape {points.shape}"
+    assert points.shape[1] == 2
+    x1, x2 = points[:,0].min(), points[:,0].max()
+    y1, y2 = points[:,1].min(), points[:,1].max()
+    cx, cy = (x2 + x1) / 2., (y2 + y1) / 2.
+    sx, sy = np.abs(x2 - x1), np.abs(y2 - y1)
+    sx, sy = int(factor * sx), int(factor * sy)
+    x1, y1 = int(cx - sx / 2.), int(cy - sy / 2.)
+    x2, y2 = int(cx + sx / 2.), int(cy + sy / 2.)
+    if output_format == 'xywh':
+        return [x1,y1,sx,sy]
+    elif output_format == 'x1y1x2y2':
+        return [x1,y1,x2,y2]
+    else:
+        raise NotImplementedError
+def render_side_views(img_array, _color, humans, model, K):
+    _bg = 255. # white
+    # camera
+    focal = np.asarray([K[0,0,0].cpu().numpy(),K[0,1,1].cpu().numpy()])
+    princpt = np.asarray([K[0,0,-1].cpu().numpy(),K[0,1,-1].cpu().numpy()])
+    # Get the vertices produced by the model.
+    l_verts = [humans[j]['verts_smplx'].cpu().numpy() for j in range(len(humans))]
+    l_faces = [model.smpl_layer['neutral'].bm_x.faces for j in range(len(humans))]
+    bg_array = 1 + 0.*img_array.copy()
+    if len(humans) == 0:
+        pred_rend_array_bis = _bg * bg_array.copy()
+        pred_rend_array_sideview = _bg * bg_array.copy()
+        pred_rend_array_bev = _bg * bg_array.copy()
+    else:
+        # Small displacement
+        H_bis = lookAt(eye=[2.,-1,-2], target=[0,0,3])
+        pred_rend_array_bis = render_meshes(_bg* bg_array.copy(), l_verts, l_faces,
+                                            {'focal': focal, 'princpt': princpt, 'R': H_bis[:3,:3], 't': H_bis[:3,3]},
+                                            alpha=1.0, color=_color, show_camera=True)
+        # Where to look at
+        l_z = []
+        for hum in humans:
+            l_z.append(hum['transl_pelvis'].cpu().numpy().reshape(-1)[-1])
+        target_z = np.median(np.asarray(l_z))
+        # Sideview
+        H_sideview = lookAt(eye=[2.2*target_z,0,target_z], target=[0,0,target_z])
+        pred_rend_array_sideview = render_meshes(_bg*bg_array.copy(), l_verts, l_faces,
+                                                 {'focal': focal, 'princpt': princpt, 'R': H_sideview[:3,:3], 't': H_sideview[:3,3]},
+                                                 alpha=1.0, color=_color, show_camera=True)
+        # Bird-Eye-View
+        H_bev = lookAt(eye=[0.,-2*target_z,target_z-0.001], target=[0,0,target_z])
+        pred_rend_array_bev = render_meshes(_bg* bg_array.copy(), l_verts, l_faces,
+                                            {'focal': focal, 'princpt': princpt, 'R': H_bev[:3,:3], 't': H_bev[:3,3]},
+                                            alpha=1.0, color=_color, show_camera=True)
+    return pred_rend_array_bis, pred_rend_array_sideview, pred_rend_array_bev

utils/tensor_manip.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Multi-HMR
+# Copyright (c) 2024-present NAVER Corp.
+# CC BY-NC-SA 4.0 license
+import torch
+def rebatch(idx_0, idx_det):
+    # Rebuild the batch dimension : (N, ...) is turned into (batch_dim, nb_max, ...)
+    # with zero padding for batch elements with fewer people.
+    values, counts = torch.unique(idx_0, sorted=True, return_counts=True)
+    #print(idx_0)
+    if not len(values) == values.max() + 1:
+        # Abnormal jumps in the idx_0: some images in the batch did not produce any inputs.
+        jumps = (values - torch.concat([torch.Tensor([-1]).to(values.device), values])[:-1]) - 1
+        offsets = torch.cumsum(jumps.int(), dim=0)
+        # Correcting idx_0 to account for missing batch elements
+        # This is actually wrong: in the case where we have 2 consecutive images without ppl, this will fail.
+        # But two consecutive jumps has proba so close to 0 that I consider it 'impossible'.
+        offsets = [c * [o] for o, c in [(offsets[i], counts[i]) for i in range(offsets.shape[0])]]
+        offsets = torch.Tensor([e for o in offsets for e in o]).to(jumps.device).int()
+        idx_0 = idx_0 - offsets
+        idx_det_0 = idx_det[0] - offsets
+    else:
+        idx_det_0 = idx_det[0]
+    return counts, idx_det_0
+def pad(x, padlen, dim):
+    assert x.shape[dim] <= padlen, "Incoherent dimensions"
+    if not dim == 1:
+        raise NotImplementedError("Not implemented for this dim.")
+    padded = torch.concat([x, x.new_zeros((x.shape[0], padlen - x.shape[dim],) + x.shape[2:])], dim=dim)
+    mask = torch.concat([x.new_ones((x.shape[0], x.shape[dim])), x.new_zeros((x.shape[0], padlen - x.shape[dim]))], dim=dim)
+    return padded, mask
+def pad_to_max(x_central, counts):
+    """Pad so that each batch images has the same number of x_central queries.
+    Mask is used in attention to remove the fact queries. """
+    max_count = counts.max()
+    xlist = torch.split(x_central, tuple(counts), dim=0)
+    xlist2 = [x.unsqueeze(0) for x in xlist]
+    xlist3 = [pad(x, max_count, dim=1) for x in xlist2]
+    xlist4, mask = [x[0] for x in xlist3], [x[1] for x in xlist3]
+    x_central, mask = torch.concat(xlist4, dim=0), torch.concat(mask, dim=0)
+    return x_central, mask