Spaces:

whyun13
/

test_virtual

Runtime error

App Files Files Community

whyun13 commited on Jul 15, 2024

Commit

882f6e2

verified ·

1 Parent(s): 828a2a7

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +4 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +31 -0
LICENSE +400 -0
README.md +373 -7
assets/demo1.gif +0 -0
assets/demo2.gif +3 -0
assets/render_defaults_GQS883.pth +3 -0
assets/render_defaults_PXB184.pth +3 -0
assets/render_defaults_RLW104.pth +3 -0
assets/render_defaults_TXB805.pth +3 -0
checkpoints/ca_body/data/PXB184/body_dec.ckpt +3 -0
checkpoints/ca_body/data/PXB184/config.yml +56 -0
checkpoints/diffusion/c1_face/args.json +34 -0
checkpoints/diffusion/c1_pose/args.json +66 -0
checkpoints/guide/c1_pose/args.json +41 -0
checkpoints/vq/c1_pose/args.json +43 -0
checkpoints/vq/c1_pose/net_iter300000.pth +3 -0
data_loaders/data.py +253 -0
data_loaders/get_data.py +129 -0
data_loaders/tensors.py +86 -0
demo/.ipynb_checkpoints/demo-checkpoint.py +276 -0
demo/demo.py +276 -0
demo/install.sh +20 -0
demo/requirements.txt +17 -0
diffusion/fp16_util.py +250 -0
diffusion/gaussian_diffusion.py +1273 -0
diffusion/losses.py +83 -0
diffusion/nn.py +213 -0
diffusion/resample.py +168 -0
diffusion/respace.py +145 -0
flagged/audio/b90d90dbca93f47e8d01/audio.wav +0 -0
flagged/audio/d8e03e2e6deae2f981b1/audio.wav +0 -0
flagged/log.csv +4 -0
model/cfg_sampler.py +33 -0
model/diffusion.py +403 -0
model/guide.py +222 -0
model/modules/audio_encoder.py +194 -0
model/modules/rotary_embedding_torch.py +139 -0
model/modules/transformer_modules.py +702 -0
model/utils.py +130 -0
model/vqvae.py +550 -0
sample/generate.py +316 -0
scripts/download_alldatasets.sh +6 -0
scripts/download_allmodels.sh +13 -0
scripts/download_prereq.sh +9 -0
scripts/installation.sh +4 -0
scripts/requirements.txt +17 -0
train/train_diffusion.py +83 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo2.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*.pyc
+*.pt
+!dataset/*/data_stats.pth
+dataset

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to audio2photoreal
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to audio2photoreal, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1,400 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -1,12 +1,378 @@
 ---
-title: Test Virtual
-emoji: 📈
-colorFrom: gray
-colorTo: indigo
 sdk: gradio
 sdk_version: 4.38.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: test_virtual
+app_file: ./demo/demo.py
 sdk: gradio
 sdk_version: 4.38.1
 ---
+# From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations
+This repository contains a pytorch implementation of ["From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations"](https://people.eecs.berkeley.edu/~evonne_ng/projects/audio2photoreal/)
+:hatching_chick: **Try out our demo [here](https://colab.research.google.com/drive/1lnX3d-3T3LaO3nlN6R8s6pPvVNAk5mdK?usp=sharing)** or continue following the steps below to run code locally!
+And thanks everyone for the support via contributions/comments/issues!
+https://github.com/facebookresearch/audio2photoreal/assets/17986358/5cba4079-275e-48b6-aecc-f84f3108c810
+This codebase provides:
+- train code
+- test code
+- pretrained motion models
+- access to dataset
+If you use the dataset or code, please cite our [Paper](https://arxiv.org/abs/2401.01885)
+```
+@inproceedings{ng2024audio2photoreal,
+  title={From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations},
+  author={Ng, Evonne and Romero, Javier and Bagautdinov, Timur and Bai, Shaojie and Darrell, Trevor and Kanazawa, Angjoo and Richard, Alexander},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+  year={2024}
+}
+```
+### Repository Contents
+- [**Quickstart:**](#quickstart) easy gradio demo that lets you record audio and render a video
+- [**Installation:**](#installation) environment setup and installation (for more details on the rendering pipeline, please refer to [Codec Avatar Body](https://github.com/facebookresearch/ca_body))
+- [**Download data and models:**](#download-data-and-models) download annotations and pre-trained models
+    - [Dataset desc.](#dataset): description of dataset annotations
+    - [Visualize Dataset](#visualize-ground-truth): script for visualizing ground truth annotations
+    - [model desc.](#pretrained-models): description of pretrained models
+- [**Running the pretrained models:**](#running-the-pretrained-models) how to generate results files and visualize the results using the rendering pipeline.
+    - [Face generation](#face-generation): commands to generate the results file for the faces
+    - [Body generation](#body-generation): commands to generate the results file for the bodies
+    - [Visualization](#visualization): how to call into the rendering api. For full details, please refer to [this repo](https://github.com/facebookresearch/ca_body).
+- [**Training from scratch (3 models):**](#training-from-scratch) scripts to get the training pipeline running from scratch for face, guide poses, and body models.
+    - [Face diffusion model](#1-face-diffusion-model)
+    - [Body diffusion](#2-body-diffusion-model)
+    - [Body vq vae](#3-body-vq-vae)
+    - [Body guide transformer](#4-body-guide-transformer)
+We annotate code that you can directly copy and paste into your terminal using the :point_down: icon.
+# Quickstart
+With this demo, you can record an audio clip and select the number of samples you want to generate.
+Make sure you have CUDA 11.7 and gcc/++ 9.0 for pytorch3d compatibility
+:point_down: Install necessary components. This will do the environment configuration and install the corresponding rendering assets, prerequisite models, and pretrained models:
+```
+conda create --name a2p_env python=3.9
+conda activate a2p_env
+sh demo/install.sh
+```
+:point_down: Run the demo. You can record your audio and then render corresponding results!
+```
+python -m demo.demo
+```
+:microphone: First, record your audio
+![](assets/demo1.gif)
+:hourglass: Hold tight because the rendering can take a while!
+You can change the number of samples (1-10) you want to generate, and download your favorite video by clicking on the download button on the top right of each video.
+![](assets/demo2.gif)
+# Installation
+The code has been tested with CUDA 11.7 and python 3.9, gcc/++ 9.0
+:point_down: If you haven't done so already via the demo setup, configure the environments and download prerequisite models:
+```
+conda create --name a2p_env python=3.9
+conda activate a2p_env
+pip install -r scripts/requirements.txt
+sh scripts/download_prereq.sh
+```
+:point_down: To get the rendering working, please also make sure you install [pytorch3d](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md).
+```
+pip install "git+https://github.com/facebookresearch/pytorch3d.git"
+```
+Please see [CA Bodies repo](https://github.com/facebookresearch/ca_body) for more details on the renderer.
+# Download data and models
+To download any of the datasets, you can find them at `https://github.com/facebookresearch/audio2photoreal/releases/download/v1.0/<person_id>.zip`, where you can replace `<person_id>` with any of `PXB184`, `RLW104`, `TXB805`, or `GQS883`.
+Download over the command line can be done with this commands.
+```
+curl -L https://github.com/facebookresearch/audio2photoreal/releases/download/v1.0/<person_id>.zip -o <person_id>.zip
+unzip <person_id>.zip -d dataset/
+rm <person_id>.zip
+```
+:point_down: To download *all* of the datasets, you can simply run the following which will download and unpack all the models.
+```
+sh scripts/download_alldatasets.sh
+```
+Similarly, to download any of the models, you can find them at `http://audio2photoreal_models.berkeleyvision.org/<person_id>_models.tar`.
+```
+# download the motion generation
+wget http://audio2photoreal_models.berkeleyvision.org/<person_id>_models.tar
+tar xvf <person_id>_models.tar
+rm <person_id>_models.tar
+# download the body decoder/rendering assets and place them in the right place
+mkdir -p checkpoints/ca_body/data/
+wget https://github.com/facebookresearch/ca_body/releases/download/v0.0.1-alpha/<person_id>.tar.gz
+tar xvf <person_id>.tar.gz --directory checkpoints/ca_body/data/
+rm <person_id>.tar.gz
+```
+:point_down: You can also download all of the models with this script:
+```
+sh scripts/download_allmodels.sh
+```
+The above model script will download *both* the models for motion generation and the body decoder/rendering models. Please view the script for more details.
+### Dataset
+Once the dataset is downloaded and unzipped (via `scripts/download_datasets.sh`), it should unfold into the following directory structure:
+```
+|-- dataset/
+    |-- PXB184/
+        |-- data_stats.pth
+        |-- scene01_audio.wav
+        |-- scene01_body_pose.npy
+        |-- scene01_face_expression.npy
+        |-- scene01_missing_face_frames.npy
+        |-- ...
+        |-- scene30_audio.wav
+        |-- scene30_body_pose.npy
+        |-- scene30_face_expression.npy
+        |-- scene30_missing_face_frames.npy
+    |-- RLW104/
+    |-- TXB805/
+    |-- GQS883/
+```
+Each of the four participants (`PXB184`, `RLW104`, `TXB805`, `GQS883`) should have independent "scenes" (1 to 26 or so).
+For each scene, there are 3 types of data annotations that we save.
+```
+*audio.wav: wavefile containing the raw audio (two channels, 1600*T samples) at 48kHz; channel 0 is the audio associated with the current person, channel 1 is the audio associated with their conversational partner.
+*body_pose.npy: (T x 104) array of joint angles in a kinematic skeleton. Not all of the joints are represented with 3DoF. Each 104-d vector can be used to reconstruct a full-body skeleton.
+*face_expression.npy: (T x 256) array of facial codes, where each 256-d vector reconstructs a face mesh.
+*missing_face_frames.npy: List of indices (t) where the facial code is missing or corrupted.
+data_stats.pth: carries the mean and std for each modality of each person.
+```
+For the train/val/test split the indices are defined in `data_loaders/data.py` as:
+```
+train_idx = list(range(0, len(data_dict["data"]) - 6))
+val_idx = list(range(len(data_dict["data"]) - 6, len(data_dict["data"]) - 4))
+test_idx = list(range(len(data_dict["data"]) - 4, len(data_dict["data"])))
+```
+for any of the four dataset participants we train on.
+### Visualize ground truth
+If you've properly installed the rendering requirements, you can then visualize the full dataset with the following command:
+```
+python -m visualize.render_anno
+    --save_dir <path/to/save/dir>
+    --data_root <path/to/data/root>
+    --max_seq_length <num>
+```
+The videos will be chunked lengths according to specified `--max_seq_length` arg, which you can specify (the default is 600).
+:point_down: For example, to visualize ground truth annotations for `PXB184`, you can run the following.
+```
+python -m visualize.render_anno --save_dir vis_anno_test --data_root dataset/PXB184 --max_seq_length 600
+```
+### Pretrained models
+We train person-specific models, so each person should have an associated directory. For instance, for `PXB184`, their complete models should unzip into the following structure.
+```
+|-- checkpoints/
+    |-- diffusion/
+        |-- c1_face/
+            |-- args.json
+            |-- model:09d.pt
+        |-- c1_pose/
+            |-- args.json
+            |-- model:09d.pt
+    |-- guide/
+        |-- c1_pose/
+            |-- args.json
+            |-- checkpoints/
+                |-- iter-:07d.pt
+    |-- vq/
+        |-- c1_pose/
+            |-- args.json
+            |-- net_iter:06d.pth
+```
+There are 4 models for each person and each model has an associated `args.json`.
+1. a face diffusion model that outputs 256 facial codes conditioned on audio
+2. a pose diffusion model that outputs 104 joint rotations conditioned on audio and guide poses
+3. a guide vq pose model that outputs vq tokens conditioned on audio at 1 fps
+4. a vq encoder-decoder model that vector quantizes the continuous 104-d pose space.
+# Running the pretrained models
+To run the actual models, you will need to run the pretrained models and generate the associated results files before visualizing them.
+### Face generation
+To generate the results file for the face,
+```
+python -m sample.generate
+    --model_path <path/to/model>
+    --num_samples <xsamples>
+    --num_repetitions <xreps>
+    --timestep_respacing ddim500
+    --guidance_param 10.0
+```
+The `<path/to/model>` should be the path to the diffusion model that is associated with generating the face.
+E.g. for participant `PXB184`, the path might be `./checkpoints/diffusion/c1_face/model000155000.pt`
+The other parameters are:
+```
+--num_samples: number of samples to generate. To sample the full dataset, use 56 (except for TXB805, whcih is 58).
+--num_repetitions: number of times to repeat the sampling, such that total number of sequences generated is (num_samples * num_repetitions).
+--timestep_respacing: how many diffusion steps to take. Format will always be ddim<number>.
+--guidance_param: how influential the conditioning is on the results. I usually use range 2.0-10.0, and tend towards higher for the face.
+```
+:point_down: A full example of running the face model for `PXB184` with the provided pretrained models would then be:
+```
+python -m sample.generate --model_path checkpoints/diffusion/c1_face/model000155000.pt --num_samples 10 --num_repetitions 5 --timestep_respacing ddim500 --guidance_param 10.0
+```
+This generates 10 samples from the dataset 1 time. The output results file will be saved to:
+`./checkpoints/diffusion/c1_face/samples_c1_face_000155000_seed10_/results.npy`
+### Body generation
+To generate the corresponding body, it will be very similar to generating the face, except now we have to feed in the model for generating the guide poses as well.
+```
+python -m sample.generate
+    --model_path <path/to/model>
+    --resume_trans <path/to/guide/model>
+    --num_samples <xsamples>
+    --num_repetitions <xreps>
+    --timestep_respacing ddim500
+    --guidance_param 2.0
+```
+:point_down: Here, `<path/to/guide/model>` should point to the guide transformer. The full command would be:
+```
+python -m sample.generate --model_path checkpoints/diffusion/c1_pose/model000340000.pt --resume_trans checkpoints/guide/c1_pose/checkpoints/iter-0100000.pt --num_samples 10 --num_repetitions 5 --timestep_respacing ddim500 --guidance_param 2.0
+```
+Similarly, the output will be saved to:
+`./checkpoints/diffusion/c1_pose/samples_c1_pose_000340000_seed10_guide_iter-0100000.pt/results.npy`
+### Visualization
+On the body generation side of things, you can also optionally pass in the `--plot` flag in order to render out the photorealistic avatar. You will also need to pass in the corresponding generated face codes with the `--face_codes` flag.
+Optionally, if you already have the poses precomputed, you an also pass in the generated body with the `--pose_codes` flag.
+This will save videos in the same directory as where the body's `results.npy` is stored.
+:point_down: An example of the full command with *the three new flags added is*:
+```
+python -m sample.generate --model_path checkpoints/diffusion/c1_pose/model000340000.pt --resume_trans checkpoints/guide/c1_pose/checkpoints/iter-0100000.pt --num_samples 10 --num_repetitions 5 --timestep_respacing ddim500 --guidance_param 2.0 --face_codes ./checkpoints/diffusion/c1_face/samples_c1_face_000155000_seed10_/results.npy --pose_codes ./checkpoints/diffusion/c1_pose/samples_c1_pose_000340000_seed10_guide_iter-0100000.pt/results.npy --plot
+```
+The remaining flags can be the same as before. For the actual rendering api, please see [Codec Avatar Body](https://github.com/facebookresearch/ca_body) for installation etc.
+*Important: in order to visualize the full photorealistic avatar, you will need to run the face codes first, then pass them into the body generation code.* It will not work if you try to call generate with `--plot` for the face codes.
+# Training from scratch
+There are four possible models you will need to train: 1) the face diffusion model, 2) the body diffusion model, 3) the body vq vae, 4) the body guide transformer.
+The only dependency is that 3) is needed for 4). All other models can be trained in parallel.
+### 1) Face diffusion model
+To train the face model, you will need to run the following script:
+```
+python -m train.train_diffusion
+    --save_dir <path/to/save/dir>
+    --data_root <path/to/data/root>
+    --batch_size <bs>
+    --dataset social
+    --data_format face
+    --layers 8
+    --heads 8
+    --timestep_respacing ''
+    --max_seq_length 600
+```
+Importantly, a few of the flags are as follows:
+```
+--save_dir: path to directory where all outputs are stored
+--data_root: path to the directory of where to load the data from
+--dataset: name of dataset to load; right now we only support the 'social' dataset
+--data_format: set to 'face' for the face, as opposed to pose
+--timestep_respacing: set to '' which does the default spacing of 1k diffusion steps
+--max_seq_length: the maximum number of frames for a given sequence to train on
+```
+:point_down: A full example for training on person `PXB184` is:
+```
+python -m train.train_diffusion --save_dir checkpoints/diffusion/c1_face_test --data_root ./dataset/PXB184/ --batch_size 4 --dataset social --data_format face --layers 8 --heads 8 --timestep_respacing '' --max_seq_length 600
+```
+### 2) Body diffusion model
+Training the body model is similar to the face model, but with the following additional parameters
+```
+python -m train.train_diffusion
+    --save_dir <path/to/save/dir>
+    --data_root <path/to/data/root>
+    --lambda_vel <num>
+    --batch_size <bs>
+    --dataset social
+    --add_frame_cond 1
+    --data_format pose
+    --layers 6
+    --heads 8
+    --timestep_respacing ''
+    --max_seq_length 600
+```
+The flags that differ from the face training are as follows:
+```
+--lambda_vel: additional auxilary loss for training with velocity
+--add_frame_cond: set to '1' for 1 fps. if not specified, it will default to 30 fps.
+--data_format: set to 'pose' for the body, as opposed to face
+```
+:point_down: A full example for training on person `PXB184` is:
+```
+python -m train.train_diffusion --save_dir checkpoints/diffusion/c1_pose_test --data_root ./dataset/PXB184/ --lambda_vel 2.0 --batch_size 4 --dataset social --add_frame_cond 1 --data_format pose --layers 6 --heads 8 --timestep_respacing '' --max_seq_length 600
+```
+### 3) Body VQ VAE
+To train a vq encoder-decoder, you will need to run the following script:
+```
+python -m train.train_vq
+    --out_dir <path/to/out/dir>
+    --data_root <path/to/data/root>
+    --batch_size <bs>
+    --lr 1e-3
+    --code_dim 1024
+    --output_emb_width 64
+    --depth 4
+    --dataname social
+    --loss_vel 0.0
+    --add_frame_cond 1
+    --data_format pose
+    --max_seq_length 600
+```
+:point_down: For person `PXB184`, it would be:
+```
+python -m train.train_vq --out_dir checkpoints/vq/c1_vq_test --data_root ./dataset/PXB184/ --lr 1e-3 --code_dim 1024 --output_emb_width 64 --depth 4 --dataname social --loss_vel 0.0 --data_format pose --batch_size 4 --add_frame_cond 1 --max_seq_length 600
+```
+### 4) Body guide transformer
+Once you have the vq trained from 3) you can then pass it in to train the body guide pose transformer:
+```
+python -m train.train_guide
+    --out_dir <path/to/out/dir>
+    --data_root <path/to/data/root>
+    --batch_size <bs>
+    --resume_pth <path/to/vq/model>
+    --add_frame_cond 1
+    --layers 6
+    --lr 2e-4
+    --gn
+    --dim 64
+```
+:point_down: For person `PXB184`, it would be:
+```
+python -m train.train_guide --out_dir checkpoints/guide/c1_trans_test --data_root ./dataset/PXB184/ --batch_size 4 --resume_pth checkpoints/vq/c1_vq_test/net_iter300000.pth --add_frame_cond 1 --layers 6 --lr 2e-4 --gn --dim 64
+```
+After training these 4 models, you can now follow the ["Running the pretrained models"](#running-the-pretrained-models) section to generate samples and visualize results.
+You can also visualize the corresponding ground truth sequences by passing in the `--render_gt` flag.
+# License
+The code and dataset are released under [CC-NC 4.0 International license](https://github.com/facebookresearch/audio2photoreal/blob/main/LICENSE).

assets/demo1.gif ADDED Viewed

assets/demo2.gif ADDED Viewed

Git LFS Details

SHA256: 4d07d3817b4a23bdb0a36869a469d051b9b10fe68d9e6f02f6cc8765cd6f5bc3
Pointer size: 132 Bytes
Size of remote file: 1.31 MB

assets/render_defaults_GQS883.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae7ee73849e258bbb8d8a04aa674960896fc1dff8757fefbd2df1685225dd7d
+size 71354547

assets/render_defaults_PXB184.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c86ba14a58d4829c8d05428f5e601072dc4bab1bdc60bc53ce6c73990e9b97d7
+size 71354547

assets/render_defaults_RLW104.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:808a3fbf33115d3cc132bad48c2e95bfca29bb1847d912b1f72e5e5b4a081db5
+size 71354547

assets/render_defaults_TXB805.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7985c79edfba70f83f560859f2ce214d9779a46031aa8ca6a917d8fd4417e24
+size 71354547

checkpoints/ca_body/data/PXB184/body_dec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26394ae03c1726b7c90b5633696d0eea733a3c5e423893c4e79b490c80c35ddf
+size 893279810

checkpoints/ca_body/data/PXB184/config.yml ADDED Viewed

	@@ -0,0 +1,56 @@

+model:
+  class_name: ca_body.models.mesh_vae_drivable.AutoEncoder
+  encoder:
+    n_embs: 1024
+    noise_std: 1.0
+  encoder_face:
+    n_embs: 256
+    noise_std: 1.0
+  decoder_face:
+    n_latent: 256
+    n_vert_out: 21918
+  decoder:
+    init_uv_size: 64
+    n_init_channels: 64
+    n_min_channels: 4
+    n_pose_dims: 98
+    n_pose_enc_channels: 16
+    n_embs: 1024
+    n_embs_enc_channels: 32
+    n_face_embs: 256
+    uv_size: 1024
+  decoder_view:
+    net_uv_size: 1024
+  upscale_net:
+    n_ftrs: 4
+  shadow_net:
+    uv_size: 2048
+    shadow_size: 256
+    n_dims: 4
+  pose_to_shadow:
+    n_pose_dims: 104
+    uv_size: 2048
+  renderer:
+    image_height: 2048
+    image_width: 1334
+    depth_disc_ksize: 3
+  cal:
+    identity_camera: '400143'
+  pixel_cal:
+    image_height: 2048
+    image_width: 1334
+    ds_rate: 8
+  learn_blur: true

checkpoints/diffusion/c1_face/args.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "add_frame_cond": null,
+    "batch_size": 4,
+    "cond_mask_prob": 0.2,
+    "cuda": true,
+    "data_format": "face",
+    "data_root": "./dataset/PXB184/",
+    "dataset": "social",
+    "device": 0,
+    "diffusion_steps": 10,
+    "heads": 8,
+    "lambda_vel": 0.0,
+    "latent_dim": 512,
+    "layers": 8,
+    "log_interval": 1000,
+    "lr": 0.0001,
+    "lr_anneal_steps": 0,
+    "max_seq_length": 600,
+    "noise_schedule": "cosine",
+    "not_rotary": false,
+    "num_audio_layers": 3,
+    "num_steps": 800000,
+    "overwrite": false,
+    "resume_checkpoint": "",
+    "save_dir": "checkpoints/diffusion/c1_face/",
+    "save_interval": 5000,
+    "seed": 10,
+    "sigma_small": true,
+    "simplify_audio": false,
+    "timestep_respacing": "",
+    "train_platform_type": "NoPlatform",
+    "unconstrained": false,
+    "weight_decay": 0.0
+}

checkpoints/diffusion/c1_pose/args.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+    "add_frame_cond": 1.0,
+    "arch": "trans_enc",
+    "batch_size": 32,
+    "clip_body": false,
+    "clip_use_delta": false,
+    "clip_use_vae": false,
+    "cond_mask_prob": 0.1,
+    "cuda": true,
+    "data_format": "pose",
+    "data_root": "./dataset/PXB184/",
+    "dataset": "social",
+    "device": 0,
+    "diffusion_steps": 10,
+    "emb_trans_dec": false,
+    "eval_batch_size": 32,
+    "eval_during_training": false,
+    "eval_num_samples": 1000,
+    "eval_rep_times": 3,
+    "eval_split": "val",
+    "filter": false,
+    "heads": 8,
+    "lambda_fc": 0.0,
+    "lambda_hands": 0.0,
+    "lambda_lips": 0.0,
+    "lambda_rcxyz": 0.0,
+    "lambda_vel": 2.0,
+    "lambda_xyz": 0.0,
+    "lambda_xyz_vel": 0.0,
+    "latent_dim": 512,
+    "layers": 6,
+    "log_interval": 1000,
+    "lr": 0.0001,
+    "lr_anneal_steps": 0,
+    "max_seq_length": 600,
+    "no_split": false,
+    "noise_schedule": "cosine",
+    "not_rotary": false,
+    "num_frames": 60,
+    "num_steps": 800000,
+    "overwrite": false,
+    "partial": false,
+    "resume_checkpoint": "",
+    "save_dir": "checkpoints/diffusion/c1_pose/",
+    "save_interval": 5000,
+    "seed": 10,
+    "sigma_small": true,
+    "simplify_audio": false,
+    "split_net": false,
+    "timestep_respacing": "",
+    "train_platform_type": "NoPlatform",
+    "unconstrained": false,
+    "use_clip": false,
+    "use_cm": true,
+    "use_full_dataset": false,
+    "use_kp": false,
+    "use_mask": true,
+    "use_mdm": false,
+    "use_nort": false,
+    "use_nort_mdm": false,
+    "use_pose_pos": false,
+    "use_resnet": true,
+    "use_vae": null,
+    "weight_decay": 0.0,
+    "z_norm": true
+}

checkpoints/guide/c1_pose/args.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "add_audio_pe": true,
+    "add_conv": true,
+    "add_frame_cond": 1,
+    "batch_size": 16,
+    "data_format": "pose",
+    "data_root": "./dataset/PXB184/",
+    "dataset": "social",
+    "dec_layers": null,
+    "dim": 64,
+    "enc_layers": null,
+    "eval_interval": 1000,
+    "filter": false,
+    "gamma": 0.1,
+    "gn": true,
+    "layers": 6,
+    "log_interval": 1000,
+    "lr": 0.0001,
+    "lr_scheduler": [
+        50000,
+        400000
+    ],
+    "no_split": false,
+    "num_audio_layers":2,
+    "out_dir": "checkpoints/guide/c1_pose",
+    "partial": false,
+    "resume_pth": "checkpoints/vq/c1_pose/net_iter300000.pth",
+    "resume_trans": null,
+    "save_interval": 5000,
+    "seed": 10,
+    "simplify_audio": false,
+    "total_iter": 1000000,
+    "use_full_dataset": false,
+    "use_kp": false,
+    "use_lstm": false,
+    "use_nort": false,
+    "use_nort_mdm": false,
+    "use_torch": false,
+    "warm_up_iter": 5000,
+    "weight_decay": 0.1
+}

checkpoints/vq/c1_pose/args.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+    "add_frame_cond": 1.0,
+    "batch_size": 16,
+    "code_dim": 1024,
+    "commit": 0.02,
+    "data_format": "pose",
+    "data_root": "./dataset/PXB184/",
+    "dataname": "social",
+    "dataset": "social",
+    "depth": 4,
+    "eval_iter": 1000,
+    "exp_name": "c1_pose",
+    "filter": false,
+    "gamma": 0.05,
+    "loss_vel": 0.0,
+    "lr": 0.001,
+    "lr_scheduler": [
+        300000
+    ],
+    "max_seq_length": 600,
+    "nb_joints": 104,
+    "no_split": true,
+    "out_dir": "checkpoints/vq/c1_pose",
+    "output_emb_width": 64,
+    "partial": false,
+    "print_iter": 200,
+    "results_dir": "visual_results/",
+    "resume_pth": null,
+    "seed": 123,
+    "simplify_audio": false,
+    "total_iter": 10000000,
+    "use_full_dataset": false,
+    "use_kp": false,
+    "use_linear": false,
+    "use_nort": false,
+    "use_nort_mdm": false,
+    "use_quant": true,
+    "use_vae": false,
+    "visual_name": "baseline",
+    "warm_up_iter": 1000,
+    "weight_decay": 0.0,
+    "z_norm": true
+}

checkpoints/vq/c1_pose/net_iter300000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5649ad5e49e0e1afcd9a7390f0ee79ee66de275a67ecb1cfe7fc691cb4ceb332
+size 3129275

data_loaders/data.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import os
+from typing import Dict, Iterable, List, Union
+import numpy as np
+import torch
+from torch.utils import data
+from utils.misc import prGreen
+class Social(data.Dataset):
+    def __init__(
+        self,
+        args,
+        data_dict: Dict[str, Iterable],
+        split: str = "train",
+        chunk: bool = False,
+        add_padding: bool = True,
+    ) -> None:
+        if args.data_format == "face":
+            prGreen("[dataset.py] training face only model")
+            data_dict["data"] = data_dict["face"]
+        elif args.data_format == "pose":
+            prGreen("[dataset.py] training pose only model")
+            missing = []
+            for d in data_dict["data"]:
+                missing.append(np.ones_like(d))
+            data_dict["missing"] = missing
+        # set up variables for dataloader
+        self.data_format = args.data_format
+        self.add_frame_cond = args.add_frame_cond
+        self._register_keyframe_step()
+        self.data_root = args.data_root
+        self.max_seq_length = args.max_seq_length
+        if hasattr(args, "curr_seq_length") and args.curr_seq_length is not None:
+            self.max_seq_length = args.curr_seq_length
+        prGreen([f"[dataset.py] sequences of {self.max_seq_length}"])
+        self.add_padding = add_padding
+        self.audio_per_frame = 1600
+        self.max_audio_length = self.max_seq_length * self.audio_per_frame
+        self.min_seq_length = 400
+        # set up training/validation splits
+        train_idx = list(range(0, len(data_dict["data"]) - 6))
+        val_idx = list(range(len(data_dict["data"]) - 6, len(data_dict["data"]) - 4))
+        test_idx = list(range(len(data_dict["data"]) - 4, len(data_dict["data"])))
+        self.split = split
+        if split == "train":
+            self._pick_sequences(data_dict, train_idx)
+        elif split == "val":
+            self._pick_sequences(data_dict, val_idx)
+        else:
+            self._pick_sequences(data_dict, test_idx)
+        self.chunk = chunk
+        if split == "test":
+            print("[dataset.py] chunking data...")
+            self._chunk_data()
+        self._load_std()
+        prGreen(
+            f"[dataset.py] {split} | {len(self.data)} sequences ({self.data[0].shape}) | total len {self.total_len}"
+        )
+    def inv_transform(
+        self, data: Union[np.ndarray, torch.Tensor], data_type: str
+    ) -> Union[np.ndarray, torch.Tensor]:
+        if data_type == "pose":
+            std = self.std
+            mean = self.mean
+        elif data_type == "face":
+            std = self.face_std
+            mean = self.face_mean
+        elif data_type == "audio":
+            std = self.audio_std
+            mean = self.audio_mean
+        else:
+            assert False, f"datatype not defined: {data_type}"
+        if torch.is_tensor(data):
+            return data * torch.tensor(
+                std, device=data.device, requires_grad=False
+            ) + torch.tensor(mean, device=data.device, requires_grad=False)
+        else:
+            return data * std + mean
+    def _pick_sequences(self, data_dict: Dict[str, Iterable], idx: List[int]) -> None:
+        self.data = np.take(data_dict["data"], idx, axis=0)
+        self.missing = np.take(data_dict["missing"], idx, axis=0)
+        self.audio = np.take(data_dict["audio"], idx, axis=0)
+        self.lengths = np.take(data_dict["lengths"], idx, axis=0)
+        self.total_len = sum([len(d) for d in self.data])
+    def _load_std(self) -> None:
+        stats = torch.load(os.path.join(self.data_root, "data_stats.pth"))
+        print(
+            f'[dataset.py] loading from... {os.path.join(self.data_root, "data_stats.pth")}'
+        )
+        self.mean = stats["pose_mean"].reshape(-1)
+        self.std = stats["pose_std"].reshape(-1)
+        self.face_mean = stats["code_mean"]
+        self.face_std = stats["code_std"]
+        self.audio_mean = stats["audio_mean"]
+        self.audio_std = stats["audio_std_flat"]
+    def _chunk_data(self) -> None:
+        chunk_data = []
+        chunk_missing = []
+        chunk_lengths = []
+        chunk_audio = []
+        # create sequences of set lengths
+        for d_idx in range(len(self.data)):
+            curr_data = self.data[d_idx]
+            curr_missing = self.missing[d_idx]
+            curr_audio = self.audio[d_idx]
+            end_range = len(self.data[d_idx]) - self.max_seq_length
+            for chunk_idx in range(0, end_range, self.max_seq_length):
+                chunk_end = chunk_idx + self.max_seq_length
+                curr_data_chunk = curr_data[chunk_idx:chunk_end, :]
+                curr_missing_chunk = curr_missing[chunk_idx:chunk_end, :]
+                curr_audio_chunk = curr_audio[
+                    chunk_idx * self.audio_per_frame : chunk_end * self.audio_per_frame,
+                    :,
+                ]
+                if curr_data_chunk.shape[0] < self.max_seq_length:
+                    # do not add a short chunk to the list
+                    continue
+                chunk_lengths.append(curr_data_chunk.shape[0])
+                chunk_data.append(curr_data_chunk)
+                chunk_missing.append(curr_missing_chunk)
+                chunk_audio.append(curr_audio_chunk)
+        idx = np.random.permutation(len(chunk_data))
+        print("==> shuffle", idx)
+        self.data = np.take(chunk_data, idx, axis=0)
+        self.missing = np.take(chunk_missing, idx, axis=0)
+        self.lengths = np.take(chunk_lengths, idx, axis=0)
+        self.audio = np.take(chunk_audio, idx, axis=0)
+        self.total_len = len(self.data)
+    def _register_keyframe_step(self) -> None:
+        if self.add_frame_cond == 1:
+            self.step = 30
+        if self.add_frame_cond is None:
+            self.step = 1
+    def _pad_sequence(
+        self, sequence: np.ndarray, actual_length: int, max_length: int
+    ) -> np.ndarray:
+        sequence = np.concatenate(
+            (
+                sequence,
+                np.zeros((max_length - actual_length, sequence.shape[-1])),
+            ),
+            axis=0,
+        )
+        return sequence
+    def _get_idx(self, item: int) -> int:
+        cumulative_len = 0
+        seq_idx = 0
+        while item > cumulative_len:
+            cumulative_len += len(self.data[seq_idx])
+            seq_idx += 1
+        item = seq_idx - 1
+        return item
+    def _get_random_subsection(
+        self, data_dict: Dict[str, Iterable]
+    ) -> Dict[str, np.ndarray]:
+        isnonzero = False
+        while not isnonzero:
+            start = np.random.randint(0, data_dict["m_length"] - self.max_seq_length)
+            if self.add_padding:
+                length = (
+                    np.random.randint(self.min_seq_length, self.max_seq_length)
+                    if not self.split == "test"
+                    else self.max_seq_length
+                )
+            else:
+                length = self.max_seq_length
+            curr_missing = data_dict["missing"][start : start + length]
+            isnonzero = np.any(curr_missing)
+        missing = curr_missing
+        motion = data_dict["motion"][start : start + length, :]
+        keyframes = motion[:: self.step]
+        audio = data_dict["audio"][
+            start * self.audio_per_frame : (start + length) * self.audio_per_frame,
+            :,
+        ]
+        data_dict["m_length"] = len(motion)
+        data_dict["k_length"] = len(keyframes)
+        data_dict["a_length"] = len(audio)
+        if data_dict["m_length"] < self.max_seq_length:
+            motion = self._pad_sequence(
+                motion, data_dict["m_length"], self.max_seq_length
+            )
+            missing = self._pad_sequence(
+                missing, data_dict["m_length"], self.max_seq_length
+            )
+            audio = self._pad_sequence(
+                audio, data_dict["a_length"], self.max_audio_length
+            )
+            max_step_length = len(np.zeros(self.max_seq_length)[:: self.step])
+            keyframes = self._pad_sequence(
+                keyframes, data_dict["k_length"], max_step_length
+            )
+        data_dict["motion"] = motion
+        data_dict["keyframes"] = keyframes
+        data_dict["audio"] = audio
+        data_dict["missing"] = missing
+        return data_dict
+    def __len__(self) -> int:
+        return self.total_len
+    def __getitem__(self, item: int) -> Dict[str, np.ndarray]:
+        # figure out which sequence to randomly sample from
+        if not self.split == "test":
+            item = self._get_idx(item)
+        motion = self.data[item]
+        audio = self.audio[item]
+        m_length = self.lengths[item]
+        missing = self.missing[item]
+        a_length = len(audio)
+        # Z Normalization
+        if self.data_format == "pose":
+            motion = (motion - self.mean) / self.std
+        elif self.data_format == "face":
+            motion = (motion - self.face_mean) / self.face_std
+        audio = (audio - self.audio_mean) / self.audio_std
+        keyframes = motion[:: self.step]
+        k_length = len(keyframes)
+        data_dict = {
+            "motion": motion,
+            "m_length": m_length,
+            "audio": audio,
+            "a_length": a_length,
+            "keyframes": keyframes,
+            "k_length": k_length,
+            "missing": missing,
+        }
+        if not self.split == "test" and not self.chunk:
+            data_dict = self._get_random_subsection(data_dict)
+        if self.data_format == "face":
+            data_dict["motion"] *= data_dict["missing"]
+        return data_dict

data_loaders/get_data.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import os
+from typing import Dict, List
+import numpy as np
+import torch
+import torchaudio
+from data_loaders.data import Social
+from data_loaders.tensors import social_collate
+from torch.utils.data import DataLoader
+from utils.misc import prGreen
+def get_dataset_loader(
+    args,
+    data_dict: Dict[str, np.ndarray],
+    split: str = "train",
+    chunk: bool = False,
+    add_padding: bool = True,
+) -> DataLoader:
+    dataset = Social(
+        args=args,
+        data_dict=data_dict,
+        split=split,
+        chunk=chunk,
+        add_padding=add_padding,
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=not split == "test",
+        num_workers=8,
+        drop_last=True,
+        collate_fn=social_collate,
+        pin_memory=True,
+    )
+    return loader
+def _load_pose_data(
+    all_paths: List[str], audio_per_frame: int, flip_person: bool = False
+) -> Dict[str, List]:
+    data = []
+    face = []
+    audio = []
+    lengths = []
+    missing = []
+    for _, curr_path_name in enumerate(all_paths):
+        if not curr_path_name.endswith("_body_pose.npy"):
+            continue
+        # load face information and deal with missing codes
+        curr_code = np.load(
+            curr_path_name.replace("_body_pose.npy", "_face_expression.npy")
+        ).astype(float)
+        # curr_code = np.array(curr_face["codes"], dtype=float)
+        missing_list = np.load(
+            curr_path_name.replace("_body_pose.npy", "_missing_face_frames.npy")
+        )
+        if len(missing_list) == len(curr_code):
+            print("skipping", curr_path_name, curr_code.shape)
+            continue
+        curr_missing = np.ones_like(curr_code)
+        curr_missing[missing_list] = 0.0
+        # load pose information and deal with discontinuities
+        curr_pose = np.load(curr_path_name)
+        if "PXB184" in curr_path_name or "RLW104" in curr_path_name:  # Capture 1 or 2
+            curr_pose[:, 3] = (curr_pose[:, 3] + np.pi) % (2 * np.pi)
+            curr_pose[:, 3] = (curr_pose[:, 3] + np.pi) % (2 * np.pi)
+        # load audio information
+        curr_audio, _ = torchaudio.load(
+            curr_path_name.replace("_body_pose.npy", "_audio.wav")
+        )
+        curr_audio = curr_audio.T
+        if flip_person:
+            prGreen("[get_data.py] flipping the dataset of left right person")
+            tmp = torch.zeros_like(curr_audio)
+            tmp[:, 1] = curr_audio[:, 0]
+            tmp[:, 0] = curr_audio[:, 1]
+            curr_audio = tmp
+        assert len(curr_pose) * audio_per_frame == len(
+            curr_audio
+        ), f"motion {curr_pose.shape} vs audio {curr_audio.shape}"
+        data.append(curr_pose)
+        face.append(curr_code)
+        missing.append(curr_missing)
+        audio.append(curr_audio)
+        lengths.append(len(curr_pose))
+    data_dict = {
+        "data": data,
+        "face": face,
+        "audio": audio,
+        "lengths": lengths,
+        "missing": missing,
+    }
+    return data_dict
+def load_local_data(
+    data_root: str, audio_per_frame: int, flip_person: bool = False
+) -> Dict[str, List]:
+    if flip_person:
+        if "PXB184" in data_root:
+            data_root = data_root.replace("PXB184", "RLW104")
+        elif "RLW104" in data_root:
+            data_root = data_root.replace("RLW104", "PXB184")
+        elif "TXB805" in data_root:
+            data_root = data_root.replace("TXB805", "GQS883")
+        elif "GQS883" in data_root:
+            data_root = data_root.replace("GQS883", "TXB805")
+    all_paths = [os.path.join(data_root, x) for x in os.listdir(data_root)]
+    all_paths.sort()
+    return _load_pose_data(
+        all_paths,
+        audio_per_frame,
+        flip_person=flip_person,
+    )

data_loaders/tensors.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import torch
+from torch.utils.data._utils.collate import default_collate
+def lengths_to_mask(lengths, max_len):
+    mask = torch.arange(max_len, device=lengths.device).expand(
+        len(lengths), max_len
+    ) < lengths.unsqueeze(1)
+    return mask
+def collate_tensors(batch):
+    dims = batch[0].dim()
+    max_size = [max([b.size(i) for b in batch]) for i in range(dims)]
+    size = (len(batch),) + tuple(max_size)
+    canvas = batch[0].new_zeros(size=size)
+    for i, b in enumerate(batch):
+        sub_tensor = canvas[i]
+        for d in range(dims):
+            sub_tensor = sub_tensor.narrow(d, 0, b.size(d))
+        sub_tensor.add_(b)
+    return canvas
+## social collate
+def collate_v2(batch):
+    notnone_batches = [b for b in batch if b is not None]
+    databatch = [b["inp"] for b in notnone_batches]
+    missingbatch = [b["missing"] for b in notnone_batches]
+    audiobatch = [b["audio"] for b in notnone_batches]
+    lenbatch = [b["lengths"] for b in notnone_batches]
+    alenbatch = [b["audio_lengths"] for b in notnone_batches]
+    keyframebatch = [b["keyframes"] for b in notnone_batches]
+    klenbatch = [b["key_lengths"] for b in notnone_batches]
+    databatchTensor = collate_tensors(databatch)
+    missingbatchTensor = collate_tensors(missingbatch)
+    audiobatchTensor = collate_tensors(audiobatch)
+    lenbatchTensor = torch.as_tensor(lenbatch)
+    alenbatchTensor = torch.as_tensor(alenbatch)
+    keyframeTensor = collate_tensors(keyframebatch)
+    klenbatchTensor = torch.as_tensor(klenbatch)
+    maskbatchTensor = (
+        lengths_to_mask(lenbatchTensor, databatchTensor.shape[-1])
+        .unsqueeze(1)
+        .unsqueeze(1)
+    )  # unqueeze for broadcasting
+    motion = databatchTensor
+    cond = {
+        "y": {
+            "missing": missingbatchTensor,
+            "mask": maskbatchTensor,
+            "lengths": lenbatchTensor,
+            "audio": audiobatchTensor,
+            "alengths": alenbatchTensor,
+            "keyframes": keyframeTensor,
+            "klengths": klenbatchTensor,
+        }
+    }
+    return motion, cond
+def social_collate(batch):
+    adapted_batch = [
+        {
+            "inp": torch.tensor(b["motion"].T).to(torch.float32).unsqueeze(1),
+            "lengths": b["m_length"],
+            "audio": b["audio"]
+            if torch.is_tensor(b["audio"])
+            else torch.tensor(b["audio"]).to(torch.float32),
+            "keyframes": torch.tensor(b["keyframes"]).to(torch.float32),
+            "key_lengths": b["k_length"],
+            "audio_lengths": b["a_length"],
+            "missing": torch.tensor(b["missing"]).to(torch.float32),
+        }
+        for b in batch
+    ]
+    return collate_v2(adapted_batch)

demo/.ipynb_checkpoints/demo-checkpoint.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import copy
+import json
+from typing import Dict, Union
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from attrdict import AttrDict
+from diffusion.respace import SpacedDiffusion
+from model.cfg_sampler import ClassifierFreeSampleModel
+from model.diffusion import FiLMTransformer
+from utils.misc import fixseed
+from utils.model_util import create_model_and_diffusion, load_model
+from visualize.render_codes import BodyRenderer
+class GradioModel:
+    def __init__(self, face_args, pose_args) -> None:
+        self.face_model, self.face_diffusion, self.device = self._setup_model(
+            face_args, "checkpoints/diffusion/c1_face/model000155000.pt"
+        )
+        self.pose_model, self.pose_diffusion, _ = self._setup_model(
+            pose_args, "checkpoints/diffusion/c1_pose/model000340000.pt"
+        )
+        # load standardization stuff
+        stats = torch.load("dataset/PXB184/data_stats.pth")
+        stats["pose_mean"] = stats["pose_mean"].reshape(-1)
+        stats["pose_std"] = stats["pose_std"].reshape(-1)
+        self.stats = stats
+        # set up renderer
+        config_base = f"./checkpoints/ca_body/data/PXB184"
+        self.body_renderer = BodyRenderer(
+            config_base=config_base,
+            render_rgb=True,
+        )
+    def _setup_model(
+        self,
+        args_path: str,
+        model_path: str,
+    ) -> (Union[FiLMTransformer, ClassifierFreeSampleModel], SpacedDiffusion):
+        with open(args_path) as f:
+            args = json.load(f)
+        args = AttrDict(args)
+        args.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        print("running on...", args.device)
+        args.model_path = model_path
+        args.output_dir = "/tmp/gradio/"
+        args.timestep_respacing = "ddim100"
+        if args.data_format == "pose":
+            args.resume_trans = "checkpoints/guide/c1_pose/checkpoints/iter-0100000.pt"
+        ## create model
+        model, diffusion = create_model_and_diffusion(args, split_type="test")
+        print(f"Loading checkpoints from [{args.model_path}]...")
+        state_dict = torch.load(args.model_path, map_location=args.device)
+        load_model(model, state_dict)
+        model = ClassifierFreeSampleModel(model)
+        model.eval()
+        model.to(args.device)
+        return model, diffusion, args.device
+    def _replace_keyframes(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        B: int,
+        T: int,
+        top_p: float = 0.97,
+    ) -> torch.Tensor:
+        with torch.no_grad():
+            tokens = self.pose_model.transformer.generate(
+                model_kwargs["y"]["audio"],
+                T,
+                layers=self.pose_model.tokenizer.residual_depth,
+                n_sequences=B,
+                top_p=top_p,
+            )
+        tokens = tokens.reshape((B, -1, self.pose_model.tokenizer.residual_depth))
+        pred = self.pose_model.tokenizer.decode(tokens).detach()
+        return pred
+    def _run_single_diffusion(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        diffusion: SpacedDiffusion,
+        model: Union[FiLMTransformer, ClassifierFreeSampleModel],
+        curr_seq_length: int,
+        num_repetitions: int = 1,
+    ) -> (torch.Tensor,):
+        sample_fn = diffusion.ddim_sample_loop
+        with torch.no_grad():
+            sample = sample_fn(
+                model,
+                (num_repetitions, model.nfeats, 1, curr_seq_length),
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+                init_image=None,
+                progress=True,
+                dump_steps=None,
+                noise=None,
+                const_noise=False,
+            )
+        return sample
+    def generate_sequences(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        data_format: str,
+        curr_seq_length: int,
+        num_repetitions: int = 5,
+        guidance_param: float = 10.0,
+        top_p: float = 0.97,
+        # batch_size: int = 1,
+    ) -> Dict[str, np.ndarray]:
+        if data_format == "pose":
+            model = self.pose_model
+            diffusion = self.pose_diffusion
+        else:
+            model = self.face_model
+            diffusion = self.face_diffusion
+        all_motions = []
+        model_kwargs["y"]["scale"] = torch.ones(num_repetitions) * guidance_param
+        model_kwargs["y"] = {
+            key: val.to(self.device) if torch.is_tensor(val) else val
+            for key, val in model_kwargs["y"].items()
+        }
+        if data_format == "pose":
+            model_kwargs["y"]["mask"] = (
+                torch.ones((num_repetitions, 1, 1, curr_seq_length))
+                .to(self.device)
+                .bool()
+            )
+            model_kwargs["y"]["keyframes"] = self._replace_keyframes(
+                model_kwargs,
+                num_repetitions,
+                int(curr_seq_length / 30),
+                top_p=top_p,
+            )
+        sample = self._run_single_diffusion(
+            model_kwargs, diffusion, model, curr_seq_length, num_repetitions
+        )
+        all_motions.append(sample.cpu().numpy())
+        print(f"created {len(all_motions) * num_repetitions} samples")
+        return np.concatenate(all_motions, axis=0)
+def generate_results(audio: np.ndarray, num_repetitions: int, top_p: float):
+    if audio is None:
+        raise gr.Error("Please record audio to start")
+    sr, y = audio
+    # set to mono and perform resampling
+    y = torch.Tensor(y)
+    if y.dim() == 2:
+        dim = 0 if y.shape[0] == 2 else 1
+        y = torch.mean(y, dim=dim)
+    y = torchaudio.functional.resample(torch.Tensor(y), orig_freq=sr, new_freq=48_000)
+    sr = 48_000
+    # make it so that it is 4 seconds long
+    if len(y) < (sr * 4):
+        raise gr.Error("Please record at least 4 second of audio")
+    if num_repetitions is None or num_repetitions <= 0 or num_repetitions > 10:
+        raise gr.Error(
+            f"Invalid number of samples: {num_repetitions}. Please specify a number between 1-10"
+        )
+    cutoff = int(len(y) / (sr * 4))
+    y = y[: cutoff * sr * 4]
+    curr_seq_length = int(len(y) / sr) * 30
+    # create model_kwargs
+    model_kwargs = {"y": {}}
+    dual_audio = np.random.normal(0, 0.001, (1, len(y), 2))
+    dual_audio[:, :, 0] = y / max(y)
+    dual_audio = (dual_audio - gradio_model.stats["audio_mean"]) / gradio_model.stats[
+        "audio_std_flat"
+    ]
+    model_kwargs["y"]["audio"] = (
+        torch.Tensor(dual_audio).float().tile(num_repetitions, 1, 1)
+    )
+    face_results = (
+        gradio_model.generate_sequences(
+            model_kwargs, "face", curr_seq_length, num_repetitions=int(num_repetitions)
+        )
+        .squeeze(2)
+        .transpose(0, 2, 1)
+    )
+    face_results = (
+        face_results * gradio_model.stats["code_std"] + gradio_model.stats["code_mean"]
+    )
+    pose_results = (
+        gradio_model.generate_sequences(
+            model_kwargs,
+            "pose",
+            curr_seq_length,
+            num_repetitions=int(num_repetitions),
+            guidance_param=2.0,
+            top_p=top_p,
+        )
+        .squeeze(2)
+        .transpose(0, 2, 1)
+    )
+    pose_results = (
+        pose_results * gradio_model.stats["pose_std"] + gradio_model.stats["pose_mean"]
+    )
+    dual_audio = (
+        dual_audio * gradio_model.stats["audio_std_flat"]
+        + gradio_model.stats["audio_mean"]
+    )
+    return face_results, pose_results, dual_audio[0].transpose(1, 0).astype(np.float32)
+def audio_to_avatar(audio: np.ndarray, num_repetitions: int, top_p: float):
+    face_results, pose_results, audio = generate_results(audio, num_repetitions, top_p)
+    # returns: num_rep x T x 104
+    B = len(face_results)
+    results = []
+    for i in range(B):
+        render_data_block = {
+            "audio": audio,  # 2 x T
+            "body_motion": pose_results[i, ...],  # T x 104
+            "face_motion": face_results[i, ...],  # T x 256
+        }
+        gradio_model.body_renderer.render_full_video(
+            render_data_block, f"/tmp/sample{i}", audio_sr=48_000
+        )
+        results += [gr.Video(value=f"/tmp/sample{i}_pred.mp4", visible=True)]
+    results += [gr.Video(visible=False) for _ in range(B, 10)]
+    return results
+gradio_model = GradioModel(
+    face_args="./checkpoints/diffusion/c1_face/args.json",
+    pose_args="./checkpoints/diffusion/c1_pose/args.json",
+)
+demo = gr.Interface(
+    audio_to_avatar,  # function
+    [
+        gr.Audio(sources=["microphone"]),
+        gr.Number(
+            value=3,
+            label="Number of Samples (default = 3)",
+            precision=0,
+            minimum=1,
+            maximum=10,
+        ),
+        gr.Number(
+            value=0.97,
+            label="Sample Diversity (default = 0.97)",
+            precision=None,
+            minimum=0.01,
+            step=0.01,
+            maximum=1.00,
+        ),
+    ],  # input type
+    [gr.Video(format="mp4", visible=True)]
+    + [gr.Video(format="mp4", visible=False) for _ in range(9)],  # output type
+    title='"From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations" Demo',
+    description="You can generate a photorealistic avatar from your voice! <br/>\
+        1) Start by recording your audio.  <br/>\
+        2) Specify the number of samples to generate.  <br/>\
+        3) Specify how diverse you want the samples to be. This tunes the cumulative probability in nucleus sampling: 0.01 = low diversity, 1.0 = high diversity.  <br/>\
+        4) Then, sit back and wait for the rendering to happen! This may take a while (e.g. 30 minutes) <br/>\
+        5) After, you can view the videos and download the ones you like.  <br/>",
+    article="Relevant links: [Project Page](https://people.eecs.berkeley.edu/~evonne_ng/projects/audio2photoreal)",  # TODO: code and arxiv
+)
+if __name__ == "__main__":
+    fixseed(10)
+    demo.launch(share=True)

demo/demo.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import copy
+import json
+from typing import Dict, Union
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from attrdict import AttrDict
+from diffusion.respace import SpacedDiffusion
+from model.cfg_sampler import ClassifierFreeSampleModel
+from model.diffusion import FiLMTransformer
+from utils.misc import fixseed
+from utils.model_util import create_model_and_diffusion, load_model
+from visualize.render_codes import BodyRenderer
+class GradioModel:
+    def __init__(self, face_args, pose_args) -> None:
+        self.face_model, self.face_diffusion, self.device = self._setup_model(
+            face_args, "checkpoints/diffusion/c1_face/model000155000.pt"
+        )
+        self.pose_model, self.pose_diffusion, _ = self._setup_model(
+            pose_args, "checkpoints/diffusion/c1_pose/model000340000.pt"
+        )
+        # load standardization stuff
+        stats = torch.load("dataset/PXB184/data_stats.pth")
+        stats["pose_mean"] = stats["pose_mean"].reshape(-1)
+        stats["pose_std"] = stats["pose_std"].reshape(-1)
+        self.stats = stats
+        # set up renderer
+        config_base = f"./checkpoints/ca_body/data/PXB184"
+        self.body_renderer = BodyRenderer(
+            config_base=config_base,
+            render_rgb=True,
+        )
+    def _setup_model(
+        self,
+        args_path: str,
+        model_path: str,
+    ) -> (Union[FiLMTransformer, ClassifierFreeSampleModel], SpacedDiffusion):
+        with open(args_path) as f:
+            args = json.load(f)
+        args = AttrDict(args)
+        args.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        print("running on...", args.device)
+        args.model_path = model_path
+        args.output_dir = "/tmp/gradio/"
+        args.timestep_respacing = "ddim100"
+        if args.data_format == "pose":
+            args.resume_trans = "checkpoints/guide/c1_pose/checkpoints/iter-0100000.pt"
+        ## create model
+        model, diffusion = create_model_and_diffusion(args, split_type="test")
+        print(f"Loading checkpoints from [{args.model_path}]...")
+        state_dict = torch.load(args.model_path, map_location=args.device)
+        load_model(model, state_dict)
+        model = ClassifierFreeSampleModel(model)
+        model.eval()
+        model.to(args.device)
+        return model, diffusion, args.device
+    def _replace_keyframes(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        B: int,
+        T: int,
+        top_p: float = 0.97,
+    ) -> torch.Tensor:
+        with torch.no_grad():
+            tokens = self.pose_model.transformer.generate(
+                model_kwargs["y"]["audio"],
+                T,
+                layers=self.pose_model.tokenizer.residual_depth,
+                n_sequences=B,
+                top_p=top_p,
+            )
+        tokens = tokens.reshape((B, -1, self.pose_model.tokenizer.residual_depth))
+        pred = self.pose_model.tokenizer.decode(tokens).detach()
+        return pred
+    def _run_single_diffusion(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        diffusion: SpacedDiffusion,
+        model: Union[FiLMTransformer, ClassifierFreeSampleModel],
+        curr_seq_length: int,
+        num_repetitions: int = 1,
+    ) -> (torch.Tensor,):
+        sample_fn = diffusion.ddim_sample_loop
+        with torch.no_grad():
+            sample = sample_fn(
+                model,
+                (num_repetitions, model.nfeats, 1, curr_seq_length),
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+                init_image=None,
+                progress=True,
+                dump_steps=None,
+                noise=None,
+                const_noise=False,
+            )
+        return sample
+    def generate_sequences(
+        self,
+        model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+        data_format: str,
+        curr_seq_length: int,
+        num_repetitions: int = 5,
+        guidance_param: float = 10.0,
+        top_p: float = 0.97,
+        # batch_size: int = 1,
+    ) -> Dict[str, np.ndarray]:
+        if data_format == "pose":
+            model = self.pose_model
+            diffusion = self.pose_diffusion
+        else:
+            model = self.face_model
+            diffusion = self.face_diffusion
+        all_motions = []
+        model_kwargs["y"]["scale"] = torch.ones(num_repetitions) * guidance_param
+        model_kwargs["y"] = {
+            key: val.to(self.device) if torch.is_tensor(val) else val
+            for key, val in model_kwargs["y"].items()
+        }
+        if data_format == "pose":
+            model_kwargs["y"]["mask"] = (
+                torch.ones((num_repetitions, 1, 1, curr_seq_length))
+                .to(self.device)
+                .bool()
+            )
+            model_kwargs["y"]["keyframes"] = self._replace_keyframes(
+                model_kwargs,
+                num_repetitions,
+                int(curr_seq_length / 30),
+                top_p=top_p,
+            )
+        sample = self._run_single_diffusion(
+            model_kwargs, diffusion, model, curr_seq_length, num_repetitions
+        )
+        all_motions.append(sample.cpu().numpy())
+        print(f"created {len(all_motions) * num_repetitions} samples")
+        return np.concatenate(all_motions, axis=0)
+def generate_results(audio: np.ndarray, num_repetitions: int, top_p: float):
+    if audio is None:
+        raise gr.Error("Please record audio to start")
+    sr, y = audio
+    # set to mono and perform resampling
+    y = torch.Tensor(y)
+    if y.dim() == 2:
+        dim = 0 if y.shape[0] == 2 else 1
+        y = torch.mean(y, dim=dim)
+    y = torchaudio.functional.resample(torch.Tensor(y), orig_freq=sr, new_freq=48_000)
+    sr = 48_000
+    # make it so that it is 4 seconds long
+    if len(y) < (sr * 4):
+        raise gr.Error("Please record at least 4 second of audio")
+    if num_repetitions is None or num_repetitions <= 0 or num_repetitions > 10:
+        raise gr.Error(
+            f"Invalid number of samples: {num_repetitions}. Please specify a number between 1-10"
+        )
+    cutoff = int(len(y) / (sr * 4))
+    y = y[: cutoff * sr * 4]
+    curr_seq_length = int(len(y) / sr) * 30
+    # create model_kwargs
+    model_kwargs = {"y": {}}
+    dual_audio = np.random.normal(0, 0.001, (1, len(y), 2))
+    dual_audio[:, :, 0] = y / max(y)
+    dual_audio = (dual_audio - gradio_model.stats["audio_mean"]) / gradio_model.stats[
+        "audio_std_flat"
+    ]
+    model_kwargs["y"]["audio"] = (
+        torch.Tensor(dual_audio).float().tile(num_repetitions, 1, 1)
+    )
+    face_results = (
+        gradio_model.generate_sequences(
+            model_kwargs, "face", curr_seq_length, num_repetitions=int(num_repetitions)
+        )
+        .squeeze(2)
+        .transpose(0, 2, 1)
+    )
+    face_results = (
+        face_results * gradio_model.stats["code_std"] + gradio_model.stats["code_mean"]
+    )
+    pose_results = (
+        gradio_model.generate_sequences(
+            model_kwargs,
+            "pose",
+            curr_seq_length,
+            num_repetitions=int(num_repetitions),
+            guidance_param=2.0,
+            top_p=top_p,
+        )
+        .squeeze(2)
+        .transpose(0, 2, 1)
+    )
+    pose_results = (
+        pose_results * gradio_model.stats["pose_std"] + gradio_model.stats["pose_mean"]
+    )
+    dual_audio = (
+        dual_audio * gradio_model.stats["audio_std_flat"]
+        + gradio_model.stats["audio_mean"]
+    )
+    return face_results, pose_results, dual_audio[0].transpose(1, 0).astype(np.float32)
+def audio_to_avatar(audio: np.ndarray, num_repetitions: int, top_p: float):
+    face_results, pose_results, audio = generate_results(audio, num_repetitions, top_p)
+    # returns: num_rep x T x 104
+    B = len(face_results)
+    results = []
+    for i in range(B):
+        render_data_block = {
+            "audio": audio,  # 2 x T
+            "body_motion": pose_results[i, ...],  # T x 104
+            "face_motion": face_results[i, ...],  # T x 256
+        }
+        gradio_model.body_renderer.render_full_video(
+            render_data_block, f"/tmp/sample{i}", audio_sr=48_000
+        )
+        results += [gr.Video(value=f"/tmp/sample{i}_pred.mp4", visible=True)]
+    results += [gr.Video(visible=False) for _ in range(B, 10)]
+    return results
+gradio_model = GradioModel(
+    face_args="./checkpoints/diffusion/c1_face/args.json",
+    pose_args="./checkpoints/diffusion/c1_pose/args.json",
+)
+demo = gr.Interface(
+    audio_to_avatar,  # function
+    [
+        gr.Audio(sources=["microphone"]),
+        gr.Number(
+            value=3,
+            label="Number of Samples (default = 3)",
+            precision=0,
+            minimum=1,
+            maximum=10,
+        ),
+        gr.Number(
+            value=0.97,
+            label="Sample Diversity (default = 0.97)",
+            precision=None,
+            minimum=0.01,
+            step=0.01,
+            maximum=1.00,
+        ),
+    ],  # input type
+    [gr.Video(format="mp4", visible=True)]
+    + [gr.Video(format="mp4", visible=False) for _ in range(9)],  # output type
+    title='"From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations" Demo',
+    description="You can generate a photorealistic avatar from your voice! <br/>\
+        1) Start by recording your audio.  <br/>\
+        2) Specify the number of samples to generate.  <br/>\
+        3) Specify how diverse you want the samples to be. This tunes the cumulative probability in nucleus sampling: 0.01 = low diversity, 1.0 = high diversity.  <br/>\
+        4) Then, sit back and wait for the rendering to happen! This may take a while (e.g. 30 minutes) <br/>\
+        5) After, you can view the videos and download the ones you like.  <br/>",
+    article="Relevant links: [Project Page](https://people.eecs.berkeley.edu/~evonne_ng/projects/audio2photoreal)",  # TODO: code and arxiv
+)
+if __name__ == "__main__":
+    fixseed(10)
+    demo.launch(share=True)

demo/install.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+# make sure to have cuda 11.7 and gcc 9.0 installed
+# install environment
+pip install -r scripts/requirements.txt
+sh scripts/download_prereq.sh
+# download pytorch3d
+pip install "git+https://github.com/facebookresearch/pytorch3d.git"
+# download model stuff
+wget http://audio2photoreal_models.berkeleyvision.org/PXB184_models.tar || { echo 'downloading model failed' ; exit 1; }
+tar xvf PXB184_models.tar
+rm PXB184_models.tar
+# install rendering stuff
+mkdir -p checkpoints/ca_body/data/
+wget https://github.com/facebookresearch/ca_body/releases/download/v0.0.1-alpha/PXB184.tar.gz || { echo 'downloading ca body model failed' ; exit 1; }
+tar xvf PXB184.tar.gz --directory checkpoints/ca_body/data/
+rm PXB184.tar.gz

demo/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+attrdict
+einops==0.7.0
+fairseq==0.12.2
+gradio==4.31.3
+gradio_client==0.7.3
+huggingface-hub==0.19.4
+hydra-core==1.0.7
+mediapy==1.2.0
+numpy==1.26.2
+omegaconf==2.0.6
+opencv-python==4.8.1.78
+protobuf==4.25.1
+tensorboardX==2.6.2.2
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+tqdm==4.66.3

diffusion/fp16_util.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+original code from
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/diffusion/gaussian_diffusion.py
+under an MIT license
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE
+"""
+"""
+Helpers to train with 16-bit precision.
+"""
+import numpy as np
+import torch as th
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from utils import logger
+INITIAL_LOG_LOSS_SCALE = 20.0
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.half()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.float()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.float()
+def make_master_params(param_groups_and_shapes):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = []
+    for param_group, shape in param_groups_and_shapes:
+        master_param = nn.Parameter(
+            _flatten_dense_tensors(
+                [param.detach().float() for (_, param) in param_group]
+            ).view(shape)
+        )
+        master_param.requires_grad = True
+        master_params.append(master_param)
+    return master_params
+def model_grads_to_master_grads(param_groups_and_shapes, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    for master_param, (param_group, shape) in zip(
+        master_params, param_groups_and_shapes
+    ):
+        master_param.grad = _flatten_dense_tensors(
+            [param_grad_or_zeros(param) for (_, param) in param_group]
+        ).view(shape)
+def master_params_to_model_params(param_groups_and_shapes, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
+        for (_, param), unflat_master_param in zip(
+            param_group, unflatten_master_params(param_group, master_param.view(-1))
+        ):
+            param.detach().copy_(unflat_master_param)
+def unflatten_master_params(param_group, master_param):
+    return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
+def get_param_groups_and_shapes(named_model_params):
+    named_model_params = list(named_model_params)
+    scalar_vector_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
+        (-1),
+    )
+    matrix_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim > 1],
+        (1, -1),
+    )
+    return [scalar_vector_named_params, matrix_named_params]
+def master_params_to_state_dict(
+    model, param_groups_and_shapes, master_params, use_fp16
+):
+    if use_fp16:
+        state_dict = model.state_dict()
+        for master_param, (param_group, _) in zip(
+            master_params, param_groups_and_shapes
+        ):
+            for (name, _), unflat_master_param in zip(
+                param_group, unflatten_master_params(param_group, master_param.view(-1))
+            ):
+                assert name in state_dict
+                state_dict[name] = unflat_master_param
+    else:
+        state_dict = model.state_dict()
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+    return state_dict
+def state_dict_to_master_params(model, state_dict, use_fp16):
+    if use_fp16:
+        named_model_params = [
+            (name, state_dict[name]) for name, _ in model.named_parameters()
+        ]
+        param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
+        master_params = make_master_params(param_groups_and_shapes)
+    else:
+        master_params = [state_dict[name] for name, _ in model.named_parameters()]
+    return master_params
+def zero_master_grads(master_params):
+    for param in master_params:
+        param.grad = None
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()
+def param_grad_or_zeros(param):
+    if param.grad is not None:
+        return param.grad.data.detach()
+    else:
+        return th.zeros_like(param)
+class MixedPrecisionTrainer:
+    def __init__(
+        self,
+        *,
+        model,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
+    ):
+        self.model = model
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.model_params = list(self.model.parameters())
+        self.master_params = self.model_params
+        self.param_groups_and_shapes = None
+        self.lg_loss_scale = initial_lg_loss_scale
+        if self.use_fp16:
+            self.param_groups_and_shapes = get_param_groups_and_shapes(
+                self.model.named_parameters()
+            )
+            self.master_params = make_master_params(self.param_groups_and_shapes)
+            self.model.convert_to_fp16()
+    def zero_grad(self):
+        zero_grad(self.model_params)
+    def backward(self, loss: th.Tensor):
+        if self.use_fp16:
+            loss_scale = 2**self.lg_loss_scale
+            (loss * loss_scale).backward()
+        else:
+            loss.backward()
+    def optimize(self, opt: th.optim.Optimizer):
+        if self.use_fp16:
+            return self._optimize_fp16(opt)
+        else:
+            return self._optimize_normal(opt)
+    def _optimize_fp16(self, opt: th.optim.Optimizer):
+        logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
+        model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
+        grad_norm, param_norm = self._compute_norms(grad_scale=2**self.lg_loss_scale)
+        if check_overflow(grad_norm):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            zero_master_grads(self.master_params)
+            return False
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        self.master_params[0].grad.mul_(1.0 / (2**self.lg_loss_scale))
+        opt.step()
+        zero_master_grads(self.master_params)
+        master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+        return True
+    def _optimize_normal(self, opt: th.optim.Optimizer):
+        grad_norm, param_norm = self._compute_norms()
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        opt.step()
+        return True
+    def _compute_norms(self, grad_scale=1.0):
+        grad_norm = 0.0
+        param_norm = 0.0
+        for p in self.master_params:
+            with th.no_grad():
+                param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
+                if p.grad is not None:
+                    grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
+        return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
+    def master_params_to_state_dict(self, master_params):
+        return master_params_to_state_dict(
+            self.model, self.param_groups_and_shapes, master_params, self.use_fp16
+        )
+    def state_dict_to_master_params(self, state_dict):
+        return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
+def check_overflow(value):
+    return (value == float("inf")) or (value == -float("inf")) or (value != value)

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,1273 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+original code from
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/diffusion/gaussian_diffusion.py
+under an MIT license
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE
+"""
+import enum
+import math
+from copy import deepcopy
+import numpy as np
+import torch
+import torch as th
+from diffusion.losses import discretized_gaussian_log_likelihood, normal_kl
+from diffusion.nn import mean_flat, sum_flat
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps, scale_betas=1.0):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = scale_betas * 1000 / num_diffusion_timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif schedule_name == "cosine":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Ported directly from here, and then adapted over time to further experimentation.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    :param model_mean_type: a ModelMeanType determining what the model outputs.
+    :param model_var_type: a ModelVarType determining how variance is output.
+    :param loss_type: a LossType determining the loss function to use.
+    :param rescale_timesteps: if True, pass floating point timesteps into the
+                              model so that they are always scaled like in the
+                              original paper (0 to 1000).
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type,
+        rescale_timesteps=False,
+        lambda_vel=0.0,
+        data_format="pose",
+        model_path=None,
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        self.rescale_timesteps = rescale_timesteps
+        self.data_format = data_format
+        self.lambda_vel = lambda_vel
+        if self.lambda_vel > 0.0:
+            assert (
+                self.loss_type == LossType.MSE
+            ), "Geometric losses are supported by MSE loss type only!"
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # log calculation clipped because the posterior variance is 0 at the
+        # beginning of the diffusion chain.
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        )
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev)
+            * np.sqrt(alphas)
+            / (1.0 - self.alphas_cumprod)
+        )
+        self.l2_loss = lambda a, b: (a - b) ** 2
+    def masked_l2(self, a, b, mask):
+        loss = self.l2_loss(a, b)
+        loss = sum_flat(loss * mask.float())
+        n_entries = a.shape[1] * a.shape[2]
+        non_zero_elements = sum_flat(mask) * n_entries
+        mse_loss_val = loss / non_zero_elements
+        return mse_loss_val
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        )
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(
+            self.log_one_minus_alphas_cumprod, t, x_start.shape
+        )
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the dataset for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial dataset batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape, f"x_start: {x_start.shape}, x_t: {x_t.shape}"
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(
+        self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
+    ):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
+        model_variance, model_log_variance = {
+            # for fixedlarge, we set the initial (log-)variance like so
+            # to get a better decoder log likelihood.
+            ModelVarType.FIXED_LARGE: (
+                np.append(self.posterior_variance[1], self.betas[1:]),
+                np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+            ),
+            ModelVarType.FIXED_SMALL: (
+                self.posterior_variance,
+                self.posterior_log_variance_clipped,
+            ),
+        }[self.model_var_type]
+        model_variance = _extract_into_tensor(model_variance, t, x.shape)
+        model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        pred_xstart = process_xstart(model_output)
+        pred_xstart = pred_xstart.permute(0, 2, 1).unsqueeze(2)
+        model_mean, _, _ = self.q_posterior_mean_variance(
+            x_start=pred_xstart, x_t=x, t=t
+        )
+        assert (
+            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        ), print(
+            f"{model_mean.shape} == {model_log_variance.shape} == {pred_xstart.shape} == {x.shape}"
+        )
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        assert x_t.shape == xprev.shape
+        return (
+            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
+            - _extract_into_tensor(
+                self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
+            )
+            * x_t
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * (1000.0 / self.num_timesteps)
+        return t
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
+        new_mean = (
+            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        )
+        return new_mean
+    def condition_mean_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, p_mean_var, **model_kwargs)
+        new_mean = (
+            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        )
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
+            x, self._scale_timesteps(t), **model_kwargs
+        )
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+    def condition_score_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, p_mean_var, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(
+            x_start=out["pred_xstart"], x_t=x, t=t
+        )
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        const_noise=False,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs
+            )
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_with_grad(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        with th.enable_grad():
+            x = x.detach().requires_grad_()
+            out = self.p_mean_variance(
+                model,
+                x,
+                t,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                model_kwargs=model_kwargs,
+            )
+            noise = th.randn_like(x)
+            nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+            if cond_fn is not None:
+                out["mean"] = self.condition_mean_with_grad(
+                    cond_fn, out, x, t, model_kwargs=model_kwargs
+                )
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"].detach()}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        dump_steps=None,
+        const_noise=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :param const_noise: If True, will noise all samples with the same noise throughout sampling
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        if dump_steps is not None:
+            dump = []
+        for i, sample in enumerate(
+            self.p_sample_loop_progressive(
+                model,
+                shape,
+                noise=noise,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                cond_fn=cond_fn,
+                model_kwargs=model_kwargs,
+                device=device,
+                progress=progress,
+                skip_timesteps=skip_timesteps,
+                init_image=init_image,
+                randomize_class=randomize_class,
+                cond_fn_with_grad=cond_fn_with_grad,
+                const_noise=const_noise,
+            )
+        ):
+            if dump_steps is not None and i in dump_steps:
+                dump.append(deepcopy(sample["sample"]))
+            final = sample
+        if dump_steps is not None:
+            return dump
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        const_noise=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        # number of timestamps to diffuse
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and "y" in model_kwargs:
+                model_kwargs["y"] = th.randint(
+                    low=0,
+                    high=model.num_classes,
+                    size=model_kwargs["y"].shape,
+                    device=model_kwargs["y"].device,
+                )
+            with th.no_grad():
+                sample_fn = (
+                    self.p_sample_with_grad if cond_fn_with_grad else self.p_sample
+                )
+                out = sample_fn(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    const_noise=const_noise,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out_orig = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(
+                cond_fn, out_orig, x, t, model_kwargs=model_kwargs
+            )
+        else:
+            out = out_orig
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out_orig["pred_xstart"]}
+    def ddim_sample_with_grad(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        with th.enable_grad():
+            x = x.detach().requires_grad_()
+            out_orig = self.p_mean_variance(
+                model,
+                x,
+                t,
+                clip_denoised=clip_denoised,
+                denoised_fn=denoised_fn,
+                model_kwargs=model_kwargs,
+            )
+            if cond_fn is not None:
+                out = self.condition_score_with_grad(
+                    cond_fn, out_orig, x, t, model_kwargs=model_kwargs
+                )
+            else:
+                out = out_orig
+        out["pred_xstart"] = out["pred_xstart"].detach()
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out_orig["pred_xstart"].detach()}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_next)
+            + th.sqrt(1 - alpha_bar_next) * eps
+        )
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        dump_steps=None,
+        const_noise=False,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        if dump_steps is not None:
+            raise NotImplementedError()
+        if const_noise == True:
+            raise NotImplementedError()
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+            skip_timesteps=skip_timesteps,
+            init_image=init_image,
+            randomize_class=randomize_class,
+            cond_fn_with_grad=cond_fn_with_grad,
+        ):
+            final = sample
+        return final["pred_xstart"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and "y" in model_kwargs:
+                model_kwargs["y"] = th.randint(
+                    low=0,
+                    high=model.num_classes,
+                    size=model_kwargs["y"].shape,
+                    device=model_kwargs["y"].device,
+                )
+            with th.no_grad():
+                sample_fn = (
+                    self.ddim_sample_with_grad
+                    if cond_fn_with_grad
+                    else self.ddim_sample
+                )
+                out = sample_fn(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def plms_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        cond_fn_with_grad=False,
+        order=2,
+        old_out=None,
+    ):
+        """
+        Sample x_{t-1} from the model using Pseudo Linear Multistep.
+        Same usage as p_sample().
+        """
+        if not int(order) or not 1 <= order <= 4:
+            raise ValueError("order is invalid (should be int from 1-4).")
+        def get_model_output(x, t):
+            with th.set_grad_enabled(cond_fn_with_grad and cond_fn is not None):
+                x = x.detach().requires_grad_() if cond_fn_with_grad else x
+                out_orig = self.p_mean_variance(
+                    model,
+                    x,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    model_kwargs=model_kwargs,
+                )
+                if cond_fn is not None:
+                    if cond_fn_with_grad:
+                        out = self.condition_score_with_grad(
+                            cond_fn, out_orig, x, t, model_kwargs=model_kwargs
+                        )
+                        x = x.detach()
+                    else:
+                        out = self.condition_score(
+                            cond_fn, out_orig, x, t, model_kwargs=model_kwargs
+                        )
+                else:
+                    out = out_orig
+            # Usually our model outputs epsilon, but we re-derive it
+            # in case we used x_start or x_prev prediction.
+            eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+            return eps, out, out_orig
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        eps, out, out_orig = get_model_output(x, t)
+        if order > 1 and old_out is None:
+            # Pseudo Improved Euler
+            old_eps = [eps]
+            mean_pred = (
+                out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+                + th.sqrt(1 - alpha_bar_prev) * eps
+            )
+            eps_2, _, _ = get_model_output(mean_pred, t - 1)
+            eps_prime = (eps + eps_2) / 2
+            pred_prime = self._predict_xstart_from_eps(x, t, eps_prime)
+            mean_pred = (
+                pred_prime * th.sqrt(alpha_bar_prev)
+                + th.sqrt(1 - alpha_bar_prev) * eps_prime
+            )
+        else:
+            # Pseudo Linear Multistep (Adams-Bashforth)
+            old_eps = old_out["old_eps"]
+            old_eps.append(eps)
+            cur_order = min(order, len(old_eps))
+            if cur_order == 1:
+                eps_prime = old_eps[-1]
+            elif cur_order == 2:
+                eps_prime = (3 * old_eps[-1] - old_eps[-2]) / 2
+            elif cur_order == 3:
+                eps_prime = (23 * old_eps[-1] - 16 * old_eps[-2] + 5 * old_eps[-3]) / 12
+            elif cur_order == 4:
+                eps_prime = (
+                    55 * old_eps[-1]
+                    - 59 * old_eps[-2]
+                    + 37 * old_eps[-3]
+                    - 9 * old_eps[-4]
+                ) / 24
+            else:
+                raise RuntimeError("cur_order is invalid.")
+            pred_prime = self._predict_xstart_from_eps(x, t, eps_prime)
+            mean_pred = (
+                pred_prime * th.sqrt(alpha_bar_prev)
+                + th.sqrt(1 - alpha_bar_prev) * eps_prime
+            )
+        if len(old_eps) >= order:
+            old_eps.pop(0)
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        sample = mean_pred * nonzero_mask + out["pred_xstart"] * (1 - nonzero_mask)
+        return {
+            "sample": sample,
+            "pred_xstart": out_orig["pred_xstart"],
+            "old_eps": old_eps,
+        }
+    def plms_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        order=2,
+    ):
+        """
+        Generate samples from the model using Pseudo Linear Multistep.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.plms_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            skip_timesteps=skip_timesteps,
+            init_image=init_image,
+            randomize_class=randomize_class,
+            cond_fn_with_grad=cond_fn_with_grad,
+            order=order,
+        ):
+            final = sample
+        return final["sample"]
+    def plms_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        skip_timesteps=0,
+        init_image=None,
+        randomize_class=False,
+        cond_fn_with_grad=False,
+        order=2,
+    ):
+        """
+        Use PLMS to sample from the model and yield intermediate samples from each
+        timestep of PLMS.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        if skip_timesteps and init_image is None:
+            init_image = th.zeros_like(img)
+        indices = list(range(self.num_timesteps - skip_timesteps))[::-1]
+        if init_image is not None:
+            my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0]
+            img = self.q_sample(init_image, my_t, img)
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        old_out = None
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            if randomize_class and "y" in model_kwargs:
+                model_kwargs["y"] = th.randint(
+                    low=0,
+                    high=model.num_classes,
+                    size=model_kwargs["y"].shape,
+                    device=model_kwargs["y"].device,
+                )
+            with th.no_grad():
+                out = self.plms_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    cond_fn_with_grad=cond_fn_with_grad,
+                    order=order,
+                    old_out=old_out,
+                )
+                yield out
+                old_out = out
+                img = out["sample"]
+    def _vb_terms_bpd(
+        self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        mask = model_kwargs["y"]["mask"]
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(
+            x_start, t, noise=noise
+        )  # use the formula to diffuse the starting tensor by t steps
+        terms = {}
+        # set random dropout for conditioning in training
+        model_kwargs["cond_drop_prob"] = 0.2
+        model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
+        target = {
+            ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                x_start=x_start, x_t=x_t, t=t
+            )[0],
+            ModelMeanType.START_X: x_start,
+            ModelMeanType.EPSILON: noise,
+        }[self.model_mean_type]
+        model_output = model_output.permute(0, 2, 1).unsqueeze(2)
+        assert model_output.shape == target.shape == x_start.shape
+        missing_mask = model_kwargs["y"]["missing"][..., 0]
+        missing_mask = missing_mask.unsqueeze(1).unsqueeze(1)
+        missing_mask = mask * missing_mask
+        terms["rot_mse"] = self.masked_l2(target, model_output, missing_mask)
+        if self.lambda_vel > 0.0:
+            target_vel = target[..., 1:] - target[..., :-1]
+            model_output_vel = model_output[..., 1:] - model_output[..., :-1]
+            terms["vel_mse"] = self.masked_l2(
+                target_vel,
+                model_output_vel,
+                mask[:, :, :, 1:],
+            )
+        terms["loss"] = terms["rot_mse"] + (self.lambda_vel * terms.get("vel_mse", 0.0))
+        with torch.no_grad():
+            terms["vb"] = self._vb_terms_bpd(
+                model,
+                x_start,
+                x_t,
+                t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+        return terms
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)

diffusion/losses.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+Helpers for various likelihood-based losses. These are ported from the original
+Ho et al. diffusion models codebase:
+https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
+"""
+import numpy as np
+import torch as th
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/nn.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+original code from
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/diffusion/gaussian_diffusion.py
+under an MIT license
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE
+"""
+"""
+Various utilities for neural networks.
+"""
+import math
+import torch as th
+import torch.nn as nn
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def sum_flat(tensor):
+    """
+    Take the sum over all non-batch dimensions.
+    """
+    return tensor.sum(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    @th.cuda.amp.custom_fwd
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_length = length
+        ctx.save_for_backward(*args)
+        with th.no_grad():
+            output_tensors = ctx.run_function(*args[:length])
+        return output_tensors
+    @staticmethod
+    @th.cuda.amp.custom_bwd
+    def backward(ctx, *output_grads):
+        args = list(ctx.saved_tensors)
+        # Filter for inputs that require grad. If none, exit early.
+        input_indices = [i for (i, x) in enumerate(args) if x.requires_grad]
+        if not input_indices:
+            return (None, None) + tuple(None for _ in args)
+        with th.enable_grad():
+            for i in input_indices:
+                if i < ctx.input_length:
+                    # Not sure why the OAI code does this little
+                    # dance. It might not be necessary.
+                    args[i] = args[i].detach().requires_grad_()
+                    args[i] = args[i].view_as(args[i])
+            output_tensors = ctx.run_function(*args[: ctx.input_length])
+        if isinstance(output_tensors, th.Tensor):
+            output_tensors = [output_tensors]
+        # Filter for outputs that require grad. If none, exit early.
+        out_and_grads = [
+            (o, g) for (o, g) in zip(output_tensors, output_grads) if o.requires_grad
+        ]
+        if not out_and_grads:
+            return (None, None) + tuple(None for _ in args)
+        # Compute gradients on the filtered tensors.
+        computed_grads = th.autograd.grad(
+            [o for (o, g) in out_and_grads],
+            [args[i] for i in input_indices],
+            [g for (o, g) in out_and_grads],
+        )
+        # Reassemble the complete gradient tuple.
+        input_grads = [None for _ in args]
+        for i, g in zip(input_indices, computed_grads):
+            input_grads[i] = g
+        return (None, None) + tuple(input_grads)

diffusion/resample.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+original code from
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/diffusion/gaussian_diffusion.py
+under an MIT license
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE
+"""
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+"""
+original code from
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/diffusion/gaussian_diffusion.py
+under an MIT license
+https://github.com/GuyTevet/motion-diffusion-model/blob/main/LICENSE
+"""
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        if hasattr(model, "step"):
+            self.step = model.step
+        self.add_frame_cond = model.add_frame_cond
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

flagged/audio/b90d90dbca93f47e8d01/audio.wav ADDED Viewed

Binary file (696 kB). View file

flagged/audio/d8e03e2e6deae2f981b1/audio.wav ADDED Viewed

Binary file (696 kB). View file

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+audio,Number of Samples (default = 3),Sample Diversity (default = 0.97),output 0,output 1,output 2,output 3,output 4,output 5,output 6,output 7,output 8,output 9,flag,username,timestamp
+,1,0.69,,,,,,,,,,,,,2024-07-15 05:46:49.672259
+flagged/audio/d8e03e2e6deae2f981b1/audio.wav,1,0.69,,,,,,,,,,,,,2024-07-15 06:28:21.003877
+flagged/audio/b90d90dbca93f47e8d01/audio.wav,1,0.69,,,,,,,,,,,,,2024-07-15 06:28:24.442449

model/cfg_sampler.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from copy import deepcopy
+import numpy as np
+import torch
+import torch.nn as nn
+# A wrapper model for Classifier-free guidance **SAMPLING** only
+# https://arxiv.org/abs/2207.12598
+class ClassifierFreeSampleModel(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model  # model is the actual model to run
+        self.nfeats = self.model.nfeats
+        self.cond_mode = self.model.cond_mode
+        self.add_frame_cond = self.model.add_frame_cond
+        if self.add_frame_cond is not None:
+            if self.model.resume_trans is not None:
+                self.transformer = self.model.transformer
+                self.tokenizer = self.model.tokenizer
+            self.step = self.model.step
+    def forward(self, x, timesteps, y=None):
+        out = self.model(x, timesteps, y, cond_drop_prob=0.0)
+        out_uncond = self.model(x, timesteps, y, cond_drop_prob=1.0)
+        return out_uncond + (y["scale"].view(-1, 1, 1) * (out - out_uncond))

model/diffusion.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import json
+from typing import Callable, Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from model.guide import GuideTransformer
+from model.modules.audio_encoder import Wav2VecEncoder
+from model.modules.rotary_embedding_torch import RotaryEmbedding
+from model.modules.transformer_modules import (
+    DecoderLayerStack,
+    FiLMTransformerDecoderLayer,
+    RegressionTransformer,
+    TransformerEncoderLayerRotary,
+)
+from model.utils import (
+    init_weight,
+    PositionalEncoding,
+    prob_mask_like,
+    setup_lip_regressor,
+    SinusoidalPosEmb,
+)
+from model.vqvae import setup_tokenizer
+from torch.nn import functional as F
+from utils.misc import prGreen, prRed
+class Audio2LipRegressionTransformer(torch.nn.Module):
+    def __init__(
+        self,
+        n_vertices: int = 338,
+        causal: bool = False,
+        train_wav2vec: bool = False,
+        transformer_encoder_layers: int = 2,
+        transformer_decoder_layers: int = 4,
+    ):
+        super().__init__()
+        self.n_vertices = n_vertices
+        self.audio_encoder = Wav2VecEncoder()
+        if not train_wav2vec:
+            self.audio_encoder.eval()
+            for param in self.audio_encoder.parameters():
+                param.requires_grad = False
+        self.regression_model = RegressionTransformer(
+            transformer_encoder_layers=transformer_encoder_layers,
+            transformer_decoder_layers=transformer_decoder_layers,
+            d_model=512,
+            d_cond=512,
+            num_heads=4,
+            causal=causal,
+        )
+        self.project_output = torch.nn.Linear(512, self.n_vertices * 3)
+    def forward(self, audio):
+        """
+        :param audio: tensor of shape B x T x 1600
+        :return: tensor of shape B x T x n_vertices x 3 containing reconstructed lip geometry
+        """
+        B, T = audio.shape[0], audio.shape[1]
+        cond = self.audio_encoder(audio)
+        x = torch.zeros(B, T, 512, device=audio.device)
+        x = self.regression_model(x, cond)
+        x = self.project_output(x)
+        verts = x.view(B, T, self.n_vertices, 3)
+        return verts
+class FiLMTransformer(nn.Module):
+    def __init__(
+        self,
+        args,
+        nfeats: int,
+        latent_dim: int = 512,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        cond_feature_dim: int = 4800,
+        activation: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        use_rotary: bool = True,
+        cond_mode: str = "audio",
+        split_type: str = "train",
+        device: str = "cuda",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.nfeats = nfeats
+        self.cond_mode = cond_mode
+        self.cond_feature_dim = cond_feature_dim
+        self.add_frame_cond = args.add_frame_cond
+        self.data_format = args.data_format
+        self.split_type = split_type
+        self.device = device
+        # positional embeddings
+        self.rotary = None
+        self.abs_pos_encoding = nn.Identity()
+        # if rotary, replace absolute embedding with a rotary embedding instance (absolute becomes an identity)
+        if use_rotary:
+            self.rotary = RotaryEmbedding(dim=latent_dim)
+        else:
+            self.abs_pos_encoding = PositionalEncoding(
+                latent_dim, dropout, batch_first=True
+            )
+        # time embedding processing
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(latent_dim),
+            nn.Linear(latent_dim, latent_dim * 4),
+            nn.Mish(),
+        )
+        self.to_time_cond = nn.Sequential(
+            nn.Linear(latent_dim * 4, latent_dim),
+        )
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(latent_dim * 4, latent_dim * 2),
+            Rearrange("b (r d) -> b r d", r=2),
+        )
+        # null embeddings for guidance dropout
+        self.seq_len = args.max_seq_length
+        emb_len = 1998  # hardcoded for now
+        self.null_cond_embed = nn.Parameter(torch.randn(1, emb_len, latent_dim))
+        self.null_cond_hidden = nn.Parameter(torch.randn(1, latent_dim))
+        self.norm_cond = nn.LayerNorm(latent_dim)
+        self.setup_audio_models()
+        # set up pose/face specific parts of the model
+        self.input_projection = nn.Linear(self.nfeats, latent_dim)
+        if self.data_format == "pose":
+            cond_feature_dim = 1024
+            key_feature_dim = 104
+            self.step = 30
+            self.use_cm = True
+            self.setup_guide_models(args, latent_dim, key_feature_dim)
+            self.post_pose_layers = self._build_single_pose_conv(self.nfeats)
+            self.post_pose_layers.apply(init_weight)
+            self.final_conv = torch.nn.Conv1d(self.nfeats, self.nfeats, kernel_size=1)
+            self.receptive_field = 25
+        elif self.data_format == "face":
+            self.use_cm = False
+            cond_feature_dim = 1024 + 1014
+            self.setup_lip_models()
+            self.cond_encoder = nn.Sequential()
+            for _ in range(2):
+                self.cond_encoder.append(
+                    TransformerEncoderLayerRotary(
+                        d_model=latent_dim,
+                        nhead=num_heads,
+                        dim_feedforward=ff_size,
+                        dropout=dropout,
+                        activation=activation,
+                        batch_first=True,
+                        rotary=self.rotary,
+                    )
+                )
+            self.cond_encoder.apply(init_weight)
+        self.cond_projection = nn.Linear(cond_feature_dim, latent_dim)
+        self.non_attn_cond_projection = nn.Sequential(
+            nn.LayerNorm(latent_dim),
+            nn.Linear(latent_dim, latent_dim),
+            nn.SiLU(),
+            nn.Linear(latent_dim, latent_dim),
+        )
+        # decoder
+        decoderstack = nn.ModuleList([])
+        for _ in range(num_layers):
+            decoderstack.append(
+                FiLMTransformerDecoderLayer(
+                    latent_dim,
+                    num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                    use_cm=self.use_cm,
+                )
+            )
+        self.seqTransDecoder = DecoderLayerStack(decoderstack)
+        self.seqTransDecoder.apply(init_weight)
+        self.final_layer = nn.Linear(latent_dim, self.nfeats)
+        self.final_layer.apply(init_weight)
+    def _build_single_pose_conv(self, nfeats: int) -> nn.ModuleList:
+        post_pose_layers = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(nfeats, max(256, nfeats), kernel_size=3, dilation=1),
+                torch.nn.Conv1d(max(256, nfeats), nfeats, kernel_size=3, dilation=2),
+                torch.nn.Conv1d(nfeats, nfeats, kernel_size=3, dilation=3),
+                torch.nn.Conv1d(nfeats, nfeats, kernel_size=3, dilation=1),
+                torch.nn.Conv1d(nfeats, nfeats, kernel_size=3, dilation=2),
+                torch.nn.Conv1d(nfeats, nfeats, kernel_size=3, dilation=3),
+            ]
+        )
+        return post_pose_layers
+    def _run_single_pose_conv(self, output: torch.Tensor) -> torch.Tensor:
+        output = torch.nn.functional.pad(output, pad=[self.receptive_field - 1, 0])
+        for _, layer in enumerate(self.post_pose_layers):
+            y = torch.nn.functional.leaky_relu(layer(output), negative_slope=0.2)
+            if self.split_type == "train":
+                y = torch.nn.functional.dropout(y, 0.2)
+            if output.shape[1] == y.shape[1]:
+                output = (output[:, :, -y.shape[-1] :] + y) / 2.0  # skip connection
+            else:
+                output = y
+        return output
+    def setup_guide_models(self, args, latent_dim: int, key_feature_dim: int) -> None:
+        # set up conditioning info
+        max_keyframe_len = len(list(range(self.seq_len))[:: self.step])
+        self.null_pose_embed = nn.Parameter(
+            torch.randn(1, max_keyframe_len, latent_dim)
+        )
+        prGreen(f"using keyframes: {self.null_pose_embed.shape}")
+        self.frame_cond_projection = nn.Linear(key_feature_dim, latent_dim)
+        self.frame_norm_cond = nn.LayerNorm(latent_dim)
+        # for test time set up keyframe transformer
+        self.resume_trans = None
+        if self.split_type == "test":
+            if hasattr(args, "resume_trans") and args.resume_trans is not None:
+                self.resume_trans = args.resume_trans
+                self.setup_guide_predictor(args.resume_trans)
+            else:
+                prRed("not using transformer, just using ground truth")
+    def setup_guide_predictor(self, cp_path: str) -> None:
+        cp_dir = cp_path.split("checkpoints/iter-")[0]
+        with open(f"{cp_dir}/args.json") as f:
+            trans_args = json.load(f)
+        # set up tokenizer based on trans_arg load point
+        self.tokenizer = setup_tokenizer(trans_args["resume_pth"])
+        # set up transformer
+        self.transformer = GuideTransformer(
+            tokens=self.tokenizer.n_clusters,
+            num_layers=trans_args["layers"],
+            dim=trans_args["dim"],
+            emb_len=1998,
+            num_audio_layers=trans_args["num_audio_layers"],
+        )
+        for param in self.transformer.parameters():
+            param.requires_grad = False
+        prGreen("loading TRANSFORMER checkpoint from {}".format(cp_path))
+        cp = torch.load(cp_path)
+        missing_keys, unexpected_keys = self.transformer.load_state_dict(
+            cp["model_state_dict"], strict=False
+        )
+        assert len(missing_keys) == 0, missing_keys
+        assert len(unexpected_keys) == 0, unexpected_keys
+    def setup_audio_models(self) -> None:
+        self.audio_model, self.audio_resampler = setup_lip_regressor()
+    def setup_lip_models(self) -> None:
+        self.lip_model = Audio2LipRegressionTransformer()
+        cp_path = "./assets/iter-0200000.pt"
+        cp = torch.load(cp_path, map_location=torch.device(self.device))
+        self.lip_model.load_state_dict(cp["model_state_dict"])
+        for param in self.lip_model.parameters():
+            param.requires_grad = False
+        prGreen(f"adding lip conditioning {cp_path}")
+    def parameters_w_grad(self):
+        return [p for p in self.parameters() if p.requires_grad]
+    def encode_audio(self, raw_audio: torch.Tensor) -> torch.Tensor:
+        device = next(self.parameters()).device
+        a0 = self.audio_resampler(raw_audio[:, :, 0].to(device))
+        a1 = self.audio_resampler(raw_audio[:, :, 1].to(device))
+        with torch.no_grad():
+            z0 = self.audio_model.feature_extractor(a0)
+            z1 = self.audio_model.feature_extractor(a1)
+            emb = torch.cat((z0, z1), axis=1).permute(0, 2, 1)
+        return emb
+    def encode_lip(self, audio: torch.Tensor, cond_embed: torch.Tensor) -> torch.Tensor:
+        reshaped_audio = audio.reshape((audio.shape[0], -1, 1600, 2))[..., 0]
+        # processes 4 seconds at a time
+        B, T, _ = reshaped_audio.shape
+        lip_cond = torch.zeros(
+            (audio.shape[0], T, 338, 3),
+            device=audio.device,
+            dtype=audio.dtype,
+        )
+        for i in range(0, T, 120):
+            lip_cond[:, i : i + 120, ...] = self.lip_model(
+                reshaped_audio[:, i : i + 120, ...]
+            )
+        lip_cond = lip_cond.permute(0, 2, 3, 1).reshape((B, 338 * 3, -1))
+        lip_cond = torch.nn.functional.interpolate(
+            lip_cond, size=cond_embed.shape[1], mode="nearest-exact"
+        ).permute(0, 2, 1)
+        cond_embed = torch.cat((cond_embed, lip_cond), dim=-1)
+        return cond_embed
+    def encode_keyframes(
+        self, y: torch.Tensor, cond_drop_prob: float, batch_size: int
+    ) -> torch.Tensor:
+        pred = y["keyframes"]
+        new_mask = y["mask"][..., :: self.step].squeeze((1, 2))
+        pred[~new_mask] = 0.0  # pad the unknown
+        pose_hidden = self.frame_cond_projection(pred.detach().clone().cuda())
+        pose_embed = self.abs_pos_encoding(pose_hidden)
+        pose_tokens = self.frame_norm_cond(pose_embed)
+        # do conditional dropout for guide poses
+        key_cond_drop_prob = cond_drop_prob
+        keep_mask_pose = prob_mask_like(
+            (batch_size,), 1 - key_cond_drop_prob, device=pose_tokens.device
+        )
+        keep_mask_pose_embed = rearrange(keep_mask_pose, "b -> b 1 1")
+        null_pose_embed = self.null_pose_embed.to(pose_tokens.dtype)
+        pose_tokens = torch.where(
+            keep_mask_pose_embed,
+            pose_tokens,
+            null_pose_embed[:, : pose_tokens.shape[1], :],
+        )
+        return pose_tokens
+    def forward(
+        self,
+        x: torch.Tensor,
+        times: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        cond_drop_prob: float = 0.0,
+    ) -> torch.Tensor:
+        if x.dim() == 4:
+            x = x.permute(0, 3, 1, 2).squeeze(-1)
+        batch_size, device = x.shape[0], x.device
+        if self.cond_mode == "uncond":
+            cond_embed = torch.zeros(
+                (x.shape[0], x.shape[1], self.cond_feature_dim),
+                dtype=x.dtype,
+                device=x.device,
+            )
+        else:
+            cond_embed = y["audio"]
+            cond_embed = self.encode_audio(cond_embed)
+            if self.data_format == "face":
+                cond_embed = self.encode_lip(y["audio"], cond_embed)
+                pose_tokens = None
+            if self.data_format == "pose":
+                pose_tokens = self.encode_keyframes(y, cond_drop_prob, batch_size)
+        assert cond_embed is not None, "cond emb should not be none"
+        # process conditioning information
+        x = self.input_projection(x)
+        x = self.abs_pos_encoding(x)
+        audio_cond_drop_prob = cond_drop_prob
+        keep_mask = prob_mask_like(
+            (batch_size,), 1 - audio_cond_drop_prob, device=device
+        )
+        keep_mask_embed = rearrange(keep_mask, "b -> b 1 1")
+        keep_mask_hidden = rearrange(keep_mask, "b -> b 1")
+        cond_tokens = self.cond_projection(cond_embed)
+        cond_tokens = self.abs_pos_encoding(cond_tokens)
+        if self.data_format == "face":
+            cond_tokens = self.cond_encoder(cond_tokens)
+        null_cond_embed = self.null_cond_embed.to(cond_tokens.dtype)
+        cond_tokens = torch.where(
+            keep_mask_embed, cond_tokens, null_cond_embed[:, : cond_tokens.shape[1], :]
+        )
+        mean_pooled_cond_tokens = cond_tokens.mean(dim=-2)
+        cond_hidden = self.non_attn_cond_projection(mean_pooled_cond_tokens)
+        # create t conditioning
+        t_hidden = self.time_mlp(times)
+        t = self.to_time_cond(t_hidden)
+        t_tokens = self.to_time_tokens(t_hidden)
+        null_cond_hidden = self.null_cond_hidden.to(t.dtype)
+        cond_hidden = torch.where(keep_mask_hidden, cond_hidden, null_cond_hidden)
+        t += cond_hidden
+        # cross-attention conditioning
+        c = torch.cat((cond_tokens, t_tokens), dim=-2)
+        cond_tokens = self.norm_cond(c)
+        # Pass through the transformer decoder
+        output = self.seqTransDecoder(x, cond_tokens, t, memory2=pose_tokens)
+        output = self.final_layer(output)
+        if self.data_format == "pose":
+            output = output.permute(0, 2, 1)
+            output = self._run_single_pose_conv(output)
+            output = self.final_conv(output)
+            output = output.permute(0, 2, 1)
+        return output

model/guide.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from typing import Callable, List
+import torch
+import torch as th
+import torch.nn as nn
+from einops import rearrange
+from model.modules.rotary_embedding_torch import RotaryEmbedding
+from model.modules.transformer_modules import (
+    DecoderLayerStack,
+    FiLMTransformerDecoderLayer,
+    PositionalEncoding,
+)
+from model.utils import prob_mask_like, setup_lip_regressor
+from torch.distributions import Categorical
+from torch.nn import functional as F
+class GuideTransformer(nn.Module):
+    def __init__(
+        self,
+        tokens: int,
+        num_heads: int = 4,
+        num_layers: int = 4,
+        dim: int = 512,
+        ff_size: int = 1024,
+        dropout: float = 0.1,
+        activation: Callable = F.gelu,
+        use_rotary: bool = True,
+        cond_feature_dim: int = 1024,
+        emb_len: int = 798,
+        num_audio_layers: int = 2,
+    ):
+        super().__init__()
+        self.tokens = tokens
+        self.token_embedding = th.nn.Embedding(
+            num_embeddings=tokens + 1,  # account for sequence start and end tokens
+            embedding_dim=dim,
+        )
+        self.abs_pos_encoding = nn.Identity()
+        # if rotary, replace absolute embedding with a rotary embedding instance (absolute becomes an identity)
+        if use_rotary:
+            self.rotary = RotaryEmbedding(dim=dim)
+        else:
+            self.abs_pos_encoding = PositionalEncoding(dim, dropout, batch_first=True)
+        self.setup_audio_models(cond_feature_dim, num_audio_layers)
+        self.null_cond_embed = nn.Parameter(torch.randn(1, emb_len, dim))
+        self.null_cond_hidden = nn.Parameter(torch.randn(1, dim))
+        self.norm_cond = nn.LayerNorm(dim)
+        self.cond_projection = nn.Linear(cond_feature_dim, dim)
+        self.non_attn_cond_projection = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+        )
+        # decoder
+        decoderstack = nn.ModuleList([])
+        for _ in range(num_layers):
+            decoderstack.append(
+                FiLMTransformerDecoderLayer(
+                    dim,
+                    num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                )
+            )
+        self.seqTransDecoder = DecoderLayerStack(decoderstack)
+        self.final_layer = nn.Linear(dim, tokens)
+    def _build_single_audio_conv(self, c: int) -> List[nn.Module]:
+        return [
+            torch.nn.Conv1d(c, max(256, c), kernel_size=3, dilation=1),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+            #
+            torch.nn.Conv1d(max(256, c), max(256, c), kernel_size=3, dilation=2),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+            #
+            torch.nn.Conv1d(max(128, c), max(128, c), kernel_size=3, dilation=3),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+            #
+            torch.nn.Conv1d(max(128, c), c, kernel_size=3, dilation=1),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+            #
+            torch.nn.Conv1d(c, c, kernel_size=3, dilation=2),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+            #
+            torch.nn.Conv1d(c, c, kernel_size=3, dilation=3),
+            torch.nn.LeakyReLU(negative_slope=0.2),
+            torch.nn.Dropout(0.2),
+        ]
+    def setup_audio_models(self, cond_feature_dim: int, num_audio_layers: int) -> None:
+        pre_layers = []
+        for _ in range(num_audio_layers):
+            pre_layers += self._build_single_audio_conv(cond_feature_dim)
+        pre_layers += [
+            torch.nn.Conv1d(cond_feature_dim, cond_feature_dim, kernel_size=1)
+        ]
+        pre_layers = torch.nn.ModuleList(pre_layers)
+        self.pre_audio = nn.Sequential(*pre_layers)
+        self.audio_model, self.audio_resampler = setup_lip_regressor()
+    def encode_audio(self, raw_audio: torch.Tensor) -> torch.Tensor:
+        device = next(self.parameters()).device
+        a0 = self.audio_resampler(raw_audio[:, :, 0].to(device))  # B x T
+        a1 = self.audio_resampler(raw_audio[:, :, 1].to(device))  # B x T
+        with torch.no_grad():
+            z0 = self.audio_model.feature_extractor(a0)
+            z1 = self.audio_model.feature_extractor(a1)
+            emb = torch.cat((z0, z1), axis=1).permute(0, 2, 1)
+        return emb
+    def get_tgt_mask(self, size: int, device: str) -> torch.tensor:
+        mask = torch.tril(
+            torch.ones((size, size), device=device) == 1
+        )  # Lower triangular matrix
+        mask = mask.float()
+        mask = mask.masked_fill(mask == 0, float("-inf"))  # Convert zeros to -inf
+        mask = mask.masked_fill(mask == 1, float(0.0))  # Convert ones to 0
+        return mask
+    def forward(
+        self, tokens: th.Tensor, condition: th.Tensor, cond_drop_prob: float = 0.0
+    ) -> torch.Tensor:
+        batch_size, device = tokens.shape[0], tokens.device
+        x = self.token_embedding(tokens)
+        x = self.abs_pos_encoding(x)
+        tgt_mask = self.get_tgt_mask(x.shape[1], x.device)
+        cond_embed = self.encode_audio(condition)
+        keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device=device)
+        keep_mask_embed = rearrange(keep_mask, "b -> b 1 1")
+        keep_mask_hidden = rearrange(keep_mask, "b -> b 1")
+        cond_tokens = self.pre_audio(cond_embed.permute(0, 2, 1)).permute(0, 2, 1)
+        #
+        cond_tokens = self.cond_projection(cond_tokens)
+        cond_tokens = self.abs_pos_encoding(cond_tokens)
+        null_cond_embed = self.null_cond_embed.to(cond_tokens.dtype)
+        cond_tokens = torch.where(
+            keep_mask_embed, cond_tokens, null_cond_embed[:, : cond_tokens.shape[1], :]
+        )
+        mean_pooled_cond_tokens = cond_tokens.mean(dim=-2)
+        cond_hidden = self.non_attn_cond_projection(mean_pooled_cond_tokens)
+        # FiLM conditioning
+        null_cond_hidden = self.null_cond_hidden.to(cond_tokens.dtype)
+        cond_hidden = torch.where(keep_mask_hidden, cond_hidden, null_cond_hidden)
+        cond_tokens = self.norm_cond(cond_tokens)
+        output = self.seqTransDecoder(x, cond_tokens, cond_hidden, tgt_mask=tgt_mask)
+        output = self.final_layer(output)
+        return output
+    def generate(
+        self,
+        condition: th.Tensor,
+        sequence_length: int,
+        layers: int,
+        n_sequences: int = 1,
+        max_key_len: int = 8,
+        max_seq_len: int = 240,
+        top_p: float = 0.94,
+    ) -> torch.Tensor:
+        """
+        :param sequence_length: number of tokens to generate in autoregressive fashion
+        :param n_sequences: number of sequences to generate simultaneously
+        :param temperature: temerature of the softmax for sampling from the output logits
+        :return n_sequences x sequence_length LongTensor containing generated tokens
+        """
+        assert max_key_len == int(max_seq_len / 30), "currently only running for 1fps"
+        max_key_len *= layers
+        with th.no_grad():
+            input_tokens = (
+                th.zeros(n_sequences, 1, dtype=th.int64).to(condition.device)
+                + self.tokens
+            )
+            for _ in range(sequence_length * layers):
+                curr_input_tokens = input_tokens
+                curr_condition = condition
+                logits = self.forward(curr_input_tokens, curr_condition)
+                logits = logits[:, -1, :]  # only most recent time step is relevant
+                one_hot = th.nn.functional.softmax(logits, dim=-1)
+                sorted_probs, indices = torch.sort(one_hot, dim=-1, descending=True)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                nucleus = cumulative_probs < top_p
+                nucleus = torch.cat(
+                    [
+                        nucleus.new_ones(nucleus.shape[:-1] + (1,)),
+                        nucleus[..., :-1],
+                    ],
+                    dim=-1,
+                )
+                sorted_probs[~nucleus] = 0
+                sorted_probs /= sorted_probs.sum(-1, keepdim=True)
+                dist = Categorical(sorted_probs)
+                idx = dist.sample()
+                tokens = indices.gather(-1, idx.unsqueeze(-1))
+                input_tokens = th.cat([input_tokens, tokens], dim=-1)
+            # return generated tokens except for sequence start token
+            tokens = input_tokens[:, 1:].contiguous()
+            return tokens

model/modules/audio_encoder.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import fairseq
+import torch as th
+import torchaudio as ta
+wav2vec_model_path = "./assets/wav2vec_large.pt"
+def weights_init(m):
+    if isinstance(m, th.nn.Conv1d):
+        th.nn.init.xavier_uniform_(m.weight)
+        try:
+            th.nn.init.constant_(m.bias, 0.01)
+        except:
+            pass
+class Wav2VecEncoder(th.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.resampler = ta.transforms.Resample(orig_freq=48000, new_freq=16000)
+        model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [wav2vec_model_path]
+        )
+        self.wav2vec_model = model[0]
+    def forward(self, audio: th.Tensor):
+        """
+        :param audio: B x T x 1600
+        :return: B x T_wav2vec x 512
+        """
+        audio = audio.view(audio.shape[0], audio.shape[1] * 1600)
+        audio = self.resampler(audio)
+        audio = th.cat(
+            [th.zeros(audio.shape[0], 320, device=audio.device), audio], dim=-1
+        )  # zero padding on the left
+        x = self.wav2vec_model.feature_extractor(audio)
+        x = self.wav2vec_model.feature_aggregator(x)
+        x = x.permute(0, 2, 1).contiguous()
+        return x
+class Wav2VecDownsampler(th.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = th.nn.Conv1d(512, 512, kernel_size=3)
+        self.conv2 = th.nn.Conv1d(512, 512, kernel_size=3)
+        self.norm = th.nn.LayerNorm(512)
+    def forward(self, x: th.Tensor, target_length: int):
+        """
+        :param x: B x T x 512 tensor containing wav2vec features at 100Hz
+        :return: B x target_length x 512 tensor containing downsampled wav2vec features at 30Hz
+        """
+        x = x.permute(0, 2, 1).contiguous()
+        # first conv
+        x = th.nn.functional.pad(x, pad=(2, 0))
+        x = th.nn.functional.relu(self.conv1(x))
+        # first downsampling
+        x = th.nn.functional.interpolate(x, size=(x.shape[-1] + target_length) // 2)
+        # second conv
+        x = th.nn.functional.pad(x, pad=(2, 0))
+        x = self.conv2(x)
+        # second downsampling
+        x = th.nn.functional.interpolate(x, size=target_length)
+        # layer norm
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.norm(x)
+        return x
+class AudioTcn(th.nn.Module):
+    def __init__(
+        self,
+        encoding_dim: int = 128,
+        use_melspec: bool = True,
+        use_wav2vec: bool = True,
+    ):
+        """
+        :param encoding_dim: size of encoding
+        :param use_melspec: extract mel spectrogram features as input
+        :param use_wav2vec: extract wav2vec features as input
+        """
+        super().__init__()
+        self.encoding_dim = encoding_dim
+        self.use_melspec = use_melspec
+        self.use_wav2vec = use_wav2vec
+        if use_melspec:
+            # hop_length=400 -> two feature vectors per visual frame (downsampling to 24kHz -> 800 samples per frame)
+            self.melspec = th.nn.Sequential(
+                ta.transforms.Resample(orig_freq=48000, new_freq=24000),
+                ta.transforms.MelSpectrogram(
+                    sample_rate=24000,
+                    n_fft=1024,
+                    win_length=800,
+                    hop_length=400,
+                    n_mels=80,
+                ),
+            )
+        if use_wav2vec:
+            model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                [wav2vec_model_path]
+            )
+            self.wav2vec_model = model[0]
+            self.wav2vec_model.eval()
+            self.wav2vec_postprocess = th.nn.Conv1d(512, 256, kernel_size=3)
+            self.wav2vec_postprocess.apply(lambda x: weights_init(x))
+        # temporal model
+        input_dim = 0 + (160 if use_melspec else 0) + (256 if use_wav2vec else 0)
+        self.layers = th.nn.ModuleList(
+            [
+                th.nn.Conv1d(
+                    input_dim, max(256, encoding_dim), kernel_size=3, dilation=1
+                ),  # 2 (+1)
+                th.nn.Conv1d(
+                    max(256, encoding_dim), encoding_dim, kernel_size=3, dilation=2
+                ),  # 4 (+1)
+                th.nn.Conv1d(
+                    encoding_dim, encoding_dim, kernel_size=3, dilation=3
+                ),  # 6 (+1)
+                th.nn.Conv1d(
+                    encoding_dim, encoding_dim, kernel_size=3, dilation=1
+                ),  # 2 (+1)
+                th.nn.Conv1d(
+                    encoding_dim, encoding_dim, kernel_size=3, dilation=2
+                ),  # 4 (+1)
+                th.nn.Conv1d(
+                    encoding_dim, encoding_dim, kernel_size=3, dilation=3
+                ),  # 6 (+1)
+            ]
+        )
+        self.layers.apply(lambda x: weights_init(x))
+        self.receptive_field = 25
+        self.final = th.nn.Conv1d(encoding_dim, encoding_dim, kernel_size=1)
+        self.final.apply(lambda x: weights_init(x))
+    def forward(self, audio):
+        """
+        :param audio: B x T x 1600 tensor containing audio samples for each frame
+        :return: B x T x encoding_dim tensor containing audio encodings for each frame
+        """
+        B, T = audio.shape[0], audio.shape[1]
+        # preprocess raw audio signal to extract feature vectors
+        audio = audio.view(B, T * 1600)
+        x_mel, x_w2v = th.zeros(B, 0, T).to(audio.device), th.zeros(B, 0, T).to(
+            audio.device
+        )
+        if self.use_melspec:
+            x_mel = self.melspec(audio)[:, :, 1:].contiguous()
+            x_mel = th.log(x_mel.clamp(min=1e-10, max=None))
+            x_mel = (
+                x_mel.permute(0, 2, 1)
+                .contiguous()
+                .view(x_mel.shape[0], T, 160)
+                .permute(0, 2, 1)
+                .contiguous()
+            )
+        if self.use_wav2vec:
+            with th.no_grad():
+                x_w2v = ta.functional.resample(audio, 48000, 16000)
+                x_w2v = self.wav2vec_model.feature_extractor(x_w2v)
+                x_w2v = self.wav2vec_model.feature_aggregator(x_w2v)
+            x_w2v = self.wav2vec_postprocess(th.nn.functional.pad(x_w2v, pad=[2, 0]))
+            x_w2v = th.nn.functional.interpolate(
+                x_w2v, size=T, align_corners=True, mode="linear"
+            )
+        x = th.cat([x_mel, x_w2v], dim=1)
+        # process signal with TCN
+        x = th.nn.functional.pad(x, pad=[self.receptive_field - 1, 0])
+        for layer_idx, layer in enumerate(self.layers):
+            y = th.nn.functional.leaky_relu(layer(x), negative_slope=0.2)
+            if self.training:
+                y = th.nn.functional.dropout(y, 0.2)
+            if x.shape[1] == y.shape[1]:
+                x = (x[:, :, -y.shape[-1] :] + y) / 2.0  # skip connection
+            else:
+                x = y
+        x = self.final(x)
+        x = x.permute(0, 2, 1).contiguous()
+        return x

model/modules/rotary_embedding_torch.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+from inspect import isfunction
+from math import log, pi
+import torch
+from einops import rearrange, repeat
+from torch import einsum, nn
+# helper functions
+def exists(val):
+    return val is not None
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+def apply_rotary_emb(freqs, t, start_index=0):
+    freqs = freqs.to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
+    return torch.cat((t_left, t, t_right), dim=-1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+    ):
+        super().__init__()
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        self.cache = dict()
+        if learned_freq:
+            self.freqs = nn.Parameter(freqs)
+        else:
+            self.register_buffer("freqs", freqs)
+    def rotate_queries_or_keys(self, t, seq_dim=-2):
+        device = t.device
+        seq_len = t.shape[seq_dim]
+        freqs = self.forward(
+            lambda: torch.arange(seq_len, device=device), cache_key=seq_len
+        )
+        return apply_rotary_emb(freqs, t)
+    def forward(self, t, cache_key=None):
+        if exists(cache_key) and cache_key in self.cache:
+            return self.cache[cache_key]
+        if isfunction(t):
+            t = t()
+        freqs = self.freqs
+        freqs = torch.einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if exists(cache_key):
+            self.cache[cache_key] = freqs
+        return freqs

model/modules/transformer_modules.py ADDED Viewed

	@@ -0,0 +1,702 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import math
+from typing import Any, Callable, List, Optional, Union
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch import Tensor
+from torch.nn import functional as F
+def generate_causal_mask(source_length, target_length, device="cpu"):
+    if source_length == target_length:
+        mask = (
+            torch.triu(torch.ones(target_length, source_length, device=device)) == 1
+        ).transpose(0, 1)
+    else:
+        mask = torch.zeros(target_length, source_length, device=device)
+        idx = torch.linspace(0, source_length, target_length + 1)[1:].round().long()
+        for i in range(target_length):
+            mask[i, 0 : idx[i]] = 1
+    return (
+        mask.float()
+        .masked_fill(mask == 0, float("-inf"))
+        .masked_fill(mask == 1, float(0.0))
+    )
+class TransformerEncoderLayerRotary(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = True,
+        rotary=None,
+    ) -> None:
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = activation
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask))
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(
+        self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]
+    ) -> Tensor:
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class DenseFiLM(nn.Module):
+    """Feature-wise linear modulation (FiLM) generator."""
+    def __init__(self, embed_channels):
+        super().__init__()
+        self.embed_channels = embed_channels
+        self.block = nn.Sequential(
+            nn.Mish(), nn.Linear(embed_channels, embed_channels * 2)
+        )
+    def forward(self, position):
+        pos_encoding = self.block(position)
+        pos_encoding = rearrange(pos_encoding, "b c -> b 1 c")
+        scale_shift = pos_encoding.chunk(2, dim=-1)
+        return scale_shift
+def featurewise_affine(x, scale_shift):
+    scale, shift = scale_shift
+    return (scale + 1) * x + shift
+class FiLMTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation=F.relu,
+        layer_norm_eps=1e-5,
+        batch_first=False,
+        norm_first=True,
+        rotary=None,
+        use_cm=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Feedforward
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = activation
+        self.film1 = DenseFiLM(d_model)
+        self.film2 = DenseFiLM(d_model)
+        self.film3 = DenseFiLM(d_model)
+        if use_cm:
+            self.multihead_attn2 = nn.MultiheadAttention(  # 2
+                d_model, nhead, dropout=dropout, batch_first=batch_first
+            )
+            self.norm2a = nn.LayerNorm(d_model, eps=layer_norm_eps)  # 2
+            self.dropout2a = nn.Dropout(dropout)  # 2
+            self.film2a = DenseFiLM(d_model)  # 2
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    # x, cond, t
+    def forward(
+        self,
+        tgt,
+        memory,
+        t,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        memory2=None,
+    ):
+        x = tgt
+        if self.norm_first:
+            # self-attention -> film -> residual
+            x_1 = self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
+            x = x + featurewise_affine(x_1, self.film1(t))
+            # cross-attention -> film -> residual
+            x_2 = self._mha_block(
+                self.norm2(x),
+                memory,
+                memory_mask,
+                memory_key_padding_mask,
+                self.multihead_attn,
+                self.dropout2,
+            )
+            x = x + featurewise_affine(x_2, self.film2(t))
+            if memory2 is not None:
+                # cross-attention x2 -> film -> residual
+                x_2a = self._mha_block(
+                    self.norm2a(x),
+                    memory2,
+                    memory_mask,
+                    memory_key_padding_mask,
+                    self.multihead_attn2,
+                    self.dropout2a,
+                )
+                x = x + featurewise_affine(x_2a, self.film2a(t))
+            # feedforward -> film -> residual
+            x_3 = self._ff_block(self.norm3(x))
+            x = x + featurewise_affine(x_3, self.film3(t))
+        else:
+            x = self.norm1(
+                x
+                + featurewise_affine(
+                    self._sa_block(x, tgt_mask, tgt_key_padding_mask), self.film1(t)
+                )
+            )
+            x = self.norm2(
+                x
+                + featurewise_affine(
+                    self._mha_block(x, memory, memory_mask, memory_key_padding_mask),
+                    self.film2(t),
+                )
+            )
+            x = self.norm3(x + featurewise_affine(self._ff_block(x), self.film3(t)))
+        return x
+    # self-attention block
+    # qkv
+    def _sa_block(self, x, attn_mask, key_padding_mask):
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # multihead attention block
+    # qkv
+    def _mha_block(self, x, mem, attn_mask, key_padding_mask, mha, dropout):
+        q = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        k = self.rotary.rotate_queries_or_keys(mem) if self.use_rotary else mem
+        x = mha(
+            q,
+            k,
+            mem,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return dropout(x)
+    # feed forward block
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+class DecoderLayerStack(nn.Module):
+    def __init__(self, stack):
+        super().__init__()
+        self.stack = stack
+    def forward(self, x, cond, t, tgt_mask=None, memory2=None):
+        for layer in self.stack:
+            x = layer(x, cond, t, tgt_mask=tgt_mask, memory2=memory2)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1024):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: B x T x d_model tensor
+        :return: B x T x d_model tensor
+        """
+        x = x + self.pe[None, : x.shape[1], :]
+        x = self.dropout(x)
+        return x
+class TimestepEncoding(nn.Module):
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        # Fourier embedding
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim) * -emb)
+        self.register_buffer("emb", emb)
+        # encoding
+        self.encoding = nn.Sequential(
+            nn.Linear(embedding_dim, 4 * embedding_dim),
+            nn.Mish(),
+            nn.Linear(4 * embedding_dim, embedding_dim),
+        )
+    def forward(self, t: torch.Tensor):
+        """
+        :param t: B-dimensional tensor containing timesteps in range [0, 1]
+        :return: B x embedding_dim tensor containing timestep encodings
+        """
+        x = t[:, None] * self.emb[None, :]
+        x = torch.cat([torch.sin(x), torch.cos(x)], dim=-1)
+        x = self.encoding(x)
+        return x
+class FiLM(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+        self.film = nn.Sequential(nn.Mish(), nn.Linear(dim, dim * 2))
+    def forward(self, x: torch.Tensor, cond: torch.Tensor):
+        """
+        :param x: ... x dim tensor
+        :param cond: ... x dim tensor
+        :return: ... x dim tensor as scale(cond) * x + bias(cond)
+        """
+        cond = self.film(cond)
+        scale, bias = torch.chunk(cond, chunks=2, dim=-1)
+        x = (scale + 1) * x + bias
+        return x
+class FeedforwardBlock(nn.Module):
+    def __init__(self, d_model: int, d_feedforward: int = 1024, dropout: float = 0.1):
+        super().__init__()
+        self.ff = nn.Sequential(
+            nn.Linear(d_model, d_feedforward),
+            nn.ReLU(),
+            nn.Dropout(p=dropout),
+            nn.Linear(d_feedforward, d_model),
+            nn.Dropout(p=dropout),
+        )
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: ... x d_model tensor
+        :return: ... x d_model tensor
+        """
+        return self.ff(x)
+class SelfAttention(nn.Module):
+    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, num_heads, dropout=dropout, batch_first=True
+        )
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor = None,
+        key_padding_mask: torch.Tensor = None,
+    ):
+        """
+        :param x: B x T x d_model input tensor
+        :param attn_mask: B * num_heads x L x S mask with L=target sequence length, S=source sequence length
+                          for a float mask: values will be added to attention weight
+                          for a binary mask: True indicates that the element is not allowed to attend
+        :param key_padding_mask: B x S mask
+                          for a float mask: values will be added directly to the corresponding key values
+                          for a binary mask: True indicates that the corresponding key value will be ignored
+        :return: B x T x d_model output tensor
+        """
+        x = self.self_attn(
+            x,
+            x,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        x = self.dropout(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, d_model: int, d_cond: int, num_heads: int, dropout: float = 0.1):
+        super().__init__()
+        self.cross_attn = nn.MultiheadAttention(
+            d_model,
+            num_heads,
+            dropout=dropout,
+            batch_first=True,
+            kdim=d_cond,
+            vdim=d_cond,
+        )
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cond: torch.Tensor,
+        attn_mask: torch.Tensor = None,
+        key_padding_mask: torch.Tensor = None,
+    ):
+        """
+        :param x: B x T_target x d_model input tensor
+        :param cond: B x T_cond x d_cond condition tensor
+        :param attn_mask: B * num_heads x L x S mask with L=target sequence length, S=source sequence length
+                          for a float mask: values will be added to attention weight
+                          for a binary mask: True indicates that the element is not allowed to attend
+        :param key_padding_mask: B x S mask
+                          for a float mask: values will be added directly to the corresponding key values
+                          for a binary mask: True indicates that the corresponding key value will be ignored
+        :return: B x T x d_model output tensor
+        """
+        x = self.cross_attn(
+            x,
+            cond,
+            cond,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        x = self.dropout(x)
+        return x
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        d_feedforward: int = 1024,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.self_attn = SelfAttention(d_model, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.feedforward = FeedforwardBlock(d_model, d_feedforward, dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor = None,
+        key_padding_mask: torch.Tensor = None,
+    ):
+        x = x + self.self_attn(self.norm1(x), mask, key_padding_mask)
+        x = x + self.feedforward(self.norm2(x))
+        return x
+class TransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        d_cond: int,
+        num_heads: int,
+        d_feedforward: int = 1024,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.self_attn = SelfAttention(d_model, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.cross_attn = CrossAttention(d_model, d_cond, num_heads, dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.feedforward = FeedforwardBlock(d_model, d_feedforward, dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_cond: torch.Tensor,
+        target_mask: torch.Tensor = None,
+        target_key_padding_mask: torch.Tensor = None,
+        cross_cond_mask: torch.Tensor = None,
+        cross_cond_key_padding_mask: torch.Tensor = None,
+    ):
+        """
+        :param x: B x T x d_model tensor
+        :param cross_cond: B x T x d_cond tensor containing the conditioning input to cross attention layers
+        :return: B x T x d_model tensor
+        """
+        x = x + self.self_attn(self.norm1(x), target_mask, target_key_padding_mask)
+        x = x + self.cross_attn(
+            self.norm2(x), cross_cond, cross_cond_mask, cross_cond_key_padding_mask
+        )
+        x = x + self.feedforward(self.norm3(x))
+        return x
+class FilmTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        d_cond: int,
+        num_heads: int,
+        d_feedforward: int = 1024,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.self_attn = SelfAttention(d_model, num_heads, dropout)
+        self.film1 = FiLM(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.cross_attn = CrossAttention(d_model, d_cond, num_heads, dropout)
+        self.film2 = FiLM(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.feedforward = FeedforwardBlock(d_model, d_feedforward, dropout)
+        self.film3 = FiLM(d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_cond: torch.Tensor,
+        film_cond: torch.Tensor,
+        target_mask: torch.Tensor = None,
+        target_key_padding_mask: torch.Tensor = None,
+        cross_cond_mask: torch.Tensor = None,
+        cross_cond_key_padding_mask: torch.Tensor = None,
+    ):
+        """
+        :param x: B x T x d_model tensor
+        :param cross_cond: B x T x d_cond tensor containing the conditioning input to cross attention layers
+        :param film_cond: B x [1 or T] x film_cond tensor containing the conditioning input to FiLM layers
+        :return: B x T x d_model tensor
+        """
+        x1 = self.self_attn(self.norm1(x), target_mask, target_key_padding_mask)
+        x = x + self.film1(x1, film_cond)
+        x2 = self.cross_attn(
+            self.norm2(x), cross_cond, cross_cond_mask, cross_cond_key_padding_mask
+        )
+        x = x + self.film2(x2, film_cond)
+        x3 = self.feedforward(self.norm3(x))
+        x = x + self.film3(x3, film_cond)
+        return x
+class RegressionTransformer(nn.Module):
+    def __init__(
+        self,
+        transformer_encoder_layers: int = 2,
+        transformer_decoder_layers: int = 4,
+        d_model: int = 512,
+        d_cond: int = 512,
+        num_heads: int = 4,
+        d_feedforward: int = 1024,
+        dropout: float = 0.1,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.causal = causal
+        self.cond_positional_encoding = PositionalEncoding(d_cond, dropout)
+        self.target_positional_encoding = PositionalEncoding(d_model, dropout)
+        self.transformer_encoder = nn.ModuleList(
+            [
+                TransformerEncoderLayer(d_cond, num_heads, d_feedforward, dropout)
+                for _ in range(transformer_encoder_layers)
+            ]
+        )
+        self.transformer_decoder = nn.ModuleList(
+            [
+                TransformerDecoderLayer(
+                    d_model, d_cond, num_heads, d_feedforward, dropout
+                )
+                for _ in range(transformer_decoder_layers)
+            ]
+        )
+    def forward(self, x: torch.Tensor, cond: torch.Tensor):
+        """
+        :param x: B x T x d_model input tensor
+        :param cond: B x T x d_cond conditional tensor
+        :return: B x T x d_model output tensor
+        """
+        x = self.target_positional_encoding(x)
+        cond = self.cond_positional_encoding(cond)
+        if self.causal:
+            encoder_mask = generate_causal_mask(
+                cond.shape[1], cond.shape[1], device=cond.device
+            )
+            decoder_self_attn_mask = generate_causal_mask(
+                x.shape[1], x.shape[1], device=x.device
+            )
+            decoder_cross_attn_mask = generate_causal_mask(
+                cond.shape[1], x.shape[1], device=x.device
+            )
+        else:
+            encoder_mask = None
+            decoder_self_attn_mask = None
+            decoder_cross_attn_mask = None
+        for encoder_layer in self.transformer_encoder:
+            cond = encoder_layer(cond, mask=encoder_mask)
+        for decoder_layer in self.transformer_decoder:
+            x = decoder_layer(
+                x,
+                cond,
+                target_mask=decoder_self_attn_mask,
+                cross_cond_mask=decoder_cross_attn_mask,
+            )
+        return x
+class DiffusionTransformer(nn.Module):
+    def __init__(
+        self,
+        transformer_encoder_layers: int = 2,
+        transformer_decoder_layers: int = 4,
+        d_model: int = 512,
+        d_cond: int = 512,
+        num_heads: int = 4,
+        d_feedforward: int = 1024,
+        dropout: float = 0.1,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.causal = causal
+        self.timestep_encoder = TimestepEncoding(d_model)
+        self.cond_positional_encoding = PositionalEncoding(d_cond, dropout)
+        self.target_positional_encoding = PositionalEncoding(d_model, dropout)
+        self.transformer_encoder = nn.ModuleList(
+            [
+                TransformerEncoderLayer(d_cond, num_heads, d_feedforward, dropout)
+                for _ in range(transformer_encoder_layers)
+            ]
+        )
+        self.transformer_decoder = nn.ModuleList(
+            [
+                FilmTransformerDecoderLayer(
+                    d_model, d_cond, num_heads, d_feedforward, dropout
+                )
+                for _ in range(transformer_decoder_layers)
+            ]
+        )
+    def forward(self, x: torch.Tensor, cond: torch.Tensor, t: torch.Tensor):
+        """
+        :param x: B x T x d_model input tensor
+        :param cond: B x T x d_cond conditional tensor
+        :param t: B-dimensional tensor containing diffusion timesteps in range [0, 1]
+        :return: B x T x d_model output tensor
+        """
+        t = self.timestep_encoder(t).unsqueeze(1)  # B x 1 x d_model
+        x = self.target_positional_encoding(x)
+        cond = self.cond_positional_encoding(cond)
+        if self.causal:
+            encoder_mask = generate_causal_mask(
+                cond.shape[1], cond.shape[1], device=cond.device
+            )
+            decoder_self_attn_mask = generate_causal_mask(
+                x.shape[1], x.shape[1], device=x.device
+            )
+            decoder_cross_attn_mask = generate_causal_mask(
+                cond.shape[1], x.shape[1], device=x.device
+            )
+        else:
+            encoder_mask = None
+            decoder_self_attn_mask = None
+            decoder_cross_attn_mask = None
+        for encoder_layer in self.transformer_encoder:
+            cond = encoder_layer(cond, mask=encoder_mask)
+        for decoder_layer in self.transformer_decoder:
+            x = decoder_layer(
+                x,
+                cond,
+                t,
+                target_mask=decoder_self_attn_mask,
+                cross_cond_mask=decoder_cross_attn_mask,
+            )
+        return x

model/utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import math
+import fairseq
+import numpy as np
+import torch
+import torchaudio.transforms as T
+from torch import nn
+def setup_lip_regressor() -> ("Audio2LipRegressionTransformer", T.Resample):
+    cp_path = "./assets/vq-wav2vec.pt"
+    audio_model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path])
+    audio_model = audio_model[0]
+    for param in audio_model.parameters():
+        param.requires_grad = False
+    audio_model.eval()
+    audio_resampler = T.Resample(48000, 16000)
+    return audio_model, audio_resampler
+def init_weight(m):
+    if (
+        isinstance(m, nn.Conv1d)
+        or isinstance(m, nn.Linear)
+        or isinstance(m, nn.ConvTranspose1d)
+    ):
+        nn.init.xavier_normal_(m.weight)
+        # m.bias.data.fill_(0.01)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+# absolute positional embedding used for vanilla transformer sequential data
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=800, batch_first=False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        if self.batch_first:
+            x = x + self.pe.permute(1, 0, 2)[:, : x.shape[1], :]
+        else:
+            x = x + self.pe[: x.shape[0], :]
+        return self.dropout(x)
+# very similar positional embedding used for diffusion timesteps
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# dropout mask
+def prob_mask_like(shape, prob, device):
+    if prob == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.zeros(shape, device=device).float().uniform_(0, 1) < prob
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def make_beta_schedule(
+    schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == "sqrt":
+        betas = (
+            torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+            ** 0.5
+        )
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()

model/vqvae.py ADDED Viewed

	@@ -0,0 +1,550 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import json
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from utils.misc import broadcast_tensors
+def setup_tokenizer(resume_pth: str) -> "TemporalVertexCodec":
+    args_path = os.path.dirname(resume_pth)
+    with open(os.path.join(args_path, "args.json")) as f:
+        trans_args = json.load(f)
+    tokenizer = TemporalVertexCodec(
+        n_vertices=trans_args["nb_joints"],
+        latent_dim=trans_args["output_emb_width"],
+        categories=trans_args["code_dim"],
+        residual_depth=trans_args["depth"],
+    )
+    print("loading checkpoint from {}".format(resume_pth))
+    ckpt = torch.load(resume_pth, map_location="cpu")
+    tokenizer.load_state_dict(ckpt["net"], strict=True)
+    for p in tokenizer.parameters():
+        p.requires_grad = False
+    tokenizer.cuda()
+    return tokenizer
+def default(val, d):
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sum_flat(tensor):
+    """
+    Take the sum over all non-batch dimensions.
+    """
+    return tensor.sum(dim=list(range(1, len(tensor.shape))))
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d")
+        dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        broadcast_tensors(self.buffers())
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+        broadcast_tensors(self.buffers())
+    def preprocess(self, x):
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x):
+        shape = x.shape
+        x = self.preprocess(x)
+        embed_ind = self.quantize(x)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim=None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 50,
+        threshold_ema_dead_code: int = 2,
+        commitment_weight: float = 1.0,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(
+            dim=_codebook_dim,
+            codebook_size=codebook_size,
+            kmeans_init=kmeans_init,
+            kmeans_iters=kmeans_iters,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+        )
+        self.codebook_size = codebook_size
+        self.l2_loss = lambda a, b: (a - b) ** 2
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        return quantize
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: B x dim input tensor
+        :return: quantize: B x dim tensor containing reconstruction after quantization
+                 embed_ind: B-dimensional tensor containing embedding indices
+                 loss: scalar tensor containing commitment loss
+        """
+        device = x.device
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        return quantize, embed_ind, loss
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers: int, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def forward(self, x, B, T, mask, n_q=None):
+        """
+        :param x: B x dim tensor
+        :return: quantized_out: B x dim tensor
+                 out_indices: B x n_q LongTensor containing indices for each quantizer
+                 out_losses: scalar tensor containing commitment loss
+        """
+        quantized_out = 0.0
+        residual = x
+        all_losses = []
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            quantized, indices, loss = layer(residual)
+            residual = (
+                residual - quantized
+            )  # would need quantizer.detach() to have commitment gradients beyond the first quantizer, but this seems to harm performance
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+        out_indices = torch.stack(all_indices, dim=-1)
+        out_losses = torch.mean(torch.stack(all_losses))
+        return quantized_out, out_indices, out_losses
+    def encode(self, x: torch.Tensor, n_q=None) -> torch.Tensor:
+        """
+        :param x: B x dim input tensor
+        :return: B x n_q LongTensor containing indices for each quantizer
+        """
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)  # indices = 16 x 8 = B x T
+            # print(indices.shape, residual.shape, x.shape)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices, dim=-1)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        """
+        :param q_indices: B x n_q LongTensor containing indices for each quantizer
+        :return: B x dim tensor containing reconstruction after quantization
+        """
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        q_indices = q_indices.permute(1, 0).contiguous()
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+class TemporalVertexEncoder(nn.Module):
+    def __init__(
+        self,
+        n_vertices: int = 338,
+        latent_dim: int = 128,
+    ):
+        super().__init__()
+        self.input_dim = n_vertices
+        self.enc = nn.Sequential(
+            nn.Conv1d(self.input_dim, latent_dim, kernel_size=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=2),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=3),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=1),
+        )
+        self.receptive_field = 8
+    def forward(self, verts):
+        """
+        :param verts: B x T x n_vertices x 3 tensor containing batched sequences of vertices
+        :return: B x T x latent_dim tensor containing the latent representation
+        """
+        if verts.dim() == 4:
+            verts = verts.permute(0, 2, 3, 1).contiguous()
+            verts = verts.view(verts.shape[0], self.input_dim, verts.shape[3])
+        else:
+            verts = verts.permute(0, 2, 1)
+        verts = nn.functional.pad(verts, pad=[self.receptive_field - 1, 0])
+        x = self.enc(verts)
+        x = x.permute(0, 2, 1).contiguous()
+        return x
+class TemporalVertexDecoder(nn.Module):
+    def __init__(
+        self,
+        n_vertices: int = 338,
+        latent_dim: int = 128,
+    ):
+        super().__init__()
+        self.output_dim = n_vertices
+        self.project_mean_shape = nn.Linear(self.output_dim, latent_dim)
+        self.dec = nn.Sequential(
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=2),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=3),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, latent_dim, kernel_size=2, dilation=1),
+            nn.LeakyReLU(negative_slope=0.2, inplace=True),
+            nn.Conv1d(latent_dim, self.output_dim, kernel_size=1),
+        )
+        self.receptive_field = 8
+    def forward(self, x):
+        """
+        :param x: B x T x latent_dim tensor containing batched sequences of vertex encodings
+        :return: B x T x n_vertices x 3 tensor containing batched sequences of vertices
+        """
+        x = x.permute(0, 2, 1).contiguous()
+        x = nn.functional.pad(x, pad=[self.receptive_field - 1, 0])
+        verts = self.dec(x)
+        verts = verts.permute(0, 2, 1)
+        return verts
+class TemporalVertexCodec(nn.Module):
+    def __init__(
+        self,
+        n_vertices: int = 338,
+        latent_dim: int = 128,
+        categories: int = 128,
+        residual_depth: int = 4,
+    ):
+        super().__init__()
+        self.latent_dim = latent_dim
+        self.categories = categories
+        self.residual_depth = residual_depth
+        self.n_clusters = categories
+        self.encoder = TemporalVertexEncoder(
+            n_vertices=n_vertices, latent_dim=latent_dim
+        )
+        self.decoder = TemporalVertexDecoder(
+            n_vertices=n_vertices, latent_dim=latent_dim
+        )
+        self.quantizer = ResidualVectorQuantization(
+            dim=latent_dim,
+            codebook_size=categories,
+            num_quantizers=residual_depth,
+            decay=0.99,
+            kmeans_init=True,
+            kmeans_iters=10,
+            threshold_ema_dead_code=2,
+        )
+    def predict(self, verts):
+        """wrapper to provide compatibility with kmeans"""
+        return self.encode(verts)
+    def encode(self, verts):
+        """
+        :param verts: B x T x n_vertices x 3 tensor containing batched sequences of vertices
+        :return: B x T x categories x residual_depth LongTensor containing quantized encodings
+        """
+        enc = self.encoder(verts)
+        q = self.quantizer.encode(enc)
+        return q
+    def decode(self, q):
+        """
+        :param q: B x T x categories x residual_depth LongTensor containing quantized encodings
+        :return: B x T x n_vertices x 3 tensor containing decoded vertices
+        """
+        reformat = q.dim() > 2
+        if reformat:
+            B, T, _ = q.shape
+            q = q.reshape((-1, self.residual_depth))
+        enc = self.quantizer.decode(q)
+        if reformat:
+            enc = enc.reshape((B, T, -1))
+        verts = self.decoder(enc)
+        return verts
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx):
+        # Calculate new centres
+        code_onehot = torch.zeros(
+            self.categories, code_idx.shape[0], device=code_idx.device
+        )  # categories, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+        code_count = code_onehot.sum(dim=-1)  # categories
+        prob = code_count / torch.sum(code_count)
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    def forward(self, verts, mask=None):
+        """
+        :param verts: B x T x n_vertices x 3 tensor containing mesh sequences
+        :return: verts: B x T x n_vertices x 3 tensor containing reconstructed mesh sequences
+                 vq_loss: scalar tensor for vq commitment loss
+        """
+        B, T = verts.shape[0], verts.shape[1]
+        x = self.encoder(verts)
+        x, code_idx, vq_loss = self.quantizer(
+            x.view(B * T, self.latent_dim), B, T, mask
+        )
+        perplexity = self.compute_perplexity(code_idx[:, -1].view((-1)))
+        verts = self.decoder(x.view(B, T, self.latent_dim))
+        verts = verts.reshape((verts.shape[0], verts.shape[1], -1))
+        return verts, vq_loss, perplexity

sample/generate.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import os
+from typing import Callable, Dict, Union
+import numpy as np
+import torch
+from data_loaders.get_data import get_dataset_loader, load_local_data
+from diffusion.respace import SpacedDiffusion
+from model.cfg_sampler import ClassifierFreeSampleModel
+from model.diffusion import FiLMTransformer
+from torch.utils.data import DataLoader
+from utils.diff_parser_utils import generate_args
+from utils.misc import fixseed, prGreen
+from utils.model_util import create_model_and_diffusion, get_person_num, load_model
+def _construct_template_variables(unconstrained: bool) -> (str,):
+    row_file_template = "sample{:02d}.mp4"
+    all_file_template = "samples_{:02d}_to_{:02d}.mp4"
+    if unconstrained:
+        sample_file_template = "row{:02d}_col{:02d}.mp4"
+        sample_print_template = "[{} row #{:02d} column #{:02d} | -> {}]"
+        row_file_template = row_file_template.replace("sample", "row")
+        row_print_template = "[{} row #{:02d} | all columns | -> {}]"
+        all_file_template = all_file_template.replace("samples", "rows")
+        all_print_template = "[rows {:02d} to {:02d} | -> {}]"
+    else:
+        sample_file_template = "sample{:02d}_rep{:02d}.mp4"
+        sample_print_template = '["{}" ({:02d}) | Rep #{:02d} | -> {}]'
+        row_print_template = '[ "{}" ({:02d}) | all repetitions | -> {}]'
+        all_print_template = "[samples {:02d} to {:02d} | all repetitions | -> {}]"
+    return (
+        sample_print_template,
+        row_print_template,
+        all_print_template,
+        sample_file_template,
+        row_file_template,
+        all_file_template,
+    )
+def _replace_keyframes(
+    model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+    model: Union[FiLMTransformer, ClassifierFreeSampleModel],
+) -> torch.Tensor:
+    B, T = (
+        model_kwargs["y"]["keyframes"].shape[0],
+        model_kwargs["y"]["keyframes"].shape[1],
+    )
+    with torch.no_grad():
+        tokens = model.transformer.generate(
+            model_kwargs["y"]["audio"],
+            T,
+            layers=model.tokenizer.residual_depth,
+            n_sequences=B,
+        )
+    tokens = tokens.reshape((B, -1, model.tokenizer.residual_depth))
+    pred = model.tokenizer.decode(tokens).detach().cpu()
+    assert (
+        model_kwargs["y"]["keyframes"].shape == pred.shape
+    ), f"{model_kwargs['y']['keyframes'].shape} vs {pred.shape}"
+    return pred
+def _run_single_diffusion(
+    args,
+    model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+    diffusion: SpacedDiffusion,
+    model: Union[FiLMTransformer, ClassifierFreeSampleModel],
+    inv_transform: Callable,
+    gt: torch.Tensor,
+) -> (torch.Tensor,):
+    if args.data_format == "pose" and args.resume_trans is not None:
+        model_kwargs["y"]["keyframes"] = _replace_keyframes(model_kwargs, model)
+    sample_fn = diffusion.ddim_sample_loop
+    with torch.no_grad():
+        sample = sample_fn(
+            model,
+            (args.batch_size, model.nfeats, 1, args.curr_seq_length),
+            clip_denoised=False,
+            model_kwargs=model_kwargs,
+            init_image=None,
+            progress=True,
+            dump_steps=None,
+            noise=None,
+            const_noise=False,
+        )
+    sample = inv_transform(sample.cpu().permute(0, 2, 3, 1), args.data_format).permute(
+        0, 3, 1, 2
+    )
+    curr_audio = inv_transform(model_kwargs["y"]["audio"].cpu().numpy(), "audio")
+    keyframes = inv_transform(model_kwargs["y"]["keyframes"], args.data_format)
+    gt_seq = inv_transform(gt.cpu().permute(0, 2, 3, 1), args.data_format).permute(
+        0, 3, 1, 2
+    )
+    return sample, curr_audio, keyframes, gt_seq
+def _generate_sequences(
+    args,
+    model_kwargs: Dict[str, Dict[str, torch.Tensor]],
+    diffusion: SpacedDiffusion,
+    model: Union[FiLMTransformer, ClassifierFreeSampleModel],
+    test_data: torch.Tensor,
+    gt: torch.Tensor,
+) -> Dict[str, np.ndarray]:
+    all_motions = []
+    all_lengths = []
+    all_audio = []
+    all_gt = []
+    all_keyframes = []
+    for rep_i in range(args.num_repetitions):
+        print(f"### Sampling [repetitions #{rep_i}]")
+        # add CFG scale to batch
+        if args.guidance_param != 1:
+            model_kwargs["y"]["scale"] = (
+                torch.ones(args.batch_size, device=args.device) * args.guidance_param
+            )
+        model_kwargs["y"] = {
+            key: val.to(args.device) if torch.is_tensor(val) else val
+            for key, val in model_kwargs["y"].items()
+        }
+        sample, curr_audio, keyframes, gt_seq = _run_single_diffusion(
+            args, model_kwargs, diffusion, model, test_data.dataset.inv_transform, gt
+        )
+        all_motions.append(sample.cpu().numpy())
+        all_audio.append(curr_audio)
+        all_keyframes.append(keyframes.cpu().numpy())
+        all_gt.append(gt_seq.cpu().numpy())
+        all_lengths.append(model_kwargs["y"]["lengths"].cpu().numpy())
+        print(f"created {len(all_motions) * args.batch_size} samples")
+    return {
+        "motions": np.concatenate(all_motions, axis=0),
+        "audio": np.concatenate(all_audio, axis=0),
+        "gt": np.concatenate(all_gt, axis=0),
+        "lengths": np.concatenate(all_lengths, axis=0),
+        "keyframes": np.concatenate(all_keyframes, axis=0),
+    }
+def _render_pred(
+    args,
+    data_block: Dict[str, torch.Tensor],
+    sample_file_template: str,
+    audio_per_frame: int,
+) -> None:
+    from visualize.render_codes import BodyRenderer
+    face_codes = None
+    if args.face_codes is not None:
+        face_codes = np.load(args.face_codes, allow_pickle=True).item()
+        face_motions = face_codes["motions"]
+        face_gts = face_codes["gt"]
+        face_audio = face_codes["audio"]
+    config_base = f"./checkpoints/ca_body/data/{get_person_num(args.data_root)}"
+    body_renderer = BodyRenderer(
+        config_base=config_base,
+        render_rgb=True,
+    )
+    for sample_i in range(args.num_samples):
+        for rep_i in range(args.num_repetitions):
+            idx = rep_i * args.batch_size + sample_i
+            save_file = sample_file_template.format(sample_i, rep_i)
+            animation_save_path = os.path.join(args.output_dir, save_file)
+            # format data
+            length = data_block["lengths"][idx]
+            body_motion = (
+                data_block["motions"][idx].transpose(2, 0, 1)[:length].squeeze(-1)
+            )
+            face_motion = face_motions[idx].transpose(2, 0, 1)[:length].squeeze(-1)
+            assert np.array_equal(
+                data_block["audio"][idx], face_audio[idx]
+            ), "face audio is not the same"
+            audio = data_block["audio"][idx, : length * audio_per_frame, :].T
+            # set up render data block to pass into renderer
+            render_data_block = {
+                "audio": audio,
+                "body_motion": body_motion,
+                "face_motion": face_motion,
+            }
+            if args.render_gt:
+                gt_body = data_block["gt"][idx].transpose(2, 0, 1)[:length].squeeze(-1)
+                gt_face = face_gts[idx].transpose(2, 0, 1)[:length].squeeze(-1)
+                render_data_block["gt_body"] = gt_body
+                render_data_block["gt_face"] = gt_face
+            body_renderer.render_full_video(
+                render_data_block,
+                animation_save_path,
+                audio_sr=audio_per_frame * 30,
+                render_gt=args.render_gt,
+            )
+def _reset_sample_args(args) -> None:
+    # set the sequence length to match the one specified by user
+    name = os.path.basename(os.path.dirname(args.model_path))
+    niter = os.path.basename(args.model_path).replace("model", "").replace(".pt", "")
+    args.curr_seq_length = (
+        args.curr_seq_length
+        if args.curr_seq_length is not None
+        else args.max_seq_length
+    )
+    # add the resume predictor model path
+    resume_trans_name = ""
+    if args.data_format == "pose" and args.resume_trans is not None:
+        resume_trans_parts = args.resume_trans.split("/")
+        resume_trans_name = f"{resume_trans_parts[1]}_{resume_trans_parts[-1]}"
+    # reformat the output directory
+    args.output_dir = os.path.join(
+        os.path.dirname(args.model_path),
+        "samples_{}_{}_seed{}_{}".format(name, niter, args.seed, resume_trans_name),
+    )
+    assert (
+        args.num_samples <= args.batch_size
+    ), f"Please either increase batch_size({args.batch_size}) or reduce num_samples({args.num_samples})"
+    # set the batch size to match the number of samples to generate
+    args.batch_size = args.num_samples
+def _setup_dataset(args) -> DataLoader:
+    data_root = args.data_root
+    data_dict = load_local_data(
+        data_root,
+        audio_per_frame=1600,
+        flip_person=args.flip_person,
+    )
+    test_data = get_dataset_loader(
+        args=args,
+        data_dict=data_dict,
+        split="test",
+        chunk=True,
+    )
+    return test_data
+def _setup_model(
+    args,
+) -> (Union[FiLMTransformer, ClassifierFreeSampleModel], SpacedDiffusion):
+    model, diffusion = create_model_and_diffusion(args, split_type="test")
+    print(f"Loading checkpoints from [{args.model_path}]...")
+    state_dict = torch.load(args.model_path, map_location="cpu")
+    load_model(model, state_dict)
+    if not args.unconstrained:
+        assert args.guidance_param != 1
+    if args.guidance_param != 1:
+        prGreen("[CFS] wrapping model in classifier free sample")
+        model = ClassifierFreeSampleModel(model)
+    model.to(args.device)
+    model.eval()
+    return model, diffusion
+def main():
+    args = generate_args()
+    fixseed(args.seed)
+    _reset_sample_args(args)
+    print("Loading dataset...")
+    test_data = _setup_dataset(args)
+    iterator = iter(test_data)
+    print("Creating model and diffusion...")
+    model, diffusion = _setup_model(args)
+    if args.pose_codes is None:
+        # generate sequences
+        gt, model_kwargs = next(iterator)
+        data_block = _generate_sequences(
+            args, model_kwargs, diffusion, model, test_data, gt
+        )
+        os.makedirs(args.output_dir, exist_ok=True)
+        npy_path = os.path.join(args.output_dir, "results.npy")
+        print(f"saving results file to [{npy_path}]")
+        np.save(npy_path, data_block)
+    else:
+        # load the pre generated results
+        data_block = np.load(args.pose_codes, allow_pickle=True).item()
+    # plot function only if face_codes exist and we are on pose prediction
+    if args.plot:
+        assert args.face_codes is not None, "need body and faces"
+        assert (
+            args.data_format == "pose"
+        ), "currently only supporting plot on pose stuff"
+        print(f"saving visualizations to [{args.output_dir}]...")
+        _, _, _, sample_file_template, _, _ = _construct_template_variables(
+            args.unconstrained
+        )
+        _render_pred(
+            args,
+            data_block,
+            sample_file_template,
+            test_data.dataset.audio_per_frame,
+        )
+if __name__ == "__main__":
+    main()

scripts/download_alldatasets.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+for i in "PXB184" "RLW104" "TXB805" "GQS883"
+do
+    curl -L https://github.com/facebookresearch/audio2photoreal/releases/download/v1.0/${i}.zip -o ${i}.zip || { echo 'downloading dataset failed' ; exit 1; }
+    unzip ${i}.zip -d dataset/
+    rm ${i}.zip
+done

scripts/download_allmodels.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+for i in "PXB184" "RLW104" "TXB805" "GQS883"
+do
+    # download motion models
+    wget http://audio2photoreal_models.berkeleyvision.org/${i}_models.tar || { echo 'downloading model failed' ; exit 1; }
+    tar xvf ${i}_models.tar
+    rm ${i}_models.tar
+    # download ca body rendering checkpoints and assets
+    mkdir -p checkpoints/ca_body/data/
+    wget https://github.com/facebookresearch/ca_body/releases/download/v0.0.1-alpha/${i}.tar.gz || { echo 'downloading ca body model failed' ; exit 1; }
+    tar xvf ${i}.tar.gz --directory checkpoints/ca_body/data/
+    rm ${i}.tar.gz
+done

scripts/download_prereq.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+# install the prerequisite asset models (lip regressor and wav2vec)
+wget http://audio2photoreal_models.berkeleyvision.org/asset_models.tar
+tar xvf asset_models.tar
+rm asset_models.tar
+# we obtained the wav2vec models via these links:
+# wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt -P ./assets/
+# wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/vq-wav2vec.pt -P ./assets/

scripts/installation.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+# download the prerequisite asset models (lip regressor and wav2vec)
+wget http://audio2photoreal_models.berkeleyvision.org/asset_models.tar
+tar xvf asset_models.tar
+rm asset_models.tar

scripts/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+attrdict
+blobfile
+einops
+fairseq
+gradio
+matplotlib
+mediapy
+numpy==1.23.0
+opencv-python
+packaging
+scikit-learn
+tensorboard
+tensorboardX
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+tqdm

train/train_diffusion.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import json
+import os
+import torch
+import torch.multiprocessing as mp
+from data_loaders.get_data import get_dataset_loader, load_local_data
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from train.train_platforms import ClearmlPlatform, NoPlatform, TensorboardPlatform
+from train.training_loop import TrainLoop
+from utils.diff_parser_utils import train_args
+from utils.misc import cleanup, fixseed, setup_dist
+from utils.model_util import create_model_and_diffusion
+def main(rank: int, world_size: int):
+    args = train_args()
+    fixseed(args.seed)
+    train_platform_type = eval(args.train_platform_type)
+    train_platform = train_platform_type(args.save_dir)
+    train_platform.report_args(args, name="Args")
+    setup_dist(args.device)
+    if rank == 0:
+        if args.save_dir is None:
+            raise FileNotFoundError("save_dir was not specified.")
+        elif os.path.exists(args.save_dir) and not args.overwrite:
+            raise FileExistsError("save_dir [{}] already exists.".format(args.save_dir))
+        elif not os.path.exists(args.save_dir):
+            os.makedirs(args.save_dir)
+        args_path = os.path.join(args.save_dir, "args.json")
+        with open(args_path, "w") as fw:
+            json.dump(vars(args), fw, indent=4, sort_keys=True)
+    if not os.path.exists(args.data_root):
+        args.data_root = args.data_root.replace("/home/", "/derived/")
+    data_dict = load_local_data(args.data_root, audio_per_frame=1600)
+    print("creating data loader...")
+    data = get_dataset_loader(args=args, data_dict=data_dict)
+    print("creating logger...")
+    writer = SummaryWriter(args.save_dir)
+    print("creating model and diffusion...")
+    model, diffusion = create_model_and_diffusion(args, split_type="train")
+    model.to(rank)
+    if world_size > 1:
+        model = DDP(
+            model, device_ids=[rank], output_device=rank, find_unused_parameters=True
+        )
+    params = (
+        model.module.parameters_w_grad()
+        if world_size > 1
+        else model.parameters_w_grad()
+    )
+    print("Total params: %.2fM" % (sum(p.numel() for p in params) / 1000000.0))
+    print("Training...")
+    TrainLoop(
+        args, train_platform, model, diffusion, data, writer, rank, world_size
+    ).run_loop()
+    train_platform.close()
+    cleanup()
+if __name__ == "__main__":
+    world_size = torch.cuda.device_count()
+    print(f"using {world_size} gpus")
+    if world_size > 1:
+        mp.spawn(main, args=(world_size,), nprocs=world_size, join=True)
+    else:
+        main(rank=0, world_size=1)