Spaces:

bytedance-research
/

USO

Running on Zero

App Files Files Community

fenfan commited on 2 days ago

Commit

0f74281

1 Parent(s): 44458a9

init commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
README copy.md +17 -0
app.py +242 -0
assets/gradio_examples/1subject/config.json +6 -0
assets/gradio_examples/1subject/ref.jpg +3 -0
assets/gradio_examples/2identity/config.json +6 -0
assets/gradio_examples/2identity/ref.webp +3 -0
assets/gradio_examples/3identity/config.json +6 -0
assets/gradio_examples/3identity/ref.jpg +3 -0
assets/gradio_examples/4identity/config.json +6 -0
assets/gradio_examples/4identity/ref.webp +3 -0
assets/gradio_examples/5style/config.json +6 -0
assets/gradio_examples/5style/ref.webp +3 -0
assets/gradio_examples/6style/config.json +6 -0
assets/gradio_examples/6style/ref.webp +3 -0
assets/gradio_examples/7style_subject/config.json +7 -0
assets/gradio_examples/7style_subject/ref1.webp +3 -0
assets/gradio_examples/7style_subject/ref2.webp +3 -0
assets/gradio_examples/8style_subject/config.json +7 -0
assets/gradio_examples/8style_subject/ref1.webp +3 -0
assets/gradio_examples/8style_subject/ref2.webp +3 -0
assets/gradio_examples/9mix_style/config.json +7 -0
assets/gradio_examples/9mix_style/ref1.webp +3 -0
assets/gradio_examples/9mix_style/ref2.webp +3 -0
assets/gradio_examples/identity1.jpg +3 -0
assets/gradio_examples/identity1_result.png +3 -0
assets/gradio_examples/identity2.webp +3 -0
assets/gradio_examples/identity2_style2_result.webp +3 -0
assets/gradio_examples/style1.webp +3 -0
assets/gradio_examples/style1_result.webp +3 -0
assets/gradio_examples/style2.webp +3 -0
assets/gradio_examples/style3.webp +3 -0
assets/gradio_examples/style3_style4_result.webp +3 -0
assets/gradio_examples/style4.webp +3 -0
assets/gradio_examples/z_mix_style/config.json +7 -0
assets/gradio_examples/z_mix_style/ref1.png +3 -0
assets/gradio_examples/z_mix_style/ref2.png +3 -0
assets/gradio_examples/zz_t2i/config.json +5 -0
assets/teaser.webp +3 -0
assets/uso.webp +3 -0
assets/uso_logo.svg +0 -0
assets/uso_text.svg +0 -0
requirements.txt +19 -0
uso/flux/math.py +45 -0
uso/flux/model.py +258 -0
uso/flux/modules/__pycache__/autoencoder.cpython-311.pyc +0 -0
uso/flux/modules/__pycache__/conditioner.cpython-311.pyc +0 -0
uso/flux/modules/__pycache__/layers.cpython-311.pyc +0 -0
uso/flux/modules/autoencoder.py +327 -0
uso/flux/modules/conditioner.py +53 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

README copy.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: USO
+emoji: 💻
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+sdk_version: 5.23.3
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Freely Combining Any Subjects with Any Styles Across All Scenarios.
+models:
+  - black-forest-labs/FLUX.1-dev
+  - bytedance-research/UNO
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import json
+import os
+from pathlib import Path
+import gradio as gr
+import torch
+import spaces
+from uso.flux.pipeline import USOPipeline
+from transformers import SiglipVisionModel, SiglipImageProcessor
+with open("assets/uso_text.svg", "r", encoding="utf-8") as svg_file:
+    text_content = svg_file.read()
+with open("assets/uso_logo.svg", "r", encoding="utf-8") as svg_file:
+    logo_content = svg_file.read()
+title = f"""
+<div style="display: flex; align-items: center; justify-content: center;">
+    <span style="transform: scale(0.7);margin-right: -5px;">{text_content}</span>
+    <span style="font-size: 1.8em;margin-left: -10px;font-weight: bold; font-family: Gill Sans;">by UXO Team</span>
+    <span style="margin-left: 0px; transform: scale(0.85); display: inline-block;">{logo_content}</span>
+</div>
+""".strip()
+badges_text = r"""
+<div style="text-align: center; display: flex; justify-content: center; gap: 5px;">
+<a href="https://github.com/bytedance/USO"><img src="https://img.shields.io/static/v1?label=GitHub&message=Code&color=green&logo=github"></a>
+<a href="https://bytedance.github.io/USO/"><img alt="Build" src="https://img.shields.io/badge/Project%20Page-USO-yellow"></a>
+<a href="https://arxiv.org/abs/2504.02160"><img alt="Build" src="https://img.shields.io/badge/arXiv%20paper-USO-b31b1b.svg"></a>
+<a href="https://huggingface.co/bytedance-research/USO"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%97%20Hugging%20Face&message=Model&color=orange"></a>
+</div>
+""".strip()
+tips = """
+ 📌 **What is USO?**
+USO is a unified style-subject optimized customization model and the latest addition to the UXO family (<a href='https://github.com/bytedance/USO' target='_blank'> USO</a> and <a href='https://github.com/bytedance/UNO' target='_blank'> UNO</a>).
+It can freely combine arbitrary subjects with arbitrary styles in any scenarios.
+ 💡 **How to use?**
+We provide step-by-step instructions in our <a href='https://github.com/bytedance/USO' target='_blank'> Github Repo</a>.
+Additionally, try the examples provided below the demo to quickly get familiar with USO and spark your creativity!
+ ⚡️ The model is trained on 1024x1024 resolution and supports 3 types of usage:
+* **Only content img**: support following types:
+  * Subject/Identity-driven (supports natural prompt, e.g., *A clock on the table.* *The woman near the sea.*, excels in producing **photorealistic portraits**)
+  * Style edit (layout-preserved): *Transform the image into Ghibli style/Pixel style/Retro comic style/Watercolor painting style...*.
+  * Style edit (layout-shift): *Ghibli style, the man on the beach.*.
+* **Only style img**: Reference input style and generate anything following prompt. Excelling in this and further support multiple style references (in beta).
+* **Content img + style img**: Place the content into the desired style.
+  * Layout-preserved: set prompt to **empty**.
+  * Layout-shift: using natural prompt."""
+star = r"""
+If USO is helpful, please help to ⭐ our <a href='https://github.com/bytedance/USO' target='_blank'> Github Repo</a>. Thanks a lot!"""
+def get_examples(examples_dir: str = "assets/examples") -> list:
+    examples = Path(examples_dir)
+    ans = []
+    for example in examples.iterdir():
+        if not example.is_dir() or len(os.listdir(example)) == 0:
+            continue
+        with open(example / "config.json") as f:
+            example_dict = json.load(f)
+        example_list = []
+        example_list.append(example_dict["usage"])  # case for
+        example_list.append(example_dict["prompt"])  # prompt
+        for key in ["image_ref1", "image_ref2", "image_ref3"]:
+            if key in example_dict:
+                example_list.append(str(example / example_dict[key]))
+            else:
+                example_list.append(None)
+        example_list.append(example_dict["seed"])
+        ans.append(example_list)
+    return ans
+def create_demo(
+    model_type: str,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    offload: bool = False,
+):
+    pipeline = USOPipeline(
+        model_type, device, offload, only_lora=True, lora_rank=128, hf_download=True
+    )
+    print("USOPipeline loaded successfully")
+    siglip_processor = SiglipImageProcessor.from_pretrained(
+        "google/siglip-so400m-patch14-384"
+    )
+    siglip_model = SiglipVisionModel.from_pretrained(
+        "google/siglip-so400m-patch14-384"
+    )
+    siglip_model.eval()
+    siglip_model.to(device)
+    pipeline.model.vision_encoder = siglip_model
+    pipeline.model.vision_encoder_processor = siglip_processor
+    print("SigLIP model loaded successfully")
+    pipeline.gradio_generate = spaces.GPU(duration=120)(pipeline.gradio_generate)
+    with gr.Blocks() as demo:
+        gr.Markdown(title)
+        gr.Markdown(badges_text)
+        gr.Markdown(tips)
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt", value="A beautiful woman.")
+                with gr.Row():
+                    image_prompt1 = gr.Image(
+                        label="Content Reference Img", visible=True, interactive=True, type="pil"
+                    )
+                    image_prompt2 = gr.Image(
+                        label="Style Reference Img", visible=True, interactive=True, type="pil"
+                    )
+                    image_prompt3 = gr.Image(
+                        label="Extra Style Reference Img (Beta)", visible=True, interactive=True, type="pil"
+                    )
+                with gr.Row():
+                    with gr.Row():
+                        width = gr.Slider(
+                            512, 1536, 1024, step=16, label="Generation Width"
+                        )
+                        height = gr.Slider(
+                            512, 1536, 1024, step=16, label="Generation Height"
+                        )
+                with gr.Row():
+                    with gr.Row():
+                        keep_size = gr.Checkbox(
+                            label="Keep input size",
+                            value=False,
+                            interactive=True
+                        )
+                    with gr.Column():
+                        gr.Markdown("Set it to True if you only need style editing or want to keep the layout.")
+                with gr.Accordion("Advanced Options", open=True):
+                    with gr.Row():
+                        num_steps = gr.Slider(
+                            1, 50, 25, step=1, label="Number of steps"
+                        )
+                        guidance = gr.Slider(
+                            1.0, 5.0, 4.0, step=0.1, label="Guidance", interactive=True
+                        )
+                        content_long_size = gr.Slider(
+                            0, 1024, 512, step=16, label="Content reference size"
+                        )
+                        seed = gr.Number(-1, label="Seed (-1 for random)")
+                generate_btn = gr.Button("Generate")
+                gr.Markdown(star)
+            with gr.Column():
+                output_image = gr.Image(label="Generated Image")
+                download_btn = gr.File(
+                    label="Download full-resolution", type="filepath", interactive=False
+                )
+            inputs = [
+                prompt,
+                image_prompt1,
+                image_prompt2,
+                image_prompt3,
+                seed,
+                width,
+                height,
+                guidance,
+                num_steps,
+                keep_size,
+                content_long_size,
+            ]
+            generate_btn.click(
+                fn=pipeline.gradio_generate,
+                inputs=inputs,
+                outputs=[output_image, download_btn],
+            )
+        example_text = gr.Text("", visible=False, label="Case For:")
+        examples = get_examples("./assets/gradio_examples")
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                example_text,
+                prompt,
+                image_prompt1,
+                image_prompt2,
+                image_prompt3,
+                seed,
+            ],
+            # cache_examples='lazy',
+            outputs=[output_image, download_btn],
+            fn=pipeline.gradio_generate,
+        )
+    return demo
+if __name__ == "__main__":
+    from typing import Literal
+    from transformers import HfArgumentParser
+    @dataclasses.dataclass
+    class AppArgs:
+        name: Literal["flux-dev", "flux-dev-fp8", "flux-schnell", "flux-krea-dev"] = "flux-dev"
+        device: Literal["cuda", "cpu"] = "cuda" if torch.cuda.is_available() else "cpu"
+        offload: bool = dataclasses.field(
+            default=False,
+            metadata={
+                "help": "If True, sequantial offload the models(ae, dit, text encoder) to CPU if not used."
+            },
+        )
+        port: int = 7860
+    parser = HfArgumentParser([AppArgs])
+    args_tuple = parser.parse_args_into_dataclasses()  # type: tuple[AppArgs]
+    args = args_tuple[0]
+    demo = create_demo(args.name, args.device, args.offload)
+    demo.launch(server_port=args.port)

assets/gradio_examples/1subject/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "Wool felt style, a clock in the jungle.",
+    "seed": 3407,
+    "usage": "Subject-driven",
+    "image_ref1": "./ref.jpg"
+}

assets/gradio_examples/1subject/ref.jpg ADDED Viewed

Git LFS Details

SHA256: 0e1eb6ca2c944f3bfaed3ace56f5f186ed073a477e0333e0237253d98f0c9267
Pointer size: 131 Bytes
Size of remote file: 139 kB

assets/gradio_examples/2identity/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "The girl is riding a bike in a street.",
+    "seed": 3407,
+    "usage": "Identity-driven",
+    "image_ref1": "./ref.webp"
+}

assets/gradio_examples/2identity/ref.webp ADDED Viewed

Git LFS Details

SHA256: 4e97502bd7eebd6692604f891f836f25c7c30dcac8d15c4d42cc874efc51fcc5
Pointer size: 130 Bytes
Size of remote file: 85.8 kB

assets/gradio_examples/3identity/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "The man in flower shops carefully match bouquets, conveying beautiful emotions and blessings with flowers.",
+    "seed": 3407,
+    "usage": "Identity-driven",
+    "image_ref1": "./ref.jpg"
+}

assets/gradio_examples/3identity/ref.jpg ADDED Viewed

Git LFS Details

SHA256: 2730103b6b9ebaf47b44ef9a9d7fbb722de7878a101af09f0b85f8dfadb4c8a4
Pointer size: 130 Bytes
Size of remote file: 30.6 kB

assets/gradio_examples/4identity/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "Transform the image into Ghibli style.",
+    "seed": 3407,
+    "usage": "Identity-driven",
+    "image_ref1": "./ref.webp"
+}

assets/gradio_examples/4identity/ref.webp ADDED Viewed

Git LFS Details

SHA256: f8ed8aa1c0714c939392e2c033735d6266e53266079bb300cbf05a6824a49f9f
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

assets/gradio_examples/5style/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "A cat sleeping on a chair.",
+    "seed": 3407,
+    "usage": "Style-driven",
+    "image_ref2": "./ref.webp"
+}

assets/gradio_examples/5style/ref.webp ADDED Viewed

Git LFS Details

SHA256: 9ebf56d2d20ae5c49a582ff6bfef64b13022d0c624d9de25ed91047380fdfcfe
Pointer size: 130 Bytes
Size of remote file: 52.3 kB

assets/gradio_examples/6style/config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "prompt": "A beautiful woman.",
+    "seed": 3407,
+    "usage": "Style-driven",
+    "image_ref2": "./ref.webp"
+}

assets/gradio_examples/6style/ref.webp ADDED Viewed

Git LFS Details

SHA256: 40c013341c8708b53094e3eaa377b3dfccdc9e77e215ad15d2ac2e875b4c494a
Pointer size: 130 Bytes
Size of remote file: 58.2 kB

assets/gradio_examples/7style_subject/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "prompt": "",
+    "seed": 321,
+    "usage": "Style-subject-driven (layout-preserved)",
+    "image_ref1": "./ref1.webp",
+    "image_ref2": "./ref2.webp"
+}

assets/gradio_examples/7style_subject/ref1.webp ADDED Viewed

Git LFS Details

SHA256: f8ed8aa1c0714c939392e2c033735d6266e53266079bb300cbf05a6824a49f9f
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

assets/gradio_examples/7style_subject/ref2.webp ADDED Viewed

Git LFS Details

SHA256: 175d6e5b975b4d494950250740c0fe371a7e9b2c93c59a3ae82b82be72ccc0f6
Pointer size: 130 Bytes
Size of remote file: 14.2 kB

assets/gradio_examples/8style_subject/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "prompt": "The woman gave an impassioned speech on the podium.",
+    "seed": 321,
+    "usage": "Style-subject-driven (layout-shifted)",
+    "image_ref1": "./ref1.webp",
+    "image_ref2": "./ref2.webp"
+}

assets/gradio_examples/8style_subject/ref1.webp ADDED Viewed

Git LFS Details

SHA256: f8ed8aa1c0714c939392e2c033735d6266e53266079bb300cbf05a6824a49f9f
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

assets/gradio_examples/8style_subject/ref2.webp ADDED Viewed

Git LFS Details

SHA256: 0235262d9bd1070155536352ccf195f9875ead0d3379dee7285c0aaae79f6464
Pointer size: 130 Bytes
Size of remote file: 39.1 kB

assets/gradio_examples/9mix_style/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "prompt": "A man.",
+    "seed": 321,
+    "usage": "Multi-style-driven",
+    "image_ref2": "./ref1.webp",
+    "image_ref3": "./ref2.webp"
+}

assets/gradio_examples/9mix_style/ref1.webp ADDED Viewed

Git LFS Details

SHA256: a1d272a0ecb03126503446b00a2152deab2045f89ac2c01f948e1099589d2862
Pointer size: 131 Bytes
Size of remote file: 142 kB

assets/gradio_examples/9mix_style/ref2.webp ADDED Viewed

Git LFS Details

SHA256: b1ce04559726509672ce859d617a08d8dff8b2fe28f503fecbca7a5f66082882
Pointer size: 131 Bytes
Size of remote file: 290 kB

assets/gradio_examples/identity1.jpg ADDED Viewed

Git LFS Details

SHA256: 2730103b6b9ebaf47b44ef9a9d7fbb722de7878a101af09f0b85f8dfadb4c8a4
Pointer size: 130 Bytes
Size of remote file: 30.6 kB

assets/gradio_examples/identity1_result.png ADDED Viewed

Git LFS Details

SHA256: 7684256e44ce1bd4ada1e77a12674432eddd95b07fb388673899139afc56d864
Pointer size: 132 Bytes
Size of remote file: 1.54 MB

assets/gradio_examples/identity2.webp ADDED Viewed

Git LFS Details

SHA256: f8ed8aa1c0714c939392e2c033735d6266e53266079bb300cbf05a6824a49f9f
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

assets/gradio_examples/identity2_style2_result.webp ADDED Viewed

Git LFS Details

SHA256: 8376b6dc02d304616c09ecf09c7dbabb16c7c9142fb4db21f576a15a1ec24062
Pointer size: 130 Bytes
Size of remote file: 43.9 kB

assets/gradio_examples/style1.webp ADDED Viewed

Git LFS Details

SHA256: 9ebf56d2d20ae5c49a582ff6bfef64b13022d0c624d9de25ed91047380fdfcfe
Pointer size: 130 Bytes
Size of remote file: 52.3 kB

assets/gradio_examples/style1_result.webp ADDED Viewed

Git LFS Details

SHA256: 16a4353dd83b1c48499e222d6f77904e1fda23c1649ea5f6cca6b00b0fca3069
Pointer size: 130 Bytes
Size of remote file: 61.1 kB

assets/gradio_examples/style2.webp ADDED Viewed

Git LFS Details

SHA256: 0235262d9bd1070155536352ccf195f9875ead0d3379dee7285c0aaae79f6464
Pointer size: 130 Bytes
Size of remote file: 39.1 kB

assets/gradio_examples/style3.webp ADDED Viewed

Git LFS Details

SHA256: a1d272a0ecb03126503446b00a2152deab2045f89ac2c01f948e1099589d2862
Pointer size: 131 Bytes
Size of remote file: 142 kB

assets/gradio_examples/style3_style4_result.webp ADDED Viewed

Git LFS Details

SHA256: d09a5e429cc1d059aecd041e061868cd8e5b59f4718bb0f926fd84364f3794b0
Pointer size: 131 Bytes
Size of remote file: 173 kB

assets/gradio_examples/style4.webp ADDED Viewed

Git LFS Details

SHA256: b1ce04559726509672ce859d617a08d8dff8b2fe28f503fecbca7a5f66082882
Pointer size: 131 Bytes
Size of remote file: 290 kB

assets/gradio_examples/z_mix_style/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "prompt": "Boat on water.",
+    "seed": 321,
+    "usage": "Multi-style-driven",
+    "image_ref2": "./ref1.png",
+    "image_ref3": "./ref2.png"
+}

assets/gradio_examples/z_mix_style/ref1.png ADDED Viewed

Git LFS Details

SHA256: 5c31ba662c85f4032abf079dfeb9cba08d797b7b63f1d661c5270b373b00d095
Pointer size: 130 Bytes
Size of remote file: 26.1 kB

assets/gradio_examples/z_mix_style/ref2.png ADDED Viewed

Git LFS Details

SHA256: c47d23d5ffdbf30b4a8f6c1bc5d07a730825eaac8363c13bdac8e3bb8c330aed
Pointer size: 130 Bytes
Size of remote file: 14.7 kB

assets/gradio_examples/zz_t2i/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "prompt": "A beautiful woman.",
+    "seed": -1,
+    "usage": "Text-to-image"
+}

assets/teaser.webp ADDED Viewed

Git LFS Details

SHA256: 543c724f6b929303046ae481672567fe4a9620f0af5ca1dfff215dc7a2cbff5f
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

assets/uso.webp ADDED Viewed

Git LFS Details

SHA256: 772957e867da33550437fa547202d0f995011353ef9a24036d23596dae1a1632
Pointer size: 130 Bytes
Size of remote file: 58.2 kB

assets/uso_logo.svg ADDED Viewed

assets/uso_text.svg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+accelerate==1.1.1
+deepspeed==0.14.4
+einops==0.8.0
+transformers==4.43.3
+huggingface-hub
+diffusers==0.30.1
+sentencepiece==0.2.0
+gradio==5.22.0
+opencv-python
+matplotlib
+safetensors==0.4.5
+scipy==1.10.1
+numpy==1.24.4
+onnxruntime-gpu
+# httpx==0.23.3
+git+https://github.com/openai/CLIP.git
+--extra-index-url https://download.pytorch.org/whl/cu124
+torch==2.4.0
+torchvision==0.19.0

uso/flux/math.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from einops import rearrange
+from torch import Tensor
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

uso/flux/model.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import torch
+from torch import Tensor, nn
+from .modules.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+    SigLIPMultiFeatProjModel,
+)
+import os
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if params.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+        # feature embedder for siglip multi-feat inputs
+        self.feature_embedder = SigLIPMultiFeatProjModel(
+            siglip_token_nums=729,
+            style_token_nums=64,
+            siglip_token_dims=1152,
+            hidden_size=self.hidden_size,
+            context_layer_norm=True,
+        )
+        print("use semantic encoder siglip multi-feat to encode style image")
+        self.vision_encoder = None
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    @property
+    def attn_processors(self):
+        # set recursively
+        processors = {}  # type: dict[str, nn.Module]
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        ref_img: Tensor | None = None,
+        ref_img_ids: Tensor | None = None,
+        siglip_inputs: list[Tensor] | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        if self.feature_embedder is not None and siglip_inputs is not None and len(siglip_inputs) > 0 and self.vision_encoder is not None:
+            # processing style feat into textural hidden space
+            siglip_embedding = [self.vision_encoder(**emb, output_hidden_states=True) for emb in siglip_inputs]
+            # siglip_embedding = [self.vision_encoder(**(emb.to(torch.bfloat16)), output_hidden_states=True) for emb in siglip_inputs]
+            siglip_embedding = torch.cat([self.feature_embedder(emb) for emb in siglip_embedding], dim=1)
+            txt = torch.cat((siglip_embedding, txt), dim=1)
+            siglip_embedding_ids = torch.zeros(
+                siglip_embedding.shape[0], siglip_embedding.shape[1], 3
+            ).to(txt_ids.device)
+            txt_ids = torch.cat((siglip_embedding_ids, txt_ids), dim=1)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        # concat ref_img/img
+        img_end = img.shape[1]
+        if ref_img is not None:
+            if isinstance(ref_img, tuple) or isinstance(ref_img, list):
+                img_in = [img] + [self.img_in(ref) for ref in ref_img]
+                img_ids = [ids] + [ref_ids for ref_ids in ref_img_ids]
+                img = torch.cat(img_in, dim=1)
+                ids = torch.cat(img_ids, dim=1)
+            else:
+                img = torch.cat((img, self.img_in(ref_img)), dim=1)
+                ids = torch.cat((ids, ref_img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for index_block, block in enumerate(self.double_blocks):
+            if self.training and self.gradient_checkpointing:
+                img, txt = torch.utils.checkpoint.checkpoint(
+                    block,
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    use_reentrant=False,
+                )
+            else:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            if self.training and self.gradient_checkpointing:
+                img = torch.utils.checkpoint.checkpoint(
+                    block, img, vec=vec, pe=pe, use_reentrant=False
+                )
+            else:
+                img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        # index img
+        img = img[:, :img_end, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

uso/flux/modules/__pycache__/autoencoder.cpython-311.pyc ADDED Viewed

Binary file (18.9 kB). View file

uso/flux/modules/__pycache__/conditioner.cpython-311.pyc ADDED Viewed

Binary file (2.6 kB). View file

uso/flux/modules/__pycache__/layers.cpython-311.pyc ADDED Viewed

Binary file (37.3 kB). View file

uso/flux/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

uso/flux/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import Tensor, nn
+from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
+                          T5Tokenizer)
+class HFEmbedder(nn.Module):
+    def __init__(self, version: str, max_length: int, **hf_kwargs):
+        super().__init__()
+        self.is_clip = "clip" in version.lower()
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]