tonyshark
/

ml_test

Model card Files Files and versions Community

tonyshark commited on Mar 25

Commit

cc69848

verified ·

1 Parent(s): 1b6dc48

Upload 132 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
Dockerfile +53 -0
Dockerfile.cuda12.4 +53 -0
LICENSE +7 -0
README.md +54 -0
__init__.py +12 -0
__pycache__/train_network.cpython-310.pyc +0 -0
advanced.png +3 -0
app-launch.sh +5 -0
app.py +1119 -0
datasets/1 +0 -0
docker-compose.yml +28 -0
fine_tune.py +560 -0
flags.png +0 -0
flow.gif +3 -0
flux_extract_lora.py +221 -0
flux_train_comfy.py +806 -0
flux_train_network_comfy.py +500 -0
hf_token.json +3 -0
icon.png +0 -0
install.js +96 -0
library/__init__.py +0 -0
library/__pycache__/__init__.cpython-310.pyc +0 -0
library/__pycache__/config_util.cpython-310.pyc +0 -0
library/__pycache__/custom_offloading_utils.cpython-310.pyc +0 -0
library/__pycache__/custom_train_functions.cpython-310.pyc +0 -0
library/__pycache__/deepspeed_utils.cpython-310.pyc +0 -0
library/__pycache__/device_utils.cpython-310.pyc +0 -0
library/__pycache__/flux_models.cpython-310.pyc +0 -0
library/__pycache__/flux_train_utils.cpython-310.pyc +0 -0
library/__pycache__/flux_utils.cpython-310.pyc +0 -0
library/__pycache__/huggingface_util.cpython-310.pyc +0 -0
library/__pycache__/model_util.cpython-310.pyc +0 -0
library/__pycache__/original_unet.cpython-310.pyc +0 -0
library/__pycache__/sai_model_spec.cpython-310.pyc +0 -0
library/__pycache__/sd3_models.cpython-310.pyc +0 -0
library/__pycache__/sd3_utils.cpython-310.pyc +0 -0
library/__pycache__/strategy_base.cpython-310.pyc +0 -0
library/__pycache__/strategy_sd.cpython-310.pyc +0 -0
library/__pycache__/train_util.cpython-310.pyc +3 -0
library/__pycache__/utils.cpython-310.pyc +0 -0
library/adafactor_fused.py +138 -0
library/attention_processors.py +227 -0
library/config_util.py +717 -0
library/custom_offloading_utils.py +227 -0
library/custom_train_functions.py +556 -0
library/deepspeed_utils.py +139 -0
library/device_utils.py +84 -0
library/flux_models.py +1060 -0
library/flux_train_utils.py +585 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+advanced.png filter=lfs diff=lfs merge=lfs -text
+flow.gif filter=lfs diff=lfs merge=lfs -text
+library/__pycache__/train_util.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+publish_to_hf.png filter=lfs diff=lfs merge=lfs -text
+sample.png filter=lfs diff=lfs merge=lfs -text
+screenshot.png filter=lfs diff=lfs merge=lfs -text
+seed.gif filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+# Base image with CUDA 12.2
+FROM nvidia/cuda:12.2.2-base-ubuntu22.04
+# Install pip if not already installed
+RUN apt-get update -y && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    git \
+    build-essential  # Install dependencies for building extensions
+# Define environment variables for UID and GID and local timezone
+# ENV PUID=${PUID:-1000}
+# ENV PGID=${PGID:-1000}
+# Create a group with the specified GID
+# RUN groupadd -g "${PGID}" appuser
+# Create a user with the specified UID and GID
+# RUN useradd -m -s /bin/sh -u "${PUID}" -g "${PGID}" appuser
+WORKDIR /app
+# Get sd-scripts from kohya-ss and install them
+RUN git clone -b sd3 https://github.com/kohya-ss/sd-scripts && \
+    cd sd-scripts && \
+    pip install --no-cache-dir -r ./requirements.txt
+# Install main application dependencies
+COPY ./requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r ./requirements.txt
+# Install Torch, Torchvision, and Torchaudio for CUDA 12.2
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu122/torch_stable.html
+RUN chown -R appuser:appuser /app
+# delete redundant requirements.txt and sd-scripts directory within the container
+RUN rm -r ./sd-scripts
+RUN rm ./requirements.txt
+RUN pip install --force-reinstall -v "triton==3.1.0"
+#Run application as non-root
+# USER appuser
+# Copy fluxgym application code
+COPY . ./fluxgym
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+WORKDIR /app/fluxgym
+# Run fluxgym Python application
+CMD ["python3", "./app.py"]

Dockerfile.cuda12.4 ADDED Viewed

	@@ -0,0 +1,53 @@

+# Base image with CUDA 12.4
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04
+# Install pip if not already installed
+RUN apt-get update -y && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    git \
+    build-essential  # Install dependencies for building extensions
+# Define environment variables for UID and GID and local timezone
+ENV PUID=${PUID:-1000}
+ENV PGID=${PGID:-1000}
+# Create a group with the specified GID
+RUN groupadd -g "${PGID}" appuser
+# Create a user with the specified UID and GID
+RUN useradd -m -s /bin/sh -u "${PUID}" -g "${PGID}" appuser
+WORKDIR /app
+# Get sd-scripts from kohya-ss and install them
+RUN git clone -b sd3 https://github.com/kohya-ss/sd-scripts && \
+    cd sd-scripts && \
+    pip install --no-cache-dir -r ./requirements.txt
+# Install main application dependencies
+COPY ./requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r ./requirements.txt
+# Install Torch, Torchvision, and Torchaudio for CUDA 12.4
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+RUN chown -R appuser:appuser /app
+# delete redundant requirements.txt and sd-scripts directory within the container
+RUN rm -r ./sd-scripts
+RUN rm ./requirements.txt
+#Run application as non-root
+USER appuser
+# Copy fluxgym application code
+COPY . ./fluxgym
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+WORKDIR /app/fluxgym
+# Run fluxgym Python application
+CMD ["python3", "./app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+Copyright 2024 cocktailpeanut
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# ComfyUI Flux Trainer
+Wrapper for slightly modified kohya's training scripts: https://github.com/kohya-ss/sd-scripts
+Including code from: https://github.com/KohakuBlueleaf/Lycoris
+And https://github.com/LoganBooker/prodigy-plus-schedule-free
+## DISCLAIMER:
+I have **very** little previous experience in training anything, Flux is basically first model I've been inspired to learn. Previously I've only trained AnimateDiff Motion Loras, and built similar training nodes for it.
+## DO NOT ASK ME FOR TRAINING ADVICE
+I can not emphasize this enough, this repository is not for raising questions related to the training itself, that would be better done to kohya's repo. Even so keep in mind my implementation may have mistakes.
+The default settings aren't necessarily any good, they are just the last (out of many) I've tried and worked for my dataset.
+# THIS IS EXPERIMENTAL
+Both these nodes and the underlaying implementation by kohya is work in progress and expected to change.
+# Installation
+1. Clone this repo into `custom_nodes` folder.
+2. Install dependencies: `pip install -r requirements.txt`
+   or if you use the portable install, run this in ComfyUI_windows_portable -folder:
+  `python_embeded\python.exe -m pip install -r ComfyUI\custom_nodes\ComfyUI-FluxTrainer\requirements.txt`
+In addition torch version 2.4.0 or higher is highly recommended.
+Example workflow for LoRA training can be found in the examples folder, it utilizes additional nodes from:
+https://github.com/kijai/ComfyUI-KJNodes
+And some (optional) debugging nodes from:
+https://github.com/rgthree/rgthree-comfy
+For LoRA training the models need to be the normal fp8 or fp16 versions, also make sure the VAE is the non-diffusers version:
+https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+For full model training the fp16 version of the main model needs to be used.
+## Why train in ComfyUI?
+- Familiar UI (obviously only if you are a Comfy user already)
+- You can use same models you use for inference
+- You can use same python environment, I faced no incompabilities
+- You can build workflows to compare settings etc.
+Currently supports LoRA training, and untested full finetune with code from kohya's scripts: https://github.com/kohya-ss/sd-scripts
+Experimental support for LyCORIS training has been added as well, using code from: https://github.com/KohakuBlueleaf/Lycoris
+![Screenshot 2024-08-21 020207](https://github.com/user-attachments/assets/1686b180-90c8-41d0-8c96-63e76ebc2475)

__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+from .nodes_sd3 import NODE_CLASS_MAPPINGS as NODE_CLASS_MAPPINGS_SD3
+from .nodes_sd3 import NODE_DISPLAY_NAME_MAPPINGS as NODE_DISPLAY_NAME_MAPPINGS_SD3
+from .nodes_sdxl import NODE_CLASS_MAPPINGS as NODE_CLASS_MAPPINGS_SDXL
+from .nodes_sdxl import NODE_DISPLAY_NAME_MAPPINGS as NODE_DISPLAY_NAME_MAPPINGS_SDXL
+NODE_CLASS_MAPPINGS.update(NODE_CLASS_MAPPINGS_SD3)
+NODE_CLASS_MAPPINGS.update(NODE_CLASS_MAPPINGS_SDXL)
+NODE_DISPLAY_NAME_MAPPINGS.update(NODE_DISPLAY_NAME_MAPPINGS_SD3)
+NODE_DISPLAY_NAME_MAPPINGS.update(NODE_DISPLAY_NAME_MAPPINGS_SDXL)
+__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

__pycache__/train_network.cpython-310.pyc ADDED Viewed

Binary file (38.9 kB). View file

advanced.png ADDED Viewed

Git LFS Details

SHA256: 15077625eb185463cc0dd383157879fe3b73ebb7305a40f5ed2af14a49bca41d
Pointer size: 131 Bytes
Size of remote file: 182 kB

app-launch.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/env bash
+cd "`dirname "$0"`" || exit 1
+. env/bin/activate
+python app.py

app.py ADDED Viewed

	@@ -0,0 +1,1119 @@

+import os
+import sys
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+os.environ['GRADIO_ANALYTICS_ENABLED'] = '0'
+sys.path.insert(0, os.getcwd())
+sys.path.append(os.path.join(os.path.dirname(__file__), 'sd-scripts'))
+import subprocess
+import gradio as gr
+from PIL import Image
+import torch
+import uuid
+import shutil
+import json
+import yaml
+from slugify import slugify
+from transformers import AutoProcessor, AutoModelForCausalLM
+from gradio_logsview import LogsView, LogsViewRunner
+from huggingface_hub import hf_hub_download, HfApi
+from library import flux_train_utils, huggingface_util
+from argparse import Namespace
+import train_network
+import toml
+import re
+MAX_IMAGES = 150
+with open('models.yaml', 'r') as file:
+    models = yaml.safe_load(file)
+def readme(base_model, lora_name, instance_prompt, sample_prompts):
+    # model license
+    model_config = models[base_model]
+    model_file = model_config["file"]
+    base_model_name = model_config["base"]
+    license = None
+    license_name = None
+    license_link = None
+    license_items = []
+    if "license" in model_config:
+        license = model_config["license"]
+        license_items.append(f"license: {license}")
+    if "license_name" in model_config:
+        license_name = model_config["license_name"]
+        license_items.append(f"license_name: {license_name}")
+    if "license_link" in model_config:
+        license_link = model_config["license_link"]
+        license_items.append(f"license_link: {license_link}")
+    license_str = "\n".join(license_items)
+    print(f"license_items={license_items}")
+    print(f"license_str = {license_str}")
+    # tags
+    tags = [ "text-to-image", "flux", "lora", "diffusers", "template:sd-lora", "fluxgym" ]
+    # widgets
+    widgets = []
+    sample_image_paths = []
+    output_name = slugify(lora_name)
+    samples_dir = resolve_path_without_quotes(f"outputs/{output_name}/sample")
+    try:
+        for filename in os.listdir(samples_dir):
+            # Filename Schema: [name]_[steps]_[index]_[timestamp].png
+            match = re.search(r"_(\d+)_(\d+)_(\d+)\.png$", filename)
+            if match:
+                steps, index, timestamp = int(match.group(1)), int(match.group(2)), int(match.group(3))
+                sample_image_paths.append((steps, index, f"sample/{filename}"))
+        # Sort by numeric index
+        sample_image_paths.sort(key=lambda x: x[0], reverse=True)
+        final_sample_image_paths = sample_image_paths[:len(sample_prompts)]
+        final_sample_image_paths.sort(key=lambda x: x[1])
+        for i, prompt in enumerate(sample_prompts):
+            _, _, image_path = final_sample_image_paths[i]
+            widgets.append(
+                {
+                    "text": prompt,
+                    "output": {
+                        "url": image_path
+                    },
+                }
+            )
+    except:
+        print(f"no samples")
+    dtype = "torch.bfloat16"
+    # Construct the README content
+    readme_content = f"""---
+tags:
+{yaml.dump(tags, indent=4).strip()}
+{"widget:" if os.path.isdir(samples_dir) else ""}
+{yaml.dump(widgets, indent=4).strip() if widgets else ""}
+base_model: {base_model_name}
+{"instance_prompt: " + instance_prompt if instance_prompt else ""}
+{license_str}
+---
+# {lora_name}
+A Flux LoRA trained on a local computer with [Fluxgym](https://github.com/cocktailpeanut/fluxgym)
+<Gallery />
+## Trigger words
+{"You should use `" + instance_prompt + "` to trigger the image generation." if instance_prompt else "No trigger words defined."}
+## Download model and use it with ComfyUI, AUTOMATIC1111, SD.Next, Invoke AI, Forge, etc.
+Weights for this model are available in Safetensors format.
+"""
+    return readme_content
+def account_hf():
+    try:
+        with open("HF_TOKEN", "r") as file:
+            token = file.read()
+            api = HfApi(token=token)
+            try:
+                account = api.whoami()
+                return { "token": token, "account": account['name'] }
+            except:
+                return None
+    except:
+        return None
+"""
+hf_logout.click(fn=logout_hf, outputs=[hf_token, hf_login, hf_logout, repo_owner])
+"""
+def logout_hf():
+    os.remove("HF_TOKEN")
+    global current_account
+    current_account = account_hf()
+    print(f"current_account={current_account}")
+    return gr.update(value=""), gr.update(visible=True), gr.update(visible=False), gr.update(value="", visible=False)
+"""
+hf_login.click(fn=login_hf, inputs=[hf_token], outputs=[hf_token, hf_login, hf_logout, repo_owner])
+"""
+def login_hf(hf_token):
+    api = HfApi(token=hf_token)
+    try:
+        account = api.whoami()
+        if account != None:
+            if "name" in account:
+                with open("HF_TOKEN", "w") as file:
+                    file.write(hf_token)
+                global current_account
+                current_account = account_hf()
+                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(value=current_account["account"], visible=True)
+        return gr.update(), gr.update(), gr.update(), gr.update()
+    except:
+        print(f"incorrect hf_token")
+        return gr.update(), gr.update(), gr.update(), gr.update()
+def upload_hf(base_model, lora_rows, repo_owner, repo_name, repo_visibility, hf_token):
+    src = lora_rows
+    repo_id = f"{repo_owner}/{repo_name}"
+    gr.Info(f"Uploading to Huggingface. Please Stand by...", duration=None)
+    args = Namespace(
+        huggingface_repo_id=repo_id,
+        huggingface_repo_type="model",
+        huggingface_repo_visibility=repo_visibility,
+        huggingface_path_in_repo="",
+        huggingface_token=hf_token,
+        async_upload=False
+    )
+    print(f"upload_hf args={args}")
+    huggingface_util.upload(args=args, src=src)
+    gr.Info(f"[Upload Complete] https://huggingface.co/{repo_id}", duration=None)
+def load_captioning(uploaded_files, concept_sentence):
+    uploaded_images = [file for file in uploaded_files if not file.endswith('.txt')]
+    txt_files = [file for file in uploaded_files if file.endswith('.txt')]
+    txt_files_dict = {os.path.splitext(os.path.basename(txt_file))[0]: txt_file for txt_file in txt_files}
+    updates = []
+    if len(uploaded_images) <= 1:
+        raise gr.Error(
+            "Please upload at least 2 images to train your model (the ideal number with default settings is between 4-30)"
+        )
+    elif len(uploaded_images) > MAX_IMAGES:
+        raise gr.Error(f"For now, only {MAX_IMAGES} or less images are allowed for training")
+    # Update for the captioning_area
+    # for _ in range(3):
+    updates.append(gr.update(visible=True))
+    # Update visibility and image for each captioning row and image
+    for i in range(1, MAX_IMAGES + 1):
+        # Determine if the current row and image should be visible
+        visible = i <= len(uploaded_images)
+        # Update visibility of the captioning row
+        updates.append(gr.update(visible=visible))
+        # Update for image component - display image if available, otherwise hide
+        image_value = uploaded_images[i - 1] if visible else None
+        updates.append(gr.update(value=image_value, visible=visible))
+        corresponding_caption = False
+        if(image_value):
+            base_name = os.path.splitext(os.path.basename(image_value))[0]
+            if base_name in txt_files_dict:
+                with open(txt_files_dict[base_name], 'r') as file:
+                    corresponding_caption = file.read()
+        # Update value of captioning area
+        text_value = corresponding_caption if visible and corresponding_caption else concept_sentence if visible and concept_sentence else None
+        updates.append(gr.update(value=text_value, visible=visible))
+    # Update for the sample caption area
+    updates.append(gr.update(visible=True))
+    updates.append(gr.update(visible=True))
+    return updates
+def hide_captioning():
+    return gr.update(visible=False), gr.update(visible=False)
+def resize_image(image_path, output_path, size):
+    with Image.open(image_path) as img:
+        width, height = img.size
+        if width < height:
+            new_width = size
+            new_height = int((size/width) * height)
+        else:
+            new_height = size
+            new_width = int((size/height) * width)
+        print(f"resize {image_path} : {new_width}x{new_height}")
+        img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        img_resized.save(output_path)
+def create_dataset(destination_folder, size, *inputs):
+    print("Creating dataset")
+    images = inputs[0]
+    if not os.path.exists(destination_folder):
+        os.makedirs(destination_folder)
+    for index, image in enumerate(images):
+        # copy the images to the datasets folder
+        new_image_path = shutil.copy(image, destination_folder)
+        # if it's a caption text file skip the next bit
+        ext = os.path.splitext(new_image_path)[-1].lower()
+        if ext == '.txt':
+            continue
+        # resize the images
+        resize_image(new_image_path, new_image_path, size)
+        # copy the captions
+        original_caption = inputs[index + 1]
+        image_file_name = os.path.basename(new_image_path)
+        caption_file_name = os.path.splitext(image_file_name)[0] + ".txt"
+        caption_path = resolve_path_without_quotes(os.path.join(destination_folder, caption_file_name))
+        print(f"image_path={new_image_path}, caption_path = {caption_path}, original_caption={original_caption}")
+        # if caption_path exists, do not write
+        if os.path.exists(caption_path):
+            print(f"{caption_path} already exists. use the existing .txt file")
+        else:
+            print(f"{caption_path} create a .txt caption file")
+            with open(caption_path, 'w') as file:
+                file.write(original_caption)
+    print(f"destination_folder {destination_folder}")
+    return destination_folder
+def run_captioning(images, concept_sentence, *captions):
+    print(f"run_captioning")
+    print(f"concept sentence {concept_sentence}")
+    print(f"captions {captions}")
+    #Load internally to not consume resources for training
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"device={device}")
+    torch_dtype = torch.float16
+    model = AutoModelForCausalLM.from_pretrained(
+        "multimodalart/Florence-2-large-no-flash-attn", torch_dtype=torch_dtype, trust_remote_code=True
+    ).to(device)
+    processor = AutoProcessor.from_pretrained("multimodalart/Florence-2-large-no-flash-attn", trust_remote_code=True)
+    captions = list(captions)
+    for i, image_path in enumerate(images):
+        print(captions[i])
+        if isinstance(image_path, str):  # If image is a file path
+            image = Image.open(image_path).convert("RGB")
+        prompt = "<DETAILED_CAPTION>"
+        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
+        print(f"inputs {inputs}")
+        generated_ids = model.generate(
+            input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
+        )
+        print(f"generated_ids {generated_ids}")
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        print(f"generated_text: {generated_text}")
+        parsed_answer = processor.post_process_generation(
+            generated_text, task=prompt, image_size=(image.width, image.height)
+        )
+        print(f"parsed_answer = {parsed_answer}")
+        caption_text = parsed_answer["<DETAILED_CAPTION>"].replace("The image shows ", "")
+        print(f"caption_text = {caption_text}, concept_sentence={concept_sentence}")
+        if concept_sentence:
+            caption_text = f"{concept_sentence} {caption_text}"
+        captions[i] = caption_text
+        yield captions
+    model.to("cpu")
+    del model
+    del processor
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def recursive_update(d, u):
+    for k, v in u.items():
+        if isinstance(v, dict) and v:
+            d[k] = recursive_update(d.get(k, {}), v)
+        else:
+            d[k] = v
+    return d
+def download(base_model):
+    model = models[base_model]
+    model_file = model["file"]
+    repo = model["repo"]
+    # download unet
+    if base_model == "flux-dev" or base_model == "flux-schnell":
+        unet_folder = "models/unet"
+    else:
+        unet_folder = f"models/unet/{repo}"
+    unet_path = os.path.join(unet_folder, model_file)
+    if not os.path.exists(unet_path):
+        os.makedirs(unet_folder, exist_ok=True)
+        gr.Info(f"Downloading base model: {base_model}. Please wait. (You can check the terminal for the download progress)", duration=None)
+        print(f"download {base_model}")
+        hf_hub_download(repo_id=repo, local_dir=unet_folder, filename=model_file)
+    # download vae
+    vae_folder = "models/vae"
+    vae_path = os.path.join(vae_folder, "ae.sft")
+    if not os.path.exists(vae_path):
+        os.makedirs(vae_folder, exist_ok=True)
+        gr.Info(f"Downloading vae")
+        print(f"downloading ae.sft...")
+        hf_hub_download(repo_id="cocktailpeanut/xulf-dev", local_dir=vae_folder, filename="ae.sft")
+    # download clip
+    clip_folder = "models/clip"
+    clip_l_path = os.path.join(clip_folder, "clip_l.safetensors")
+    if not os.path.exists(clip_l_path):
+        os.makedirs(clip_folder, exist_ok=True)
+        gr.Info(f"Downloading clip...")
+        print(f"download clip_l.safetensors")
+        hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", local_dir=clip_folder, filename="clip_l.safetensors")
+    # download t5xxl
+    t5xxl_path = os.path.join(clip_folder, "t5xxl_fp16.safetensors")
+    if not os.path.exists(t5xxl_path):
+        print(f"download t5xxl_fp16.safetensors")
+        gr.Info(f"Downloading t5xxl...")
+        hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", local_dir=clip_folder, filename="t5xxl_fp16.safetensors")
+def resolve_path(p):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    norm_path = os.path.normpath(os.path.join(current_dir, p))
+    return f"\"{norm_path}\""
+def resolve_path_without_quotes(p):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    norm_path = os.path.normpath(os.path.join(current_dir, p))
+    return norm_path
+def gen_sh(
+    base_model,
+    output_name,
+    resolution,
+    seed,
+    workers,
+    learning_rate,
+    network_dim,
+    max_train_epochs,
+    save_every_n_epochs,
+    timestep_sampling,
+    guidance_scale,
+    vram,
+    sample_prompts,
+    sample_every_n_steps,
+    *advanced_components
+):
+    print(f"gen_sh: network_dim:{network_dim}, max_train_epochs={max_train_epochs}, save_every_n_epochs={save_every_n_epochs}, timestep_sampling={timestep_sampling}, guidance_scale={guidance_scale}, vram={vram}, sample_prompts={sample_prompts}, sample_every_n_steps={sample_every_n_steps}")
+    output_dir = resolve_path(f"outputs/{output_name}")
+    sample_prompts_path = resolve_path(f"outputs/{output_name}/sample_prompts.txt")
+    line_break = "\\"
+    file_type = "sh"
+    if sys.platform == "win32":
+        line_break = "^"
+        file_type = "bat"
+    ############# Sample args ########################
+    sample = ""
+    if len(sample_prompts) > 0 and sample_every_n_steps > 0:
+        sample = f"""--sample_prompts={sample_prompts_path} --sample_every_n_steps="{sample_every_n_steps}" {line_break}"""
+    ############# Optimizer args ########################
+#    if vram == "8G":
+#        optimizer = f"""--optimizer_type adafactor {line_break}
+#    --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" {line_break}
+#        --split_mode {line_break}
+#        --network_args "train_blocks=single" {line_break}
+#        --lr_scheduler constant_with_warmup {line_break}
+#        --max_grad_norm 0.0 {line_break}"""
+    if vram == "16G":
+        # 16G VRAM
+        optimizer = f"""--optimizer_type adafactor {line_break}
+  --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" {line_break}
+  --lr_scheduler constant_with_warmup {line_break}
+  --max_grad_norm 0.0 {line_break}"""
+    elif vram == "12G":
+      # 12G VRAM
+        optimizer = f"""--optimizer_type adafactor {line_break}
+  --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" {line_break}
+  --split_mode {line_break}
+  --network_args "train_blocks=single" {line_break}
+  --lr_scheduler constant_with_warmup {line_break}
+  --max_grad_norm 0.0 {line_break}"""
+    else:
+        # 20G+ VRAM
+        optimizer = f"--optimizer_type adamw8bit {line_break}"
+    #######################################################
+    model_config = models[base_model]
+    model_file = model_config["file"]
+    repo = model_config["repo"]
+    if base_model == "flux-dev" or base_model == "flux-schnell":
+        model_folder = "models/unet"
+    else:
+        model_folder = f"models/unet/{repo}"
+    model_path = os.path.join(model_folder, model_file)
+    pretrained_model_path = resolve_path(model_path)
+    clip_path = resolve_path("models/clip/clip_l.safetensors")
+    t5_path = resolve_path("models/clip/t5xxl_fp16.safetensors")
+    ae_path = resolve_path("models/vae/ae.sft")
+    sh = f"""accelerate launch {line_break}
+  --mixed_precision bf16 {line_break}
+  --num_cpu_threads_per_process 1 {line_break}
+  sd-scripts/flux_train_network.py {line_break}
+  --pretrained_model_name_or_path {pretrained_model_path} {line_break}
+  --clip_l {clip_path} {line_break}
+  --t5xxl {t5_path} {line_break}
+  --ae {ae_path} {line_break}
+  --cache_latents_to_disk {line_break}
+  --save_model_as safetensors {line_break}
+  --sdpa --persistent_data_loader_workers {line_break}
+  --max_data_loader_n_workers {workers} {line_break}
+  --seed {seed} {line_break}
+  --gradient_checkpointing {line_break}
+  --mixed_precision bf16 {line_break}
+  --save_precision bf16 {line_break}
+  --network_module networks.lora_flux {line_break}
+  --network_dim {network_dim} {line_break}
+  {optimizer}{sample}
+  --learning_rate {learning_rate} {line_break}
+  --cache_text_encoder_outputs {line_break}
+  --cache_text_encoder_outputs_to_disk {line_break}
+  --fp8_base {line_break}
+  --highvram {line_break}
+  --max_train_epochs {max_train_epochs} {line_break}
+  --save_every_n_epochs {save_every_n_epochs} {line_break}
+  --dataset_config {resolve_path(f"outputs/{output_name}/dataset.toml")} {line_break}
+  --output_dir {output_dir} {line_break}
+  --output_name {output_name} {line_break}
+  --timestep_sampling {timestep_sampling} {line_break}
+  --discrete_flow_shift 3.1582 {line_break}
+  --model_prediction_type raw {line_break}
+  --guidance_scale {guidance_scale} {line_break}
+  --loss_type l2 {line_break}"""
+    ############# Advanced args ########################
+    global advanced_component_ids
+    global original_advanced_component_values
+    # check dirty
+    print(f"original_advanced_component_values = {original_advanced_component_values}")
+    advanced_flags = []
+    for i, current_value in enumerate(advanced_components):
+#        print(f"compare {advanced_component_ids[i]}: old={original_advanced_component_values[i]}, new={current_value}")
+        if original_advanced_component_values[i] != current_value:
+            # dirty
+            if current_value == True:
+                # Boolean
+                advanced_flags.append(advanced_component_ids[i])
+            else:
+                # string
+                advanced_flags.append(f"{advanced_component_ids[i]} {current_value}")
+    if len(advanced_flags) > 0:
+        advanced_flags_str = f" {line_break}\n  ".join(advanced_flags)
+        sh = sh + "\n  " + advanced_flags_str
+    return sh
+def gen_toml(
+  dataset_folder,
+  resolution,
+  class_tokens,
+  num_repeats
+):
+    toml = f"""[general]
+shuffle_caption = false
+caption_extension = '.txt'
+keep_tokens = 1
+[[datasets]]
+resolution = {resolution}
+batch_size = 1
+keep_tokens = 1
+  [[datasets.subsets]]
+  image_dir = '{resolve_path_without_quotes(dataset_folder)}'
+  class_tokens = '{class_tokens}'
+  num_repeats = {num_repeats}"""
+    return toml
+def update_total_steps(max_train_epochs, num_repeats, images):
+    try:
+        num_images = len(images)
+        total_steps = max_train_epochs * num_images * num_repeats
+        print(f"max_train_epochs={max_train_epochs} num_images={num_images}, num_repeats={num_repeats}, total_steps={total_steps}")
+        return gr.update(value = total_steps)
+    except:
+        print("")
+def set_repo(lora_rows):
+    selected_name = os.path.basename(lora_rows)
+    return gr.update(value=selected_name)
+def get_loras():
+    try:
+        outputs_path = resolve_path_without_quotes(f"outputs")
+        files = os.listdir(outputs_path)
+        folders = [os.path.join(outputs_path, item) for item in files if os.path.isdir(os.path.join(outputs_path, item)) and item != "sample"]
+        folders.sort(key=lambda file: os.path.getctime(file), reverse=True)
+        return folders
+    except Exception as e:
+        return []
+def get_samples(lora_name):
+    output_name = slugify(lora_name)
+    try:
+        samples_path = resolve_path_without_quotes(f"outputs/{output_name}/sample")
+        files = [os.path.join(samples_path, file) for file in os.listdir(samples_path)]
+        files.sort(key=lambda file: os.path.getctime(file), reverse=True)
+        return files
+    except:
+        return []
+def start_training(
+    base_model,
+    lora_name,
+    train_script,
+    train_config,
+    sample_prompts,
+):
+    # write custom script and toml
+    if not os.path.exists("models"):
+        os.makedirs("models", exist_ok=True)
+    if not os.path.exists("outputs"):
+        os.makedirs("outputs", exist_ok=True)
+    output_name = slugify(lora_name)
+    output_dir = resolve_path_without_quotes(f"outputs/{output_name}")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+    download(base_model)
+    file_type = "sh"
+    if sys.platform == "win32":
+        file_type = "bat"
+    sh_filename = f"train.{file_type}"
+    sh_filepath = resolve_path_without_quotes(f"outputs/{output_name}/{sh_filename}")
+    with open(sh_filepath, 'w', encoding="utf-8") as file:
+        file.write(train_script)
+    gr.Info(f"Generated train script at {sh_filename}")
+    dataset_path = resolve_path_without_quotes(f"outputs/{output_name}/dataset.toml")
+    with open(dataset_path, 'w', encoding="utf-8") as file:
+        file.write(train_config)
+    gr.Info(f"Generated dataset.toml")
+    sample_prompts_path = resolve_path_without_quotes(f"outputs/{output_name}/sample_prompts.txt")
+    with open(sample_prompts_path, 'w', encoding='utf-8') as file:
+        file.write(sample_prompts)
+    gr.Info(f"Generated sample_prompts.txt")
+    # Train
+    if sys.platform == "win32":
+        command = sh_filepath
+    else:
+        command = f"bash \"{sh_filepath}\""
+    # Use Popen to run the command and capture output in real-time
+    env = os.environ.copy()
+    env['PYTHONIOENCODING'] = 'utf-8'
+    env['LOG_LEVEL'] = 'DEBUG'
+    runner = LogsViewRunner()
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    gr.Info(f"Started training")
+    yield from runner.run_command([command], cwd=cwd)
+    yield runner.log(f"Runner: {runner}")
+    # Generate Readme
+    config = toml.loads(train_config)
+    concept_sentence = config['datasets'][0]['subsets'][0]['class_tokens']
+    print(f"concept_sentence={concept_sentence}")
+    print(f"lora_name {lora_name}, concept_sentence={concept_sentence}, output_name={output_name}")
+    sample_prompts_path = resolve_path_without_quotes(f"outputs/{output_name}/sample_prompts.txt")
+    with open(sample_prompts_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    sample_prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    md = readme(base_model, lora_name, concept_sentence, sample_prompts)
+    readme_path = resolve_path_without_quotes(f"outputs/{output_name}/README.md")
+    with open(readme_path, "w", encoding="utf-8") as f:
+        f.write(md)
+    gr.Info(f"Training Complete. Check the outputs folder for the LoRA files.", duration=None)
+def update(
+    base_model,
+    lora_name,
+    resolution,
+    seed,
+    workers,
+    class_tokens,
+    learning_rate,
+    network_dim,
+    max_train_epochs,
+    save_every_n_epochs,
+    timestep_sampling,
+    guidance_scale,
+    vram,
+    num_repeats,
+    sample_prompts,
+    sample_every_n_steps,
+    *advanced_components,
+):
+    output_name = slugify(lora_name)
+    dataset_folder = str(f"datasets/{output_name}")
+    sh = gen_sh(
+        base_model,
+        output_name,
+        resolution,
+        seed,
+        workers,
+        learning_rate,
+        network_dim,
+        max_train_epochs,
+        save_every_n_epochs,
+        timestep_sampling,
+        guidance_scale,
+        vram,
+        sample_prompts,
+        sample_every_n_steps,
+        *advanced_components,
+    )
+    toml = gen_toml(
+        dataset_folder,
+        resolution,
+        class_tokens,
+        num_repeats
+    )
+    return gr.update(value=sh), gr.update(value=toml), dataset_folder
+"""
+demo.load(fn=loaded, js=js, outputs=[hf_token, hf_login, hf_logout, hf_account])
+"""
+def loaded():
+    global current_account
+    current_account = account_hf()
+    print(f"current_account={current_account}")
+    if current_account != None:
+        return gr.update(value=current_account["token"]), gr.update(visible=False), gr.update(visible=True), gr.update(value=current_account["account"], visible=True)
+    else:
+        return gr.update(value=""), gr.update(visible=True), gr.update(visible=False), gr.update(value="", visible=False)
+def update_sample(concept_sentence):
+    return gr.update(value=concept_sentence)
+def refresh_publish_tab():
+    loras = get_loras()
+    return gr.Dropdown(label="Trained LoRAs", choices=loras)
+def init_advanced():
+    # if basic_args
+    basic_args = {
+        'pretrained_model_name_or_path',
+        'clip_l',
+        't5xxl',
+        'ae',
+        'cache_latents_to_disk',
+        'save_model_as',
+        'sdpa',
+        'persistent_data_loader_workers',
+        'max_data_loader_n_workers',
+        'seed',
+        'gradient_checkpointing',
+        'mixed_precision',
+        'save_precision',
+        'network_module',
+        'network_dim',
+        'learning_rate',
+        'cache_text_encoder_outputs',
+        'cache_text_encoder_outputs_to_disk',
+        'fp8_base',
+        'highvram',
+        'max_train_epochs',
+        'save_every_n_epochs',
+        'dataset_config',
+        'output_dir',
+        'output_name',
+        'timestep_sampling',
+        'discrete_flow_shift',
+        'model_prediction_type',
+        'guidance_scale',
+        'loss_type',
+        'optimizer_type',
+        'optimizer_args',
+        'lr_scheduler',
+        'sample_prompts',
+        'sample_every_n_steps',
+        'max_grad_norm',
+        'split_mode',
+        'network_args'
+    }
+    # generate a UI config
+    # if not in basic_args, create a simple form
+    parser = train_network.setup_parser()
+    flux_train_utils.add_flux_train_arguments(parser)
+    args_info = {}
+    for action in parser._actions:
+        if action.dest != 'help':  # Skip the default help argument
+            # if the dest is included in basic_args
+            args_info[action.dest] = {
+                "action": action.option_strings,  # Option strings like '--use_8bit_adam'
+                "type": action.type,              # Type of the argument
+                "help": action.help,              # Help message
+                "default": action.default,        # Default value, if any
+                "required": action.required       # Whether the argument is required
+            }
+    temp = []
+    for key in args_info:
+        temp.append({ 'key': key, 'action': args_info[key] })
+    temp.sort(key=lambda x: x['key'])
+    advanced_component_ids = []
+    advanced_components = []
+    for item in temp:
+        key = item['key']
+        action = item['action']
+        if key in basic_args:
+            print("")
+        else:
+            action_type = str(action['type'])
+            component = None
+            with gr.Column(min_width=300):
+                if action_type == "None":
+                    # radio
+                    component = gr.Checkbox()
+    #            elif action_type == "<class 'str'>":
+    #                component = gr.Textbox()
+    #            elif action_type == "<class 'int'>":
+    #                component = gr.Number(precision=0)
+    #            elif action_type == "<class 'float'>":
+    #                component = gr.Number()
+    #            elif "int_or_float" in action_type:
+    #                component = gr.Number()
+                else:
+                    component = gr.Textbox(value="")
+                if component != None:
+                    component.interactive = True
+                    component.elem_id = action['action'][0]
+                    component.label = component.elem_id
+                    component.elem_classes = ["advanced"]
+                if action['help'] != None:
+                    component.info = action['help']
+            advanced_components.append(component)
+            advanced_component_ids.append(component.elem_id)
+    return advanced_components, advanced_component_ids
+theme = gr.themes.Monochrome(
+    text_size=gr.themes.Size(lg="18px", md="15px", sm="13px", xl="22px", xs="12px", xxl="24px", xxs="9px"),
+    font=[gr.themes.GoogleFont("Source Sans Pro"), "ui-sans-serif", "system-ui", "sans-serif"],
+)
+css = """
+@keyframes rotate {
+    0% {
+        transform: rotate(0deg);
+    }
+    100% {
+        transform: rotate(360deg);
+    }
+}
+#advanced_options .advanced:nth-child(even) { background: rgba(0,0,100,0.04) !important; }
+h1{font-family: georgia; font-style: italic; font-weight: bold; font-size: 30px; letter-spacing: -1px;}
+h3{margin-top: 0}
+.tabitem{border: 0px}
+.group_padding{}
+nav{position: fixed; top: 0; left: 0; right: 0; z-index: 1000; text-align: center; padding: 10px; box-sizing: border-box; display: flex; align-items: center; backdrop-filter: blur(10px); }
+nav button { background: none; color: firebrick; font-weight: bold; border: 2px solid firebrick; padding: 5px 10px; border-radius: 5px; font-size: 14px; }
+nav img { height: 40px; width: 40px; border-radius: 40px; }
+nav img.rotate { animation: rotate 2s linear infinite; }
+.flexible { flex-grow: 1; }
+.tast-details { margin: 10px 0 !important; }
+.toast-wrap { bottom: var(--size-4) !important; top: auto !important; border: none !important; backdrop-filter: blur(10px); }
+.toast-title, .toast-text, .toast-icon, .toast-close { color: black !important; font-size: 14px; }
+.toast-body { border: none !important; }
+#terminal { box-shadow: none !important; margin-bottom: 25px; background: rgba(0,0,0,0.03); }
+#terminal .generating { border: none !important; }
+#terminal label { position: absolute !important; }
+.tabs { margin-top: 50px; }
+.hidden { display: none !important; }
+.codemirror-wrapper .cm-line { font-size: 12px !important; }
+label { font-weight: bold !important; }
+#start_training.clicked { background: silver; color: black; }
+"""
+js = """
+function() {
+    let autoscroll = document.querySelector("#autoscroll")
+    if (window.iidxx) {
+        window.clearInterval(window.iidxx);
+    }
+    window.iidxx = window.setInterval(function() {
+        let text=document.querySelector(".codemirror-wrapper .cm-line").innerText.trim()
+        let img = document.querySelector("#logo")
+        if (text.length > 0) {
+            autoscroll.classList.remove("hidden")
+            if (autoscroll.classList.contains("on")) {
+                autoscroll.textContent = "Autoscroll ON"
+                window.scrollTo(0, document.body.scrollHeight, { behavior: "smooth" });
+                img.classList.add("rotate")
+            } else {
+                autoscroll.textContent = "Autoscroll OFF"
+                img.classList.remove("rotate")
+            }
+        }
+    }, 500);
+    console.log("autoscroll", autoscroll)
+    autoscroll.addEventListener("click", (e) => {
+        autoscroll.classList.toggle("on")
+    })
+    function debounce(fn, delay) {
+        let timeoutId;
+        return function(...args) {
+            clearTimeout(timeoutId);
+            timeoutId = setTimeout(() => fn(...args), delay);
+        };
+    }
+    function handleClick() {
+        console.log("refresh")
+        document.querySelector("#refresh").click();
+    }
+    const debouncedClick = debounce(handleClick, 1000);
+    document.addEventListener("input", debouncedClick);
+    document.querySelector("#start_training").addEventListener("click", (e) => {
+      e.target.classList.add("clicked")
+      e.target.innerHTML = "Training..."
+    })
+}
+"""
+current_account = account_hf()
+print(f"current_account={current_account}")
+with gr.Blocks(elem_id="app", theme=theme, css=css, fill_width=True) as demo:
+    with gr.Tabs() as tabs:
+        with gr.TabItem("Gym"):
+            output_components = []
+            with gr.Row():
+                gr.HTML("""<nav>
+            <img id='logo' src='/file=icon.png' width='80' height='80'>
+            <div class='flexible'></div>
+            <button id='autoscroll' class='on hidden'></button>
+        </nav>
+        """)
+            with gr.Row(elem_id='container'):
+                with gr.Column():
+                    gr.Markdown(
+                        """# Step 1. LoRA Info
+        <p style="margin-top:0">Configure your LoRA train settings.</p>
+        """, elem_classes="group_padding")
+                    lora_name = gr.Textbox(
+                        label="The name of your LoRA",
+                        info="This has to be a unique name",
+                        placeholder="e.g.: Persian Miniature Painting style, Cat Toy",
+                    )
+                    concept_sentence = gr.Textbox(
+                        elem_id="--concept_sentence",
+                        label="Trigger word/sentence",
+                        info="Trigger word or sentence to be used",
+                        placeholder="uncommon word like p3rs0n or trtcrd, or sentence like 'in the style of CNSTLL'",
+                        interactive=True,
+                    )
+                    model_names = list(models.keys())
+                    print(f"model_names={model_names}")
+                    base_model = gr.Dropdown(label="Base model (edit the models.yaml file to add more to this list)", choices=model_names, value=model_names[0])
+                    vram = gr.Radio(["20G", "16G", "12G" ], value="20G", label="VRAM", interactive=True)
+                    num_repeats = gr.Number(value=10, precision=0, label="Repeat trains per image", interactive=True)
+                    max_train_epochs = gr.Number(label="Max Train Epochs", value=16, interactive=True)
+                    total_steps = gr.Number(0, interactive=False, label="Expected training steps")
+                    sample_prompts = gr.Textbox("", lines=5, label="Sample Image Prompts (Separate with new lines)", interactive=True)
+                    sample_every_n_steps = gr.Number(0, precision=0, label="Sample Image Every N Steps", interactive=True)
+                    resolution = gr.Number(value=512, precision=0, label="Resize dataset images")
+                with gr.Column():
+                    gr.Markdown(
+                        """# Step 2. Dataset
+        <p style="margin-top:0">Make sure the captions include the trigger word.</p>
+        """, elem_classes="group_padding")
+                    with gr.Group():
+                        images = gr.File(
+                            file_types=["image", ".txt"],
+                            label="Upload your images",
+                            #info="If you want, you can also manually upload caption files that match the image names (example: img0.png => img0.txt)",
+                            file_count="multiple",
+                            interactive=True,
+                            visible=True,
+                            scale=1,
+                        )
+                    with gr.Group(visible=False) as captioning_area:
+                        do_captioning = gr.Button("Add AI captions with Florence-2")
+                        output_components.append(captioning_area)
+                        #output_components = [captioning_area]
+                        caption_list = []
+                        for i in range(1, MAX_IMAGES + 1):
+                            locals()[f"captioning_row_{i}"] = gr.Row(visible=False)
+                            with locals()[f"captioning_row_{i}"]:
+                                locals()[f"image_{i}"] = gr.Image(
+                                    type="filepath",
+                                    width=111,
+                                    height=111,
+                                    min_width=111,
+                                    interactive=False,
+                                    scale=2,
+                                    show_label=False,
+                                    show_share_button=False,
+                                    show_download_button=False,
+                                )
+                                locals()[f"caption_{i}"] = gr.Textbox(
+                                    label=f"Caption {i}", scale=15, interactive=True
+                                )
+                            output_components.append(locals()[f"captioning_row_{i}"])
+                            output_components.append(locals()[f"image_{i}"])
+                            output_components.append(locals()[f"caption_{i}"])
+                            caption_list.append(locals()[f"caption_{i}"])
+                with gr.Column():
+                    gr.Markdown(
+                        """# Step 3. Train
+        <p style="margin-top:0">Press start to start training.</p>
+        """, elem_classes="group_padding")
+                    refresh = gr.Button("Refresh", elem_id="refresh", visible=False)
+                    start = gr.Button("Start training", visible=False, elem_id="start_training")
+                    output_components.append(start)
+                    train_script = gr.Textbox(label="Train script", max_lines=100, interactive=True)
+                    train_config = gr.Textbox(label="Train config", max_lines=100, interactive=True)
+            with gr.Accordion("Advanced options", elem_id='advanced_options', open=False):
+                with gr.Row():
+                    with gr.Column(min_width=300):
+                        seed = gr.Number(label="--seed", info="Seed", value=42, interactive=True)
+                    with gr.Column(min_width=300):
+                        workers = gr.Number(label="--max_data_loader_n_workers", info="Number of Workers", value=2, interactive=True)
+                    with gr.Column(min_width=300):
+                        learning_rate = gr.Textbox(label="--learning_rate", info="Learning Rate", value="8e-4", interactive=True)
+                    with gr.Column(min_width=300):
+                        save_every_n_epochs = gr.Number(label="--save_every_n_epochs", info="Save every N epochs", value=4, interactive=True)
+                    with gr.Column(min_width=300):
+                        guidance_scale = gr.Number(label="--guidance_scale", info="Guidance Scale", value=1.0, interactive=True)
+                    with gr.Column(min_width=300):
+                        timestep_sampling = gr.Textbox(label="--timestep_sampling", info="Timestep Sampling", value="shift", interactive=True)
+                    with gr.Column(min_width=300):
+                        network_dim = gr.Number(label="--network_dim", info="LoRA Rank", value=4, minimum=4, maximum=128, step=4, interactive=True)
+                    advanced_components, advanced_component_ids = init_advanced()
+            with gr.Row():
+                terminal = LogsView(label="Train log", elem_id="terminal")
+            with gr.Row():
+                gallery = gr.Gallery(get_samples, inputs=[lora_name], label="Samples", every=10, columns=6)
+        with gr.TabItem("Publish") as publish_tab:
+            hf_token = gr.Textbox(label="Huggingface Token")
+            hf_login = gr.Button("Login")
+            hf_logout = gr.Button("Logout")
+            with gr.Row() as row:
+                gr.Markdown("**LoRA**")
+                gr.Markdown("**Upload**")
+            loras = get_loras()
+            with gr.Row():
+                lora_rows = refresh_publish_tab()
+                with gr.Column():
+                    with gr.Row():
+                        repo_owner = gr.Textbox(label="Account", interactive=False)
+                        repo_name = gr.Textbox(label="Repository Name")
+                    repo_visibility = gr.Textbox(label="Repository Visibility ('public' or 'private')", value="public")
+                    upload_button = gr.Button("Upload to HuggingFace")
+                    upload_button.click(
+                        fn=upload_hf,
+                        inputs=[
+                            base_model,
+                            lora_rows,
+                            repo_owner,
+                            repo_name,
+                            repo_visibility,
+                            hf_token,
+                        ]
+                    )
+            hf_login.click(fn=login_hf, inputs=[hf_token], outputs=[hf_token, hf_login, hf_logout, repo_owner])
+            hf_logout.click(fn=logout_hf, outputs=[hf_token, hf_login, hf_logout, repo_owner])
+    publish_tab.select(refresh_publish_tab, outputs=lora_rows)
+    lora_rows.select(fn=set_repo, inputs=[lora_rows], outputs=[repo_name])
+    dataset_folder = gr.State()
+    listeners = [
+        base_model,
+        lora_name,
+        resolution,
+        seed,
+        workers,
+        concept_sentence,
+        learning_rate,
+        network_dim,
+        max_train_epochs,
+        save_every_n_epochs,
+        timestep_sampling,
+        guidance_scale,
+        vram,
+        num_repeats,
+        sample_prompts,
+        sample_every_n_steps,
+        *advanced_components
+    ]
+    advanced_component_ids = [x.elem_id for x in advanced_components]
+    original_advanced_component_values = [comp.value for comp in advanced_components]
+    images.upload(
+        load_captioning,
+        inputs=[images, concept_sentence],
+        outputs=output_components
+    )
+    images.delete(
+        load_captioning,
+        inputs=[images, concept_sentence],
+        outputs=output_components
+    )
+    images.clear(
+        hide_captioning,
+        outputs=[captioning_area, start]
+    )
+    max_train_epochs.change(
+        fn=update_total_steps,
+        inputs=[max_train_epochs, num_repeats, images],
+        outputs=[total_steps]
+    )
+    num_repeats.change(
+        fn=update_total_steps,
+        inputs=[max_train_epochs, num_repeats, images],
+        outputs=[total_steps]
+    )
+    images.upload(
+        fn=update_total_steps,
+        inputs=[max_train_epochs, num_repeats, images],
+        outputs=[total_steps]
+    )
+    images.delete(
+        fn=update_total_steps,
+        inputs=[max_train_epochs, num_repeats, images],
+        outputs=[total_steps]
+    )
+    images.clear(
+        fn=update_total_steps,
+        inputs=[max_train_epochs, num_repeats, images],
+        outputs=[total_steps]
+    )
+    concept_sentence.change(fn=update_sample, inputs=[concept_sentence], outputs=sample_prompts)
+    start.click(fn=create_dataset, inputs=[dataset_folder, resolution, images] + caption_list, outputs=dataset_folder).then(
+        fn=start_training,
+        inputs=[
+            base_model,
+            lora_name,
+            train_script,
+            train_config,
+            sample_prompts,
+        ],
+        outputs=terminal,
+    )
+    do_captioning.click(fn=run_captioning, inputs=[images, concept_sentence] + caption_list, outputs=caption_list)
+    demo.load(fn=loaded, js=js, outputs=[hf_token, hf_login, hf_logout, repo_owner])
+    refresh.click(update, inputs=listeners, outputs=[train_script, train_config, dataset_folder])
+if __name__ == "__main__":
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    demo.launch(debug=True, show_error=True, allowed_paths=[cwd])

datasets/1 ADDED Viewed

File without changes

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+services:
+  fluxgym:
+    build:
+      context: .
+      # change the dockerfile to Dockerfile.cuda12.4 if you are running CUDA 12.4 drivers otherwise leave as is
+      dockerfile: Dockerfile
+    image: fluxgym
+    container_name: fluxgym
+    ports:
+      - 7860:7860
+    environment:
+      - PUID=${PUID:-1000}
+      - PGID=${PGID:-1000}
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - /etc/timezone:/etc/timezone:ro
+      - ./:/app/fluxgym
+    stop_signal: SIGKILL
+    tty: true
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+    restart: unless-stopped

fine_tune.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# training with captions
+# XXX dropped option: hypernetwork training
+import argparse
+import math
+import os
+from multiprocessing import Value
+import toml
+from tqdm import tqdm
+import torch
+from library import deepspeed_utils, strategy_base
+from library.device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+from accelerate.utils import set_seed
+from diffusers import DDPMScheduler
+from .utils import setup_logging, add_logging_arguments
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+import library.train_util as train_util
+import library.config_util as config_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.custom_train_functions as custom_train_functions
+from library.custom_train_functions import (
+    apply_snr_weight,
+    get_weighted_text_embeddings,
+    prepare_scheduler_for_custom_training,
+    scale_v_prediction_loss_like_noise_prediction,
+    apply_debiased_estimation,
+)
+import library.strategy_sd as strategy_sd
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    deepspeed_utils.prepare_deepspeed_args(args)
+    setup_logging(args, reset=True)
+    cache_latents = args.cache_latents
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+    tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if cache_latents:
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, True, False, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            user_config = {
+                "datasets": [
+                    {
+                        "subsets": [
+                            {
+                                "image_dir": args.train_data_dir,
+                                "metadata_file": args.in_json,
+                            }
+                        ]
+                    }
+                ]
+            }
+        blueprint = blueprint_generator.generate(user_config, args)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+    if args.debug_dataset:
+        train_util.debug_dataset(train_dataset_group)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+    vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
+    # モデルを読み込む
+    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype, accelerator)
+    # verify load/save model formats
+    if load_stable_diffusion_format:
+        src_stable_diffusion_ckpt = args.pretrained_model_name_or_path
+        src_diffusers_model_path = None
+    else:
+        src_stable_diffusion_ckpt = None
+        src_diffusers_model_path = args.pretrained_model_name_or_path
+    if args.save_model_as is None:
+        save_stable_diffusion_format = load_stable_diffusion_format
+        use_safetensors = args.use_safetensors
+    else:
+        save_stable_diffusion_format = args.save_model_as.lower() == "ckpt" or args.save_model_as.lower() == "safetensors"
+        use_safetensors = args.use_safetensors or ("safetensors" in args.save_model_as.lower())
+    # Diffusers版のxformers使用フラグを設定する関数
+    def set_diffusers_xformers_flag(model, valid):
+        #   model.set_use_memory_efficient_attention_xformers(valid)            # 次のリリースでなくなりそう
+        # pipeが自動で再帰的にset_use_memory_efficient_attention_xformersを探すんだって(;´Д｀)
+        # U-Netだけ使う時にはどうすればいいのか……仕方ないからコピって使うか
+        # 0.10.2でなんか巻き戻って個別に指定するようになった(;^ω^)
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        fn_recursive_set_mem_eff(model)
+    # モデルに xformers とか memory efficient attention を組み込む
+    if args.diffusers_xformers:
+        accelerator.print("Use xformers by Diffusers")
+        set_diffusers_xformers_flag(unet, True)
+    else:
+        # Windows版のxformersはfloatで学習できないのでxformersを使わない設定も可能にしておく必要がある
+        accelerator.print("Disable Diffusers' xformers")
+        set_diffusers_xformers_flag(unet, False)
+        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+    # 学習を準備する
+    if cache_latents:
+        vae.to(accelerator.device, dtype=vae_dtype)
+        vae.requires_grad_(False)
+        vae.eval()
+        train_dataset_group.new_cache_latents(vae, accelerator)
+        vae.to("cpu")
+        clean_memory_on_device(accelerator.device)
+        accelerator.wait_for_everyone()
+    # 学習を準備する：モデルを適切な状態にする
+    training_models = []
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    training_models.append(unet)
+    if args.train_text_encoder:
+        accelerator.print("enable text encoder training")
+        if args.gradient_checkpointing:
+            text_encoder.gradient_checkpointing_enable()
+        training_models.append(text_encoder)
+    else:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+        text_encoder.requires_grad_(False)  # text encoderは学習しない
+        if args.gradient_checkpointing:
+            text_encoder.gradient_checkpointing_enable()
+            text_encoder.train()  # required for gradient_checkpointing
+        else:
+            text_encoder.eval()
+    text_encoding_strategy = strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+    if not cache_latents:
+        vae.requires_grad_(False)
+        vae.eval()
+        vae.to(accelerator.device, dtype=vae_dtype)
+    for m in training_models:
+        m.requires_grad_(True)
+    trainable_params = []
+    if args.learning_rate_te is None or not args.train_text_encoder:
+        for m in training_models:
+            trainable_params.extend(m.parameters())
+    else:
+        trainable_params = [
+            {"params": list(unet.parameters()), "lr": args.learning_rate},
+            {"params": list(text_encoder.parameters()), "lr": args.learning_rate_te},
+        ]
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+    _, _, optimizer = train_util.get_optimizer(args, trainable_params=trainable_params)
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+    # lr schedulerを用意する
+    lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+    # 実験的機能：勾配も含めたfp16学習を行う　モデル全体をfp16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        unet.to(weight_dtype)
+        text_encoder.to(weight_dtype)
+    if args.deepspeed:
+        if args.train_text_encoder:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder)
+        else:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+    else:
+        # acceleratorがなんかよろしくやってくれるらしい
+        if args.train_text_encoder:
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+            )
+        else:
+            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+    # 学習する
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    accelerator.print("running training / 学習開始")
+    accelerator.print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
+    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(f"  batch size per device / バッチサイズ: {args.train_batch_size}")
+    accelerator.print(
+        f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+    )
+    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+    global_step = 0
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
+    )
+    prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
+    if args.zero_terminal_snr:
+        custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+    # For --sample_at_first
+    train_util.sample_images(
+        accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+    )
+    loss_recorder = train_util.LossRecorder()
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+        for m in training_models:
+            m.train()
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+            with accelerator.accumulate(*training_models):
+                with torch.no_grad():
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                    else:
+                        # latentに変換
+                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(weight_dtype)
+                    latents = latents * 0.18215
+                b_size = latents.shape[0]
+                with torch.set_grad_enabled(args.train_text_encoder):
+                    # Get the text embedding for conditioning
+                    if args.weighted_captions:
+                        # TODO move to strategy_sd.py
+                        encoder_hidden_states = get_weighted_text_embeddings(
+                            tokenize_strategy.tokenizer,
+                            text_encoder,
+                            batch["captions"],
+                            accelerator.device,
+                            args.max_token_length // 75 if args.max_token_length else 1,
+                            clip_skip=args.clip_skip,
+                        )
+                    else:
+                        input_ids = batch["input_ids_list"][0].to(accelerator.device)
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder], [input_ids]
+                        )[0]
+                        if args.full_fp16:
+                            encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
+                # Sample noise, sample a random timestep for each image, and add noise to the latents,
+                # with noise offset and/or multires noise if specified
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
+                )
+                # Predict the noise residual
+                with accelerator.autocast():
+                    noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                if args.v_parameterization:
+                    # v-parameterization training
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    target = noise
+                if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
+                    # do not mean over batch dimension for snr weight or scale v-pred loss
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                    loss = loss.mean([1, 2, 3])
+                    if args.min_snr_gamma:
+                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
+                    if args.scale_v_pred_loss_like_noise_pred:
+                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                    if args.debiased_estimation_loss:
+                        loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+                    loss = loss.mean()  # mean over batch dimension
+                else:
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
+                accelerator.backward(loss)
+                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                    params_to_clip = []
+                    for m in training_models:
+                        params_to_clip.extend(m.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                train_util.sample_images(
+                    accelerator, args, None, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+                )
+                # 指定ステップごとにモデルを保存
+                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path
+                        train_util.save_sd_model_on_epoch_end_or_stepwise(
+                            args,
+                            False,
+                            accelerator,
+                            src_path,
+                            save_stable_diffusion_format,
+                            use_safetensors,
+                            save_dtype,
+                            epoch,
+                            num_train_epochs,
+                            global_step,
+                            accelerator.unwrap_model(text_encoder),
+                            accelerator.unwrap_model(unet),
+                            vae,
+                        )
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if args.logging_dir is not None:
+                logs = {"loss": current_loss}
+                train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
+                accelerator.log(logs, step=global_step)
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        if args.logging_dir is not None:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+        accelerator.wait_for_everyone()
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path
+                train_util.save_sd_model_on_epoch_end_or_stepwise(
+                    args,
+                    True,
+                    accelerator,
+                    src_path,
+                    save_stable_diffusion_format,
+                    use_safetensors,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    accelerator.unwrap_model(text_encoder),
+                    accelerator.unwrap_model(unet),
+                    vae,
+                )
+        train_util.sample_images(
+            accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+        )
+    is_main_process = accelerator.is_main_process
+    if is_main_process:
+        unet = accelerator.unwrap_model(unet)
+        text_encoder = accelerator.unwrap_model(text_encoder)
+    accelerator.end_training()
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
+        train_util.save_state_on_train_end(args, accelerator)
+    del accelerator  # この後メモリを使うのでこれは消す
+    if is_main_process:
+        src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path
+        train_util.save_sd_model_on_train_end(
+            args, src_path, save_stable_diffusion_format, use_safetensors, save_dtype, epoch, global_step, text_encoder, unet, vae
+        )
+        logger.info("model saved.")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_dataset_arguments(parser, False, True, True)
+    train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    parser.add_argument(
+        "--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
+    parser.add_argument(
+        "--learning_rate_te",
+        type=float,
+        default=None,
+        help="learning rate for text encoder, default is same as unet / Text Encoderの学習率、デフォルトはunetと同じ",
+    )
+    parser.add_argument(
+        "--no_half_vae",
+        action="store_true",
+        help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+    train(args)

flags.png ADDED Viewed

flow.gif ADDED Viewed

Git LFS Details

SHA256: e502e5bcbfd25f5d7bad10e0b57a88c8f3b24006792d3a273d7bd964634a8fd9
Pointer size: 133 Bytes
Size of remote file: 11.3 MB

flux_extract_lora.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# extract approximating LoRA by svd from two FLUX models
+# The code is based on https://github.com/cloneofsimo/lora/blob/develop/lora_diffusion/cli_svd.py
+# Thanks to cloneofsimo!
+import argparse
+import json
+import os
+import time
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from tqdm import tqdm
+from .library import flux_utils, sai_model_spec
+from .library.utils import MemoryEfficientSafeOpen
+from .library.utils import setup_logging
+from .networks import lora_flux
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+from comfy.utils import ProgressBar
+# CLAMP_QUANTILE = 0.99
+# MIN_DIFF = 1e-1
+def save_to_file(file_name, state_dict, metadata, dtype):
+    if dtype is not None:
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)
+    save_file(state_dict, file_name, metadata=metadata)
+def svd(
+    model_org=None,
+    model_tuned=None,
+    save_to=None,
+    dim=4,
+    device=None,
+    store_device='cpu',
+    save_precision=None,
+    clamp_quantile=0.99,
+    min_diff=0.01,
+    no_metadata=False,
+    mem_eff_safe_open=False,
+):
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None
+    calc_dtype = torch.float
+    save_dtype = str_to_dtype(save_precision)
+    # open models
+    lora_weights = {}
+    if not mem_eff_safe_open:
+        # use original safetensors.safe_open
+        open_fn = lambda fn: safe_open(fn, framework="pt")
+    else:
+        logger.info("Using memory efficient safe_open")
+        open_fn = lambda fn: MemoryEfficientSafeOpen(fn)
+    with open_fn(model_org) as fo:
+        # filter keys
+        keys = []
+        for key in fo.keys():
+            if not ("single_block" in key or "double_block" in key):
+                continue
+            if ".bias" in key:
+                continue
+            if "norm" in key:
+                continue
+            keys.append(key)
+        comfy_pbar = ProgressBar(len(keys))
+        with open_fn(model_tuned) as ft:
+            for key in tqdm(keys):
+                # get tensors and calculate difference
+                value_o = fo.get_tensor(key)
+                value_t = ft.get_tensor(key)
+                mat = value_t.to(calc_dtype) - value_o.to(calc_dtype)
+                del value_o, value_t
+                # extract LoRA weights
+                if device:
+                    mat = mat.to(device)
+                out_dim, in_dim = mat.size()[0:2]
+                rank = min(dim, in_dim, out_dim)  # LoRA rank cannot exceed the original dim
+                mat = mat.squeeze()
+                U, S, Vh = torch.linalg.svd(mat)
+                U = U[:, :rank]
+                S = S[:rank]
+                U = U @ torch.diag(S)
+                Vh = Vh[:rank, :]
+                dist = torch.cat([U.flatten(), Vh.flatten()])
+                hi_val = torch.quantile(dist, clamp_quantile)
+                low_val = -hi_val
+                U = U.clamp(low_val, hi_val)
+                Vh = Vh.clamp(low_val, hi_val)
+                U = U.to(store_device, dtype=save_dtype).contiguous()
+                Vh = Vh.to(store_device, dtype=save_dtype).contiguous()
+                print(f"key: {key}, U: {U.size()}, Vh: {Vh.size()}")
+                comfy_pbar.update(1)
+                lora_weights[key] = (U, Vh)
+                del mat, U, S, Vh
+    # make state dict for LoRA
+    lora_sd = {}
+    for key, (up_weight, down_weight) in lora_weights.items():
+        lora_name = key.replace(".weight", "").replace(".", "_")
+        lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + lora_name
+        lora_sd[lora_name + ".lora_up.weight"] = up_weight
+        lora_sd[lora_name + ".lora_down.weight"] = down_weight
+        lora_sd[lora_name + ".alpha"] = torch.tensor(down_weight.size()[0])  # same as rank
+    # minimum metadata
+    net_kwargs = {}
+    metadata = {
+        "ss_v2": str(False),
+        "ss_base_model_version": flux_utils.MODEL_VERSION_FLUX_V1,
+        "ss_network_module": "networks.lora_flux",
+        "ss_network_dim": str(dim),
+        "ss_network_alpha": str(float(dim)),
+        "ss_network_args": json.dumps(net_kwargs),
+    }
+    if not no_metadata:
+        title = os.path.splitext(os.path.basename(save_to))[0]
+        sai_metadata = sai_model_spec.build_metadata(lora_sd, False, False, False, True, False, time.time(), title, flux="dev")
+        metadata.update(sai_metadata)
+    save_to_file(save_to, lora_sd, metadata, save_dtype)
+    logger.info(f"LoRA weights saved to {save_to}")
+    return save_to
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はfloat",
+    )
+    parser.add_argument(
+        "--model_org",
+        type=str,
+        default=None,
+        required=True,
+        help="Original model: safetensors file / 元モデル、safetensors",
+    )
+    parser.add_argument(
+        "--model_tuned",
+        type=str,
+        default=None,
+        required=True,
+        help="Tuned model, LoRA is difference of `original to tuned`: safetensors file / 派生モデル（生成されるLoRAは元→派生の差分になります）、ckptまたはsafetensors",
+    )
+    parser.add_argument(
+        "--mem_eff_safe_open",
+        action="store_true",
+        help="use memory efficient safe_open. This is an experimental feature, use only when memory is not enough."
+        " / メモリ効率の良いsafe_openを使用する。実装は実験的なものなので、メモリが足りない場合のみ使用してください。",
+    )
+    parser.add_argument(
+        "--save_to",
+        type=str,
+        default=None,
+        required=True,
+        help="destination file name: safetensors file / 保存先のファイル名、safetensors",
+    )
+    parser.add_argument(
+        "--dim", type=int, default=4, help="dimension (rank) of LoRA (default 4) / LoRAの次元数（rank）（デフォルト4）"
+    )
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う"
+    )
+    parser.add_argument(
+        "--clamp_quantile",
+        type=float,
+        default=0.99,
+        help="Quantile clamping value, float, (0-1). Default = 0.99 / 値をクランプするための分位点、float、(0-1)。デフォルトは0.99",
+    )
+    # parser.add_argument(
+    #     "--min_diff",
+    #     type=float,
+    #     default=0.01,
+    #     help="Minimum difference between finetuned model and base to consider them different enough to extract, float, (0-1). Default = 0.01 /"
+    #     + "LoRAを抽出するために元モデルと派生モデルの差分の最小値、float、(0-1)。デフォルトは0.01",
+    # )
+    parser.add_argument(
+        "--no_metadata",
+        action="store_true",
+        help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
+        + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    svd(**vars(args))

flux_train_comfy.py ADDED Viewed

	@@ -0,0 +1,806 @@

+# training with captions
+# Swap blocks between CPU and GPU:
+# This implementation is inspired by and based on the work of 2kpr.
+# Many thanks to 2kpr for the original concept and implementation of memory-efficient offloading.
+# The original idea has been adapted and extended to fit the current project's needs.
+# Key features:
+# - CPU offloading during forward and backward passes
+# - Use of fused optimizer and grad_hook for efficient gradient processing
+# - Per-block fused optimizer instances
+import argparse
+import copy
+import math
+import os
+from multiprocessing import Value
+import toml
+from tqdm import tqdm
+import torch
+from .library.device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+from accelerate.utils import set_seed
+from .library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
+from .library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+from .library import train_util as train_util
+from .library.utils import setup_logging, add_logging_arguments
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+from .library import config_util as config_util
+from .library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from .library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
+class FluxTrainer:
+    def __init__(self):
+        self.sample_prompts_te_outputs = None
+    def sample_images(self, epoch, global_step, validation_settings):
+        image_tensors = flux_train_utils.sample_images(
+        self.accelerator, self.args, epoch, global_step, self.unet, self.vae, self.text_encoder, self.sample_prompts_te_outputs, validation_settings)
+        return image_tensors
+    def init_train(self, args):
+        train_util.verify_training_args(args)
+        train_util.prepare_dataset_args(args, True)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+        deepspeed_utils.prepare_deepspeed_args(args)
+        setup_logging(args, reset=True)
+        # temporary: backward compatibility for deprecated options. remove in the future
+        if not args.skip_cache_check:
+            args.skip_cache_check = args.skip_latents_validity_check
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+        if args.cpu_offload_checkpointing and not args.gradient_checkpointing:
+            logger.warning(
+                "cpu_offload_checkpointing is enabled, so gradient_checkpointing is also enabled / cpu_offload_checkpointingが有効になっているため、gradient_checkpointingも有効になります"
+            )
+            args.gradient_checkpointing = True
+        assert (
+            args.blocks_to_swap is None or args.blocks_to_swap == 0
+        ) or not args.cpu_offload_checkpointing, (
+            "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+        )
+        cache_latents = args.cache_latents
+        use_dreambooth_method = args.in_json is None
+        if args.seed is not None:
+            set_seed(args.seed)
+        # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+        if args.cache_latents:
+            latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(
+                args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
+            )
+            strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+        # Prepare the dataset
+        if args.dataset_class is None:
+            blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+            if args.dataset_config is not None:
+                logger.info(f"Load dataset config from {args.dataset_config}")
+                user_config = config_util.load_user_config(args.dataset_config)
+                ignored = ["train_data_dir", "in_json"]
+                if any(getattr(args, attr) is not None for attr in ignored):
+                    logger.warning(
+                        "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                            ", ".join(ignored)
+                        )
+                    )
+            else:
+                if use_dreambooth_method:
+                    logger.info("Using DreamBooth method.")
+                    user_config = {
+                        "datasets": [
+                            {
+                                "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                    args.train_data_dir, args.reg_data_dir
+                                )
+                            }
+                        ]
+                    }
+                else:
+                    logger.info("Training with captions.")
+                    user_config = {
+                        "datasets": [
+                            {
+                                "subsets": [
+                                    {
+                                        "image_dir": args.train_data_dir,
+                                        "metadata_file": args.in_json,
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+            blueprint = blueprint_generator.generate(user_config, args)
+            train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        else:
+            train_dataset_group = train_util.load_arbitrary_dataset(args)
+        current_epoch = Value("i", 0)
+        current_step = Value("i", 0)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+        train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
+        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+        if args.debug_dataset:
+            if args.cache_text_encoder_outputs:
+                strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+                    strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                        args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, args.skip_cache_check, False
+                    )
+                )
+            t5xxl_max_token_length = (
+                args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if is_schnell else 512)
+            )
+            strategy_base.TokenizeStrategy.set_strategy(strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length))
+            train_dataset_group.set_current_strategies()
+            train_util.debug_dataset(train_dataset_group, True)
+            return
+        if len(train_dataset_group) == 0:
+            logger.error(
+                "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+            )
+            return
+        if cache_latents:
+            assert (
+                train_dataset_group.is_latent_cacheable()
+            ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching text encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / text encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+        # acceleratorを準備する
+        logger.info("prepare accelerator")
+        accelerator = train_util.prepare_accelerator(args)
+        # mixed precisionに対応した型を用意しておき適宜castする
+        weight_dtype, save_dtype = train_util.prepare_dtype(args)
+        # load VAE for caching latents
+        ae = None
+        if cache_latents:
+            ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+            ae.to(accelerator.device, dtype=weight_dtype)
+            ae.requires_grad_(False)
+            ae.eval()
+            train_dataset_group.new_cache_latents(ae, accelerator)
+            ae.to("cpu")  # if no sampling, vae can be deleted
+            clean_memory_on_device(accelerator.device)
+            accelerator.wait_for_everyone()
+        # prepare tokenize strategy
+        if args.t5xxl_max_token_length is None:
+            if is_schnell:
+                t5xxl_max_token_length = 256
+            else:
+                t5xxl_max_token_length = 512
+        else:
+            t5xxl_max_token_length = args.t5xxl_max_token_length
+        flux_tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length)
+        strategy_base.TokenizeStrategy.set_strategy(flux_tokenize_strategy)
+        # load clip_l, t5xxl for caching text encoder outputs
+        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        clip_l.eval()
+        t5xxl.eval()
+        clip_l.requires_grad_(False)
+        t5xxl.requires_grad_(False)
+        text_encoding_strategy = strategy_flux.FluxTextEncodingStrategy(args.apply_t5_attn_mask)
+        strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+        # cache text encoder outputs
+        sample_prompts_te_outputs = None
+        if args.cache_text_encoder_outputs:
+            # Text Encodes are eval and no grad here
+            clip_l.to(accelerator.device)
+            t5xxl.to(accelerator.device)
+            text_encoder_caching_strategy = strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False, args.apply_t5_attn_mask
+            )
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
+            with accelerator.autocast():
+                train_dataset_group.new_cache_text_encoder_outputs([clip_l, t5xxl], accelerator)
+            # cache sample prompt's embeddings to free text encoder's memory
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+                text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+                prompts = []
+                for line in args.sample_prompts:
+                    line = line.strip()
+                    if len(line) > 0 and line[0] != "#":
+                        prompts.append(line)
+                # preprocess prompts
+                for i in range(len(prompts)):
+                    prompt_dict = prompts[i]
+                    if isinstance(prompt_dict, str):
+                        from .library.train_util import line_to_prompt_dict
+                        prompt_dict = line_to_prompt_dict(prompt_dict)
+                        prompts[i] = prompt_dict
+                    assert isinstance(prompt_dict, dict)
+                    # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+                    prompt_dict["enum"] = i
+                    prompt_dict.pop("subset", None)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = flux_tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    flux_tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                                )
+            self.sample_prompts_te_outputs = sample_prompts_te_outputs
+            accelerator.wait_for_everyone()
+            # now we can delete Text Encoders to free memory
+            clip_l = None
+            t5xxl = None
+            clean_memory_on_device(accelerator.device)
+        # load FLUX
+        _, flux = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+        )
+        if args.gradient_checkpointing:
+            flux.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
+        flux.requires_grad_(True)
+        # block swap
+        # backward compatibility
+        if args.blocks_to_swap is None:
+            blocks_to_swap = args.double_blocks_to_swap or 0
+            if args.single_blocks_to_swap is not None:
+                blocks_to_swap += args.single_blocks_to_swap // 2
+            if blocks_to_swap > 0:
+                logger.warning(
+                    "double_blocks_to_swap and single_blocks_to_swap are deprecated. Use blocks_to_swap instead."
+                    " / double_blocks_to_swapとsingle_blocks_to_swapは非推奨です。blocks_to_swapを使ってください。"
+                )
+                logger.info(
+                    f"double_blocks_to_swap={args.double_blocks_to_swap} and single_blocks_to_swap={args.single_blocks_to_swap} are converted to blocks_to_swap={blocks_to_swap}."
+                )
+                args.blocks_to_swap = blocks_to_swap
+            del blocks_to_swap
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            # This idea is based on 2kpr's great work. Thank you!
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            flux.enable_block_swap(args.blocks_to_swap, accelerator.device)
+        if not cache_latents:
+            # load VAE here if not cached
+            ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu")
+            ae.requires_grad_(False)
+            ae.eval()
+            ae.to(accelerator.device, dtype=weight_dtype)
+        training_models = []
+        params_to_optimize = []
+        training_models.append(flux)
+        name_and_params = list(flux.named_parameters())
+        # single param group for now
+        params_to_optimize.append({"params": [p for _, p in name_and_params], "lr": args.learning_rate})
+        param_names = [[n for n, _ in name_and_params]]
+        # calculate number of trainable parameters
+        n_params = 0
+        for group in params_to_optimize:
+            for p in group["params"]:
+                n_params += p.numel()
+        accelerator.print(f"number of trainable parameters: {n_params}")
+        # 学習に必要なクラスを準備する
+        accelerator.print("prepare optimizer, data loader etc.")
+        if args.blockwise_fused_optimizers:
+            # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+            # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
+            # This balances memory usage and management complexity.
+            # split params into groups. currently different learning rates are not supported
+            grouped_params = []
+            param_group = {}
+            for group in params_to_optimize:
+                named_parameters = list(flux.named_parameters())
+                assert len(named_parameters) == len(group["params"]), "number of parameters does not match"
+                for p, np in zip(group["params"], named_parameters):
+                    # determine target layer and block index for each parameter
+                    block_type = "other"  # double, single or other
+                    if np[0].startswith("double_blocks"):
+                        block_index = int(np[0].split(".")[1])
+                        block_type = "double"
+                    elif np[0].startswith("single_blocks"):
+                        block_index = int(np[0].split(".")[1])
+                        block_type = "single"
+                    else:
+                        block_index = -1
+                    param_group_key = (block_type, block_index)
+                    if param_group_key not in param_group:
+                        param_group[param_group_key] = []
+                    param_group[param_group_key].append(p)
+            block_types_and_indices = []
+            for param_group_key, param_group in param_group.items():
+                block_types_and_indices.append(param_group_key)
+                grouped_params.append({"params": param_group, "lr": args.learning_rate})
+                num_params = 0
+                for p in param_group:
+                    num_params += p.numel()
+                accelerator.print(f"block {param_group_key}: {num_params} parameters")
+            # prepare optimizers for each group
+            optimizers = []
+            for group in grouped_params:
+                _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+                optimizers.append(optimizer)
+            optimizer = optimizers[0]  # avoid error in the following code
+            logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
+            if train_util.is_schedulefree_optimizer(optimizers[0], args):
+                raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
+            self.optimizer_train_fn = lambda: None  # dummy function
+            self.optimizer_eval_fn = lambda: None  # dummy function
+        else:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+            self.optimizer_train_fn, self.optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
+        # prepare dataloader
+        # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+        # some strategies can be None
+        train_dataset_group.set_current_strategies()
+        # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+        n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+        train_dataloader = torch.utils.data.DataLoader(
+            train_dataset_group,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+        # 学習ステップ数を計算する
+        if args.max_train_epochs is not None:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+        # データセット側にも学習ステップを送信
+        train_dataset_group.set_max_train_steps(args.max_train_steps)
+        # lr schedulerを用意する
+        if args.blockwise_fused_optimizers:
+            # prepare lr schedulers for each optimizer
+            lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
+            lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+        else:
+            lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+        # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+        if args.full_fp16:
+            assert (
+                args.mixed_precision == "fp16"
+            ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+            accelerator.print("enable full fp16 training.")
+            flux.to(weight_dtype)
+            if clip_l is not None:
+                clip_l.to(weight_dtype)
+                t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+        elif args.full_bf16:
+            assert (
+                args.mixed_precision == "bf16"
+            ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+            accelerator.print("enable full bf16 training.")
+            flux.to(weight_dtype)
+            if clip_l is not None:
+                clip_l.to(weight_dtype)
+                t5xxl.to(weight_dtype)
+        # if we don't cache text encoder outputs, move them to device
+        if not args.cache_text_encoder_outputs:
+            clip_l.to(accelerator.device)
+            t5xxl.to(accelerator.device)
+        clean_memory_on_device(accelerator.device)
+        if args.deepspeed:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=flux)
+            # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                ds_model, optimizer, train_dataloader, lr_scheduler
+            )
+            training_models = [ds_model]
+        else:
+            # accelerator does some magic
+            # if we doesn't swap blocks, we can move the model to device
+            flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
+            if self.is_swapping_blocks:
+                accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+            optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+        # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+        if args.full_fp16:
+            # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+            # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+            train_util.patch_accelerator_for_fp16_training(accelerator)
+        # resumeする
+        train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+        if args.fused_backward_pass:
+            # use fused optimizer for backward pass: other optimizers will be supported in the future
+            from .library import adafactor_fused
+            adafactor_fused.patch_adafactor_fused(optimizer)
+            for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+                for parameter, param_name in zip(param_group["params"], param_name_group):
+                    if parameter.requires_grad:
+                        def create_grad_hook(p_name, p_group):
+                            def grad_hook(tensor: torch.Tensor):
+                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                    accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                optimizer.step_param(tensor, p_group)
+                                tensor.grad = None
+                            return grad_hook
+                        parameter.register_post_accumulate_grad_hook(create_grad_hook(param_name, param_group))
+        elif args.blockwise_fused_optimizers:
+            # prepare for additional optimizers and lr schedulers
+            for i in range(1, len(optimizers)):
+                optimizers[i] = accelerator.prepare(optimizers[i])
+                lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+            # counters are used to determine when to step the optimizer
+            global optimizer_hooked_count
+            global num_parameters_per_group
+            global parameter_optimizer_map
+            optimizer_hooked_count = {}
+            num_parameters_per_group = [0] * len(optimizers)
+            parameter_optimizer_map = {}
+            for opt_idx, optimizer in enumerate(optimizers):
+                for param_group in optimizer.param_groups:
+                    for parameter in param_group["params"]:
+                        if parameter.requires_grad:
+                            def grad_hook(parameter: torch.Tensor):
+                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+                                i = parameter_optimizer_map[parameter]
+                                optimizer_hooked_count[i] += 1
+                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                    optimizers[i].step()
+                                    optimizers[i].zero_grad(set_to_none=True)
+                            parameter.register_post_accumulate_grad_hook(grad_hook)
+                            parameter_optimizer_map[parameter] = opt_idx
+                            num_parameters_per_group[opt_idx] += 1
+        # epoch数を計算する
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+        if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+            args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+        # 学習する
+        # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        accelerator.print("running training")
+        accelerator.print(f"  num examples: {train_dataset_group.num_train_images}")
+        accelerator.print(f"  num batches per epoch: {len(train_dataloader)}")
+        accelerator.print(f"  num epochs: {num_train_epochs}")
+        accelerator.print(
+            f"  batch size per device: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+        )
+        # accelerator.print(
+        #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+        # )
+        accelerator.print(f"  gradient accumulation steps = {args.gradient_accumulation_steps}")
+        accelerator.print(f"  total optimization steps: {args.max_train_steps}")
+        progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+        self.global_step = 0
+        noise_scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+        noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        if accelerator.is_main_process:
+            init_kwargs = {}
+            if args.wandb_run_name:
+                init_kwargs["wandb"] = {"name": args.wandb_run_name}
+            if args.log_tracker_config is not None:
+                init_kwargs = toml.load(args.log_tracker_config)
+            accelerator.init_trackers(
+                "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_util.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
+            )
+        if self.is_swapping_blocks:
+            accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
+        # For --sample_at_first
+        #flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+        self.loss_recorder = train_util.LossRecorder()
+        epoch = 0  # avoid error when max_train_steps is 0
+        self.tokens_and_masks = tokens_and_masks
+        self.num_train_epochs = num_train_epochs
+        self.current_epoch = current_epoch
+        self.args = args
+        self.accelerator = accelerator
+        self.unet = flux
+        self.vae = ae
+        self.text_encoder = [clip_l, t5xxl]
+        self.save_dtype = save_dtype
+        def training_loop(break_at_steps, epoch):
+            global optimizer_hooked_count
+            steps_done = 0
+            #accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+            progress_bar.set_description(f"Epoch {epoch + 1}/{num_train_epochs} - steps")
+            current_epoch.value = epoch + 1
+            for m in training_models:
+                m.train()
+            for step, batch in enumerate(train_dataloader):
+                current_step.value = self.global_step
+                if args.blockwise_fused_optimizers:
+                    optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
+                with accelerator.accumulate(*training_models):
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
+                    else:
+                        with torch.no_grad():
+                            # encode images to latents. images are [-1, 1]
+                            latents = ae.encode(batch["images"].to(ae.dtype)).to(accelerator.device, dtype=weight_dtype)
+                        # NaNが含まれていれば警告を表示し0に置き換える
+                        if torch.any(torch.isnan(latents)):
+                            accelerator.print("NaN found in latents, replacing with zeros")
+                            latents = torch.nan_to_num(latents, 0, out=latents)
+                    text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                    if text_encoder_outputs_list is not None:
+                        text_encoder_conds = text_encoder_outputs_list
+                    else:
+                        # not cached or training, so get from text encoders
+                        tokens_and_masks = batch["input_ids_list"]
+                        with torch.no_grad():
+                            input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
+                            text_encoder_conds = text_encoding_strategy.encode_tokens(
+                                flux_tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
+                            )
+                            if args.full_fp16:
+                                text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
+                    # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    # get noisy model input and timesteps
+                    noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+                        args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
+                    )
+                    # pack latents and get img_ids
+                    packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+                    packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+                    img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+                    # get guidance: ensure args.guidance_scale is float
+                    guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
+                    # call model
+                    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+                    if not args.apply_t5_attn_mask:
+                        t5_attn_mask = None
+                    if args.bypass_flux_guidance:
+                        flux_utils.bypass_flux_guidance(flux)
+                    with accelerator.autocast():
+                        # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                        model_pred = flux(
+                            img=packed_noisy_model_input,
+                            img_ids=img_ids,
+                            txt=t5_out,
+                            txt_ids=txt_ids,
+                            y=l_pooled,
+                            timesteps=timesteps / 1000,
+                            guidance=guidance_vec,
+                            txt_attention_mask=t5_attn_mask,
+                        )
+                    # unpack latents
+                    model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+                    if args.bypass_flux_guidance:
+                        flux_utils.restore_flux_guidance(flux)
+                    # apply model prediction type
+                    model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+                    # flow matching loss: this is different from SD3
+                    target = noise - latents
+                    # calculate loss
+                    huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                    loss = train_util.conditional_loss(model_pred.float(), target.float(), args.loss_type, "none", huber_c)
+                    if weighting is not None:
+                        loss = loss * weighting
+                    if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                        loss = apply_masked_loss(loss, batch)
+                    loss = loss.mean([1, 2, 3])
+                    loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                    loss = loss * loss_weights
+                    loss = loss.mean()
+                    # backward
+                    accelerator.backward(loss)
+                    if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            params_to_clip = []
+                            for m in training_models:
+                                params_to_clip.extend(m.parameters())
+                            accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                        optimizer.step()
+                        lr_scheduler.step()
+                        optimizer.zero_grad(set_to_none=True)
+                    else:
+                        # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                        lr_scheduler.step()
+                        if args.blockwise_fused_optimizers:
+                            for i in range(1, len(optimizers)):
+                                lr_schedulers[i].step()
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    self.global_step += 1
+                current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+                if len(accelerator.trackers) > 0:
+                    logs = {"loss": current_loss}
+                    train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
+                    accelerator.log(logs, step=self.global_step)
+                self.loss_recorder.add(epoch=epoch, step=step, loss=current_loss, global_step=self.global_step)
+                avr_loss: float = self.loss_recorder.moving_average
+                logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                if self.global_step >= break_at_steps:
+                    break
+                steps_done += 1
+            if len(accelerator.trackers) > 0:
+                logs = {"loss/epoch": self.loss_recorder.moving_average}
+                accelerator.log(logs, step=epoch + 1)
+            return steps_done
+        return training_loop
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)  # TODO split this
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    add_custom_train_arguments(parser)  # TODO remove this from here
+    train_util.add_dit_training_arguments(parser)
+    flux_train_utils.add_flux_train_arguments(parser)
+    parser.add_argument(
+        "--mem_eff_save",
+        action="store_true",
+        help="[EXPERIMENTAL] use memory efficient custom model saving method / メモリ効率の良い独自のモデル保存方法を使う",
+    )
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="**this option is not working** will be removed in the future / このオプションは動作しません。将来削除されます",
+    )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
+    )
+    parser.add_argument(
+        "--skip_latents_validity_check",
+        action="store_true",
+        help="skip latents validity check / latentsの正当性チェックをスキップする",
+    )
+    parser.add_argument(
+        "--cpu_offload_checkpointing",
+        action="store_true",
+        help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
+    )
+    return parser

flux_train_network_comfy.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import torch
+import copy
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+import argparse
+from .library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, strategy_base, strategy_flux, train_util
+from .train_network import NetworkTrainer, clean_memory_on_device, setup_parser
+from accelerate import Accelerator
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class FluxNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+        self.sample_prompts_te_outputs = None
+        self.is_schnell: Optional[bool] = None
+        self.is_swapping_blocks: bool = False
+    def assert_extra_args(self, args, train_dataset_group):
+        super().assert_extra_args(args, train_dataset_group)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+        if args.fp8_base_unet:
+            args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+        # prepare CLIP-L/T5XXL training flags
+        self.train_clip_l = not args.network_train_unet_only
+        self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
+        if args.max_token_length is not None:
+            logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
+        assert (
+            args.blocks_to_swap is None or args.blocks_to_swap == 0
+        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+        train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+    def load_target_model(self, args, weight_dtype, accelerator):
+        # currently offload to cpu for some models
+        # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
+        loading_dtype = None if args.fp8_base else weight_dtype
+        # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
+        self.is_schnell, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        )
+        if args.fp8_base:
+            # check dtype of model
+            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
+            elif model.dtype == torch.float8_e4m3fn or model.dtype == torch.float8_e5m2:
+                logger.info(f"Loaded {model.dtype} FLUX model")
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        clip_l.eval()
+        # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
+        if args.fp8_base and not args.fp8_base_unet:
+            loading_dtype = None  # as is
+        else:
+            loading_dtype = weight_dtype
+        # loading t5xxl to cpu takes a long time, so we should load to gpu in future
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        t5xxl.eval()
+        if args.fp8_base and not args.fp8_base_unet:
+            # check dtype of model
+            if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
+            elif t5xxl.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 T5XXL model")
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+    def get_tokenize_strategy(self, args):
+        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+        if args.t5xxl_max_token_length is None:
+            if is_schnell:
+                t5xxl_max_token_length = 256
+            else:
+                t5xxl_max_token_length = 512
+        else:
+            t5xxl_max_token_length = args.t5xxl_max_token_length
+        logger.info(f"t5xxl_max_token_length: {t5xxl_max_token_length}")
+        return strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length, args.tokenizer_cache_dir)
+    def get_tokenizers(self, tokenize_strategy: strategy_flux.FluxTokenizeStrategy):
+        return [tokenize_strategy.clip_l, tokenize_strategy.t5xxl]
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
+        return latents_caching_strategy
+    def get_text_encoding_strategy(self, args):
+        return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        # check t5xxl is trained or not
+        self.train_t5xxl = network.train_t5xxl
+        if self.train_t5xxl and args.cache_text_encoder_outputs:
+            raise ValueError(
+                "T5XXL is trained, so cache_text_encoder_outputs cannot be used / T5XXL学習時はcache_text_encoder_outputsは使用できません"
+            )
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        if args.cache_text_encoder_outputs:
+            if self.train_clip_l and not self.train_t5xxl:
+                return text_encoders[0:1]  # only CLIP-L is needed for encoding because T5XXL is cached
+            else:
+                return None  # no text encoders are needed for encoding because both are cached
+        else:
+            return text_encoders  # both CLIP-L and T5XXL are needed for encoding
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [self.train_clip_l, self.train_t5xxl]
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
+            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk,
+                args.text_encoder_batch_size,
+                args.skip_cache_check,
+                is_partial=self.train_clip_l or self.train_t5xxl,
+                apply_t5_attn_mask=args.apply_t5_attn_mask,
+            )
+        else:
+            return None
+    def cache_text_encoder_outputs_if_needed(
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+    ):
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # reduce memory consumption
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[1].to(accelerator.device)
+            if text_encoders[1].dtype == torch.float8_e4m3fn:
+                # if we load fp8 weights, the model is already fp8, so we use it as is
+                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
+            else:
+                # otherwise, we need to convert it to target dtype
+                text_encoders[1].to(weight_dtype)
+            with accelerator.autocast():
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
+            # cache sample prompts
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+                tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+                prompts = []
+                for line in args.sample_prompts:
+                    line = line.strip()
+                    if len(line) > 0 and line[0] != "#":
+                        prompts.append(line)
+                # preprocess prompts
+                for i in range(len(prompts)):
+                    prompt_dict = prompts[i]
+                    if isinstance(prompt_dict, str):
+                        from .library.train_util import line_to_prompt_dict
+                        prompt_dict = line_to_prompt_dict(prompt_dict)
+                        prompts[i] = prompt_dict
+                    assert isinstance(prompt_dict, dict)
+                    # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+                    prompt_dict["enum"] = i
+                    prompt_dict.pop("subset", None)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+            accelerator.wait_for_everyone()
+            # move back to cpu
+            if not self.is_train_text_encoder(args):
+                logger.info("move CLIP-L back to cpu")
+                text_encoders[0].to("cpu")
+            logger.info("move t5XXL back to cpu")
+            text_encoders[1].to("cpu")
+            clean_memory_on_device(accelerator.device)
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoder
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device)
+    def sample_images(self, epoch, global_step, validation_settings):
+        text_encoders = self.get_models_for_text_encoding(self.args, self.accelerator, self.text_encoder)
+        image_tensors = flux_train_utils.sample_images(
+        self.accelerator, self.args, epoch, global_step, self.unet, self.vae, text_encoders, self.sample_prompts_te_outputs, validation_settings)
+        clean_memory_on_device(self.accelerator.device)
+        return image_tensors
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images)
+    def shift_scale_latents(self, args, latents):
+        return latents
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: flux_models.Flux,
+        network,
+        weight_dtype,
+        train_unet,
+    ):
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+        )
+        # pack latents and get img_ids
+        packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+        packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+        img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+        # get guidance
+        # ensure guidance_scale in args is float
+        guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                if t is not None and t.dtype.is_floating_point:
+                    t.requires_grad_(True)
+            img_ids.requires_grad_(True)
+            guidance_vec.requires_grad_(True)
+        # Predict the noise residual
+        l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+        if not args.apply_t5_attn_mask:
+            t5_attn_mask = None
+        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
+            # normal forward
+            with accelerator.autocast():
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                model_pred = unet(
+                    img=img,
+                    img_ids=img_ids,
+                    txt=t5_out,
+                    txt_ids=txt_ids,
+                    y=l_pooled,
+                    timesteps=timesteps / 1000,
+                    guidance=guidance_vec,
+                    txt_attention_mask=t5_attn_mask,
+                )
+            """
+            else:
+                # split forward to reduce memory usage
+                assert network.train_blocks == "single", "train_blocks must be single for split mode"
+                with accelerator.autocast():
+                    # move flux lower to cpu, and then move flux upper to gpu
+                    unet.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    self.flux_upper.to(accelerator.device)
+                    # upper model does not require grad
+                    with torch.no_grad():
+                        intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
+                            img=packed_noisy_model_input,
+                            img_ids=img_ids,
+                            txt=t5_out,
+                            txt_ids=txt_ids,
+                            y=l_pooled,
+                            timesteps=timesteps / 1000,
+                            guidance=guidance_vec,
+                            txt_attention_mask=t5_attn_mask,
+                        )
+                    # move flux upper back to cpu, and then move flux lower to gpu
+                    self.flux_upper.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    unet.to(accelerator.device)
+                    # lower model requires grad
+                    intermediate_img.requires_grad_(True)
+                    intermediate_txt.requires_grad_(True)
+                    vec.requires_grad_(True)
+                    pe.requires_grad_(True)
+                    model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+            """
+            return model_pred
+        if args.bypass_flux_guidance:
+            flux_utils.bypass_flux_guidance(unet)
+        model_pred = call_dit(
+            img=packed_noisy_model_input,
+            img_ids=img_ids,
+            t5_out=t5_out,
+            txt_ids=txt_ids,
+            l_pooled=l_pooled,
+            timesteps=timesteps,
+            guidance_vec=guidance_vec,
+            t5_attn_mask=t5_attn_mask,
+        )
+        # unpack latents
+        model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+        if args.bypass_flux_guidance: #for flex
+            flux_utils.restore_flux_guidance(unet)
+        # apply model prediction type
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+        # flow matching loss: this is different from SD3
+        target = noise - latents
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
+                    diff_output_pr_indices.append(i)
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                unet.prepare_block_swap_before_forward()
+                with torch.no_grad():
+                    model_pred_prior = call_dit(
+                        img=packed_noisy_model_input[diff_output_pr_indices],
+                        img_ids=img_ids[diff_output_pr_indices],
+                        t5_out=t5_out[diff_output_pr_indices],
+                        txt_ids=txt_ids[diff_output_pr_indices],
+                        l_pooled=l_pooled[diff_output_pr_indices],
+                        timesteps=timesteps[diff_output_pr_indices],
+                        guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
+                        t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
+                    )
+                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+                model_pred_prior = flux_utils.unpack_latents(model_pred_prior, packed_latent_height, packed_latent_width)
+                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+                    args,
+                    model_pred_prior,
+                    noisy_model_input[diff_output_pr_indices],
+                    sigmas[diff_output_pr_indices] if sigmas is not None else None,
+                )
+                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
+        return model_pred, target, timesteps, weighting
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+    def update_metadata(self, metadata, args):
+        metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
+        metadata["ss_weighting_scheme"] = args.weighting_scheme
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+        metadata["ss_guidance_scale"] = args.guidance_scale
+        metadata["ss_timestep_sampling"] = args.timestep_sampling
+        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
+        metadata["ss_model_prediction_type"] = args.model_prediction_type
+        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        if index == 0:  # CLIP-L
+            return super().prepare_text_encoder_grad_ckpt_workaround(index, text_encoder)
+        else:  # T5XXL
+            text_encoder.encoder.embed_tokens.requires_grad_(True)
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        if index == 0:  # CLIP-L
+            logger.info(f"prepare CLIP-L for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
+            text_encoder.to(te_weight_dtype)  # fp8
+            text_encoder.text_model.embeddings.to(dtype=weight_dtype)
+        else:  # T5XXL
+            def prepare_fp8(text_encoder, target_dtype):
+                def forward_hook(module):
+                    def forward(hidden_states):
+                        hidden_gelu = module.act(module.wi_0(hidden_states))
+                        hidden_linear = module.wi_1(hidden_states)
+                        hidden_states = hidden_gelu * hidden_linear
+                        hidden_states = module.dropout(hidden_states)
+                        hidden_states = module.wo(hidden_states)
+                        return hidden_states
+                    return forward
+                for module in text_encoder.modules():
+                    if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                        # print("set", module.__class__.__name__, "to", target_dtype)
+                        module.to(target_dtype)
+                    if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = forward_hook(module)
+            if flux_utils.get_t5xxl_actual_dtype(text_encoder) == torch.float8_e4m3fn and text_encoder.dtype == weight_dtype:
+                logger.info(f"T5XXL already prepared for fp8")
+            else:
+                logger.info(f"prepare T5XXL for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}, add hooks")
+                text_encoder.to(te_weight_dtype)  # fp8
+                prepare_fp8(text_encoder, weight_dtype)
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+        # if we doesn't swap blocks, we can move the model to device
+        flux: flux_models.Flux = unet
+        flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
+        return flux
+def setup_parser() -> argparse.ArgumentParser:
+    parser = setup_parser()
+    train_util.add_dit_training_arguments(parser)
+    flux_train_utils.add_flux_train_arguments(parser)

hf_token.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "hf_token": "your_token_here"
+}

icon.png ADDED Viewed

install.js ADDED Viewed

	@@ -0,0 +1,96 @@

+module.exports = {
+  run: [
+    {
+      method: "shell.run",
+      params: {
+        venv: "env",
+        message: [
+          "git config --global --add safe.directory '*'",
+          "git clone -b sd3 https://github.com/kohya-ss/sd-scripts"
+        ]
+      }
+    },
+    {
+      method: "shell.run",
+      params: {
+        path: "sd-scripts",
+        venv: "../env",
+        message: [
+          "uv pip install -r requirements.txt",
+        ]
+      }
+    },
+    {
+      method: "shell.run",
+      params: {
+        venv: "env",
+        message: [
+          "pip uninstall -y diffusers[torch] torch torchaudio torchvision",
+          "uv pip install -r requirements.txt",
+        ]
+      }
+    },
+    {
+      method: "script.start",
+      params: {
+        uri: "torch.js",
+        params: {
+          venv: "env",
+          // xformers: true   // uncomment this line if your project requires xformers
+        }
+      }
+    },
+    {
+      method: "fs.link",
+      params: {
+        drive: {
+          vae: "models/vae",
+          clip: "models/clip",
+          unet: "models/unet",
+          loras: "outputs",
+        },
+        peers: [
+          "https://github.com/pinokiofactory/stable-diffusion-webui-forge.git",
+          "https://github.com/pinokiofactory/comfy.git",
+          "https://github.com/cocktailpeanutlabs/comfyui.git",
+          "https://github.com/cocktailpeanutlabs/fooocus.git",
+          "https://github.com/cocktailpeanutlabs/automatic1111.git",
+        ]
+      }
+    },
+//    {
+//      method: "fs.download",
+//      params: {
+//        uri: [
+//          "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors?download=true",
+//          "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors?download=true",
+//        ],
+//        dir: "models/clip"
+//      }
+//    },
+//    {
+//      method: "fs.download",
+//      params: {
+//        uri: [
+//          "https://huggingface.co/cocktailpeanut/xulf-dev/resolve/main/ae.sft?download=true",
+//        ],
+//        dir: "models/vae"
+//      }
+//    },
+//    {
+//      method: "fs.download",
+//      params: {
+//        uri: [
+//          "https://huggingface.co/cocktailpeanut/xulf-dev/resolve/main/flux1-dev.sft?download=true",
+//        ],
+//        dir: "models/unet"
+//      }
+//    },
+    {
+      method: "fs.link",
+      params: {
+        venv: "env"
+      }
+    }
+  ]
+}

library/__init__.py ADDED Viewed

File without changes

library/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

library/__pycache__/config_util.cpython-310.pyc ADDED Viewed

Binary file (20.2 kB). View file

library/__pycache__/custom_offloading_utils.cpython-310.pyc ADDED Viewed

Binary file (6.98 kB). View file

library/__pycache__/custom_train_functions.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

library/__pycache__/deepspeed_utils.cpython-310.pyc ADDED Viewed

Binary file (4.79 kB). View file

library/__pycache__/device_utils.cpython-310.pyc ADDED Viewed

Binary file (2.07 kB). View file

library/__pycache__/flux_models.cpython-310.pyc ADDED Viewed

Binary file (30.6 kB). View file

library/__pycache__/flux_train_utils.cpython-310.pyc ADDED Viewed

Binary file (14.8 kB). View file

library/__pycache__/flux_utils.cpython-310.pyc ADDED Viewed

Binary file (16.5 kB). View file

library/__pycache__/huggingface_util.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

library/__pycache__/model_util.cpython-310.pyc ADDED Viewed

Binary file (32.8 kB). View file

library/__pycache__/original_unet.cpython-310.pyc ADDED Viewed

Binary file (44.1 kB). View file

library/__pycache__/sai_model_spec.cpython-310.pyc ADDED Viewed

Binary file (5.68 kB). View file

library/__pycache__/sd3_models.cpython-310.pyc ADDED Viewed

Binary file (38.8 kB). View file

library/__pycache__/sd3_utils.cpython-310.pyc ADDED Viewed

Binary file (8.48 kB). View file

library/__pycache__/strategy_base.cpython-310.pyc ADDED Viewed

Binary file (17.4 kB). View file

library/__pycache__/strategy_sd.cpython-310.pyc ADDED Viewed

Binary file (6.7 kB). View file

library/__pycache__/train_util.cpython-310.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa71a44895d0a006e41ba9fadbd0177a9ad5499cc89aeb2266aa1c7a9597e82e
+size 164434

library/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (15.9 kB). View file

library/adafactor_fused.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import math
+import torch
+from transformers import Adafactor
+# stochastic rounding for bfloat16
+# The implementation was provided by 2kpr. Thank you very much!
+def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
+    """
+    copies source into target using stochastic rounding
+    Args:
+        target: the target tensor with dtype=bfloat16
+        source: the target tensor with dtype=float32
+    """
+    # create a random 16 bit integer
+    result = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
+    # add the random number to the lower 16 bit of the mantissa
+    result.add_(source.view(dtype=torch.int32))
+    # mask off the lower 16 bit of the mantissa
+    result.bitwise_and_(-65536)  # -65536 = FFFF0000 as a signed int32
+    # copy the higher 16 bit into the target tensor
+    target.copy_(result.view(dtype=torch.float32))
+    del result
+@torch.no_grad()
+def adafactor_step_param(self, p, group):
+    if p.grad is None:
+        return
+    grad = p.grad
+    if grad.dtype in {torch.float16, torch.bfloat16}:
+        grad = grad.float()
+    if grad.is_sparse:
+        raise RuntimeError("Adafactor does not support sparse gradients.")
+    state = self.state[p]
+    grad_shape = grad.shape
+    factored, use_first_moment = Adafactor._get_options(group, grad_shape)
+    # State Initialization
+    if len(state) == 0:
+        state["step"] = 0
+        if use_first_moment:
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(grad)
+        if factored:
+            state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+            state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+        else:
+            state["exp_avg_sq"] = torch.zeros_like(grad)
+        state["RMS"] = 0
+    else:
+        if use_first_moment:
+            state["exp_avg"] = state["exp_avg"].to(grad)
+        if factored:
+            state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+            state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+        else:
+            state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+    p_data_fp32 = p
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p_data_fp32 = p_data_fp32.float()
+    state["step"] += 1
+    state["RMS"] = Adafactor._rms(p_data_fp32)
+    lr = Adafactor._get_lr(group, state)
+    beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+    update = (grad**2) + group["eps"][0]
+    if factored:
+        exp_avg_sq_row = state["exp_avg_sq_row"]
+        exp_avg_sq_col = state["exp_avg_sq_col"]
+        exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
+        exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
+        # Approximation of exponential moving average of square of gradient
+        update = Adafactor._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+        update.mul_(grad)
+    else:
+        exp_avg_sq = state["exp_avg_sq"]
+        exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
+        update = exp_avg_sq.rsqrt().mul_(grad)
+    update.div_((Adafactor._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
+    update.mul_(lr)
+    if use_first_moment:
+        exp_avg = state["exp_avg"]
+        exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
+        update = exp_avg
+    if group["weight_decay"] != 0:
+        p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
+    p_data_fp32.add_(-update)
+    # if p.dtype in {torch.float16, torch.bfloat16}:
+    #    p.copy_(p_data_fp32)
+    if p.dtype == torch.bfloat16:
+        copy_stochastic_(p, p_data_fp32)
+    elif p.dtype == torch.float16:
+        p.copy_(p_data_fp32)
+@torch.no_grad()
+def adafactor_step(self, closure=None):
+    """
+    Performs a single optimization step
+    Arguments:
+        closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+    """
+    loss = None
+    if closure is not None:
+        loss = closure()
+    for group in self.param_groups:
+        for p in group["params"]:
+            adafactor_step_param(self, p, group)
+    return loss
+def patch_adafactor_fused(optimizer: Adafactor):
+    optimizer.step_param = adafactor_step_param.__get__(optimizer)
+    optimizer.step = adafactor_step.__get__(optimizer)

library/attention_processors.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import math
+from typing import Any
+from einops import rearrange
+import torch
+from diffusers.models.attention_processor import Attention
+# flash attention forwards and backwards
+# https://arxiv.org/abs/2205.14135
+EPSILON = 1e-6
+class FlashAttentionFunction(torch.autograd.function.Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """Algorithm 2 in the paper"""
+        device = q.device
+        dtype = q.dtype
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), dtype=dtype, device=device)
+        all_row_maxes = torch.full(
+            (*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device
+        )
+        scale = q.shape[-1] ** -0.5
+        if mask is None:
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, "b n -> b 1 1 n")
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if row_mask is not None:
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if row_mask is not None:
+                    exp_weights.masked_fill_(~row_mask, 0.0)
+                block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(
+                    min=EPSILON
+                )
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = torch.einsum(
+                    "... i j, ... j d -> ... i d", exp_weights, vc
+                )
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+                new_row_sums = (
+                    exp_row_max_diff * row_sums
+                    + exp_block_row_max_diff * block_row_sums
+                )
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_(
+                    (exp_block_row_max_diff / new_row_sums) * exp_values
+                )
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """Algorithm 4 in the paper"""
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            l.split(q_bucket_size, dim=-2),
+            m.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                exp_attn_weights = torch.exp(attn_weights - mc)
+                if row_mask is not None:
+                    exp_attn_weights.masked_fill_(~row_mask, 0.0)
+                p = exp_attn_weights / lc
+                dv_chunk = torch.einsum("... i j, ... i d -> ... j d", p, doc)
+                dp = torch.einsum("... i d, ... j d -> ... i j", doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = torch.einsum("... i j, ... j d -> ... i d", ds, kc)
+                dk_chunk = torch.einsum("... i j, ... i d -> ... j d", ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None
+class FlashAttnProcessor:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ) -> Any:
+        q_bucket_size = 512
+        k_bucket_size = 1024
+        h = attn.heads
+        q = attn.to_q(hidden_states)
+        encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else hidden_states
+        )
+        encoder_hidden_states = encoder_hidden_states.to(hidden_states.dtype)
+        if hasattr(attn, "hypernetwork") and attn.hypernetwork is not None:
+            context_k, context_v = attn.hypernetwork.forward(
+                hidden_states, encoder_hidden_states
+            )
+            context_k = context_k.to(hidden_states.dtype)
+            context_v = context_v.to(hidden_states.dtype)
+        else:
+            context_k = encoder_hidden_states
+            context_v = encoder_hidden_states
+        k = attn.to_k(context_k)
+        v = attn.to_v(context_v)
+        del encoder_hidden_states, hidden_states
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = FlashAttentionFunction.apply(
+            q, k, v, attention_mask, False, q_bucket_size, k_bucket_size
+        )
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = attn.to_out[0](out)
+        out = attn.to_out[1](out)
+        return out

library/config_util.py ADDED Viewed

	@@ -0,0 +1,717 @@

+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import toml
+import voluptuous
+from voluptuous import (
+    Any,
+    ExactSequence,
+    MultipleInvalid,
+    Object,
+    Required,
+    Schema,
+)
+from . import train_util
+from .train_util import (
+    DreamBoothSubset,
+    FineTuningSubset,
+    ControlNetSubset,
+    DreamBoothDataset,
+    FineTuningDataset,
+    ControlNetDataset,
+    DatasetGroup,
+)
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_config_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
+# TODO: inherit Params class in Subset, Dataset
+@dataclass
+class BaseSubsetParams:
+    image_dir: Optional[str] = None
+    num_repeats: int = 1
+    shuffle_caption: bool = False
+    caption_separator: str = (",",)
+    keep_tokens: int = 0
+    keep_tokens_separator: str = (None,)
+    secondary_separator: Optional[str] = None
+    enable_wildcard: bool = False
+    color_aug: bool = False
+    flip_aug: bool = False
+    face_crop_aug_range: Optional[Tuple[float, float]] = None
+    random_crop: bool = False
+    caption_prefix: Optional[str] = None
+    caption_suffix: Optional[str] = None
+    caption_dropout_rate: float = 0.0
+    caption_dropout_every_n_epochs: int = 0
+    caption_tag_dropout_rate: float = 0.0
+    token_warmup_min: int = 1
+    token_warmup_step: float = 0
+    custom_attributes: Optional[Dict[str, Any]] = None
+@dataclass
+class DreamBoothSubsetParams(BaseSubsetParams):
+    is_reg: bool = False
+    class_tokens: Optional[str] = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+    alpha_mask: bool = False
+@dataclass
+class FineTuningSubsetParams(BaseSubsetParams):
+    metadata_file: Optional[str] = None
+    alpha_mask: bool = False
+@dataclass
+class ControlNetSubsetParams(BaseSubsetParams):
+    conditioning_data_dir: str = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+@dataclass
+class BaseDatasetParams:
+    resolution: Optional[Tuple[int, int]] = None
+    network_multiplier: float = 1.0
+    debug_dataset: bool = False
+@dataclass
+class DreamBoothDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+    prior_loss_weight: float = 1.0
+@dataclass
+class FineTuningDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class ControlNetDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class SubsetBlueprint:
+    params: Union[DreamBoothSubsetParams, FineTuningSubsetParams]
+@dataclass
+class DatasetBlueprint:
+    is_dreambooth: bool
+    is_controlnet: bool
+    params: Union[DreamBoothDatasetParams, FineTuningDatasetParams]
+    subsets: Sequence[SubsetBlueprint]
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+    # subset schema
+    SUBSET_ASCENDABLE_SCHEMA = {
+        "color_aug": bool,
+        "face_crop_aug_range": functools.partial(__validate_and_convert_twodim.__func__, float),
+        "flip_aug": bool,
+        "num_repeats": int,
+        "random_crop": bool,
+        "shuffle_caption": bool,
+        "keep_tokens": int,
+        "keep_tokens_separator": str,
+        "secondary_separator": str,
+        "caption_separator": str,
+        "enable_wildcard": bool,
+        "token_warmup_min": int,
+        "token_warmup_step": Any(float, int),
+        "caption_prefix": str,
+        "caption_suffix": str,
+        "custom_attributes": dict,
+    }
+    # DO means DropOut
+    DO_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_dropout_every_n_epochs": int,
+        "caption_dropout_rate": Any(float, int),
+        "caption_tag_dropout_rate": Any(float, int),
+    }
+    # DB means DreamBooth
+    DB_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "class_tokens": str,
+        "cache_info": bool,
+    }
+    DB_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        "is_reg": bool,
+        "alpha_mask": bool,
+    }
+    # FT means FineTuning
+    FT_SUBSET_DISTINCT_SCHEMA = {
+        Required("metadata_file"): str,
+        "image_dir": str,
+        "alpha_mask": bool,
+    }
+    CN_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "cache_info": bool,
+    }
+    CN_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        Required("conditioning_data_dir"): str,
+    }
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "batch_size": int,
+        "bucket_no_upscale": bool,
+        "bucket_reso_steps": int,
+        "enable_bucket": bool,
+        "max_bucket_reso": int,
+        "min_bucket_reso": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "network_multiplier": float,
+    }
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+        "max_token_length": Any(None, int),
+        "prior_loss_weight": Any(float, int),
+    }
+    # for handling default None value of argparse
+    ARGPARSE_NULLABLE_OPTNAMES = [
+        "face_crop_aug_range",
+        "resolution",
+    ]
+    # prepare map because option name may differ among argparse and user config
+    ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME = {
+        "train_batch_size": "batch_size",
+        "dataset_repeats": "num_repeats",
+    }
+    def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_controlnet: bool, support_dropout: bool) -> None:
+        assert support_dreambooth or support_finetuning or support_controlnet, (
+            "Neither DreamBooth mode nor fine tuning mode nor controlnet mode specified. Please specify one mode or more."
+            + " / DreamBooth モードか fine tuning モードか controlnet モードのどれも指定されていません。1つ以上指定してください。"
+        )
+        self.db_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_DISTINCT_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.ft_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.FT_SUBSET_DISTINCT_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.cn_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_DISTINCT_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.db_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.db_subset_schema]},
+        )
+        self.ft_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.ft_subset_schema]},
+        )
+        self.cn_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.cn_subset_schema]},
+        )
+        if support_dreambooth and support_finetuning:
+            def validate_flex_dataset(dataset_config: dict):
+                subsets_config = dataset_config.get("subsets", [])
+                if support_controlnet and all(["conditioning_data_dir" in subset for subset in subsets_config]):
+                    return Schema(self.cn_dataset_schema)(dataset_config)
+                # check dataset meets FT style
+                # NOTE: all FT subsets should have "metadata_file"
+                elif all(["metadata_file" in subset for subset in subsets_config]):
+                    return Schema(self.ft_dataset_schema)(dataset_config)
+                # check dataset meets DB style
+                # NOTE: all DB subsets should have no "metadata_file"
+                elif all(["metadata_file" not in subset for subset in subsets_config]):
+                    return Schema(self.db_dataset_schema)(dataset_config)
+                else:
+                    raise voluptuous.Invalid(
+                        "DreamBooth subset and fine tuning subset cannot be mixed in the same dataset. Please split them into separate datasets. / DreamBoothのサブセットとfine tuninのサブセットを同一のデータセットに混在させることはできません。別々のデータセットに分割してください。"
+                    )
+            self.dataset_schema = validate_flex_dataset
+        elif support_dreambooth:
+            if support_controlnet:
+                self.dataset_schema = self.cn_dataset_schema
+            else:
+                self.dataset_schema = self.db_dataset_schema
+        elif support_finetuning:
+            self.dataset_schema = self.ft_dataset_schema
+        elif support_controlnet:
+            self.dataset_schema = self.cn_dataset_schema
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA if support_dreambooth else {},
+            self.CN_SUBSET_ASCENDABLE_SCHEMA if support_controlnet else {},
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.general_schema,
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+            {optname: Any(None, self.general_schema[optname]) for optname in self.ARGPARSE_NULLABLE_OPTNAMES},
+            {a_name: self.general_schema[c_name] for a_name, c_name in self.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME.items()},
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: エラー発生時のメッセージをわかりやすくする
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+        # convert argparse namespace to dict like config
+        # NOTE: it is ok to have extra entries in dict
+        optname_map = self.sanitizer.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME
+        argparse_config = {
+            optname_map.get(optname, optname): value for optname, value in vars(sanitized_argparse_namespace).items()
+        }
+        general_config = sanitized_user_config.get("general", {})
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            # NOTE: if subsets have no "metadata_file", these are DreamBooth datasets/subsets
+            subsets = dataset_config.get("subsets", [])
+            is_dreambooth = all(["metadata_file" not in subset for subset in subsets])
+            is_controlnet = all(["conditioning_data_dir" in subset for subset in subsets])
+            if is_controlnet:
+                subset_params_klass = ControlNetSubsetParams
+                dataset_params_klass = ControlNetDatasetParams
+            elif is_dreambooth:
+                subset_params_klass = DreamBoothSubsetParams
+                dataset_params_klass = DreamBoothDatasetParams
+            else:
+                subset_params_klass = FineTuningSubsetParams
+                dataset_params_klass = FineTuningDatasetParams
+            subset_blueprints = []
+            for subset_config in subsets:
+                params = self.generate_params_by_fallbacks(
+                    subset_params_klass, [subset_config, dataset_config, general_config, argparse_config, runtime_params]
+                )
+                subset_blueprints.append(SubsetBlueprint(params))
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_dreambooth, is_controlnet, params, subset_blueprints))
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+        return Blueprint(dataset_group_blueprint)
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+        return param_klass(**params)
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+        return default_value
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
+    datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_controlnet:
+            subset_klass = ControlNetSubset
+            dataset_klass = ControlNetDataset
+        elif dataset_blueprint.is_dreambooth:
+            subset_klass = DreamBoothSubset
+            dataset_klass = DreamBoothDataset
+        else:
+            subset_klass = FineTuningSubset
+            dataset_klass = FineTuningDataset
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_dreambooth = isinstance(dataset, DreamBoothDataset)
+        is_controlnet = isinstance(dataset, ControlNetDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
+        if dataset.enable_bucket:
+            info += indent(
+                dedent(
+                    f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
+        else:
+            info += "\n"
+        for j, subset in enumerate(dataset.subsets):
+            info += indent(
+                dedent(
+                    f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_separator: {subset.caption_separator}
+          secondary_separator: {subset.secondary_separator}
+          enable_wildcard: {subset.enable_wildcard}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epochs: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          caption_prefix: {subset.caption_prefix}
+          caption_suffix: {subset.caption_suffix}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min}
+          token_warmup_step: {subset.token_warmup_step}
+          alpha_mask: {subset.alpha_mask}
+          custom_attributes: {subset.custom_attributes}
+      """
+                ),
+                "  ",
+            )
+            if is_dreambooth:
+                info += indent(
+                    dedent(
+                        f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""
+                    ),
+                    "    ",
+                )
+            elif not is_controlnet:
+                info += indent(
+                    dedent(
+                        f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""
+                    ),
+                    "    ",
+                )
+    logger.info(f"{info}")
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        logger.info(f"[Dataset {i}]")
+        dataset.make_buckets()
+        dataset.set_seed(seed)
+    return DatasetGroup(datasets)
+def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
+    def extract_dreambooth_params(name: str) -> Tuple[int, str]:
+        tokens = name.split("_")
+        try:
+            n_repeats = int(tokens[0])
+        except ValueError as e:
+            logger.warning(f"ignore directory without repeats / 繰り返し回数のないディレクトリを無視します: {name}")
+            return 0, ""
+        caption_by_folder = "_".join(tokens[1:])
+        return n_repeats, caption_by_folder
+    def generate(base_dir: Optional[str], is_reg: bool):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        for subdir in base_dir.iterdir():
+            if not subdir.is_dir():
+                continue
+            num_repeats, class_tokens = extract_dreambooth_params(subdir.name)
+            if num_repeats < 1:
+                continue
+            subset_config = {"image_dir": str(subdir), "num_repeats": num_repeats, "is_reg": is_reg, "class_tokens": class_tokens}
+            subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir, False)
+    subsets_config += generate(reg_data_dir, True)
+    return subsets_config
+def generate_controlnet_subsets_config_by_subdirs(
+    train_data_dir: Optional[str] = None, conditioning_data_dir: Optional[str] = None, caption_extension: str = ".txt"
+):
+    def generate(base_dir: Optional[str]):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        subset_config = {
+            "image_dir": train_data_dir,
+            "conditioning_data_dir": conditioning_data_dir,
+            "caption_extension": caption_extension,
+            "num_repeats": 1,
+        }
+        subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir)
+    return subsets_config
+def load_user_config(file: str) -> dict:
+    file_path: Path = Path(file)
+    if not file_path.is_file():
+        #raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+        return toml.loads(file)
+    if file_path.name.lower().endswith(".json"):
+        try:
+            with open(file, "r") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file_path.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file_path)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file_path}")
+    return config
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--support_dreambooth", action="store_true")
+    parser.add_argument("--support_finetuning", action="store_true")
+    parser.add_argument("--support_controlnet", action="store_true")
+    parser.add_argument("--support_dropout", action="store_true")
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+    parser = argparse.ArgumentParser()
+    train_util.add_dataset_arguments(
+        parser, config_args.support_dreambooth, config_args.support_finetuning, config_args.support_dropout
+    )
+    train_util.add_training_arguments(parser, config_args.support_dreambooth)
+    argparse_namespace = parser.parse_args(remain)
+    train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+    user_config = load_user_config(config_args.dataset_config)
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+    sanitizer = ConfigSanitizer(
+        config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
+    )
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")

library/custom_offloading_utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from concurrent.futures import ThreadPoolExecutor
+import time
+from typing import Optional
+import torch
+import torch.nn as nn
+from .device_utils import clean_memory_on_device
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+    weight_swap_jobs = []
+    # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
+    # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+    #     print(module_to_cpu.__class__, module_to_cuda.__class__)
+    #     if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+    #         weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+    modules_to_cpu = {k: v for k, v in layer_to_cpu.named_modules()}
+    for module_to_cuda_name, module_to_cuda in layer_to_cuda.named_modules():
+        if hasattr(module_to_cuda, "weight") and module_to_cuda.weight is not None:
+            module_to_cpu = modules_to_cpu.get(module_to_cuda_name, None)
+            if module_to_cpu is not None and module_to_cpu.weight.shape == module_to_cuda.weight.shape:
+                weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+            else:
+                if module_to_cuda.weight.data.device.type != device.type:
+                    # print(
+                    #     f"Module {module_to_cuda_name} not found in CPU model or shape mismatch, so not swapping and moving to device"
+                    # )
+                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device)
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+        stream.synchronize()
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+    stream.synchronize()
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    """
+    not tested
+    """
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+    # device to cpu
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+    synchronize_device()
+    # cpu to device
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+        module_to_cuda.weight.data = cuda_data_view
+    synchronize_device()
+def weighs_to_device(layer: nn.Module, device: torch.device):
+    for module in layer.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data = module.weight.data.to(device, non_blocking=True)
+class Offloader:
+    """
+    common offloading class
+    """
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        self.num_blocks = num_blocks
+        self.blocks_to_swap = blocks_to_swap
+        self.device = device
+        self.debug = debug
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+        self.futures = {}
+        self.cuda_available = device.type == "cuda"
+    def swap_weight_devices(self, block_to_cpu: nn.Module, block_to_cuda: nn.Module):
+        if self.cuda_available:
+            swap_weight_devices_cuda(self.device, block_to_cpu, block_to_cuda)
+        else:
+            swap_weight_devices_no_cuda(self.device, block_to_cpu, block_to_cuda)
+    def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+            if self.debug:
+                start_time = time.perf_counter()
+                print(f"Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}")
+            self.swap_weight_devices(block_to_cpu, block_to_cuda)
+            if self.debug:
+                print(f"Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+            return bidx_to_cpu, bidx_to_cuda  # , event
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+        self.futures[block_idx_to_cuda] = self.thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda
+        )
+    def _wait_blocks_move(self, block_idx):
+        if block_idx not in self.futures:
+            return
+        if self.debug:
+            print(f"Wait for block {block_idx}")
+            start_time = time.perf_counter()
+        future = self.futures.pop(block_idx)
+        _, bidx_to_cuda = future.result()
+        assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
+        if self.debug:
+            print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+class ModelOffloader(Offloader):
+    """
+    supports forward offloading
+    """
+    def __init__(self, blocks: list[nn.Module], num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(num_blocks, blocks_to_swap, device, debug)
+        # register backward hooks
+        self.remove_handles = []
+        for i, block in enumerate(blocks):
+            hook = self.create_backward_hook(blocks, i)
+            if hook is not None:
+                handle = block.register_full_backward_hook(hook)
+                self.remove_handles.append(handle)
+    def __del__(self):
+        for handle in self.remove_handles:
+            handle.remove()
+    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+        # -1 for 0-based index
+        num_blocks_propagated = self.num_blocks - block_index - 1
+        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
+        waiting = block_index > 0 and block_index <= self.blocks_to_swap
+        if not swapping and not waiting:
+            return None
+        # create  hook
+        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
+        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
+        block_idx_to_wait = block_index - 1
+        def backward_hook(module, grad_input, grad_output):
+            if self.debug:
+                print(f"Backward hook for block {block_index}")
+            if swapping:
+                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+            if waiting:
+                self._wait_blocks_move(block_idx_to_wait)
+            return None
+        return backward_hook
+    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        if self.debug:
+            print("Prepare block devices before forward")
+        for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
+            b.to(self.device)
+            weighs_to_device(b, self.device)  # make sure weights are on device
+        for b in blocks[self.num_blocks - self.blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+        synchronize_device(self.device)
+        clean_memory_on_device(self.device)
+    def wait_for_block(self, block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self._wait_blocks_move(block_idx)
+    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        if block_idx >= self.blocks_to_swap:
+            return
+        block_idx_to_cpu = block_idx
+        block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
+        self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)

library/custom_train_functions.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import torch
+import argparse
+import random
+import re
+from typing import List, Optional, Union
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def prepare_scheduler_for_custom_training(noise_scheduler, device):
+    if hasattr(noise_scheduler, "all_snr"):
+        return
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+    alpha = sqrt_alphas_cumprod
+    sigma = sqrt_one_minus_alphas_cumprod
+    all_snr = (alpha / sigma) ** 2
+    noise_scheduler.all_snr = all_snr.to(device)
+def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
+    # fix beta: zero terminal SNR
+    logger.info(f"fix noise scheduler betas: https://arxiv.org/abs/2305.08891")
+    def enforce_zero_terminal_snr(betas):
+        # Convert betas to alphas_bar_sqrt
+        alphas = 1 - betas
+        alphas_bar = alphas.cumprod(0)
+        alphas_bar_sqrt = alphas_bar.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so first timestep is back to old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+        betas = 1 - alphas
+        return betas
+    betas = noise_scheduler.betas
+    betas = enforce_zero_terminal_snr(betas)
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    # logger.info(f"original: {noise_scheduler.betas}")
+    # logger.info(f"fixed: {betas}")
+    noise_scheduler.betas = betas
+    noise_scheduler.alphas = alphas
+    noise_scheduler.alphas_cumprod = alphas_cumprod
+def apply_snr_weight(loss, timesteps, noise_scheduler, gamma, v_prediction=False):
+    snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
+    min_snr_gamma = torch.minimum(snr, torch.full_like(snr, gamma))
+    if v_prediction:
+        snr_weight = torch.div(min_snr_gamma, snr + 1).float().to(loss.device)
+    else:
+        snr_weight = torch.div(min_snr_gamma, snr).float().to(loss.device)
+    loss = loss * snr_weight
+    return loss
+def scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    loss = loss * scale
+    return loss
+def get_snr_scale(timesteps, noise_scheduler):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    scale = snr_t / (snr_t + 1)
+    # # show debug info
+    # logger.info(f"timesteps: {timesteps}, snr_t: {snr_t}, scale: {scale}")
+    return scale
+def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_loss):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    # logger.info(f"add v-prediction like loss: {v_pred_like_loss}, scale: {scale}, loss: {loss}, time: {timesteps}")
+    loss = loss + loss / scale * v_pred_like_loss
+    return loss
+def apply_debiased_estimation(loss, timesteps, noise_scheduler):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    weight = 1 / torch.sqrt(snr_t)
+    loss = weight * loss
+    return loss
+# TODO train_utilと分散しているのでどちらかに寄せる
+def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=float,
+        default=None,
+        help="gamma for reducing the weight of high loss timesteps. Lower numbers have stronger effect. 5 is recommended by paper. / 低いタイムステップでの高いlossに対して重みを減らすためのgamma値、低いほど効果が強く、論文では5が推奨",
+    )
+    parser.add_argument(
+        "--scale_v_pred_loss_like_noise_pred",
+        action="store_true",
+        help="scale v-prediction loss like noise prediction loss / v-prediction lossをnoise prediction lossと同じようにスケーリングする",
+    )
+    parser.add_argument(
+        "--v_pred_like_loss",
+        type=float,
+        default=None,
+        help="add v-prediction like loss multiplied by this value / v-prediction lossをこの値をかけたものをlossに加算する",
+    )
+    parser.add_argument(
+        "--debiased_estimation_loss",
+        action="store_true",
+        help="debiased estimation loss / debiased estimation loss",
+    )
+    if support_weighted_captions:
+        parser.add_argument(
+            "--weighted_captions",
+            action="store_true",
+            default=False,
+            help="Enable weighted captions in the standard style (token:1.3). No commas inside parens, or shuffle/dropout may break the decoder. / 「[token]」、「(token)」「(token:1.3)」のような重み付きキャプションを有効にする。カンマを括弧内に入れるとシャッフルやdropoutで重みづけがおかしくなるので注意",
+        )
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = text_encoder(text_input)[0]
+        else:
+            enc_out = text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]],
+    device,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    prompt_tokens, prompt_weights = get_prompts_with_weights(tokenizer, prompt, max_length - 2)
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = tokenizer.bos_token_id
+    eos = tokenizer.eos_token_id
+    pad = tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        tokenizer,
+        text_encoder,
+        prompt_tokens,
+        tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=device)
+    # assign weights to the prompts and normalize in the sense of mean
+    previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * prompt_weights.unsqueeze(-1)
+    current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    return text_embeddings
+# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+    b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
+    u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+    for i in range(iterations):
+        r = random.random() * 2 + 2  # Rather than always going 2x,
+        wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+        noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
+        if wn == 1 or hn == 1:
+            break  # Lowest resolution is 1x1
+    return noise / noise.std()  # Scaled back to roughly unit variance
+# https://www.crosslabs.org//blog/diffusion-with-offset-noise
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+    if noise_offset is None:
+        return noise
+    if adaptive_noise_scale is not None:
+        # latent shape: (batch_size, channels, height, width)
+        # abs mean value for each channel
+        latent_mean = torch.abs(latents.mean(dim=(2, 3), keepdim=True))
+        # multiply adaptive noise scale to the mean value and add it to the noise offset
+        noise_offset = noise_offset + adaptive_noise_scale * latent_mean
+        noise_offset = torch.clamp(noise_offset, 0.0, None)  # in case of adaptive noise scale is negative
+    noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+    return noise
+def apply_masked_loss(loss, batch):
+    if "conditioning_images" in batch:
+        # conditioning image is -1 to 1. we need to convert it to 0 to 1
+        mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+        mask_image = mask_image / 2 + 0.5
+        # print(f"conditioning_image: {mask_image.shape}")
+    elif "alpha_masks" in batch and batch["alpha_masks"] is not None:
+        # alpha mask is 0 to 1
+        mask_image = batch["alpha_masks"].to(dtype=loss.dtype).unsqueeze(1) # add channel dimension
+        # print(f"mask_image: {mask_image.shape}, {mask_image.mean()}")
+    else:
+        return loss
+    # resize to the same size as the loss
+    mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+    loss = loss * mask_image
+    return loss
+"""
+##########################################
+# Perlin Noise
+def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+    delta = (res[0] / shape[0], res[1] / shape[1])
+    d = (shape[0] // res[0], shape[1] // res[1])
+    grid = (
+        torch.stack(
+            torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
+            dim=-1,
+        )
+        % 1
+    )
+    angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
+    gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
+    tile_grads = (
+        lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
+        .repeat_interleave(d[0], 0)
+        .repeat_interleave(d[1], 1)
+    )
+    dot = lambda grad, shift: (
+        torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
+        * grad[: shape[0], : shape[1]]
+    ).sum(dim=-1)
+    n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
+    n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
+    n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
+    n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
+    t = fade(grid[: shape[0], : shape[1]])
+    return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
+def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
+    noise = torch.zeros(shape, device=device)
+    frequency = 1
+    amplitude = 1
+    for _ in range(octaves):
+        noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
+        frequency *= 2
+        amplitude *= persistence
+    return noise
+def perlin_noise(noise, device, octaves):
+    _, c, w, h = noise.shape
+    perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
+    noise_perlin = []
+    for _ in range(c):
+        noise_perlin.append(perlin())
+    noise_perlin = torch.stack(noise_perlin).unsqueeze(0)   # (1, c, w, h)
+    noise += noise_perlin # broadcast for each batch
+    return noise / noise.std()  # Scaled back to roughly unit variance
+"""

library/deepspeed_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import argparse
+import torch
+from accelerate import DeepSpeedPlugin, Accelerator
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_deepspeed_arguments(parser: argparse.ArgumentParser):
+    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
+    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
+    parser.add_argument("--zero_stage", type=int, default=2, choices=[0, 1, 2, 3], help="Possible options are 0,1,2,3.")
+    parser.add_argument(
+        "--offload_optimizer_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.",
+    )
+    parser.add_argument(
+        "--offload_optimizer_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--zero3_init_flag",
+        action="store_true",
+        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
+        "Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--zero3_save_16bit_model",
+        action="store_true",
+        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--fp16_master_weights_and_gradients",
+        action="store_true",
+        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32.",
+    )
+def prepare_deepspeed_args(args: argparse.Namespace):
+    if not args.deepspeed:
+        return
+    # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+    args.max_data_loader_n_workers = 1
+def prepare_deepspeed_plugin(args: argparse.Namespace):
+    if not args.deepspeed:
+        return None
+    try:
+        import deepspeed
+    except ImportError as e:
+        logger.error(
+            "deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed"
+        )
+        exit(1)
+    deepspeed_plugin = DeepSpeedPlugin(
+        zero_stage=args.zero_stage,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        gradient_clipping=args.max_grad_norm,
+        offload_optimizer_device=args.offload_optimizer_device,
+        offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+        offload_param_device=args.offload_param_device,
+        offload_param_nvme_path=args.offload_param_nvme_path,
+        zero3_init_flag=args.zero3_init_flag,
+        zero3_save_16bit_model=args.zero3_save_16bit_model,
+    )
+    deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
+    deepspeed_plugin.deepspeed_config["train_batch_size"] = 1#(
+    #    args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
+    #)
+    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+    if args.mixed_precision.lower() == "fp16":
+        deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
+    if args.full_fp16 or args.fp16_master_weights_and_gradients:
+        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
+            deepspeed_plugin.deepspeed_config["fp16"]["fp16_master_weights_and_grads"] = True
+            logger.info("[DeepSpeed] full fp16 enable.")
+        else:
+            logger.info(
+                "[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage."
+            )
+    if args.offload_optimizer_device is not None:
+        logger.info("[DeepSpeed] start to manually build cpu_adam.")
+        deepspeed.ops.op_builder.CPUAdamBuilder().load()
+        logger.info("[DeepSpeed] building cpu_adam done.")
+    return deepspeed_plugin
+# Accelerate library does not support multiple models for deepspeed. So, we need to wrap multiple models into a single model.
+def prepare_deepspeed_model(args: argparse.Namespace, **models):
+    # remove None from models
+    models = {k: v for k, v in models.items() if v is not None}
+    class DeepSpeedWrapper(torch.nn.Module):
+        def __init__(self, **kw_models) -> None:
+            super().__init__()
+            self.models = torch.nn.ModuleDict()
+            for key, model in kw_models.items():
+                if isinstance(model, list):
+                    model = torch.nn.ModuleList(model)
+                assert isinstance(
+                    model, torch.nn.Module
+                ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+                self.models.update(torch.nn.ModuleDict({key: model}))
+        def get_models(self):
+            return self.models
+    ds_model = DeepSpeedWrapper(**models)
+    return ds_model

library/device_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import functools
+import gc
+import torch
+try:
+    HAS_CUDA = torch.cuda.is_available()
+except Exception:
+    HAS_CUDA = False
+try:
+    HAS_MPS = torch.backends.mps.is_available()
+except Exception:
+    HAS_MPS = False
+try:
+    import intel_extension_for_pytorch as ipex  # noqa
+    HAS_XPU = torch.xpu.is_available()
+except Exception:
+    HAS_XPU = False
+def clean_memory():
+    gc.collect()
+    if HAS_CUDA:
+        torch.cuda.empty_cache()
+    if HAS_XPU:
+        torch.xpu.empty_cache()
+    if HAS_MPS:
+        torch.mps.empty_cache()
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+@functools.lru_cache(maxsize=None)
+def get_preferred_device() -> torch.device:
+    r"""
+    Do not call this function from training scripts. Use accelerator.device instead.
+    """
+    if HAS_CUDA:
+        device = torch.device("cuda")
+    elif HAS_XPU:
+        device = torch.device("xpu")
+    elif HAS_MPS:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"get_preferred_device() -> {device}")
+    return device
+def init_ipex():
+    """
+    Apply IPEX to CUDA hijacks using `library.ipex.ipex_init`.
+    This function should run right after importing torch and before doing anything else.
+    If IPEX is not available, this function does nothing.
+    """
+    try:
+        if HAS_XPU:
+            from .ipex import ipex_init
+            is_initialized, error_message = ipex_init()
+            if not is_initialized:
+                print("failed to initialize ipex:", error_message)
+        else:
+            return
+    except Exception as e:
+        print("failed to initialize ipex:", e)

library/flux_models.py ADDED Viewed

	@@ -0,0 +1,1060 @@

+# copy from FLUX repo: https://github.com/black-forest-labs/flux
+# license: Apache-2.0 License
+from dataclasses import dataclass
+import math
+from typing import Dict, List, Optional, Union
+from .device_utils import init_ipex
+from .custom_offloading_utils import ModelOffloader
+init_ipex()
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from torch.utils.checkpoint import checkpoint
+# USE_REENTRANT = True
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+# region autoencoder
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+# endregion
+# region config
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    # repo_id: str | None
+    # repo_flow: str | None
+    # repo_ae: str | None
+configs = {
+    "dev": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-dev",
+        # repo_flow="flux1-dev.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_DEV"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "schnell": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-schnell",
+        # repo_flow="flux1-schnell.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_SCHNELL"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+# endregion
+# region math
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, attn_mask: Optional[Tensor] = None) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+# endregion
+# region layers
+# for cpu_offload_checkpointing
+def to_cuda(x):
+    if isinstance(x, torch.Tensor):
+        return x.cuda()
+    elif isinstance(x, (list, tuple)):
+        return [to_cuda(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cuda(v) for k, v in x.items()}
+    else:
+        return x
+def to_cpu(x):
+    if isinstance(x, torch.Tensor):
+        return x.cpu()
+    elif isinstance(x, (list, tuple)):
+        return [to_cpu(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cpu(v) for k, v in x.items()}
+    else:
+        return x
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+    def _forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+    # def forward(self, x):
+    #     if self.training and self.gradient_checkpointing:
+    #         def create_custom_forward(func):
+    #             def custom_forward(*inputs):
+    #                 return func(*inputs)
+    #             return custom_forward
+    #         return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, use_reentrant=USE_REENTRANT)
+    #     else:
+    #         return self._forward(x)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        # return (x * rrms).to(dtype=x_dtype) * self.scale
+        return ((x * rrms) * self.scale.float()).to(dtype=x_dtype)
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    # this is not called from DoubleStreamBlock/SingleStreamBlock because they uses attention function directly
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def _forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
+            attn_mask = torch.cat(
+                (attn_mask, torch.ones(attn_mask.shape[0], img.shape[1], device=attn_mask.device, dtype=torch.bool)), dim=1
+            )  # b, seq_len + img_len
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img blocks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        # calculate the txt blocks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
+        if self.training and self.gradient_checkpointing:
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, img, txt, vec, pe, txt_attention_mask, use_reentrant=False)
+            # cpu offload checkpointing
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+                return custom_forward
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), img, txt, vec, pe, txt_attention_mask, use_reentrant=False
+            )
+        else:
+            return self._forward(img, txt, vec, pe, txt_attention_mask)
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
+            attn_mask = torch.cat(
+                (
+                    attn_mask,
+                    torch.ones(
+                        attn_mask.shape[0], x.shape[1] - txt_attention_mask.shape[1], device=attn_mask.device, dtype=torch.bool
+                    ),
+                ),
+                dim=1,
+            )  # b, seq_len + img_len = x_len
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+        # compute attention
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
+        if self.training and self.gradient_checkpointing:
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, x, vec, pe, txt_attention_mask, use_reentrant=False)
+            # cpu offload checkpointing
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+                return custom_forward
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), x, vec, pe, txt_attention_mask, use_reentrant=False
+            )
+        else:
+            return self._forward(x, vec, pe, txt_attention_mask)
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+# endregion
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
+        print(f"FLUX: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+        print("FLUX: Gradient checkpointing disabled.")
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        self.blocks_to_swap = num_blocks
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+        )
+        self.offloader_single = ModelOffloader(
+            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+        )
+        print(
+            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        if not self.blocks_to_swap:
+            for block in self.double_blocks:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+            img = torch.cat((txt, img), 1)
+            for block in self.single_blocks:
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+        else:
+            for block_idx, block in enumerate(self.double_blocks):
+                self.offloader_double.wait_for_block(block_idx)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
+            img = torch.cat((txt, img), 1)
+            for block_idx, block in enumerate(self.single_blocks):
+                self.offloader_single.wait_for_block(block_idx)
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
+        img = img[:, txt.shape[1] :, ...]
+        if self.training and self.cpu_offload_checkpointing:
+            img = img.to(self.device)
+            vec = vec.to(self.device)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

library/flux_train_utils.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import argparse
+import math
+import os
+import numpy as np
+import toml
+import json
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from accelerate import Accelerator, PartialState
+from transformers import CLIPTextModel
+from tqdm import tqdm
+from PIL import Image
+from safetensors.torch import save_file
+from . import flux_models, flux_utils, strategy_base, train_util
+from .device_utils import init_ipex, clean_memory_on_device
+init_ipex()
+from .utils import setup_logging, mem_eff_save_file
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# from comfy.utils import ProgressBar
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    flux,
+    ae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    validation_settings=None,
+    prompt_replacement=None,
+):
+    logger.info("")
+    logger.info(f"generating sample images at step: {steps}")
+    #distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+    # unwrap unet and text_encoder(s)
+    flux = accelerator.unwrap_model(flux)
+    if text_encoders is not None:
+        text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+    prompts = []
+    for line in args.sample_prompts:
+        line = line.strip()
+        if len(line) > 0 and line[0] != "#":
+            prompts.append(line)
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            from .train_util import line_to_prompt_dict
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+    with torch.no_grad(), accelerator.autocast():
+        image_tensor_list = []
+        for prompt_dict in prompts:
+            image_tensor = sample_image_inference(
+                accelerator,
+                args,
+                flux,
+                text_encoders,
+                ae,
+                save_dir,
+                prompt_dict,
+                epoch,
+                steps,
+                sample_prompts_te_outputs,
+                prompt_replacement,
+                validation_settings
+            )
+            image_tensor_list.append(image_tensor)
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+    clean_memory_on_device(accelerator.device)
+    return torch.cat(image_tensor_list, dim=0)
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    flux: flux_models.Flux,
+    text_encoders: Optional[List[CLIPTextModel]],
+    ae: flux_models.AutoEncoder,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+    validation_settings=None
+):
+    assert isinstance(prompt_dict, dict)
+    # negative_prompt = prompt_dict.get("negative_prompt")
+    if validation_settings is not None:
+        sample_steps = validation_settings["steps"]
+        width = validation_settings["width"]
+        height = validation_settings["height"]
+        scale = validation_settings["guidance_scale"]
+        seed = validation_settings["seed"]
+        base_shift = validation_settings["base_shift"]
+        max_shift = validation_settings["max_shift"]
+        shift = validation_settings["shift"]
+    else:
+        sample_steps = prompt_dict.get("sample_steps", 20)
+        width = prompt_dict.get("width", 512)
+        height = prompt_dict.get("height", 512)
+        scale = prompt_dict.get("scale", 3.5)
+        seed = prompt_dict.get("seed")
+        base_shift = 0.5
+        max_shift = 1.15
+        shift = True
+    # controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        # if negative_prompt is not None:
+        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+    # if negative_prompt is None:
+    #     negative_prompt = ""
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+    text_encoder_conds = []
+    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
+        text_encoder_conds = sample_prompts_te_outputs[prompt]
+        print(f"Using cached text encoder outputs for prompt: {prompt}")
+    if text_encoders is not None:
+        print(f"Encoding prompt: {prompt}")
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        # strategy has apply_t5_attn_mask option
+        encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+        if len(text_encoder_conds) == 0:
+            text_encoder_conds = encoded_text_encoder_conds
+        else:
+            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+            for i in range(len(encoded_text_encoder_conds)):
+                if encoded_text_encoder_conds[i] is not None:
+                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
+    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+    # sample image
+    weight_dtype = ae.dtype  # TOFO give dtype as argument
+    packed_latent_height = height // 16
+    packed_latent_width = width // 16
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=accelerator.device,
+        dtype=weight_dtype,
+        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+    )
+    timesteps = get_schedule(sample_steps, noise.shape[1], base_shift=base_shift, max_shift=max_shift, shift=shift)  # FLUX.1 dev -> shift=True
+    #print("TIMESTEPS: ", timesteps)
+    img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
+    with accelerator.autocast(), torch.no_grad():
+        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask)
+    x = x.float()
+    x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = ae.device  # will be on cpu
+    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast(), torch.no_grad():
+        x = ae.decode(x)
+    ae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+    return x
+    # wandb有効時のみログを送信
+    # try:
+    #     wandb_tracker = accelerator.get_tracker("wandb")
+    #     try:
+    #         import wandb
+    #     except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
+    #         raise ImportError("No wandb / wandb がインストールされていないようです")
+    #     wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
+    # except:  # wandb 無効時
+    #     pass
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def denoise(
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    txt: torch.Tensor,
+    txt_ids: torch.Tensor,
+    vec: torch.Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+    t5_attn_mask: Optional[torch.Tensor] = None,
+):
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    # comfy_pbar = ProgressBar(total=len(timesteps))
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        model.prepare_block_swap_before_forward()
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            txt_attention_mask=t5_attn_mask,
+        )
+        img = img + (t_prev - t_curr) * pred
+        # comfy_pbar.update(1)
+    model.prepare_block_swap_before_forward()
+    return img
+# endregion
+# region train
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz, _, H, W = latents.shape
+    sigmas = None
+    if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+        # Simple random t-based noise sampling
+        if args.timestep_sampling == "sigmoid":
+            # https://github.com/XLabs-AI/x-flux/tree/main
+            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+        else:
+            t = torch.rand((bsz,), device=device)
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "shift":
+        shift = args.discrete_flow_shift
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "flux_shift":
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        mu=get_lin_function(y1=0.5, y2=1.15)((H//2) * (W//2))
+        timesteps = time_shift(mu, 1.0, timesteps)
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    else:
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=device)
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+    return noisy_model_input, timesteps, sigmas
+def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+    weighting = None
+    if args.model_prediction_type == "raw":
+        pass
+    elif args.model_prediction_type == "additive":
+        # add the model_pred to the noisy_model_input
+        model_pred = model_pred + noisy_model_input
+    elif args.model_prediction_type == "sigma_scaled":
+        # apply sigma scaling
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+    return model_pred, weighting
+def save_models(
+    ckpt_path: str,
+    flux: flux_models.Flux,
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+    use_mem_eff_save: bool = False,
+):
+    state_dict = {}
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None and v.dtype != save_dtype:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    update_sd("", flux.state_dict())
+    if not use_mem_eff_save:
+        save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    else:
+        mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
+def save_flux_model_on_train_end(
+    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, flux: flux_models.Flux
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合している
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_flux_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    flux: flux_models.Flux,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+# endregion
+def add_flux_train_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
+        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )
+    parser.add_argument(
+        "--bypass_flux_guidance"
+        , action="store_true"
+        , help="bypass flux guidance module for Flex.1-Alpha Training"
+    )