from diffusers_helper.hf_login import login

import gc
import time
import os
import subprocess
import glob
import tempfile  # 1フレーム推論のための設定
import shutil    # ディレクトリ削除用
import cv2       # 画像処理用
import numpy as np
from PIL import Image

# Hugging Face Space環境内かどうか確認
IN_HF_SPACE = os.environ.get('SPACE_ID') is not None

# HF_HOMEの設定
os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))

import gradio as gr
import torch
import traceback
import einops
import safetensors.torch as sf
import numpy as np

# GPU利用可能性を追跡する変数を追加
GPU_AVAILABLE = False
GPU_INITIALIZED = False
last_update_time = time.time()
cpu_fallback_mode = False  # CPUフォールバックモードのフラグ

# モデルの初期化ステータスを追跡
MODELS_INITIALIZED = False

# クライアントタイムアウト設定の強化
if IN_HF_SPACE:
    # サーバーとクライアントの両方のタイムアウト設定を拡張
    os.environ["GRADIO_SERVER_PORT"] = "7860"
    os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
    os.environ["GRADIO_UPLOAD_TIMEOUT"] = "600"  # 10分のアップロードタイムアウト
    os.environ["GRADIO_REQUEST_TIMEOUT"] = "900"  # 15分のリクエストタイムアウト
    # メモリ使用量制限の緩和
    import resource
    resource.setrlimit(resource.RLIMIT_AS, (1<<40, 1<<40))

# Hugging Face Space内の場合、spacesモジュールをインポート
if IN_HF_SPACE:
    try:
        import spaces
        print("Hugging Face Space環境内で実行中、spacesモジュールをインポートしました")
        
        # GPU利用可能性をチェック
        try:
            GPU_AVAILABLE = torch.cuda.is_available()
            print(f"GPU利用可能: {GPU_AVAILABLE}")
            if GPU_AVAILABLE:
                print(f"GPUデバイス名: {torch.cuda.get_device_name(0)}")
                print(f"GPUメモリ: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
                
                # 小規模なGPU操作を試行し、GPUが実際に使用可能か確認
                try:
                    test_tensor = torch.zeros(1, device='cuda')
                    test_tensor = test_tensor + 1
                    del test_tensor
                    print("GPUテスト操作に成功しました")
                except Exception as e:
                    print(f"GPUテスト操作でエラーが発生しました: {e}")
                    GPU_AVAILABLE = False
                    cpu_fallback_mode = True
                    print("CPUフォールバックモードに設定します")
            else:
                print("警告: CUDAが利用可能と報告されていますが、GPUデバイスが検出されませんでした")
                cpu_fallback_mode = True
        except Exception as e:
            GPU_AVAILABLE = False
            cpu_fallback_mode = True
            print(f"GPU確認中にエラーが発生しました: {e}")
            print("CPUモードで実行します")
    except ImportError:
        print("spacesモジュールのインポートに失敗しました。Hugging Face Space環境外かもしれません")
        GPU_AVAILABLE = torch.cuda.is_available()
        if not GPU_AVAILABLE:
            cpu_fallback_mode = True

# 初回ロード時のチェック関数
def is_first_time_load():
    global GPU_INITIALIZED
    if not GPU_INITIALIZED:
        GPU_INITIALIZED = True
        return True
    return False


# GPU制限を超えたかどうかを確認する関数
def check_gpu_quota_exceeded():
    """GPU使用制限を超えたかどうかを確認"""
    global cpu_fallback_mode
    
    # すでにCPUモードならチェック不要
    if cpu_fallback_mode or not GPU_AVAILABLE:
        return True
    
    if not IN_HF_SPACE:
        return False
        
    try:
        import requests
        try:
            response = requests.get("http://localhost:7860/api/v1/spaces/usage", timeout=1)
            if response.status_code == 200:
                try:
                    data = response.json()
                    if data.get("gpu", {}).get("quota_exceeded", False):
                        print("GPU使用制限に達しています。")
                        cpu_fallback_mode = True
                        return True
                except ValueError as json_err:
                    print(f"JSON解析エラー: {json_err}")
                    # JSONデコードエラーが続く場合は、CPU動作にフォールバック
                    return False
            else:
                print(f"APIエンドポイントから不正なステータスコード: {response.status_code}")
        except requests.exceptions.RequestException as req_err:
            print(f"APIリクエスト中にエラー: {req_err}")
    except Exception as e:
        print(f"GPU使用制限確認中にエラー: {e}")
    
    return False

# 条件付きインポート（CPUモードでのエラーを回避するため）
try:
    from diffusers import AutoencoderKLHunyuanVideo
    from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
    from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
    from diffusers_helper.utils import (
        save_bcthw_as_mp4,
        crop_or_pad_yield_mask,
        soft_append_bcthw,
        resize_and_center_crop,
        state_dict_weighted_merge,
        state_dict_offset_merge,
        generate_timestamp,
    )
    from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
    from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
    from diffusers_helper.memory import (
        cpu,
        gpu,
        get_cuda_free_memory_gb,
        move_model_to_device_with_memory_preservation,
        offload_model_from_device_for_memory_preservation,
        fake_diffusers_current_device,
        DynamicSwapInstaller,
        unload_complete_models,
        load_model_as_complete,
        IN_HF_SPACE as MEMORY_IN_HF_SPACE
    )
    from diffusers_helper.thread_utils import AsyncStream, async_run
    from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
    from transformers import SiglipImageProcessor, SiglipVisionModel
    from diffusers_helper.clip_vision import hf_clip_vision_encode
    from diffusers_helper.bucket_tools import find_nearest_bucket
    print("基本的なディフューザーモジュールを正常にインポートしました")
except ImportError as e:
    print(f"一部の基本モジュールのインポートに失敗しました: {e}")
    # ダミー関数を定義
    class AsyncStream:
        def __init__(self):
            self.input_queue = MockQueue()
            self.output_queue = MockQueue()
    
    class MockQueue:
        def __init__(self):
            self.items = []
        
        def push(self, item):
            self.items.append(item)
        
        def top(self):
            return self.items[-1] if self.items else None
        
        def next(self):
            return self.items.pop(0) if self.items else ("end", None)
    
    def async_run(*args, **kwargs):
        pass
    
    def make_progress_bar_css():
        return ""
    
    def make_progress_bar_html(percentage, hint):
        return f"<div>{percentage}% - {hint}</div>"

# GPU使用に必要なモジュールのインポートを試みる（可能な場合）
try:
    from utils.lora_utils import merge_lora_to_state_dict
    from utils.fp8_optimization_utils import optimize_state_dict_with_fp8, apply_fp8_monkey_patch
    print("LoRAとFP8最適化モジュールを正常にインポートしました")
except ImportError as e:
    print(f"一部のモジュールのインポートに失敗しました: {e}")
    # ダミー関数を定義
    def merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=None):
        print("Warning: LoRA適用機能が利用できません")
        return state_dict
        
    def optimize_state_dict_with_fp8(state_dict, device, target_keys, exclude_keys, move_to_device=False):
        print("Warning: FP8最適化機能が利用できません")
        return state_dict
        
    def apply_fp8_monkey_patch(model, state_dict, use_scaled_mm=False):
        print("Warning: FP8 monkey patch機能が利用できません")
        pass

outputs_folder = './outputs/'
os.makedirs(outputs_folder, exist_ok=True)

# 追加: 指定された解像度リスト
NEW_RESOLUTIONS = [
    (416, 960), (448, 864), (480, 832), (512, 768), (544, 704), 
    (576, 672), (608, 640), (640, 608), (672, 576), (704, 544), 
    (768, 512), (832, 480), (864, 448), (960, 416), (640, 640),
]

# VRAMを安全に確認する関数
def get_safe_vram_size():
    """利用可能なVRAMを安全に確認する"""
    try:
        if torch.cuda.is_available() and not cpu_fallback_mode:
            free_mem_gb = get_cuda_free_memory_gb(gpu)
            print(f'空きVRAM {free_mem_gb} GB')
            return free_mem_gb
        else:
            free_mem_gb = 6.0  # デフォルト値
            print("CUDAが利用できないか、CPUフォールバックモードです。デフォルトのメモリ設定を使用します")
            return free_mem_gb
    except Exception as e:
        free_mem_gb = 6.0  # デフォルト値
        print(f"CUDAメモリ取得中にエラーが発生しました: {e}、デフォルトのメモリ設定を使用します")
        return free_mem_gb

# メモリ設定を初期化
if not IN_HF_SPACE:
    # 非Spaces環境でのメモリ設定
    free_mem_gb = get_safe_vram_size()
    high_vram = free_mem_gb > 60
    print(f'高VRAM モード: {high_vram}')
else:
    # Spaces環境でのメモリ設定
    print("Spaces環境でデフォルトのメモリ設定を使用します")
    try:
        if GPU_AVAILABLE and not cpu_fallback_mode:
            free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 * 0.9  # GPUメモリの90%を使用
            high_vram = free_mem_gb > 10  # より保守的な条件
        else:
            free_mem_gb = 6.0  # デフォルト値
            high_vram = False
    except Exception as e:
        print(f"GPUメモリ取得中にエラーが発生しました: {e}")
        free_mem_gb = 6.0  # デフォルト値
        high_vram = False
    
    print(f'GPUメモリ: {free_mem_gb:.2f} GB, 高VRAMモード: {high_vram}')

# modelsグローバル変数でモデル参照を保存
models = {}
stream = None

# 市松模様を作成する関数
def create_checkerboard(width, height, cell_size):
    """紫と黄色の市松模様を作成する"""
    # 市松模様のサイズを計算
    rows = int(np.ceil(height / cell_size))
    cols = int(np.ceil(width / cell_size))
    
    # 紫と黄色の色を定義
    purple = (128, 0, 128)  # RGB for purple
    yellow = (255, 255, 0)  # RGB for yellow
    
    # 空の画像を作成
    checkerboard = np.zeros((rows * cell_size, cols * cell_size, 3), dtype=np.uint8)
    
    # 市松模様を埋める
    for i in range(rows):
        for j in range(cols):
            color = purple if (i + j) % 2 == 0 else yellow
            y_start = i * cell_size
            y_end = (i + 1) * cell_size
            x_start = j * cell_size
            x_end = (j + 1) * cell_size
            checkerboard[y_start:y_end, x_start:x_end] = color
    
    # 元の画像サイズにリサイズ
    return checkerboard[:height, :width]

# ImageMaskからの画像とマスクを処理する関数
def process_image_mask(image_mask_dict):
    """ImageMaskからの画像とマスクを処理（サイズ制限あり）"""
    if image_mask_dict is None or not isinstance(image_mask_dict, dict):
        return None

    # Gradio ImageMask の新フォーマット
    background = image_mask_dict.get("background")
    layers     = image_mask_dict.get("layers")

    if background is None:
        return None

    # 画像サイズをチェックして制限（大きすぎる場合はリサイズ）
    if isinstance(background, Image.Image):
        w, h = background.size
        max_size = 1024  # 最大サイズを1024pxに制限
        if w > max_size or h > max_size:
            # 長辺が1024になるようにリサイズ
            ratio = max_size / max(w, h)
            new_w, new_h = int(w * ratio), int(h * ratio)
            background = background.resize((new_w, new_h), Image.LANCZOS)
            print(f"画像サイズを制限しました: {w}x{h} → {new_w}x{new_h}")
            
            # レイヤーも同様にリサイズ
            if layers and len(layers) > 0:
                new_layers = []
                for layer in layers:
                    if isinstance(layer, Image.Image):
                        layer = layer.resize((new_w, new_h), Image.LANCZOS)
                    new_layers.append(layer)
                layers = new_layers
                image_mask_dict["layers"] = layers
            
            image_mask_dict["background"] = background

    # ---- 1) Drop alpha from background ----
    if isinstance(background, Image.Image) and background.mode == "RGBA":
        background = background.convert("RGB")
    img_array = np.array(background)
    # safety-net: if it's still 4-channel, just slice
    if img_array.ndim == 3 and img_array.shape[2] == 4:
        img_array = img_array[..., :3]

    # ---- 2) マスクがある場合のみマスク処理 ----
    if layers and len(layers) > 0:
        layer = layers[0]
        if isinstance(layer, Image.Image) and layer.mode == "RGBA":
            layer = layer.convert("RGB")
        mask_array = np.array(layer)
        if mask_array.ndim == 3 and mask_array.shape[2] == 4:
            mask_array = mask_array[..., :3]

        # convert to gray + binary
        if mask_array.ndim == 3:
            mask_gray = cv2.cvtColor(mask_array, cv2.COLOR_RGB2GRAY)
        else:
            mask_gray = mask_array
        _, binary_mask = cv2.threshold(mask_gray, 1, 255, cv2.THRESH_BINARY)

        # 市松模様合成ロジック
        total_pixels = img_array.shape[0] * img_array.shape[1]
        cell_size = max(int(np.sqrt(total_pixels) / 20), 10)
        checkerboard = create_checkerboard(img_array.shape[1], img_array.shape[0], cell_size)

        result = img_array.copy()
        binary_mask_3ch = np.stack([binary_mask]*3, axis=2) // 255
        for c in range(3):
            result[..., c] = result[..., c] * (1 - binary_mask_3ch[..., c]) + checkerboard[..., c] * binary_mask_3ch[..., c]

        return result.astype(np.uint8)
    else:
        # マスクがない場合は元の画像をそのまま返す
        return img_array

# 最も近い解像度を見つける関数
def find_nearest_resolution(width, height):
    """最適な解像度を選択する関数（正方形入力に対する改善版）"""
    min_diff = float('inf')
    best_res = None
    aspect_ratio = width / height
    
    # 入力がほぼ正方形の場合、正方形の解像度を優先する
    # アスペクト比が0.95〜1.05の範囲なら正方形と見なす
    is_square_input = 0.95 <= aspect_ratio <= 1.05
    
    for res_h, res_w in NEW_RESOLUTIONS:
        # 解像度のアスペクト比を計算
        res_aspect = res_w / res_h
        
        # 正方形入力で、この解像度も正方形なら、優先的に選択
        if is_square_input and res_w == res_h:
            return (res_h, res_w)  # 正方形の解像度を即座に返す
        
        # アスペクト比の差を計算
        aspect_diff = abs(res_aspect - aspect_ratio)
        
        # 総ピクセル数の差を計算
        pixels_orig = width * height
        pixels_res = res_w * res_h
        pixel_diff = abs(pixels_res - pixels_orig)
        
        # 重み付けした差分（アスペクト比の差に重きを置く）
        total_diff = aspect_diff * 10000 + pixel_diff * 0.01
        
        if total_diff < min_diff:
            min_diff = total_diff
            best_res = (res_h, res_w)
    
    return best_res

# 一時ディレクトリ管理関数
def create_temp_directory():
    """一時ディレクトリを作成して、パスを返す"""
    temp_dir = tempfile.mkdtemp(prefix="hunyuan_temp_")
    print(f"一時ディレクトリを作成しました: {temp_dir}")
    return temp_dir

def cleanup_temp_files(temp_dir):
    """処理後に一時ファイルを削除する"""
    if temp_dir and os.path.exists(temp_dir):
        try:
            shutil.rmtree(temp_dir)
            print(f"一時ディレクトリを削除しました: {temp_dir}")
        except Exception as e:
            print(f"一時ディレクトリの削除に失敗しました: {e}")

# mp4からffmpegでPNGフレームを抽出する関数（一時フォルダ使用）
def extract_frames_from_mp4(mp4_path, temp_dir, job_id):
    """MP4から画像フレームを抽出し、一時ディレクトリに保存"""
    # フレーム出力用のフォルダ作成
    frames_dir = os.path.join(temp_dir, "frames", job_id)
    os.makedirs(frames_dir, exist_ok=True)
    
    # ffmpegコマンドでmp4からフレームを抽出
    cmd = [
        'ffmpeg', 
        '-i', mp4_path, 
        '-vf', 'fps=30', 
        f'{frames_dir}/frame_%04d.png',
        '-hide_banner',
        '-loglevel', 'error'
    ]
    
    try:
        subprocess.run(cmd, check=True)
        # 抽出されたフレームのリストを返す
        frames = sorted(glob.glob(f'{frames_dir}/frame_*.png'))
        return frames
    except subprocess.CalledProcessError as e:
        print(f"Error extracting frames from {mp4_path}: {e}")
        return []

# 一時ファイルを使用するmp4保存関数
def save_bcthw_as_mp4_with_frames(bcthw, temp_dir, job_id, fps=30, crf=16):
    """BCTHWテンソルをMP4として保存し、フレームを抽出する（一時ディレクトリ使用）"""
    # 一時ディレクトリにMP4を保存
    output_path = os.path.join(temp_dir, f"{job_id}.mp4")
    
    # 元の関数を呼び出してmp4を保存
    save_bcthw_as_mp4(bcthw, output_path, fps, crf)
    
    # フレームを抽出
    frames = extract_frames_from_mp4(output_path, temp_dir, job_id)
    
    return output_path, frames

# GPUの状態を確認する関数
def check_gpu_status():
    """GPUが使用可能かつクォータ内かを確認"""
    global GPU_AVAILABLE, cpu_fallback_mode
    
    # GPU使用不可の場合はCPUモードのまま
    if not GPU_AVAILABLE:
        cpu_fallback_mode = True
        return False
    
    # GPU使用制限を超えた場合はCPUモードに切り替え
    if check_gpu_quota_exceeded():
        print("GPU使用制限を超えました。CPUモードに切り替えます。")
        cpu_fallback_mode = True
        return False
    
    # GPUが使用可能でクォータ内
    return True

# モデルロード関数（GPU対応）
def load_models():
    """各種モデルを読み込む（GPU対応）"""
    global models, MODELS_INITIALIZED, cpu_fallback_mode
    
    # すでに初期化済みの場合はそのまま返す
    if MODELS_INITIALIZED and models:
        return models
        
    print("モデルを読み込みます...")
    
    try:
        # GPU状態を確認
        gpu_ok = check_gpu_status()
        device = gpu if gpu_ok and not cpu_fallback_mode else cpu
        dtype = torch.float16 if gpu_ok and not cpu_fallback_mode else torch.float32
        
        print(f"モデルをロード: デバイス={device}, データ型={dtype}")
        
        # モデル読み込み
        text_encoder = LlamaModel.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder", torch_dtype=dtype
        ).cpu()
        text_encoder_2 = CLIPTextModel.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder_2", torch_dtype=dtype
        ).cpu()
        tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
        tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
        vae = AutoencoderKLHunyuanVideo.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=dtype
        ).cpu()

        feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="feature_extractor")
        image_encoder = SiglipVisionModel.from_pretrained(
            "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=dtype
        ).cpu()

        print("Transformerモデルを読み込み中...")
        
        # CPU対応: CPUモードではbfloat16ではなくfloat32を使用
        transformer_dtype = torch.bfloat16 if gpu_ok and not cpu_fallback_mode else torch.float32
        
        transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
            "lllyasviel/FramePackI2V_HY", torch_dtype=transformer_dtype
        ).cpu()
        
        transformer.eval()
        transformer.high_quality_fp32_output_for_inference = True
        print("transformer.high_quality_fp32_output_for_inference = True")

        if gpu_ok and not cpu_fallback_mode:
            transformer.to(dtype=torch.bfloat16)
        transformer.requires_grad_(False)
        
        vae.eval()
        text_encoder.eval()
        text_encoder_2.eval()
        image_encoder.eval()

        if not high_vram or cpu_fallback_mode:
            vae.enable_slicing()
            vae.enable_tiling()

        # CPUモードでは精度を下げない
        if gpu_ok and not cpu_fallback_mode:
            vae.to(dtype=torch.float16)
            image_encoder.to(dtype=torch.float16)
            text_encoder.to(dtype=torch.float16)
            text_encoder_2.to(dtype=torch.float16)

        vae.requires_grad_(False)
        text_encoder.requires_grad_(False)
        text_encoder_2.requires_grad_(False)
        image_encoder.requires_grad_(False)

        # GPUに移動（可能な場合のみ）
        if gpu_ok and not cpu_fallback_mode:
            if not high_vram:
                # DynamicSwapInstallerはhuggingfaceのenable_sequential_offloadと同じですが3倍高速です
                DynamicSwapInstaller.install_model(text_encoder, device=gpu)
            else:
                text_encoder.to(gpu)
                text_encoder_2.to(gpu)
                image_encoder.to(gpu)
                vae.to(gpu)
        
        print("すべてのモデルの読み込みが完了しました")
        
        models = {
            'transformer': transformer,
            'text_encoder': text_encoder,
            'text_encoder_2': text_encoder_2,
            'tokenizer': tokenizer,
            'tokenizer_2': tokenizer_2,
            'vae': vae,
            'feature_extractor': feature_extractor,
            'image_encoder': image_encoder,
        }
        
        MODELS_INITIALIZED = True
        return models
    except Exception as e:
        # GPU関連のエラーを検出
        if "CUDA" in str(e) or "GPU" in str(e) or "nvidia" in str(e).lower():
            print(f"GPU関連のエラーが発生しました: {e}")
            print("CPUモードにフォールバックします")
            cpu_fallback_mode = True
            # CPUモードで再度試行
            return load_models_cpu()
        else:
            print(f"モデル読み込み中にエラーが発生しました: {e}")
            traceback.print_exc()
            return {}

# CPUのみを使用したモデルロード関数
def load_models_cpu():
    """CPUのみを使用してモデルを読み込む"""
    global models, MODELS_INITIALIZED
    
    print("CPUモードでモデルを読み込みます...")
    
    try:
        # CPUモード用の設定
        device = cpu
        dtype = torch.float32
        
        # モデル読み込み（CPU最適化版）
        text_encoder = LlamaModel.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder", torch_dtype=dtype
        ).cpu()
        text_encoder_2 = CLIPTextModel.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="text_encoder_2", torch_dtype=dtype
        ).cpu()
        tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
        tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
        vae = AutoencoderKLHunyuanVideo.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=dtype
        ).cpu()

        feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder="feature_extractor")
        image_encoder = SiglipVisionModel.from_pretrained(
            "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=dtype
        ).cpu()

        print("CPUモードでTransformerモデルを読み込み中...")
        transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
            "lllyasviel/FramePackI2V_HY", torch_dtype=torch.float32
        ).cpu()
        
        transformer.eval()
        transformer.high_quality_fp32_output_for_inference = True
        transformer.requires_grad_(False)
        
        vae.eval()
        text_encoder.eval()
        text_encoder_2.eval()
        image_encoder.eval()

        # CPUモードでは常にスライシングとタイリングを有効化
        vae.enable_slicing()
        vae.enable_tiling()

        vae.requires_grad_(False)
        text_encoder.requires_grad_(False)
        text_encoder_2.requires_grad_(False)
        image_encoder.requires_grad_(False)
        
        print("CPUモードですべてのモデルの読み込みが完了しました")
        
        models = {
            'transformer': transformer,
            'text_encoder': text_encoder,
            'text_encoder_2': text_encoder_2,
            'tokenizer': tokenizer,
            'tokenizer_2': tokenizer_2,
            'vae': vae,
            'feature_extractor': feature_extractor,
            'image_encoder': image_encoder,
        }
        
        MODELS_INITIALIZED = True
        return models
    except Exception as e:
        print(f"CPUモードでのモデル読み込み中にエラーが発生しました: {e}")
        traceback.print_exc()
        return {}

# モデル取得・初期化関数
def get_models():
    """モデルを取得する（必要に応じて読み込む）"""
    global models, GPU_INITIALIZED, MODELS_INITIALIZED, cpu_fallback_mode
    
    if not models or not MODELS_INITIALIZED:
        try:
            # GPU使用可能性を再確認
            if check_gpu_status():
                # GPU対応モードでモデル読み込み
                try:
                    print("GPU対応モードでモデル読み込みを試みます")
                    models = load_models()
                    GPU_INITIALIZED = True
                except Exception as e:
                    if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU quota exceeded" in str(e) or "nvidia" in str(e).lower():
                        print(f"GPU対応モデル読み込みに失敗しました: {e}")
                        print("CPUモードにフォールバックします")
                        cpu_fallback_mode = True
                        models = load_models_cpu()
                    else:
                        print(f"モデル読み込み中に予期せぬエラーが発生しました: {e}")
                        traceback.print_exc()
                        # エラーを再スローせず、空のモデル辞書を返す
                        return {}
            else:
                # CPUモードでモデル読み込み
                print("CPUモードでモデル読み込みを実行します")
                cpu_fallback_mode = True
                models = load_models_cpu()
        except Exception as e:
            print(f"モデル取得中にエラーが発生しました: {e}")
            traceback.print_exc()
            # エラーを再スローせず、空のモデル辞書を返す
            return {}
    
    return models

# 処理ワーカー関数（GPU対応版）
def worker_with_temp_files(
    image_mask_dict,
    prompt,
    n_prompt,
    seed,
    steps,
    cfg,
    gs,
    rs,
    gpu_memory_preservation,
    use_teacache,
    mp4_crf,
    lora_file,
    lora_multiplier,
    fp8_optimization,
):
    global last_update_time, cpu_fallback_mode
    last_update_time = time.time()
    
    # GPU状態を最初にチェック
    check_gpu_status()

    # マスク画像の処理
    input_image = process_image_mask(image_mask_dict)
    if input_image is None:
        error_msg = "マスク画像の処理に失敗しました。画像をアップロードして、マスクを描画してください。"
        print(error_msg)
        stream.output_queue.push(("error", error_msg))
        stream.output_queue.push(("end", None))
        return

    # 一時ディレクトリの作成
    temp_dir = create_temp_directory()
    
    # 一時ディレクトリ内にサブディレクトリを作成
    os.makedirs(os.path.join(temp_dir, "frames"), exist_ok=True)

    # 1フレーム推論用の固定設定
    total_second_length = 1.0
    latent_window_size = 9
    total_latent_sections = 1
    latent_paddings = [0]

    job_id = generate_timestamp()

    stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "開始中 ..."))))

    try:
        # モデルを取得
        models = get_models()
        if not models:
            error_msg = "モデルの読み込みに失敗しました。詳細はログを確認してください。"
            print(error_msg)
            stream.output_queue.push(("error", error_msg))
            stream.output_queue.push(("end", None))
            cleanup_temp_files(temp_dir)
            return
            
        transformer = models['transformer']
        text_encoder = models['text_encoder']
        text_encoder_2 = models['text_encoder_2']
        tokenizer = models['tokenizer']
        tokenizer_2 = models['tokenizer_2']
        vae = models['vae']
        feature_extractor = models['feature_extractor']
        image_encoder = models['image_encoder']
        
        # LoRAファイルの適用
        if not cpu_fallback_mode and lora_file is not None and os.path.exists(lora_file):
            try:
                print(f"LoRAファイル {os.path.basename(lora_file)} をマージします...")
                state_dict = transformer.state_dict()
                state_dict = merge_lora_to_state_dict(state_dict, lora_file, lora_multiplier, device=gpu)
                
                if fp8_optimization and not cpu_fallback_mode:
                    TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"]
                    EXCLUDE_KEYS = ["norm"]  # Exclude norm layers from FP8
                    
                    print("FP8最適化を適用します")
                    state_dict = optimize_state_dict_with_fp8(state_dict, gpu, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=False)
                    apply_fp8_monkey_patch(transformer, state_dict, use_scaled_mm=False)
                    gc.collect()
                    
                info = transformer.load_state_dict(state_dict, strict=True, assign=True)
                print(f"LoRAと/またはFP8最適化を適用しました: {info}")
            except Exception as e:
                print(f"LoRA適用中にエラーが発生しました: {e}")
                # エラー発生時も処理を継続
        elif cpu_fallback_mode and lora_file is not None and os.path.exists(lora_file):
            print("CPUモードではLoRAはサポートされていません")

        # Clean GPU (GPU使用時のみ)
        if not high_vram and not cpu_fallback_mode:
            unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)

        # Text encoding
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "テキストエンコーディング中 ..."))))

        # 途中でGPU制限を超えた場合のチェック
        if check_gpu_quota_exceeded():
            cpu_fallback_mode = True
            print("テキストエンコード中にGPU制限を超えました。CPUモードに切り替えます。")
            # モデルを再度取得
            models = get_models()
            transformer = models['transformer']
            text_encoder = models['text_encoder']
            text_encoder_2 = models['text_encoder_2']
            tokenizer = models['tokenizer']
            tokenizer_2 = models['tokenizer_2']
            vae = models['vae']
            feature_extractor = models['feature_extractor']
            image_encoder = models['image_encoder']

        # GPU/CPU選択ロジック
        if not cpu_fallback_mode:
            target_device = gpu
            
            if not high_vram:
                # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
                fake_diffusers_current_device(text_encoder, target_device)
                load_model_as_complete(text_encoder_2, target_device=target_device)
            
            # テキストエンコーディングを実行
            llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
            
            if cfg == 1:
                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
            else:
                llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
        else:
            # CPUモードでのテキストエンコーディング
            target_device = cpu
            
            # テキストエンコーダーをCPUに設定
            text_encoder = text_encoder.to(cpu)
            text_encoder_2 = text_encoder_2.to(cpu)
            
            # テキストエンコーディングを実行（CPU上で）
            llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
            
            if cfg == 1:
                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
            else:
                llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)

        llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)

        # Processing input image
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "画像処理中 ..."))))

        H, W, C = input_image.shape
        
        # 元の画像サイズを保存
        original_height, original_width = H, W
        
        # 最適な解像度を選択
        target_height, target_width = find_nearest_resolution(W, H)
        print(f"オリジナルサイズ: {W}x{H}, 選択された解像度: {target_width}x{target_height}")
        
        input_image_np = resize_and_center_crop(input_image, target_width=target_width, target_height=target_height)

        # 一時ディレクトリに入力画像を保存
        input_image_path = os.path.join(temp_dir, f"{job_id}_input.png")
        Image.fromarray(input_image_np).save(input_image_path)

        input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
        input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]

        # VAE encoding
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "VAEエンコーディング中 ..."))))

        # 途中でGPU制限を超えた場合のチェック
        if check_gpu_quota_exceeded():
            cpu_fallback_mode = True
            print("VAEエンコード中にGPU制限を超えました。CPUモードに切り替えます。")
            # モデルを再度取得
            models = get_models()
            vae = models['vae']
            
        if not cpu_fallback_mode and not high_vram:
            load_model_as_complete(vae, target_device=target_device)

        # VAEエンコーディング（CPU/GPUモードに応じて）
        if not cpu_fallback_mode:
            start_latent = vae_encode(input_image_pt, vae)
        else:
            # CPUモードでのVAEエンコーディング
            vae = vae.to(cpu)
            # CPU用に精度やバッチサイズを調整
            input_image_pt_cpu = input_image_pt.to(cpu, dtype=torch.float32)
            start_latent = vae_encode(input_image_pt_cpu, vae)

        # CLIP Vision
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "CLIP Visionエンコーディング中 ..."))))

        # 途中でGPU制限を超えた場合のチェック
        if check_gpu_quota_exceeded():
            cpu_fallback_mode = True
            print("CLIP Vision処理中にGPU制限を超えました。CPUモードに切り替えます。")
            # モデルを再度取得
            models = get_models()
            image_encoder = models['image_encoder']

        if not cpu_fallback_mode and not high_vram:
            load_model_as_complete(image_encoder, target_device=target_device)

        # CLIP Visionエンコーディング（CPU/GPUモードに応じて）
        if not cpu_fallback_mode:
            image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
        else:
            # CPUモードでのCLIP Visionエンコーディング
            image_encoder = image_encoder.to(cpu)
            image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
            
        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state

        # データ型の変換（CPU/GPUモードに応じて）
        if not cpu_fallback_mode:
            # GPUモードでの型変換
            llama_vec = llama_vec.to(torch.bfloat16)
            llama_vec_n = llama_vec_n.to(torch.bfloat16)
            clip_l_pooler = clip_l_pooler.to(torch.bfloat16)
            clip_l_pooler_n = clip_l_pooler_n.to(torch.bfloat16)
            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(torch.bfloat16)
        else:
            # CPUモードではfloat32を維持
            llama_vec = llama_vec.to(torch.float32)
            llama_vec_n = llama_vec_n.to(torch.float32)
            clip_l_pooler = clip_l_pooler.to(torch.float32)
            clip_l_pooler_n = clip_l_pooler_n.to(torch.float32)
            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(torch.float32)

        # Transformerモデルの準備
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "Transformerモデル準備中 ..."))))

        # 途中でGPU制限を超えた場合のチェック
        if check_gpu_quota_exceeded():
            cpu_fallback_mode = True
            print("Transformer準備中にGPU制限を超えました。CPUモードに切り替えます。")
            # モデルを再度取得
            models = get_models()
            transformer = models['transformer']

        if not cpu_fallback_mode:
            # GPUモード
            if not high_vram:
                if IN_HF_SPACE:
                    # Hugging Face Space環境でのメモリ管理
                    move_model_to_device_with_memory_preservation(
                        transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation
                    )
                else:
                    # 通常環境でのメモリ管理
                    DynamicSwapInstaller.install_model(transformer, device=gpu) 
            else:
                transformer.to(gpu)
        else:
            # CPUモード
            transformer = transformer.to(cpu)

        # Sampling
        stream.output_queue.push(("progress", (None, "", make_progress_bar_html(0, "サンプリング開始 ..."))))

        rnd = torch.Generator("cpu").manual_seed(seed)
        
        # 1フレーム推論のための設定
        num_frames = 1
        print(f"1フレーム推論モード: num_frames = {num_frames}")

        # CPU/GPUモードに応じた設定
        device_to_use = cpu if cpu_fallback_mode else gpu
        dtype_to_use = torch.float32 if cpu_fallback_mode else torch.bfloat16

        history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, target_height // 8, target_width // 8), dtype=torch.float32).cpu()
        history_pixels = None
        total_generated_latent_frames = 0

        # 1フレーム推論処理
        for latent_padding in latent_paddings:
            is_last_section = latent_padding == 0
            latent_padding_size = latent_padding * latent_window_size

            if stream.input_queue.top() == "end":
                stream.output_queue.push(("end", None))
                cleanup_temp_files(temp_dir)  # 終了時に一時ファイル削除
                return

            print(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")

            indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
            (
                clean_latent_indices_pre,
                blank_indices,
                latent_indices,
                clean_latent_indices_post,
                clean_latent_2x_indices,
                clean_latent_4x_indices,
            ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)

            clean_latents_pre = start_latent.to(history_latents)
            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
                [1, 2, 16], dim=2
            )
            clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)

            # 1フレーム推論用の設定
            latent_indices = latent_indices[:, -1:]
            print(f"latent_indices = {latent_indices}")
            
            # 2xと4xは空に設定
            clean_latent_2x_indices = None
            clean_latent_4x_indices = None
            clean_latents_2x = None
            clean_latents_4x = None

            # GPU使用時のメモリ管理の最適化
            if not cpu_fallback_mode and not high_vram:
                unload_complete_models()
                move_model_to_device_with_memory_preservation(
                    transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation
                )

            if use_teacache and not cpu_fallback_mode:
                transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
            else:
                transformer.initialize_teacache(enable_teacache=False)

            def callback(d):
                preview = d["denoised"]
                preview = vae_decode_fake(preview)

                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
                preview = einops.rearrange(preview, "b c t h w -> (b h) (t w) c")

                if stream.input_queue.top() == "end":
                    stream.output_queue.push(("end", None))
                    cleanup_temp_files(temp_dir)  # 終了時に一時ファイル削除
                    raise KeyboardInterrupt("ユーザーがタスクを終了しました。")

                current_step = d["i"] + 1
                percentage = int(100.0 * current_step / steps)
                hint = f"サンプリング中 {current_step}/{steps}"
                desc = f"フレーム生成中: {current_step}/{steps} ({percentage}%)"
                stream.output_queue.push(("progress", (preview, desc, make_progress_bar_html(percentage, hint))))
                return

            # 途中でGPU制限を超えた場合のチェック
            if check_gpu_quota_exceeded():
                cpu_fallback_mode = True
                print("サンプリング前にGPU制限を超えました。CPUモードに切り替えます。")
                # モデルを再度取得
                models = get_models()
                transformer = models['transformer']
                transformer = transformer.to(cpu)
                device_to_use = cpu
                dtype_to_use = torch.float32

            # 適切な設定でサンプリング実行
            try:
                # CPUモードでステップ数を減らす（速度向上のため）
                actual_steps = min(steps, 15) if cpu_fallback_mode else steps
                
                generated_latents = sample_hunyuan(
                    transformer=transformer,
                    sampler="unipc",
                    width=target_width,
                    height=target_height,
                    frames=num_frames,
                    real_guidance_scale=cfg,
                    distilled_guidance_scale=gs,
                    guidance_rescale=rs,
                    # shift=3.0,
                    num_inference_steps=actual_steps,
                    generator=rnd,
                    prompt_embeds=llama_vec,
                    prompt_embeds_mask=llama_attention_mask,
                    prompt_poolers=clip_l_pooler,
                    negative_prompt_embeds=llama_vec_n,
                    negative_prompt_embeds_mask=llama_attention_mask_n,
                    negative_prompt_poolers=clip_l_pooler_n,
                    device=device_to_use,
                    dtype=dtype_to_use,
                    image_embeddings=image_encoder_last_hidden_state,
                    latent_indices=latent_indices,
                    clean_latents=clean_latents,
                    clean_latent_indices=clean_latent_indices,
                    clean_latents_2x=clean_latents_2x,
                    clean_latent_2x_indices=clean_latent_2x_indices,
                    clean_latents_4x=clean_latents_4x,
                    clean_latent_4x_indices=clean_latent_4x_indices,
                    callback=callback,
                )
            except Exception as e:
                if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU quota exceeded" in str(e):
                    print(f"サンプリング中にGPU関連エラーが発生: {e}")
                    print("CPUモードに切り替えて再試行します")
                    
                    # CPUモードに切り替え
                    cpu_fallback_mode = True
                    
                    # モデルをCPUに移動
                    transformer = transformer.to(cpu)
                    
                    # CPUモード用のパラメータ設定
                    device_to_use = cpu
                    dtype_to_use = torch.float32
                    
                    # CPUモードでステップ数を減らす
                    actual_steps = min(steps, 15)
                    
                    # CPUモードで再試行
                    generated_latents = sample_hunyuan(
                        transformer=transformer,
                        sampler="unipc",
                        width=target_width,
                        height=target_height,
                        frames=num_frames,
                        real_guidance_scale=cfg,
                        distilled_guidance_scale=gs,
                        guidance_rescale=rs,
                        num_inference_steps=actual_steps,
                        generator=rnd,
                        prompt_embeds=llama_vec.to(torch.float32),
                        prompt_embeds_mask=llama_attention_mask,
                        prompt_poolers=clip_l_pooler.to(torch.float32),
                        negative_prompt_embeds=llama_vec_n.to(torch.float32),
                        negative_prompt_embeds_mask=llama_attention_mask_n,
                        negative_prompt_poolers=clip_l_pooler_n.to(torch.float32),
                        device=device_to_use,
                        dtype=dtype_to_use,
                        image_embeddings=image_encoder_last_hidden_state.to(torch.float32),
                        latent_indices=latent_indices,
                        clean_latents=clean_latents,
                        clean_latent_indices=clean_latent_indices,
                        clean_latents_2x=clean_latents_2x,
                        clean_latent_2x_indices=clean_latent_2x_indices,
                        clean_latents_4x=clean_latents_4x,
                        clean_latent_4x_indices=clean_latent_4x_indices,
                        callback=callback,
                    )
                else:
                    # GPU関連以外のエラーは再スロー
                    raise

            print(f"generated_latents.shape = {generated_latents.shape}")

            total_generated_latent_frames += int(generated_latents.shape[2])
            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)

            # メモリ管理の最適化（GPUモードのみ）
            if not cpu_fallback_mode and not high_vram:
                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
                load_model_as_complete(vae, target_device=gpu)

            real_history_latents =history_latents[:, :, :total_generated_latent_frames, :, :]

            # VAEデコード
            stream.output_queue.push(("progress", (None, "", make_progress_bar_html(70, "VAEデコード中 ..."))))

            # VAEデコード前にGPU制限超過チェック
            if check_gpu_quota_exceeded():
                cpu_fallback_mode = True
                print("VAEデコード前にGPU制限を超えました。CPUモードに切り替えます。")
                # モデルを再度取得
                models = get_models()
                vae = models['vae']
                vae = vae.to(cpu)

            if history_pixels is None:
                if not cpu_fallback_mode:
                    history_pixels = vae_decode(real_history_latents, vae).cpu()
                else:
                    # CPUモードでのVAEデコード
                    vae = vae.to(cpu)
                    history_pixels = vae_decode(real_history_latents.to(cpu, dtype=torch.float32), vae).cpu()
            else:
                section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
                overlapped_frames = latent_window_size * 4 - 3

                if not cpu_fallback_mode:
                    current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
                else:
                    # CPUモードでのVAEデコード
                    vae = vae.to(cpu)
                    current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames].to(cpu, dtype=torch.float32), vae).cpu()
                    
                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)

            if not cpu_fallback_mode and not high_vram:
                unload_complete_models()

            # 一時フォルダにMP4保存とフレーム抽出
            stream.output_queue.push(("progress", (None, "", make_progress_bar_html(80, "動画保存中 ..."))))
            
            output_path, extracted_frames = save_bcthw_as_mp4_with_frames(
                history_pixels, temp_dir, job_id, fps=30, crf=mp4_crf
            )

            # 生成情報
            resize_info = {
                "original_width": original_width,
                "original_height": original_height,
                "target_width": target_width,
                "target_height": target_height
            }

            print(f"デコード完了。現在の潜在変数形状 {real_history_latents.shape}; ピクセル形状 {history_pixels.shape}")
            print(f"MP4から {len(extracted_frames)} フレームを抽出しました")

            # 元の解像度にリサイズした結果フレームを作成
            stream.output_queue.push(("progress", (None, "", make_progress_bar_html(90, "リサイズ中 ..."))))
            
            if resize_info:
                print(f"原寸サイズに戻します: {resize_info['original_width']}x{resize_info['original_height']}")
                resized_frames = []
                
                for frame_path in extracted_frames:
                    # フレームを読み込み
                    frame = Image.open(frame_path)
                    # 元のサイズにリサイズ
                    resized_frame = frame.resize((resize_info['original_width'], resize_info['original_height']), Image.LANCZOS)
                    # 一時ファイルに保存
                    resized_path = frame_path.replace(".png", "_resized.png")
                    resized_frame.save(resized_path)
                    resized_frames.append(resized_path)
                
                # 元のサイズに戻したフレームのリストを使用
                extracted_frames = resized_frames

            # 最後のフレームのパスを取得
            last_frame_path = extracted_frames[0] if extracted_frames else None
            
            # 結果を送信
            stream.output_queue.push(("file", (output_path, last_frame_path, resize_info, temp_dir)))

            if is_last_section:
                break
    except Exception as e:
        traceback.print_exc()
        print(f"エラーが発生しました: {str(e)}")

        if not cpu_fallback_mode and not high_vram:
            try:
                unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
            except:
                pass

        # エラー発生時も一時ファイルを削除
        cleanup_temp_files(temp_dir)
        
        # エラーメッセージを送信
        stream.output_queue.push(("error", str(e)))
        
    stream.output_queue.push(("end", None))
    return

# 統合版プロセス関数 - GPU/CPU両方に対応（Spaces環境向け改良版）
if IN_HF_SPACE and 'spaces' in globals():
    @spaces.GPU(duration=120)
    def process_with_temp(image_mask_dict, lora_multiplier=1.0):
        """一時ファイルを使用する処理メインフロー（GPU/CPU対応）"""
        global stream, cpu_fallback_mode
        
        # GPU状態を事前チェック
        check_gpu_status()
        
        # 入力画像が提供されていることを確認
        if image_mask_dict is None:
            return (
                gr.update(visible=False),   # preview_image
                gr.update(visible=False),   # result_frame
                "画像をアップロードしてマスクを描画してください",  # progress_desc
                "",                         # progress_bar
                gr.update(interactive=True),  # start_button
                gr.update(interactive=False),  # end_button
                f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}"  # mode_info
            )

        # 処理開始時に UI をリセット
        yield (
            gr.update(visible=False),  # preview_image
            gr.update(visible=False),  # result_frame
            "",                        # progress_desc
            "",                        # progress_bar
            gr.update(interactive=False),  # start_button
            gr.update(interactive=True),    # end_button
            f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}"  # mode_info
        )

        # 固定パラメータ
        prompt = "A yellow and purple checkerboard mask fades away, smoothly revealing the background beneath, while the areas not covered by the mask remain completely still."
        n_prompt = None
        seed = 1234
        steps = 25  # CPU モードでは自動的に減らされる
        cfg = 1.0
        gs = 10.0
        rs = 0.0
        gpu_memory_preservation = 6.0
        use_teacache = False
        mp4_crf = 0
        lora_file = "./LoRA/mask_fadeout_V1.safetensors"
        fp8_optimization = False
        
        # LoRAファイルの存在確認
        if not os.path.exists(lora_file):
            print(f"警告: LoRAファイル {lora_file} が見つかりません。LoRAなしで処理を続行します。")
            lora_file = None

        try:
            # GPU使用可能かどうかを確認
            if check_gpu_quota_exceeded():
                print("GPU使用制限を超えているため、CPUモードで実行します")
                cpu_fallback_mode = True
        except Exception as e:
            print(f"GPU確認中にエラーが発生しました: {e}")

        # モード情報をログに表示
        print(f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}")

        # 非同期ワーカー起動
        stream = AsyncStream()
        async_run(
            worker_with_temp_files,
            image_mask_dict,
            prompt,
            n_prompt,
            seed,
            steps,
            cfg,
            gs,
            rs,
            gpu_memory_preservation,
            use_teacache,
            mp4_crf,
            lora_file,
            lora_multiplier,
            fp8_optimization,
        )

        temp_dir = None
        last_frame_path = None

        try:
            while True:
                flag, data = stream.output_queue.next()

                # 生成完了ファイルを受け取ったとき
                if flag == "file":
                    output_path, last_frame, resize_info, temp_dir = data
                    last_frame_path = last_frame
                    img_file = Image.open(last_frame_path)

                    yield (
                        gr.update(visible=True),                   # preview_image に前回プレビューがあればそのまま
                        gr.update(visible=True, value=img_file), 
                        gr.update(),                               # progress_desc
                        gr.update(),                               # progress_bar
                        gr.update(interactive=False),              # start_button を無効化
                        gr.update(interactive=True),                # end_button を有効化
                        f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}"  # mode_info
                    )

                # 進捗更新を受け取ったとき
                elif flag == "progress":
                    preview, desc, html = data
                    yield (
                        gr.update(visible=True, value=preview),  # preview_image に進捗サムネイル
                        gr.update(visible=False),                # result_frame は隠す
                        desc,                                    # progress_desc にテキスト
                        html,                                    # progress_bar に HTML
                        gr.update(interactive=False),            # start_button
                        gr.update(interactive=True),              # end_button
                        f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}"  # mode_info
                    )

                # エラーを受け取ったとき
                elif flag == "error":
                    error_message = data
                    yield (
                        gr.update(visible=False),   # preview_image
                        gr.update(visible=False),   # result_frame
                        error_message,              # progress_desc にエラー表示
                        "",                         # progress_bar
                        gr.update(interactive=True),  # start_button
                        gr.update(interactive=False),  # end_button
                        f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'} (エラー発生)"  # mode_info
                    )
                    if temp_dir and os.path.exists(temp_dir):
                        cleanup_temp_files(temp_dir)
                    break

                # 処理終了を受け取ったとき
                elif flag == "end":
                    img_end = Image.open(last_frame_path)
                    # 最終的に last_frame を再表示
                    yield (
                        gr.update(visible=False),                        # preview_image を隠す
                        gr.update(visible=True,  value=img_end),  # result_frame に１枚だけ表示
                        "",                                              # progress_desc
                        "",                                              # progress_bar
                        gr.update(interactive=True),                     # start_button
                        gr.update(interactive=False),                     # end_button
                        f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'} (完了)"  # mode_info
                    )
                    if temp_dir and os.path.exists(temp_dir):
                        cleanup_temp_files(temp_dir)
                    break

        except Exception as e:
            print(f"処理中にエラーが発生しました: {e}")
            if temp_dir and os.path.exists(temp_dir):
                cleanup_temp_files(temp_dir)
            raise e
else:
    # 非Spaces環境用のプロセス関数
    def process_with_temp(image_mask_dict, lora_multiplier=1.0):
        """一時ファイルを使用する処理メインフロー（非GPU環境用）"""
        global stream, cpu_fallback_mode

        # CPU固定モード
        cpu_fallback_mode = True
        
        # 入力画像が提供されていることを確認
        if image_mask_dict is None:
            return (
                gr.update(visible=False),   # preview_image
                gr.update(visible=False),   # result_frame
                "画像をアップロードしてマスクを描画してください",  # progress_desc
                "",                         # progress_bar
                gr.update(interactive=True),  # start_button
                gr.update(interactive=False),  # end_button
                "実行モード: CPU (通常環境)"  # mode_info
            )

        # 処理開始時に UI をリセット
        yield (
            gr.update(visible=False),  # preview_image
            gr.update(visible=False),  # result_frame
            "",                        # progress_desc
            "",                        # progress_bar
            gr.update(interactive=False),  # start_button
            gr.update(interactive=True),    # end_button
            "実行モード: CPU (通常環境)"  # mode_info
        )

        # 固定パラメータ
        prompt = "A yellow and purple checkerboard mask fades away, smoothly revealing the background beneath, while the areas not covered by the mask remain completely still."
        n_prompt = None
        seed = 1234
        steps = 15  # CPU環境では少ないステップ数
        cfg = 1.0
        gs = 10.0
        rs = 0.0
        gpu_memory_preservation = 6.0
        use_teacache = False
        mp4_crf = 0
        lora_file = "./LoRA/mask_fadeout_V1.safetensors"
        fp8_optimization = False
        
        # LoRAファイルの存在確認
        if not os.path.exists(lora_file):
            print(f"警告: LoRAファイル {lora_file} が見つかりません。LoRAなしで処理を続行します。")
            lora_file = None

        # 非同期ワーカー起動
        stream = AsyncStream()
        async_run(
            worker_with_temp_files,
            image_mask_dict,
            prompt,
            n_prompt,
            seed,
            steps,
            cfg,
            gs,
            rs,
            gpu_memory_preservation,
            use_teacache,
            mp4_crf,
            lora_file,
            lora_multiplier,
            fp8_optimization,
        )

        temp_dir = None
        last_frame_path = None

        try:
            while True:
                flag, data = stream.output_queue.next()

                # 生成完了ファイルを受け取ったとき
                if flag == "file":
                    output_path, last_frame, resize_info, temp_dir = data
                    last_frame_path = last_frame

                    yield (
                        gr.update(visible=True),                   # preview_image に前回プレビューがあればそのまま
                        gr.update(visible=True, value=last_frame_path),# result_frame に最初の１枚を表示
                        gr.update(),                               # progress_desc
                        gr.update(),                               # progress_bar
                        gr.update(interactive=False),              # start_button を無効化
                        gr.update(interactive=True),                # end_button を有効化
                        "実行モード: CPU (通常環境)"  # mode_info
                    )

                # 進捗更新を受け取ったとき
                elif flag == "progress":
                    preview, desc, html = data
                    yield (
                        gr.update(visible=True, value=preview),  # preview_image に進捗サムネイル
                        gr.update(visible=False),                # result_frame は隠す
                        desc,                                    # progress_desc にテキスト
                        html,                                    # progress_bar に HTML
                        gr.update(interactive=False),            # start_button
                        gr.update(interactive=True),              # end_button
                        "実行モード: CPU (通常環境)"  # mode_info
                    )

                # エラーを受け取ったとき
                elif flag == "error":
                    error_message = data
                    yield (
                        gr.update(visible=False),   # preview_image
                        gr.update(visible=False),   # result_frame
                        error_message,              # progress_desc にエラー表示
                        "",                         # progress_bar
                        gr.update(interactive=True),  # start_button
                        gr.update(interactive=False),  # end_button
                        "実行モード: CPU (エラー発生)"  # mode_info
                    )
                    if temp_dir and os.path.exists(temp_dir):
                        cleanup_temp_files(temp_dir)
                    break

                # 処理終了を受け取ったとき
                elif flag == "end":
                    img_end = Image.open(last_frame_path)
                    # 最終的に last_frame を再表示
                    yield (
                        gr.update(visible=False),                        # preview_image を隠す
                        gr.update(visible=True, value=img_end ),  # result_frame に１枚だけ表示
                        "",                                              # progress_desc
                        "",                                              # progress_bar
                        gr.update(interactive=True),                     # start_button
                        gr.update(interactive=False),                     # end_button
                        "実行モード: CPU (完了)"  # mode_info
                    )
                    if temp_dir and os.path.exists(temp_dir):
                        cleanup_temp_files(temp_dir)
                    break

        except Exception as e:
            print(f"処理中にエラーが発生しました: {e}")
            if temp_dir and os.path.exists(temp_dir):
                cleanup_temp_files(temp_dir)
            raise e


# 処理終了時に明示的にGPUメモリを解放
def cleanup_gpu_resources():
    """GPUリソースを明示的に解放する"""
    global models
    
    # モデルを全てCPUに移動
    for model_name, model in models.items():
        try:
            if model is not None and hasattr(model, 'to') and callable(model.to):
                model.to('cpu')
                print(f"{model_name}をCPUに移動しました")
        except Exception as e:
            print(f"{model_name}のCPU移動中にエラー: {e}")
    
    # キャッシュクリア
    try:
        torch.cuda.empty_cache()
        gc.collect()
        print("GPUキャッシュをクリアしました")
    except Exception as e:
        print(f"GPUキャッシュクリア中にエラー: {e}")

# 処理中止・クリーンアップ関数（GPU/CPU共通）
def end_process_with_cleanup():
    cleanup_gpu_resources()
    """処理を中止し、一時ファイルを削除する"""
    global stream
    
    if stream is not None:
        stream.input_queue.push("end")
        print("処理を中止しました")
    
    # 既存の一時ディレクトリをすべて削除（念のため）
    for dir_path in glob.glob(tempfile.gettempdir() + "/hunyuan_temp_*"):
        if os.path.exists(dir_path):
            try:
                shutil.rmtree(dir_path)
                print(f"一時ディレクトリを削除しました: {dir_path}")
            except Exception as e:
                print(f"一時ディレクトリの削除中にエラーが発生しました: {e}")

css = make_progress_bar_css()
block = gr.Blocks(css=css).queue()
with block:
    gr.Markdown("# FramePackI2V_HY_mask_fadeout - 画像のマスクした部分を除去")
    with gr.Row():
        with gr.Column():
            # 入力画像をImageMaskで設定
            image_mask = gr.ImageMask(
                label="画像をアップロードしてマスクを描画",
                type="pil",
                brush=gr.Brush(
                    colors=["#FF00FF", "#FFFFFF", "#000000", "#FF0000", "#00FF00", "#0000FF", "#FFFF00"],
                    default_color="#FF00FF",
                    color_mode="defaults"
                ),
                layers=True,
                height="70vh",
                width="60vh",
            )

            with gr.Row():
                start_button = gr.Button(value="生成開始")
                end_button = gr.Button(value="生成中止", interactive=False)

            with gr.Group():
                lora_multiplier = gr.Slider(label="LoRA倍率", minimum=0.0, maximum=2.0, value=1.0, step=0.1)

            with gr.Group():
                mode_info = gr.Markdown(f"実行モード: {'CPU' if cpu_fallback_mode else 'GPU'}")

        with gr.Column():
            preview_image = gr.Image(label="生成プレビュー", visible=False)
            result_frame = gr.Image(label="生成結果", visible=False, type="pil", height="60vh")
            
            progress_desc = gr.Markdown("", elem_classes="no-generating-animation")
            progress_bar = gr.HTML("", elem_classes="no-generating-animation")

    ips = [
        image_mask,
        lora_multiplier,
    ]
    
    if IN_HF_SPACE and 'spaces' in globals():
        ops = [
            preview_image, 
            result_frame, 
            progress_desc, 
            progress_bar, 
            start_button, 
            end_button,
            mode_info
        ]
    else:
        ops = [
            preview_image, 
            result_frame, 
            progress_desc, 
            progress_bar, 
            start_button, 
            end_button,
            mode_info
        ]
    
    start_button.click(
        fn=process_with_temp, inputs=ips, outputs=ops
    )
    end_button.click(fn=end_process_with_cleanup)

# アプリ起動関数（エラーハンドリング付き）
def launch_app():
    """アプリケーションの起動（エラーハンドリング付き）"""
    # 通常の起動方法
    block.launch(
        server_name="0.0.0.0", 
        share=False, 
        inbrowser=False,
    )

# アプリケーションの起動
launch_app()