FLUX-Vision

Running on Zero

File size: 15,523 Bytes

import subprocess
import sys

# Install/upgrade required packages with specific versions
def install_packages():
    packages = [
        "transformers>=4.46.0",
        "diffusers>=0.31.0",
        "accelerate>=0.26.0",
        "huggingface-hub>=0.23.0",
        "timm",  # Required for Florence-2
    ]
    
    for package in packages:
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", package], check=True)
        except Exception as e:
            print(f"Warning: Could not install {package}: {e}")

# Run installation before other imports
try:
    install_packages()
except Exception as e:
    print(f"Warning: Could not auto-install packages: {e}")

# Try to install flash-attn with a timeout
try:
    print("Attempting to install flash-attn...")
    result = subprocess.run(
        'pip install flash-attn --no-build-isolation', 
        env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, 
        shell=True,
        timeout=120,  # 2 minute timeout
        capture_output=True,
        text=True
    )
    if result.returncode == 0:
        print("Flash-attn installed successfully")
    else:
        print(f"Flash-attn installation failed: {result.stderr}")
        print("Continuing without flash-attn...")
except subprocess.TimeoutExpired:
    print("Flash-attn installation timed out - continuing without it")
except Exception as e:
    print(f"Flash-attn installation error: {e}")
    print("Continuing without flash-attn...")

import spaces
import argparse
import os
import time
from os import path
import shutil
from datetime import datetime
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download
import gradio as gr
import torch
from diffusers import FluxPipeline
from diffusers.pipelines.stable_diffusion import safety_checker
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# Try to use efficient attention mechanisms
ATTN_METHOD = None
try:
    import xformers
    print("Using xformers for efficient attention")
    ATTN_METHOD = "xformers"
except ImportError:
    try:
        import flash_attn
        print("Using flash attention")
        ATTN_METHOD = "flash_attn"
    except ImportError:
        print("No efficient attention method available, using default")
        ATTN_METHOD = "default"

# Setup and initialization code
cache_path = path.join(path.dirname(path.abspath(__file__)), "models")
PERSISTENT_DIR = os.environ.get("PERSISTENT_DIR", ".")

os.environ["TRANSFORMERS_CACHE"] = cache_path
os.environ["HF_HUB_CACHE"] = cache_path
os.environ["HF_HOME"] = cache_path

torch.backends.cuda.matmul.allow_tf32 = True

# Florence 모델 초기화
print("Initializing Florence models...")
florence_models = {}
florence_processors = {}

try:
    # Try importing timm to verify it's available
    import timm
    print("timm library available")
except ImportError:
    print("Installing timm...")
    subprocess.run([sys.executable, "-m", "pip", "install", "timm"], check=True)
    import timm

# Initialize Florence models with error handling
model_names = ['gokaygokay/Florence-2-Flux-Large', 'gokaygokay/Florence-2-Flux']

for model_name in model_names:
    try:
        print(f"Loading {model_name}...")
        florence_models[model_name] = AutoModelForCausalLM.from_pretrained(
            model_name, 
            trust_remote_code=True
        ).eval()
        florence_processors[model_name] = AutoProcessor.from_pretrained(
            model_name, 
            trust_remote_code=True
        )
        print(f"Successfully loaded {model_name}")
    except Exception as e:
        print(f"Warning: Could not load {model_name}: {e}")
        # If the large model fails, we'll fall back to the smaller one
        if model_name == 'gokaygokay/Florence-2-Flux-Large' and len(florence_models) == 0:
            print("Attempting to load fallback model...")
            try:
                fallback_model = 'gokaygokay/Florence-2-Flux'
                florence_models[model_name] = AutoModelForCausalLM.from_pretrained(
                    fallback_model, 
                    trust_remote_code=True
                ).eval()
                florence_processors[model_name] = AutoProcessor.from_pretrained(
                    fallback_model, 
                    trust_remote_code=True
                )
                print(f"Using {fallback_model} as fallback")
            except Exception as e2:
                print(f"Error loading fallback model: {e2}")

if not florence_models:
    print("ERROR: No Florence models could be loaded. Caption generation will not work.")
else:
    print(f"Loaded {len(florence_models)} Florence model(s)")

def filter_prompt(prompt):
    inappropriate_keywords = [
    "sex"    
    ]
    
    prompt_lower = prompt.lower()
    
    for keyword in inappropriate_keywords:
        if keyword in prompt_lower:
            return False, "부적절한 내용이 포함된 프롬프트입니다."
            
    return True, prompt

class timer:
    def __init__(self, method_name="timed process"):
        self.method = method_name
    def __enter__(self):
        self.start = time.time()
        print(f"{self.method} starts")
    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        print(f"{self.method} took {str(round(end - self.start, 2))}s")

# Model initialization
if not path.exists(cache_path):
    os.makedirs(cache_path, exist_ok=True)

print("Loading FLUX pipeline...")
pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", 
    torch_dtype=torch.bfloat16
)

# Configure attention mechanism
if ATTN_METHOD == "xformers":
    try:
        pipe.enable_xformers_memory_efficient_attention()
        print("Enabled xformers memory efficient attention")
    except Exception as e:
        print(f"Could not enable xformers: {e}")
elif ATTN_METHOD == "flash_attn":
    print("Flash attention available")
else:
    print("Using standard attention")

print("Loading LoRA weights...")
pipe.load_lora_weights(
    hf_hub_download(
        "ByteDance/Hyper-SD", 
        "Hyper-FLUX.1-dev-8steps-lora.safetensors"
    )
)
pipe.fuse_lora(lora_scale=0.125)
pipe.to(device="cuda", dtype=torch.bfloat16)

# Safety checker initialization
try:
    pipe.safety_checker = safety_checker.StableDiffusionSafetyChecker.from_pretrained(
        "CompVis/stable-diffusion-safety-checker"
    )
except Exception as e:
    print(f"Warning: Could not load safety checker: {e}")
    pipe.safety_checker = None

@spaces.GPU
def generate_caption(image, model_name='gokaygokay/Florence-2-Flux-Large'):
    if not florence_models:
        gr.Warning("Caption models are not loaded. Please refresh the page.")
        return "Caption generation unavailable - please describe your image manually"
    
    # Use fallback model if the requested one isn't available
    if model_name not in florence_models:
        model_name = list(florence_models.keys())[0]
        print(f"Using fallback model: {model_name}")
    
    image = Image.fromarray(image)
    task_prompt = "<DESCRIPTION>"
    prompt = task_prompt + "Describe this image in great detail."

    if image.mode != "RGB":
        image = image.convert("RGB")

    model = florence_models[model_name]
    processor = florence_processors[model_name]

    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3,
        repetition_penalty=1.10,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer["<DESCRIPTION>"]

@spaces.GPU
def process_and_save_image(height, width, steps, scales, prompt, seed):
    is_safe, filtered_prompt = filter_prompt(prompt)
    if not is_safe:
        gr.Warning("The prompt contains inappropriate content.")
        return None
            
    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16), timer("inference"):
        try:
            generated_image = pipe(
                prompt=[filtered_prompt],
                generator=torch.Generator().manual_seed(int(seed)),
                num_inference_steps=int(steps),
                guidance_scale=float(scales),
                height=int(height),
                width=int(width),
                max_sequence_length=256
            ).images[0]
            
            return generated_image
        except Exception as e:
            print(f"Error in image generation: {str(e)}")
            gr.Warning(f"Error generating image: {str(e)}")
            return None

def get_random_seed():
    return torch.randint(0, 1000000, (1,)).item()

def update_seed():
    return get_random_seed()

# CSS 스타일
css = """
footer {display: none !important}
.gradio-container {
    max-width: 1200px;
    margin: auto;
}
.contain {
    background: rgba(255, 255, 255, 0.05);
    border-radius: 12px;
    padding: 20px;
}
.generate-btn {
    background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;
    border: none !important;
    color: white !important;
}
.generate-btn:hover {
    transform: translateY(-2px);
    box-shadow: 0 5px 15px rgba(0,0,0,0.2);
}
.title {
    text-align: center;
    font-size: 2.5em;
    font-weight: bold;
    margin-bottom: 1em;
    background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
}
.tabs {
    margin-top: 20px;
    border-radius: 10px;
    overflow: hidden;
}
.tab-nav {
    background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%);
    padding: 10px;
}
.tab-nav button {
    color: white;
    border: none;
    padding: 10px 20px;
    margin: 0 5px;
    border-radius: 5px;
    transition: all 0.3s ease;
}
.tab-nav button.selected {
    background: rgba(255, 255, 255, 0.2);
}
.image-upload-container {
    border: 2px dashed #4B79A1;
    border-radius: 10px;
    padding: 20px;
    text-align: center;
    transition: all 0.3s ease;
}
.image-upload-container:hover {
    border-color: #283E51;
    background: rgba(75, 121, 161, 0.1);
}
.primary-btn {
    background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;
    font-size: 1.2em !important;
    padding: 12px 20px !important;
    margin-top: 20px !important;
}
hr {
    border: none;
    border-top: 1px solid rgba(75, 121, 161, 0.2);
    margin: 20px 0;
}
.input-section {
    background: rgba(255, 255, 255, 0.03);
    border-radius: 12px;
    padding: 20px;
    margin-bottom: 20px;
}
.output-section {
    background: rgba(255, 255, 255, 0.03);
    border-radius: 12px;
    padding: 20px;
}
.example-images {
    display: grid;
    grid-template-columns: repeat(4, 1fr);
    gap: 10px;
    margin-bottom: 20px;
}
.example-images img {
    width: 100%;
    height: 150px;
    object-fit: cover;
    border-radius: 8px;
    cursor: pointer;
    transition: transform 0.2s;
}
.example-images img:hover {
    transform: scale(1.05);
}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.HTML('<div class="title">FLUX VisionReply</div>')
    gr.HTML('<div style="text-align: center; margin-bottom: 2em;">Upload an image(Image2Text2Image)</div>')
    
    with gr.Row():
        # 왼쪽 컬럼: 입력 섹션
        with gr.Column(scale=3):
            # 이미지 업로드 섹션
            input_image = gr.Image(
                label="Upload Image (Optional)",
                type="numpy",
                elem_classes=["image-upload-container"]
            )
            
            # 예시 이미지 갤러리 추가
            example_images = [
                "5.jpg",
                "6.jpg",
                "2.jpg",   
                "3.jpg",                
                "1.jpg",
                "4.jpg",
            ]
            gr.Examples(
                examples=example_images,
                inputs=input_image,
                label="Example Images",
                examples_per_page=4
            )
            
            # Florence 모델 선택 - 숨김 처리
            available_models = list(florence_models.keys()) if florence_models else []
            florence_model = gr.Dropdown(
                choices=available_models,
                label="Caption Model",
                value=available_models[0] if available_models else None,
                visible=False
            )
            
            caption_button = gr.Button(
                "🔍 Generate Caption from Image",
                elem_classes=["generate-btn"]
            )
            
            # 구분선
            gr.HTML('<hr style="margin: 20px 0;">')
            
            # 텍스트 프롬프트 섹션
            prompt = gr.Textbox(
                label="Image Description",
                placeholder="Enter text description or use generated caption above...",
                lines=3
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                with gr.Row():
                    height = gr.Slider(
                        label="Height",
                        minimum=256,
                        maximum=1152,
                        step=64,
                        value=1024
                    )
                    width = gr.Slider(
                        label="Width",
                        minimum=256,
                        maximum=1152,
                        step=64,
                        value=1024
                    )
                
                with gr.Row():
                    steps = gr.Slider(
                        label="Inference Steps",
                        minimum=6,
                        maximum=25,
                        step=1,
                        value=8
                    )
                    scales = gr.Slider(
                        label="Guidance Scale",
                        minimum=0.0,
                        maximum=5.0,
                        step=0.1,
                        value=3.5
                    )
                
                seed = gr.Number(
                    label="Seed",
                    value=get_random_seed(),
                    precision=0
                )
                
                randomize_seed = gr.Button(
                    "🎲 Randomize Seed", 
                    elem_classes=["generate-btn"]
                )
            
            generate_btn = gr.Button(
                "✨ Generate Image",
                elem_classes=["generate-btn", "primary-btn"]
            )

        # 오른쪽 컬럼: 출력 섹션
        with gr.Column(scale=4):
            output = gr.Image(
                label="Generated Image",
                elem_classes=["output-image"]
            )

    # Event handlers
    caption_button.click(
        generate_caption,
        inputs=[input_image, florence_model],
        outputs=[prompt]
    )
    
    generate_btn.click(
        process_and_save_image,
        inputs=[height, width, steps, scales, prompt, seed],
        outputs=[output]
    )
    
    randomize_seed.click(
        update_seed,
        outputs=[seed]
    )
    
    generate_btn.click(
        update_seed,
        outputs=[seed]
    )

if __name__ == "__main__":
    demo.launch(allowed_paths=[PERSISTENT_DIR])