Spaces:

anirudh97
/

textual_inversion

Sleeping

App Files Files Community

textual_inversion / app.py

anirudh97

final_app

17b1c77 3 months ago

raw

history blame contribute delete

15 kB

	import os
	import torch
	import gradio as gr
	from PIL import Image
	from diffusers import StableDiffusionPipeline, DiffusionPipeline
	from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
	from tqdm.auto import tqdm
	import torchvision.transforms as T
	import torch.nn.functional as F
	import gc
	import signal
	import time
	import traceback

	# Configure constants - optimized for CPU
	HEIGHT, WIDTH = 384, 384 # Smaller images use less memory
	GUIDANCE_SCALE = 7.5
	LOSS_SCALE = 200
	NUM_INFERENCE_STEPS = 30 # Reduced from 50
	BATCH_SIZE = 1
	DEFAULT_PROMPT = "A deadly witcher slinging a sword with a lion medallion in his neck, casting a fire spell from his hand in a snowy forest"

	# Define the device
	TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
	print(f"Using device: {TORCH_DEVICE}")

	# Initialize the elastic transformer
	elastic_transformer = T.ElasticTransform(alpha=550.0, sigma=5.0)

	# Timeout handler for CPU processing
	def timeout_handler(signum, frame):
	raise TimeoutError("Image generation took too long")

	# Load the model
	def load_model():
	try:
	# Initialize signal handler only on Unix-like systems
	if TORCH_DEVICE == "cpu" and hasattr(signal, 'SIGALRM'):
	signal.signal(signal.SIGALRM, timeout_handler)
	signal.alarm(2100) # 15 minutes timeout for model loading

	pipe = DiffusionPipeline.from_pretrained(
	"CompVis/stable-diffusion-v1-4",
	torch_dtype=torch.float16 if TORCH_DEVICE == "cuda" else torch.float32,
	safety_checker=None, # Disable safety checker for memory
	low_cpu_mem_usage=True # Enable memory optimization
	).to(TORCH_DEVICE)

	# Load textual inversion for all devices including CPU
	try:
	# Load one at a time with memory cleanup between each
	concepts = [
	"sd-concepts-library/rimworld-art-style",
	"sd-concepts-library/hk-goldenlantern",
	"sd-concepts-library/phoenix-01",
	"sd-concepts-library/fractal-flame",
	"sd-concepts-library/scarlet-witch"
	]

	for concept in concepts:
	try:
	print(f"Loading textual inversion concept: {concept}")
	pipe.load_textual_inversion(concept, mean_resizing=False)
	# Clear memory after loading each concept
	if TORCH_DEVICE == "cpu":
	gc.collect()
	except Exception as e:
	print(f"Warning: Could not load textual inversion concept {concept}: {e}")
	except Exception as e:
	print(f"Warning: Could not load textual inversion concepts: {e}")

	# Clear the alarm if set
	if TORCH_DEVICE == "cpu" and hasattr(signal, 'SIGALRM'):
	signal.alarm(0)

	return pipe
	except Exception as e:
	# Clear the alarm if set
	if TORCH_DEVICE == "cpu" and hasattr(signal, 'SIGALRM'):
	signal.alarm(0)

	print(f"Error loading model: {e}")
	traceback.print_exc()
	raise

	# Helper functions
	def image_grid(imgs, rows, cols):
	assert len(imgs) == rows*cols
	w, h = imgs[0].size
	grid = Image.new('RGB', size=(colsw, rowsh))

	for i, img in enumerate(imgs):
	grid.paste(img, box=(i%colsw, i//colsh))
	return grid

	def image_loss(images, loss_type):
	if loss_type == 'blue':
	# blue loss
	error = torch.abs(images[:,2] - 0.9).mean()
	elif loss_type == 'elastic':
	# elastic loss
	transformed_imgs = elastic_transformer(images)
	error = torch.abs(transformed_imgs - images).mean()
	elif loss_type == 'symmetry':
	flipped_image = torch.flip(images, [3])
	error = F.mse_loss(images, flipped_image)
	elif loss_type == 'saturation':
	# saturation loss
	transformed_imgs = T.functional.adjust_saturation(images, saturation_factor=10)
	error = torch.abs(transformed_imgs - images).mean()
	else:
	print("Error. Loss not defined")
	error = torch.tensor(0.0)

	return error

	def latents_to_pil(latents, pipe):
	# batch of latents -> list of images
	latents = (1 / 0.18215) * latents
	with torch.no_grad():
	image = pipe.vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
	images = (image * 255).round().astype("uint8")
	pil_images = [Image.fromarray(image) for image in images]
	return pil_images

	def generate_image(pipe, seed_no, prompts, loss_type, loss_apply=False, progress=gr.Progress()):
	try:
	# Set timeout for CPU
	if TORCH_DEVICE == "cpu":
	signal.signal(signal.SIGALRM, timeout_handler)
	signal.alarm(600) # 10 minute timeout

	# Initialization and Setup
	generator = torch.manual_seed(seed_no)

	scheduler = LMSDiscreteScheduler(
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	num_train_timesteps=1000
	)
	scheduler.set_timesteps(NUM_INFERENCE_STEPS)
	scheduler.timesteps = scheduler.timesteps.to(torch.float32)

	# Text Processing
	text_input = pipe.tokenizer(
	prompts,
	padding='max_length',
	max_length=pipe.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt"
	)
	input_ids = text_input.input_ids.to(TORCH_DEVICE)

	# Convert text inputs to embeddings
	with torch.no_grad():
	text_embeddings = pipe.text_encoder(input_ids)[0]

	# Handle padding and truncation of text inputs
	max_length = text_input.input_ids.shape[-1]
	uncond_input = pipe.tokenizer(
	[""] * BATCH_SIZE,
	padding="max_length",
	max_length=max_length,
	return_tensors="pt"
	)

	with torch.no_grad():
	uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(TORCH_DEVICE))[0]

	# Concatenate unconditioned and text embeddings
	text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

	# Create random initial latents
	latents = torch.randn(
	(BATCH_SIZE, pipe.unet.config.in_channels, HEIGHT // 8, WIDTH // 8),
	generator=generator,
	)

	# Move latents to device and apply noise scaling
	if TORCH_DEVICE == "cuda":
	latents = latents.to(torch.float16)
	latents = latents.to(TORCH_DEVICE)
	latents = latents * scheduler.init_noise_sigma

	# Diffusion Process
	timesteps = scheduler.timesteps
	progress(0, desc="Generating")

	# Fixed loop - separate the progress tracking from the enumeration
	for i in range(len(timesteps)):
	progress((i + 1) / len(timesteps), desc=f"Diffusion step {i+1}/{len(timesteps)}")
	t = timesteps[i]

	# Process the latent model input
	latent_model_input = torch.cat([latents] * 2)
	sigma = scheduler.sigmas[i]
	latent_model_input = scheduler.scale_model_input(latent_model_input, t)

	with torch.no_grad():
	noise_pred = pipe.unet(
	latent_model_input,
	t,
	encoder_hidden_states=text_embeddings
	)["sample"]

	# Apply noise prediction
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + GUIDANCE_SCALE * (noise_pred_text - noise_pred_uncond)

	# Apply loss if requested
	if loss_apply and i % 5 == 0 and loss_type != "N/A":
	latents = latents.detach().requires_grad_()
	latents_x0 = latents - sigma * noise_pred

	# Use VAE to decode the image
	denoised_images = pipe.vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5

	# Apply loss
	loss = image_loss(denoised_images, loss_type) * LOSS_SCALE
	print(f"Step {i}, Loss: {loss.item()}")

	# Compute gradients for optimization
	cond_grad = torch.autograd.grad(loss, latents)[0]
	latents = latents.detach() - cond_grad * sigma**2

	# Update latents using the scheduler
	latents = scheduler.step(noise_pred, t, latents).prev_sample

	# Garbage collect every 5 steps if on CPU
	if TORCH_DEVICE == "cpu" and i % 5 == 0:
	gc.collect()

	# Clear the alarm if set
	if TORCH_DEVICE == "cpu":
	signal.alarm(0)

	return latents

	except Exception as e:
	print(f"Error in generate_image: {e}")
	traceback.print_exc()
	# Return empty latents as fallback
	return torch.zeros(
	(BATCH_SIZE, pipe.unet.config.in_channels, HEIGHT // 8, WIDTH // 8),
	device=TORCH_DEVICE
	)

	def generate_images(prompt, loss_type, apply_loss, seeds, pipe, progress=gr.Progress()):
	try:
	images_list = []

	# Convert comma-separated string to list and clean
	seeds = [int(seed.strip()) for seed in seeds.split(',') if seed.strip()]

	if not seeds:
	seeds = [1000] # Default seed if none provided

	# Process one seed at a time to save memory
	for i, seed_no in enumerate(seeds):
	progress((i / len(seeds)) * 0.1, desc=f"Starting seed {seed_no}")

	# Clear memory
	if TORCH_DEVICE == "cuda":
	torch.cuda.empty_cache()
	gc.collect()

	try:
	# Generate image
	prompts = [prompt]
	latents = generate_image(pipe, seed_no, prompts, loss_type, loss_apply=apply_loss, progress=progress)
	pil_images = latents_to_pil(latents, pipe)
	images_list.extend(pil_images)
	except Exception as e:
	print(f"Error generating image with seed {seed_no}: {e}")
	# Create an error image
	error_img = Image.new('RGB', (HEIGHT, WIDTH), color=(255, 0, 0))
	images_list.append(error_img)

	# Force garbage collection
	gc.collect()

	# Create image grid
	if len(images_list) > 1:
	result = image_grid(images_list, 1, len(images_list))
	return result
	else:
	return images_list[0]

	except Exception as e:
	print(f"Error in generate_images: {e}")
	traceback.print_exc()
	# Create an error image
	error_img = Image.new('RGB', (WIDTH, HEIGHT), color=(255, 0, 0))
	return error_img

	# Gradio Interface
	def create_interface():
	with gr.Blocks(title="Stable Diffusion Text Inversion with Loss Functions") as app:
	gr.Markdown("""
	# Stable Diffusion Text Inversion with Loss Functions

	Generate images using Stable Diffusion with various loss functions to guide the diffusion process.
	""")

	if TORCH_DEVICE == "cpu":
	gr.Markdown("""
	⚠️ Running on CPU: Generation will be slow and memory-intensive.
	Each image may take several minutes to generate.
	""")

	pipe = None # Initialize to None to avoid loading during interface creation

	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(
	label="Prompt",
	value=DEFAULT_PROMPT,
	lines=3
	)

	loss_type = gr.Radio(
	label="Loss Type",
	choices=["N/A", "blue", "elastic", "symmetry", "saturation"],
	value="N/A"
	)

	apply_loss = gr.Checkbox(
	label="Apply Loss Function",
	value=False
	)

	if TORCH_DEVICE == "cpu":
	seeds = gr.Textbox(
	label="Seeds (comma-separated) - Use fewer seeds for CPU",
	value="1000",
	lines=1
	)
	else:
	seeds = gr.Textbox(
	label="Seeds (comma-separated)",
	value="3000,2000,1000",
	lines=1
	)

	# Load model button
	load_model_btn = gr.Button("Load Model")
	model_status = gr.Textbox(label="Model Status", value="Model not loaded", interactive=False)

	generate_btn = gr.Button("Generate Images", interactive=False)

	with gr.Column():
	output_image = gr.Image(label="Generated Image")

	def load_model_fn():
	nonlocal pipe
	try:
	pipe = load_model()
	return "Model loaded successfully", True
	except Exception as e:
	return f"Error loading model: {str(e)}", False

	load_model_btn.click(
	fn=load_model_fn,
	inputs=[],
	outputs=[model_status, generate_btn]
	)

	generate_btn.click(
	fn=lambda p, lt, al, s, prog: generate_images(p, lt, al, s, pipe, prog),
	inputs=[prompt, loss_type, apply_loss, seeds],
	outputs=output_image
	)

	gr.Markdown("""
	## About the Loss Functions

	- Blue: Encourages more blue tones in the image
	- Elastic: Creates distortion effects by minimizing differences with elastically transformed versions
	- Symmetry: Encourages symmetrical images by minimizing differences with horizontally flipped versions
	- Saturation: Increases color saturation in the image

	Set "N/A" and uncheck "Apply Loss Function" for normal image generation.
	""")

	if TORCH_DEVICE == "cpu":
	gr.Markdown("""
	## CPU Mode Tips
	- Use smaller prompts
	- Process one seed at a time
	- Be patient, generation can take 5-10 minutes per image
	- If you encounter memory errors, try restarting the app and using even smaller dimensions
	""")

	return app

	if __name__ == "__main__":
	# Create and launch the interface
	app = create_interface()
	app.launch()