comrender commited on
Commit
b0a9f3e
Β·
verified Β·
1 Parent(s): 93af3e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +294 -206
app.py CHANGED
@@ -13,7 +13,7 @@ from PIL import Image
13
  from huggingface_hub import snapshot_download
14
  import requests
15
 
16
- # For ESRGAN (requires pip install basicsr gfpgan)
17
  try:
18
  from basicsr.archs.rrdbnet_arch import RRDBNet
19
  from basicsr.utils import img2tensor, tensor2img
@@ -33,9 +33,9 @@ css = """
33
  }
34
  """
35
 
36
- # Device setup - Force CPU for startup in ZeroGPU
37
  power_device = "ZeroGPU"
38
- device = "cpu"
39
 
40
  # Get HuggingFace token
41
  huggingface_token = os.getenv("HF_TOKEN")
@@ -54,7 +54,7 @@ model_path = snapshot_download(
54
  print("πŸ“₯ Loading Florence-2 model...")
55
  florence_model = AutoModelForCausalLM.from_pretrained(
56
  "microsoft/Florence-2-large",
57
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
58
  trust_remote_code=True,
59
  attn_implementation="eager"
60
  ).to(device)
@@ -67,7 +67,7 @@ florence_processor = AutoProcessor.from_pretrained(
67
  print("πŸ“₯ Loading FLUX Img2Img...")
68
  pipe = FluxImg2ImgPipeline.from_pretrained(
69
  model_path,
70
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
71
  )
72
  pipe.enable_vae_tiling()
73
  pipe.enable_vae_slicing()
@@ -76,27 +76,51 @@ print("βœ… All models loaded successfully!")
76
 
77
  # Download ESRGAN model if using
78
  if USE_ESRGAN:
79
- esrgan_path = "4x-UltraSharp.pth"
80
- if not os.path.exists(esrgan_path):
81
- url = "https://huggingface.co/uwg/upscaler/resolve/main/ESRGAN/4x-UltraSharp.pth"
82
- with open(esrgan_path, "wb") as f:
83
- f.write(requests.get(url).content)
84
- esrgan_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
85
- state_dict = torch.load(esrgan_path)['params_ema']
86
- esrgan_model.load_state_dict(state_dict)
87
- esrgan_model.eval()
 
 
 
 
 
 
88
 
89
  MAX_SEED = 1000000
90
- MAX_PIXEL_BUDGET = 8192 * 8192 # Increased for tiling support
 
 
 
 
 
91
 
92
 
93
  def generate_caption(image):
94
  """Generate detailed caption using Florence-2"""
95
  try:
 
 
 
 
96
  task_prompt = "<MORE_DETAILED_CAPTION>"
97
  prompt = task_prompt
98
 
99
- inputs = florence_processor(text=prompt, images=image, return_tensors="pt").to(florence_model.device) # Fixed: Use model's current device instead of static 'device'
 
 
 
 
 
 
 
 
 
100
 
101
  generated_ids = florence_model.generate(
102
  input_ids=inputs["input_ids"],
@@ -107,7 +131,11 @@ def generate_caption(image):
107
  )
108
 
109
  generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
110
- parsed_answer = florence_processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
 
 
 
 
111
 
112
  caption = parsed_answer[task_prompt]
113
  return caption
@@ -120,10 +148,9 @@ def process_input(input_image, upscale_factor):
120
  """Process input image and handle size constraints"""
121
  w, h = input_image.size
122
  w_original, h_original = w, h
123
- aspect_ratio = w / h
124
-
125
  was_resized = False
126
-
127
  if w * h * upscale_factor**2 > MAX_PIXEL_BUDGET:
128
  warnings.warn(
129
  f"Requested output image is too large ({w * upscale_factor}x{h * upscale_factor}). Resizing to fit budget."
@@ -133,11 +160,11 @@ def process_input(input_image, upscale_factor):
133
  )
134
  target_input_pixels = MAX_PIXEL_BUDGET / (upscale_factor ** 2)
135
  scale = (target_input_pixels / (w * h)) ** 0.5
136
- new_w = int(w * scale) - int(w * scale) % 16 # Fixed: Use % 16 for FLUX alignment (was % 8)
137
- new_h = int(h * scale) - int(h * scale) % 16 # Fixed: Use % 16 for FLUX alignment (was % 8)
138
  input_image = input_image.resize((new_w, new_h), resample=Image.LANCZOS)
139
  was_resized = True
140
-
141
  return input_image, w_original, h_original, was_resized
142
 
143
 
@@ -152,61 +179,168 @@ def load_image_from_url(url):
152
 
153
 
154
  def esrgan_upscale(image, scale=4):
 
155
  if not USE_ESRGAN:
156
  return image.resize((image.width * scale, image.height * scale), resample=Image.LANCZOS)
157
- img = img2tensor(np.array(image) / 255., bgr2rgb=False, float32=True)
158
- with torch.no_grad():
159
- output = esrgan_model(img.unsqueeze(0)).squeeze()
160
- output_img = tensor2img(output, rgb2bgr=False, min_max=(0, 1))
161
- return Image.fromarray(output_img)
 
 
 
 
 
 
 
 
 
162
 
163
 
164
- def tiled_flux_img2img(pipe, prompt, image, strength, steps, guidance, generator, tile_size=1024, overlap=32):
165
- """Tiled Img2Img to mimic Ultimate SD Upscaler tiling"""
166
- w, h = image.size
167
- output = image.copy() # Start with the control image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- for x in range(0, w, tile_size - overlap):
170
- for y in range(0, h, tile_size - overlap):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  tile_w = min(tile_size, w - x)
172
  tile_h = min(tile_size, h - y)
173
- tile = image.crop((x, y, x + tile_w, y + tile_h))
174
-
175
- # Run Flux on tile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  gen_tile = pipe(
177
  prompt=prompt,
178
  image=tile,
179
  strength=strength,
180
  num_inference_steps=steps,
181
  guidance_scale=guidance,
182
- height=tile_h,
183
- width=tile_w,
184
  generator=generator,
185
  ).images[0]
186
-
187
- # Fixed: Resize generated tile back to exact tile dimensions if pipeline auto-resized for multiple-of-16 requirement
188
- gen_tile = gen_tile.resize((tile_w, tile_h), resample=Image.LANCZOS)
189
-
190
- # Paste with blending if overlap
191
- if overlap > 0:
192
- paste_box = (x, y, x + tile_w, y + tile_h)
193
- if x > 0 or y > 0:
194
- # Simple linear blend on overlaps
195
- mask = Image.new('L', (tile_w, tile_h), 255)
196
- if x > 0:
197
- for i in range(overlap):
198
- for j in range(tile_h):
199
- mask.putpixel((i, j), int(255 * (i / overlap)))
200
- if y > 0:
201
- for i in range(tile_w):
202
- for j in range(overlap):
203
- mask.putpixel((i, j), int(255 * (j / overlap)))
204
- output.paste(gen_tile, paste_box, mask)
205
- else:
206
- output.paste(gen_tile, paste_box)
207
  else:
208
- output.paste(gen_tile, (x, y))
209
-
 
 
 
 
 
 
210
  return output
211
 
212
 
@@ -224,85 +358,106 @@ def enhance_image(
224
  progress=gr.Progress(track_tqdm=True),
225
  ):
226
  """Main enhancement function"""
227
- # Move models to GPU inside the function
228
- pipe.to("cuda")
229
- florence_model.to("cuda")
230
-
231
- # Handle image input
232
- if image_input is not None:
233
- input_image = image_input
234
- elif image_url:
235
- input_image = load_image_from_url(image_url)
236
- else:
237
- raise gr.Error("Please provide an image (upload or URL)")
238
-
239
- if randomize_seed:
240
- seed = random.randint(0, MAX_SEED)
241
-
242
- true_input_image = input_image
243
-
244
- # Process input image
245
- input_image, w_original, h_original, was_resized = process_input(
246
- input_image, upscale_factor
247
- )
248
-
249
- # Generate caption if requested
250
- if use_generated_caption:
251
- gr.Info("πŸ” Generating image caption...")
252
- generated_caption = generate_caption(input_image)
253
- prompt = generated_caption
254
- else:
255
- prompt = custom_prompt if custom_prompt.strip() else ""
256
-
257
- generator = torch.Generator(device="cuda").manual_seed(seed)
258
-
259
- gr.Info("πŸš€ Upscaling image...")
260
-
261
- # Initial upscale
262
- if USE_ESRGAN and upscale_factor == 4:
263
- esrgan_model.to("cuda")
264
- control_image = esrgan_upscale(input_image, upscale_factor)
265
- esrgan_model.to("cpu")
266
- else:
267
- w, h = input_image.size
268
- control_image = input_image.resize((w * upscale_factor, h * upscale_factor), resample=Image.LANCZOS)
269
-
270
- # Tiled Flux Img2Img for refinement
271
- image = tiled_flux_img2img(
272
- pipe,
273
- prompt,
274
- control_image,
275
- denoising_strength,
276
- num_inference_steps,
277
- 1.0, # Hardcoded guidance_scale to 1
278
- generator,
279
- tile_size=1024,
280
- overlap=32
281
- )
282
-
283
- if was_resized:
284
- gr.Info(f"πŸ“ Resizing output to target size: {w_original * upscale_factor}x{h_original * upscale_factor}")
285
- image = image.resize((w_original * upscale_factor, h_original * upscale_factor), resample=Image.LANCZOS)
286
-
287
- # Resize input image to match output size for slider alignment
288
- resized_input = true_input_image.resize(image.size, resample=Image.LANCZOS)
289
-
290
- # Move back to CPU to release GPU
291
- pipe.to("cpu")
292
- florence_model.to("cpu")
293
-
294
- return [resized_input, image]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
 
297
  # Create Gradio interface
298
  with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as demo:
299
- gr.HTML("""
300
  <div class="main-header">
301
  <h1>🎨 AI Image Upscaler</h1>
302
  <p>Upload an image or provide a URL to upscale it using Florence-2 captioning and FLUX upscaling</p>
303
- <p>Currently running on <strong>{}</strong></p>
304
  </div>
305
- """.format(power_device))
306
 
307
  with gr.Row():
308
  with gr.Column(scale=1):
@@ -313,14 +468,14 @@ with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as d
313
  input_image = gr.Image(
314
  label="Upload Image",
315
  type="pil",
316
- height=200 # Made smaller
317
  )
318
 
319
  with gr.TabItem("πŸ”— Image URL"):
320
  image_url = gr.Textbox(
321
  label="Image URL",
322
  placeholder="https://example.com/image.jpg",
323
- value="https://upload.wikimedia.org/wikipedia/commons/thumb/a/a7/Example.jpg/800px-Example.jpg"
324
  )
325
 
326
  gr.HTML("<h3>πŸŽ›οΈ Caption Settings</h3>")
@@ -386,15 +541,15 @@ with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as d
386
  size="lg"
387
  )
388
 
389
- with gr.Column(scale=2): # Larger scale for results
390
  gr.HTML("<h3>πŸ“Š Results</h3>")
391
 
392
  result_slider = ImageSlider(
393
  type="pil",
394
- interactive=False, # Disable interactivity to prevent uploads
395
- height=600, # Made larger
396
  elem_id="result_slider",
397
- label=None # Remove default label
398
  )
399
 
400
  # Event handler
@@ -419,73 +574,6 @@ with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as d
419
  <p><strong>Note:</strong> This upscaler uses the Flux dev model. Users are responsible for obtaining commercial rights if used commercially under their license.</p>
420
  </div>
421
  """)
422
-
423
- # Custom CSS for slider
424
- gr.HTML("""
425
- <style>
426
- #result_slider .slider {
427
- width: 100% !important;
428
- max-width: inherit !important;
429
- }
430
- #result_slider img {
431
- object-fit: contain !important;
432
- width: 100% !important;
433
- height: auto !important;
434
- }
435
- #result_slider .gr-button-tool {
436
- display: none !important;
437
- }
438
- #result_slider .gr-button-undo {
439
- display: none !important;
440
- }
441
- #result_slider .gr-button-clear {
442
- display: none !important;
443
- }
444
- #result_slider .badge-container .badge {
445
- display: none !important;
446
- }
447
- #result_slider .badge-container::before {
448
- content: "Before";
449
- position: absolute;
450
- top: 10px;
451
- left: 10px;
452
- background: rgba(0,0,0,0.5);
453
- color: white;
454
- padding: 5px;
455
- border-radius: 5px;
456
- z-index: 10;
457
- }
458
- #result_slider .badge-container::after {
459
- content: "After";
460
- position: absolute;
461
- top: 10px;
462
- right: 10px;
463
- background: rgba(0,0,0,0.5);
464
- color: white;
465
- padding: 5px;
466
- border-radius: 5px;
467
- z-index: 10;
468
- }
469
- #result_slider .fullscreen img {
470
- object-fit: contain !important;
471
- width: 100vw !important;
472
- height: 100vh !important;
473
- }
474
- </style>
475
- """)
476
-
477
- # JS to set slider default position to middle
478
- gr.HTML("""
479
- <script>
480
- document.addEventListener('DOMContentLoaded', function() {
481
- const sliderInput = document.querySelector('#result_slider input[type="range"]');
482
- if (sliderInput) {
483
- sliderInput.value = 50;
484
- sliderInput.dispatchEvent(new Event('input'));
485
- }
486
- });
487
- </script>
488
- """)
489
 
490
  if __name__ == "__main__":
491
  demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)
 
13
  from huggingface_hub import snapshot_download
14
  import requests
15
 
16
+ # For ESRGAN (optional - will work without it)
17
  try:
18
  from basicsr.archs.rrdbnet_arch import RRDBNet
19
  from basicsr.utils import img2tensor, tensor2img
 
33
  }
34
  """
35
 
36
+ # Device setup
37
  power_device = "ZeroGPU"
38
+ device = "cpu" # Start on CPU, will move to GPU when needed
39
 
40
  # Get HuggingFace token
41
  huggingface_token = os.getenv("HF_TOKEN")
 
54
  print("πŸ“₯ Loading Florence-2 model...")
55
  florence_model = AutoModelForCausalLM.from_pretrained(
56
  "microsoft/Florence-2-large",
57
+ torch_dtype=torch.float32, # Use float32 on CPU to avoid dtype issues
58
  trust_remote_code=True,
59
  attn_implementation="eager"
60
  ).to(device)
 
67
  print("πŸ“₯ Loading FLUX Img2Img...")
68
  pipe = FluxImg2ImgPipeline.from_pretrained(
69
  model_path,
70
+ torch_dtype=torch.float32 # Start with float32 on CPU
71
  )
72
  pipe.enable_vae_tiling()
73
  pipe.enable_vae_slicing()
 
76
 
77
  # Download ESRGAN model if using
78
  if USE_ESRGAN:
79
+ try:
80
+ esrgan_path = "4x-UltraSharp.pth"
81
+ if not os.path.exists(esrgan_path):
82
+ url = "https://huggingface.co/uwg/upscaler/resolve/main/ESRGAN/4x-UltraSharp.pth"
83
+ print("πŸ“₯ Downloading ESRGAN model...")
84
+ with open(esrgan_path, "wb") as f:
85
+ f.write(requests.get(url).content)
86
+ esrgan_model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
87
+ state_dict = torch.load(esrgan_path, map_location='cpu')['params_ema']
88
+ esrgan_model.load_state_dict(state_dict)
89
+ esrgan_model.eval()
90
+ print("βœ… ESRGAN model loaded!")
91
+ except Exception as e:
92
+ print(f"Failed to load ESRGAN: {e}")
93
+ USE_ESRGAN = False
94
 
95
  MAX_SEED = 1000000
96
+ MAX_PIXEL_BUDGET = 8192 * 8192
97
+
98
+
99
+ def make_multiple_16(n):
100
+ """Round up to nearest multiple of 16"""
101
+ return ((n + 15) // 16) * 16
102
 
103
 
104
  def generate_caption(image):
105
  """Generate detailed caption using Florence-2"""
106
  try:
107
+ # Ensure model is on the correct device with correct dtype
108
+ if florence_model.device.type == "cuda":
109
+ florence_model.to(torch.float16)
110
+
111
  task_prompt = "<MORE_DETAILED_CAPTION>"
112
  prompt = task_prompt
113
 
114
+ inputs = florence_processor(
115
+ text=prompt,
116
+ images=image,
117
+ return_tensors="pt"
118
+ ).to(florence_model.device)
119
+
120
+ # Ensure dtype consistency
121
+ if florence_model.device.type == "cuda":
122
+ if hasattr(inputs, "pixel_values"):
123
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
124
 
125
  generated_ids = florence_model.generate(
126
  input_ids=inputs["input_ids"],
 
131
  )
132
 
133
  generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
134
+ parsed_answer = florence_processor.post_process_generation(
135
+ generated_text,
136
+ task=task_prompt,
137
+ image_size=(image.width, image.height)
138
+ )
139
 
140
  caption = parsed_answer[task_prompt]
141
  return caption
 
148
  """Process input image and handle size constraints"""
149
  w, h = input_image.size
150
  w_original, h_original = w, h
151
+
 
152
  was_resized = False
153
+
154
  if w * h * upscale_factor**2 > MAX_PIXEL_BUDGET:
155
  warnings.warn(
156
  f"Requested output image is too large ({w * upscale_factor}x{h * upscale_factor}). Resizing to fit budget."
 
160
  )
161
  target_input_pixels = MAX_PIXEL_BUDGET / (upscale_factor ** 2)
162
  scale = (target_input_pixels / (w * h)) ** 0.5
163
+ new_w = make_multiple_16(int(w * scale))
164
+ new_h = make_multiple_16(int(h * scale))
165
  input_image = input_image.resize((new_w, new_h), resample=Image.LANCZOS)
166
  was_resized = True
167
+
168
  return input_image, w_original, h_original, was_resized
169
 
170
 
 
179
 
180
 
181
  def esrgan_upscale(image, scale=4):
182
+ """Upscale image using ESRGAN or fallback to LANCZOS"""
183
  if not USE_ESRGAN:
184
  return image.resize((image.width * scale, image.height * scale), resample=Image.LANCZOS)
185
+
186
+ try:
187
+ img = img2tensor(np.array(image) / 255., bgr2rgb=False, float32=True)
188
+ with torch.no_grad():
189
+ # Move model to same device as image tensor
190
+ if torch.cuda.is_available():
191
+ esrgan_model.to("cuda")
192
+ img = img.to("cuda")
193
+ output = esrgan_model(img.unsqueeze(0)).squeeze()
194
+ output_img = tensor2img(output, rgb2bgr=False, min_max=(0, 1))
195
+ return Image.fromarray(output_img)
196
+ except Exception as e:
197
+ print(f"ESRGAN upscale failed: {e}, falling back to LANCZOS")
198
+ return image.resize((image.width * scale, image.height * scale), resample=Image.LANCZOS)
199
 
200
 
201
+ def create_blend_mask(width, height, overlap, edge_x, edge_y):
202
+ """Create a gradient blend mask for smooth tile transitions"""
203
+ mask = Image.new('L', (width, height), 255)
204
+ pixels = mask.load()
205
+
206
+ # Horizontal blend (left edge)
207
+ if edge_x and overlap > 0:
208
+ for x in range(min(overlap, width)):
209
+ alpha = x / overlap
210
+ for y in range(height):
211
+ pixels[x, y] = int(255 * alpha)
212
+
213
+ # Vertical blend (top edge)
214
+ if edge_y and overlap > 0:
215
+ for y in range(min(overlap, height)):
216
+ alpha = y / overlap
217
+ for x in range(width):
218
+ # Combine with existing alpha if both edges
219
+ existing = pixels[x, y] / 255.0
220
+ combined = min(existing, alpha)
221
+ pixels[x, y] = int(255 * combined)
222
+
223
+ return mask
224
+
225
 
226
+ def tiled_flux_img2img(pipe, prompt, image, strength, steps, guidance, generator, tile_size=1024, overlap=64):
227
+ """Tiled Img2Img to handle large images"""
228
+ w, h = image.size
229
+
230
+ # Ensure tile_size is divisible by 16
231
+ tile_size = make_multiple_16(tile_size)
232
+ overlap = make_multiple_16(overlap)
233
+
234
+ # If image is small enough, process without tiling
235
+ if w <= tile_size and h <= tile_size:
236
+ # Ensure dimensions are divisible by 16
237
+ new_w = make_multiple_16(w)
238
+ new_h = make_multiple_16(h)
239
+
240
+ if new_w != w or new_h != h:
241
+ padded_image = Image.new('RGB', (new_w, new_h))
242
+ padded_image.paste(image, (0, 0))
243
+ else:
244
+ padded_image = image
245
+
246
+ result = pipe(
247
+ prompt=prompt,
248
+ image=padded_image,
249
+ strength=strength,
250
+ num_inference_steps=steps,
251
+ guidance_scale=guidance,
252
+ height=new_h,
253
+ width=new_w,
254
+ generator=generator,
255
+ ).images[0]
256
+
257
+ # Crop back to original size if padded
258
+ if new_w != w or new_h != h:
259
+ result = result.crop((0, 0, w, h))
260
+
261
+ return result
262
+
263
+ # Process with tiling for large images
264
+ output = Image.new('RGB', (w, h))
265
+
266
+ # Calculate tile positions
267
+ tiles = []
268
+ for y in range(0, h, tile_size - overlap):
269
+ for x in range(0, w, tile_size - overlap):
270
  tile_w = min(tile_size, w - x)
271
  tile_h = min(tile_size, h - y)
272
+
273
+ # Ensure tile dimensions are divisible by 16
274
+ tile_w_padded = make_multiple_16(tile_w)
275
+ tile_h_padded = make_multiple_16(tile_h)
276
+
277
+ tiles.append({
278
+ 'x': x,
279
+ 'y': y,
280
+ 'w': tile_w,
281
+ 'h': tile_h,
282
+ 'w_padded': tile_w_padded,
283
+ 'h_padded': tile_h_padded,
284
+ 'edge_x': x > 0,
285
+ 'edge_y': y > 0
286
+ })
287
+
288
+ # Process each tile
289
+ for i, tile_info in enumerate(tiles):
290
+ print(f"Processing tile {i+1}/{len(tiles)}...")
291
+
292
+ # Extract tile from image
293
+ tile = image.crop((
294
+ tile_info['x'],
295
+ tile_info['y'],
296
+ tile_info['x'] + tile_info['w'],
297
+ tile_info['y'] + tile_info['h']
298
+ ))
299
+
300
+ # Pad if necessary
301
+ if tile_info['w_padded'] != tile_info['w'] or tile_info['h_padded'] != tile_info['h']:
302
+ padded_tile = Image.new('RGB', (tile_info['w_padded'], tile_info['h_padded']))
303
+ padded_tile.paste(tile, (0, 0))
304
+ tile = padded_tile
305
+
306
+ # Process tile with FLUX
307
+ try:
308
  gen_tile = pipe(
309
  prompt=prompt,
310
  image=tile,
311
  strength=strength,
312
  num_inference_steps=steps,
313
  guidance_scale=guidance,
314
+ height=tile_info['h_padded'],
315
+ width=tile_info['w_padded'],
316
  generator=generator,
317
  ).images[0]
318
+
319
+ # Crop back to original tile size if padded
320
+ if tile_info['w_padded'] != tile_info['w'] or tile_info['h_padded'] != tile_info['h']:
321
+ gen_tile = gen_tile.crop((0, 0, tile_info['w'], tile_info['h']))
322
+
323
+ # Create blend mask if needed
324
+ if overlap > 0 and (tile_info['edge_x'] or tile_info['edge_y']):
325
+ mask = create_blend_mask(
326
+ tile_info['w'],
327
+ tile_info['h'],
328
+ overlap,
329
+ tile_info['edge_x'],
330
+ tile_info['edge_y']
331
+ )
332
+
333
+ # Composite with blending
334
+ output.paste(gen_tile, (tile_info['x'], tile_info['y']), mask)
 
 
 
 
335
  else:
336
+ # Direct paste for first tile or no overlap
337
+ output.paste(gen_tile, (tile_info['x'], tile_info['y']))
338
+
339
+ except Exception as e:
340
+ print(f"Error processing tile: {e}")
341
+ # Fallback: paste original tile
342
+ output.paste(tile, (tile_info['x'], tile_info['y']))
343
+
344
  return output
345
 
346
 
 
358
  progress=gr.Progress(track_tqdm=True),
359
  ):
360
  """Main enhancement function"""
361
+ try:
362
+ # Move models to GPU and convert to appropriate dtype
363
+ pipe.to("cuda")
364
+ pipe.to(torch.bfloat16)
365
+
366
+ florence_model.to("cuda")
367
+ florence_model.to(torch.float16)
368
+
369
+ # Handle image input
370
+ if image_input is not None:
371
+ input_image = image_input
372
+ elif image_url:
373
+ input_image = load_image_from_url(image_url)
374
+ else:
375
+ raise gr.Error("Please provide an image (upload or URL)")
376
+
377
+ if randomize_seed:
378
+ seed = random.randint(0, MAX_SEED)
379
+
380
+ true_input_image = input_image
381
+
382
+ # Process input image
383
+ input_image, w_original, h_original, was_resized = process_input(
384
+ input_image, upscale_factor
385
+ )
386
+
387
+ # Generate caption if requested
388
+ if use_generated_caption:
389
+ gr.Info("πŸ” Generating image caption...")
390
+ generated_caption = generate_caption(input_image)
391
+ prompt = generated_caption
392
+ print(f"Generated caption: {prompt}")
393
+ else:
394
+ prompt = custom_prompt if custom_prompt.strip() else ""
395
+
396
+ generator = torch.Generator(device="cuda").manual_seed(seed)
397
+
398
+ gr.Info("πŸš€ Upscaling image...")
399
+
400
+ # Initial upscale
401
+ if USE_ESRGAN and upscale_factor == 4:
402
+ if torch.cuda.is_available():
403
+ esrgan_model.to("cuda")
404
+ control_image = esrgan_upscale(input_image, upscale_factor)
405
+ if torch.cuda.is_available():
406
+ esrgan_model.to("cpu")
407
+ else:
408
+ w, h = input_image.size
409
+ control_image = input_image.resize(
410
+ (w * upscale_factor, h * upscale_factor),
411
+ resample=Image.LANCZOS
412
+ )
413
+
414
+ # Tiled Flux Img2Img for refinement
415
+ image = tiled_flux_img2img(
416
+ pipe,
417
+ prompt,
418
+ control_image,
419
+ denoising_strength,
420
+ num_inference_steps,
421
+ 1.0, # guidance_scale fixed to 1.0
422
+ generator,
423
+ tile_size=1024,
424
+ overlap=64
425
+ )
426
+
427
+ if was_resized:
428
+ gr.Info(f"πŸ“ Resizing output to target size: {w_original * upscale_factor}x{h_original * upscale_factor}")
429
+ image = image.resize(
430
+ (w_original * upscale_factor, h_original * upscale_factor),
431
+ resample=Image.LANCZOS
432
+ )
433
+
434
+ # Resize input image to match output size for slider alignment
435
+ resized_input = true_input_image.resize(image.size, resample=Image.LANCZOS)
436
+
437
+ # Move models back to CPU to release GPU
438
+ pipe.to("cpu")
439
+ florence_model.to("cpu")
440
+ torch.cuda.empty_cache()
441
+
442
+ return [resized_input, image]
443
+
444
+ except Exception as e:
445
+ # Ensure models are moved back to CPU even on error
446
+ pipe.to("cpu")
447
+ florence_model.to("cpu")
448
+ torch.cuda.empty_cache()
449
+ raise gr.Error(f"Enhancement failed: {str(e)}")
450
 
451
 
452
  # Create Gradio interface
453
  with gr.Blocks(css=css, title="🎨 AI Image Upscaler - Florence-2 + FLUX") as demo:
454
+ gr.HTML(f"""
455
  <div class="main-header">
456
  <h1>🎨 AI Image Upscaler</h1>
457
  <p>Upload an image or provide a URL to upscale it using Florence-2 captioning and FLUX upscaling</p>
458
+ <p>Currently running on <strong>{power_device}</strong></p>
459
  </div>
460
+ """)
461
 
462
  with gr.Row():
463
  with gr.Column(scale=1):
 
468
  input_image = gr.Image(
469
  label="Upload Image",
470
  type="pil",
471
+ height=200
472
  )
473
 
474
  with gr.TabItem("πŸ”— Image URL"):
475
  image_url = gr.Textbox(
476
  label="Image URL",
477
  placeholder="https://example.com/image.jpg",
478
+ value=""
479
  )
480
 
481
  gr.HTML("<h3>πŸŽ›οΈ Caption Settings</h3>")
 
541
  size="lg"
542
  )
543
 
544
+ with gr.Column(scale=2):
545
  gr.HTML("<h3>πŸ“Š Results</h3>")
546
 
547
  result_slider = ImageSlider(
548
  type="pil",
549
+ interactive=False,
550
+ height=600,
551
  elem_id="result_slider",
552
+ label=None
553
  )
554
 
555
  # Event handler
 
574
  <p><strong>Note:</strong> This upscaler uses the Flux dev model. Users are responsible for obtaining commercial rights if used commercially under their license.</p>
575
  </div>
576
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  if __name__ == "__main__":
579
  demo.queue().launch(share=True, server_name="0.0.0.0", server_port=7860)