Cicici1109 commited on
Commit
f965c67
·
1 Parent(s): 8fb4bb0

Rename utils to utils.py

Browse files
Files changed (1) hide show
  1. utils +0 -588
utils DELETED
@@ -1,588 +0,0 @@
1
- import torch
2
- import numpy as np
3
- from diffusers.pipelines import FluxPipeline
4
- from src.flux.condition import Condition
5
- from PIL import Image
6
- import argparse
7
- import os
8
- import json
9
- import base64
10
- import io
11
- import re
12
- from PIL import Image, ImageFilter
13
- from transformers import AutoModelForCausalLM, AutoTokenizer
14
- from scipy.ndimage import binary_dilation
15
- import cv2
16
- import openai
17
- from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
18
-
19
-
20
- from src.flux.generate import generate, seed_everything
21
-
22
- try:
23
- from mmengine.visualization import Visualizer
24
- except ImportError:
25
- Visualizer = None
26
- print("Warning: mmengine is not installed, visualization is disabled.")
27
-
28
- import re
29
-
30
- def encode_image_to_datauri(path, size=(512, 512)):
31
- with Image.open(path).convert('RGB') as img:
32
- img = img.resize(size, Image.LANCZOS)
33
- buffer = io.BytesIO()
34
- img.save(buffer, format='PNG')
35
- b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
36
- return b64
37
- # return f"data:image/png;base64,{b64}"
38
-
39
-
40
- @retry(
41
- reraise=True,
42
- wait=wait_exponential(min=1, max=60),
43
- stop=stop_after_attempt(6),
44
- retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
45
- )
46
- def cot_with_gpt(image_uri, instruction):
47
- response = openai.ChatCompletion.create(
48
- model="gpt-4o",
49
- messages=[
50
- {
51
- "role": "user",
52
- "content": [
53
- {"type": "text", "text": f'''
54
- Now you are an expert in image editing. Based on the given single image, what atomic image editing instructions should be if the user wants to {instruction}? Let's think step by step.
55
- Atomic instructions include 13 categories as follows:
56
- - Add: e.g.: add a car on the road
57
- - Remove: e.g.: remove the sofa in the image
58
- - Color Change: e.g.: change the color of the shoes to blue
59
- - Material Change: e.g.: change the material of the sign like stone
60
- - Action Change: e.g.: change the action of the boy to raising hands
61
- - Expression Change: e.g.: change the expression to smile
62
- - Replace: e.g.: replace the coffee with an apple
63
- - Background Change: e.g.: change the background into forest
64
- - Appearance Change: e.g.: make the cup have a floral pattern
65
- - Move: e.g.: move the plane to the left
66
- - Resize: e.g.: enlarge the clock
67
- - Tone Transfer: e.g.: change the weather to foggy
68
- - Style Change: e.g.: make the style of the image to cartoon
69
- Respond *only* with a numbered list.
70
- Each line must begin with the category in square brackets, then the instruction. Please strictly follow the atomic categories.
71
- The operation (what) and the target (to what) are crystal clear.
72
- Do not split replace to add and remove.
73
- For example:
74
- “1. [Add] add a car on the road\n
75
- 2. [Color Change] change the color of the shoes to blue\n
76
- 3. [Move] move the lamp to the left\n"
77
- Do not include any extra text, explanations, JSON or markdown—just the list.
78
- '''},
79
- {
80
- "type": "image_url",
81
- "image_url": {
82
- "url": f"data:image/jpeg;base64,{image_uri}"
83
- }
84
- },
85
- ],
86
- }
87
- ],
88
- max_tokens=300,
89
- )
90
- text = response.choices[0].message.content.strip()
91
- print(text)
92
-
93
- categories, instructions = extract_instructions(text)
94
- return categories, instructions
95
-
96
-
97
- def extract_instructions(text):
98
- categories = []
99
- instructions = []
100
-
101
- pattern = r'^\s*\d+\.\s*\[(.*?)\]\s*(.*?)$'
102
-
103
- for line in text.split('\n'):
104
- line = line.strip()
105
- if not line:
106
- continue
107
-
108
- match = re.match(pattern, line)
109
- if match:
110
- category = match.group(1).strip()
111
- instruction = match.group(2).strip()
112
-
113
- if category and instruction:
114
- categories.append(category)
115
- instructions.append(instruction)
116
-
117
- return categories, instructions
118
-
119
- def extract_last_bbox(result):
120
- pattern = r'\[?<span data-type="inline-math" data-value="XCcoW15cJ10rKVwnLFxzKlxbXHMqKFxkKylccyosXHMqKFxkKylccyosXHMqKFxkKylccyosXHMqKFxkKylccypcXQ=="></span>\]?'
121
- matches = re.findall(pattern, result)
122
-
123
- if not matches:
124
- simple_pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
125
- simple_matches = re.findall(simple_pattern, result)
126
- if simple_matches:
127
- x0, y0, x1, y1 = map(int, simple_matches[-1])
128
- return [x0, y0, x1, y1]
129
- else:
130
- print(f"No bounding boxes found, please try again: {result}")
131
- return None
132
-
133
- last_match = matches[-1]
134
- x0, y0, x1, y1 = map(int, last_match[1:])
135
- return x0, y0, x1, y1
136
-
137
-
138
- def infer_with_DiT(task, image, instruction, category):
139
- seed_everything(3407)
140
-
141
- if task == 'RoI Inpainting':
142
- if category == 'Add' or category == 'Replace':
143
- lora_path = "weights/add.safetensors"
144
- added = extract_object_with_gpt(instruction)
145
- instruction_dit = f"add {added} on the black region"
146
- elif category == 'Remove' or category == 'Action Change':
147
- lora_path = "weights/remove.safetensors"
148
- instruction_dit = f"Fill the hole of the image"
149
-
150
- condition = Condition("scene", image, position_delta=(0, 0))
151
- elif task == 'RoI Editing':
152
- image = Image.open(image).convert('RGB').resize((512, 512))
153
- condition = Condition("scene", image, position_delta=(0, -32))
154
- instruction_dit = instruction
155
- if category == 'Action Change':
156
- lora_path = "weights/action.safetensors"
157
- elif category == 'Expression Change':
158
- lora_path = "weights/expression.safetensors"
159
- elif category == 'Add':
160
- lora_path = "weights/addition.safetensors"
161
- elif category == 'Material Change':
162
- lora_path = "weights/material.safetensors"
163
- elif category == 'Color Change':
164
- lora_path = "weights/color.safetensors"
165
-
166
- elif task == 'RoI Compositioning':
167
- lora_path = "weights/fusion.safetensors"
168
- condition = Condition("scene", image, position_delta=(0, 0))
169
- instruction_dit = "inpaint the black-bordered region so that the object's edges blend smoothly with the background"
170
-
171
- elif task == 'Global Transformation':
172
- image = Image.open(image).convert('RGB').resize((512, 512))
173
- instruction_dit = instruction
174
- lora_path = "weights/overall.safetensors"
175
-
176
- condition = Condition("scene", image, position_delta=(0, -32))
177
- else:
178
- raise ValueError(f"Invalid task: '{task}'")
179
- pipe = FluxPipeline.from_pretrained(
180
- "black-forest-labs/FLUX.1-dev",
181
- torch_dtype=torch.bfloat16
182
- )
183
-
184
- pipe = pipe.to("cuda")
185
-
186
- pipe.load_lora_weights(
187
- "Cicici1109/IEAP",
188
- weight_name=lora_path,
189
- adapter_name="scene",
190
- )
191
- result_img = generate(
192
- pipe,
193
- prompt=instruction_dit,
194
- conditions=[condition],
195
- config_path = "train/config/scene_512.yaml",
196
- num_inference_steps=28,
197
- height=512,
198
- width=512,
199
- ).images[0]
200
- # result_img
201
- if task == 'RoI Editing' and category == 'Action Change':
202
- text_roi = extract_object_with_gpt(instruction)
203
- instruction_loc = f"<image>Please segment {text_roi}."
204
- # (model, tokenizer, image_path, instruction, work_dir, dilate):
205
- img = result_img
206
- print(f"Instruction: {instruction_loc}")
207
-
208
- model, tokenizer = load_model("ByteDance/Sa2VA-8B")
209
-
210
- result = model.predict_forward(
211
- image=img,
212
- text=instruction_loc,
213
- tokenizer=tokenizer,
214
- )
215
-
216
- prediction = result['prediction']
217
- print(f"Model Output: {prediction}")
218
-
219
- if '[SEG]' in prediction and 'prediction_masks' in result:
220
- pred_mask = result['prediction_masks'][0]
221
- pred_mask_np = np.squeeze(np.array(pred_mask))
222
-
223
- ## obtain region bbox
224
- rows = np.any(pred_mask_np, axis=1)
225
- cols = np.any(pred_mask_np, axis=0)
226
- if not np.any(rows) or not np.any(cols):
227
- print("Warning: Mask is empty, cannot compute bounding box")
228
- return img
229
-
230
- y0, y1 = np.where(rows)[0][[0, -1]]
231
- x0, x1 = np.where(cols)[0][[0, -1]]
232
-
233
- changed_instance = crop_masked_region(result_img, pred_mask_np)
234
-
235
- return changed_instance, x0, y1, 1
236
-
237
-
238
- return result_img
239
-
240
- def load_model(model_path):
241
- model = AutoModelForCausalLM.from_pretrained(
242
- model_path,
243
- torch_dtype="auto",
244
- device_map="auto",
245
- trust_remote_code=True
246
- ).eval()
247
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
248
- return model, tokenizer
249
-
250
- def extract_object_with_gpt(instruction):
251
- system_prompt = (
252
- "You are a helpful assistant that extracts the object or target being edited in an image editing instruction. "
253
- "Only return a concise noun phrase describing the object. "
254
- "Examples:\n"
255
- "- Input: 'Remove the dog' → Output: 'the dog'\n"
256
- "- Input: 'Add a hat on the dog' → Output: 'a hat'\n"
257
- "- Input: 'Replace the biggest bear with a tiger' → Output: 'the biggest bear'\n"
258
- "- Input: 'Change the action of the girl to riding' → Output: 'the girl'\n"
259
- "- Input: 'Move the red car on the lake' → Output: 'the red car'\n"
260
- "- Input: 'Minify the carrot on the rabbit's hand' → Output: 'the carrot on the rabbit's hand'\n"
261
- "- Input: 'Swap the location of the dog and the cat' → Output: 'the dog and the cat'\n"
262
- "Now extract the object for this instruction:"
263
- )
264
-
265
- try:
266
- response = openai.ChatCompletion.create(
267
- model="gpt-3.5-turbo",
268
- messages=[
269
- {"role": "system", "content": system_prompt},
270
- {"role": "user", "content": instruction}
271
- ],
272
- temperature=0.2,
273
- max_tokens=20,
274
- )
275
- object_phrase = response.choices[0].message['content'].strip().strip('"')
276
- print(f"Identified object: {object_phrase}")
277
- return object_phrase
278
- except Exception as e:
279
- print(f"GPT extraction failed: {e}")
280
- return instruction
281
-
282
- def extract_region_with_gpt(instruction):
283
- system_prompt = (
284
- "You are a helpful assistant that extracts target region being edited in an image editing instruction. "
285
- "Only return a concise noun phrase describing the target region. "
286
- "Examples:\n"
287
- "- Input: 'Add a red hat to the man on the left' → Output: 'the man on the left'\n"
288
- "- Input: 'Add a cat beside the dog' → Output: 'the dog'\n"
289
- "Now extract the target region for this instruction:"
290
- )
291
-
292
- try:
293
- response = openai.ChatCompletion.create(
294
- model="gpt-3.5-turbo",
295
- messages=[
296
- {"role": "system", "content": system_prompt},
297
- {"role": "user", "content": instruction}
298
- ],
299
- temperature=0.2,
300
- max_tokens=20,
301
- )
302
- object_phrase = response.choices[0].message['content'].strip().strip('"')
303
- print(f"Identified object: {object_phrase}")
304
- return object_phrase
305
- except Exception as e:
306
- print(f"GPT extraction failed: {e}")
307
- return instruction
308
-
309
- def get_masked(mask, image):
310
- if mask.shape[:2] != image.size[::-1]:
311
- raise ValueError(f"Mask size {mask.shape[:2]} does not match image size {image.size}")
312
-
313
- image_array = np.array(image)
314
- image_array[mask] = [0, 0, 0]
315
-
316
- return Image.fromarray(image_array)
317
-
318
- def bbox_to_mask(x0, y0, x1, y1, image_shape=(512, 512), fill_value=True):
319
- height, width = image_shape
320
-
321
- mask = np.zeros((height, width), dtype=bool)
322
-
323
- x0 = max(0, int(x0))
324
- y0 = max(0, int(y0))
325
- x1 = min(width, int(x1))
326
- y1 = min(height, int(y1))
327
-
328
- if x0 >= x1 or y0 >= y1:
329
- print("Warning: Invalid bounding box coordinates")
330
- return mask
331
-
332
- mask[y0:y1, x0:x1] = fill_value
333
-
334
- return mask
335
-
336
- def combine_bbox(text, x0, y0, x1, y1):
337
- bbox = [x0, y0, x1, y1]
338
- return [(text, bbox)]
339
-
340
- def crop_masked_region(image, pred_mask_np):
341
- if not isinstance(image, Image.Image):
342
- raise ValueError("The input image is not a PIL Image object")
343
- if not isinstance(pred_mask_np, np.ndarray) or pred_mask_np.dtype != bool:
344
- raise ValueError("pred_mask_np must be a NumPy array of boolean type")
345
- if pred_mask_np.shape[:2] != image.size[::-1]:
346
- raise ValueError(f"Mask size {pred_mask_np.shape[:2]} does not match image size {image.size}")
347
-
348
- image_rgba = image.convert("RGBA")
349
- image_array = np.array(image_rgba)
350
-
351
- rows = np.any(pred_mask_np, axis=1)
352
- cols = np.any(pred_mask_np, axis=0)
353
-
354
- if not np.any(rows) or not np.any(cols):
355
- print("Warning: Mask is empty, cannot compute bounding box")
356
- return image_rgba
357
-
358
- y0, y1 = np.where(rows)[0][[0, -1]]
359
- x0, x1 = np.where(cols)[0][[0, -1]]
360
-
361
- cropped_image = image_array[y0:y1+1, x0:x1+1].copy()
362
- cropped_mask = pred_mask_np[y0:y1+1, x0:x1+1]
363
-
364
- alpha_channel = np.ones(cropped_mask.shape, dtype=np.uint8) * 255
365
- alpha_channel[~cropped_mask] = 0
366
-
367
- cropped_image[:, :, 3] = alpha_channel
368
-
369
- return Image.fromarray(cropped_image, mode='RGBA')
370
-
371
- def roi_localization(image, instruction, category): # add, remove, replace, action change, move, resize
372
- model, tokenizer = load_model("ByteDance/Sa2VA-8B")
373
- if category == 'Add':
374
- text_roi = extract_region_with_gpt(instruction)
375
- else:
376
- text_roi = extract_object_with_gpt(instruction)
377
- instruction_loc = f"<image>Please segment {text_roi}."
378
- img = Image.open(image).convert('RGB').resize((512, 512))
379
- print(f"Processing image: {os.path.basename(image)}, Instruction: {instruction_loc}")
380
-
381
- result = model.predict_forward(
382
- image=img,
383
- text=instruction_loc,
384
- tokenizer=tokenizer,
385
- )
386
-
387
- prediction = result['prediction']
388
- print(f"Model Output: {prediction}")
389
-
390
- if '[SEG]' in prediction and 'prediction_masks' in result:
391
- pred_mask = result['prediction_masks'][0]
392
- pred_mask_np = np.squeeze(np.array(pred_mask))
393
- if category == 'Add':
394
- ## obtain region bbox
395
- rows = np.any(pred_mask_np, axis=1)
396
- cols = np.any(pred_mask_np, axis=0)
397
- if not np.any(rows) or not np.any(cols):
398
- print("Warning: Mask is empty, cannot compute bounding box")
399
- return img
400
-
401
- y0, y1 = np.where(rows)[0][[0, -1]]
402
- x0, x1 = np.where(cols)[0][[0, -1]]
403
-
404
- ## obtain inpainting bbox
405
- bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
406
- print(bbox)
407
- x0, y0, x1, y1 = layout_add(bbox, instruction)
408
- mask = bbox_to_mask(x0, y0, x1, y1)
409
- ## make it black
410
- masked_img = get_masked(mask, img)
411
- elif category == 'Move' or category == 'Resize':
412
- dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
413
- masked_img = get_masked(dilated_original_mask, img)
414
- ## obtain region bbox
415
- rows = np.any(pred_mask_np, axis=1)
416
- cols = np.any(pred_mask_np, axis=0)
417
- if not np.any(rows) or not np.any(cols):
418
- print("Warning: Mask is empty, cannot compute bounding box")
419
- return img
420
-
421
- y0, y1 = np.where(rows)[0][[0, -1]]
422
- x0, x1 = np.where(cols)[0][[0, -1]]
423
-
424
- ## obtain inpainting bbox
425
- bbox = combine_bbox(text_roi, x0, y0, x1, y1) #? multiple?
426
- print(bbox)
427
- x0_new, y0_new, x1_new, y1_new, = layout_change(bbox, instruction)
428
- scale = (y1_new - y0_new) / (y1 - y0)
429
- print(scale)
430
- changed_instance = crop_masked_region(img, pred_mask_np)
431
-
432
- return masked_img, changed_instance, x0_new, y1_new, scale
433
- else:
434
- dilated_original_mask = binary_dilation(pred_mask_np, iterations=3)
435
- masked_img = get_masked(dilated_original_mask, img)
436
-
437
- return masked_img
438
-
439
- else:
440
- print("No valid mask found in the prediction.")
441
- return None
442
-
443
- def fusion(background, foreground, x, y, scale):
444
- background = background.convert("RGBA")
445
- bg_width, bg_height = background.size
446
-
447
- fg_width, fg_height = foreground.size
448
- new_size = (int(fg_width * scale), int(fg_height * scale))
449
- foreground_resized = foreground.resize(new_size, Image.Resampling.LANCZOS)
450
-
451
- left = x
452
- top = y - new_size[1]
453
-
454
- canvas = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
455
- canvas.paste(foreground_resized, (left, top), foreground_resized)
456
- masked_foreground = process_edge(canvas, left, top, new_size)
457
- result = Image.alpha_composite(background, masked_foreground)
458
-
459
- return result
460
-
461
- def process_edge(canvas, left, top, size):
462
- width, height = size
463
-
464
- region = canvas.crop((left, top, left + width, top + height))
465
- alpha = region.getchannel('A')
466
-
467
- dilated_alpha = alpha.filter(ImageFilter.MaxFilter(5))
468
- eroded_alpha = alpha.filter(ImageFilter.MinFilter(3))
469
-
470
- edge_mask = Image.new('L', (width, height), 0)
471
- edge_pixels = edge_mask.load()
472
- dilated_pixels = dilated_alpha.load()
473
- eroded_pixels = eroded_alpha.load()
474
-
475
- for y in range(height):
476
- for x in range(width):
477
- if dilated_pixels[x, y] > 0 and eroded_pixels[x, y] == 0:
478
- edge_pixels[x, y] = 255
479
-
480
- black_edge = Image.new('RGBA', (width, height), (0, 0, 0, 0))
481
- black_edge.putalpha(edge_mask)
482
-
483
- canvas.paste(black_edge, (left, top), black_edge)
484
-
485
- return canvas
486
-
487
- def combine_text_and_bbox(text_roi, x0, y0, x1, y1):
488
- return [(text_roi, [x0, y0, x1, y1])]
489
-
490
- @retry(
491
- reraise=True,
492
- wait=wait_exponential(min=1, max=60),
493
- stop=stop_after_attempt(6),
494
- retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
495
- )
496
- def layout_add(bbox, instruction):
497
- response = openai.ChatCompletion.create(
498
- model="gpt-4o",
499
- messages=[
500
- {
501
- "role": "user",
502
- "content": [
503
- {"type": "text", "text": f'''
504
- You are an intelligent bounding box editor. I will provide you with the current bounding boxes and an add editing instruction.
505
- Your task is to determine the new bounding box of the added object. Let's think step by step.
506
- The images are of size 512x512. The top-left corner has coordinate [0, 0]. The bottom-right corner has coordinnate [512, 512].
507
- The bounding boxes should not go beyond the image boundaries. The new box must be large enough to reasonably encompass the added object in a visually appropriate way, allowing for partial overlap with existing objects when it comes to accessories like hat, necklace. etc.
508
- Each bounding box should be in the format of (object name,[top-left x coordinate, top-left y coordinate, bottom-right x coordinate, bottom-right y coordinate]).
509
- Only return the bounding box of the newly added object. Do not include the existing bounding boxes.
510
- Please consider the semantic information of the layout, preserve semantic relations.
511
- If needed, you can make reasonable guesses. Please refer to the examples below:
512
- Input bounding boxes: [('a green car', [21, 281, 232, 440])]
513
- Editing instruction: Add a bird on the green car.
514
- Output bounding boxes: [('a bird', [80, 150, 180, 281])]
515
- Input bounding boxes: [('stool', [300, 350, 380, 450])]
516
- Editing instruction: Add a cat to the left of the stool.
517
- Output bounding boxes: [('a cat', [180, 250, 300, 450])]
518
-
519
- Here are some examples to illustrate appropriate overlapping for better visual effects:
520
- Input bounding boxes: [('the white cat', [200, 300, 320, 420])]
521
- Editing instruction: Add a hat on the white cat.
522
- Output bounding boxes: [('a hat', [200, 150, 320, 330])]
523
- Now, the current bounding boxes is {bbox}, the instruction is {instruction}.
524
- '''},
525
- ],
526
- }
527
- ],
528
- max_tokens=1000,
529
- )
530
-
531
- result = response.choices[0].message.content.strip()
532
-
533
- print(result)
534
- bbox = extract_last_bbox(result)
535
- return bbox
536
-
537
- @retry(
538
- reraise=True,
539
- wait=wait_exponential(min=1, max=60),
540
- stop=stop_after_attempt(6),
541
- retry=retry_if_exception_type((openai.error.RateLimitError, openai.error.APIError))
542
- )
543
- def layout_change(bbox, instruction):
544
- response = openai.ChatCompletion.create(
545
- model="gpt-4o",
546
- messages=[
547
- {
548
- "role": "user",
549
- "content": [
550
- {"type": "text", "text": f'''
551
- You are an intelligent bounding box editor. I will provide you with the current bounding boxes and the editing instruction.
552
- Your task is to generate the new bounding boxes after editing.
553
- The images are of size 512x512. The top-left corner has coordinate [0, 0]. The bottom-right corner has coordinnate [512, 512].
554
- The bounding boxes should not overlap or go beyond the image boundaries.
555
- Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, bottom-right x coordinate, bottom-right y coordinate]).
556
- Do not add new objects or delete any object provided in the bounding boxes. Do not change the size or the shape of any object unless the instruction requires so.
557
- Please consider the semantic information of the layout.
558
- When resizing, keep the bottom-left corner fixed by default. When swaping locations, change according to the center point.
559
- If needed, you can make reasonable guesses. Please refer to the examples below:
560
-
561
- Input bounding boxes: [('a car', [21, 281, 232, 440])]
562
- Editing instruction: Move the car to the right.
563
- Output bounding boxes: [('a car', [121, 281, 332, 440])]
564
-
565
- Input bounding boxes: [("bed", [50, 300, 450, 450]), ("pillow", [200, 200, 300, 230])]
566
- Editing instruction: Move the pillow to the left side of the bed.
567
- Output bounding boxes: [("bed", [50, 300, 450, 450]), ("pillow", [70, 270, 170, 300])]
568
-
569
- Input bounding boxes: [("dog", [150, 250, 250, 300])]
570
- Editing instruction: Enlarge the dog.
571
- Output bounding boxes: [("dog", [150, 225, 300, 300])]
572
-
573
- Input bounding boxes: [("chair", [100, 350, 200, 450]), ("lamp", [300, 200, 360, 300])]
574
- Editing instruction: Swap the location of the chair and the lamp.
575
- Output bounding boxes: [("chair", [280, 200, 380, 300]), ("lamp", [120, 350, 180, 450])]
576
-
577
-
578
- Now, the current bounding boxes is {bbox}, the instruction is {instruction}. Let's think step by step, and output the edited layout.
579
- '''},
580
- ],
581
- }
582
- ],
583
- max_tokens=1000,
584
- )
585
- result = response.choices[0].message.content.strip()
586
- print(result)
587
- bbox = extract_last_bbox(result)
588
- return bbox